diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 1e6c2d0aa2..667ff7f92e 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -32,3 +32,4 @@ config-variables:
   - NEON_DEV_AWS_ACCOUNT_ID
   - NEON_PROD_AWS_ACCOUNT_ID
   - AWS_ECR_REGION
+  - BENCHMARK_LARGE_OLTP_PROJECTID
diff --git a/.github/actions/neon-branch-create/action.yml b/.github/actions/neon-branch-create/action.yml
index 9f752d5a89..71dd6f3af2 100644
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -84,7 +84,13 @@ runs:
           --header "Authorization: Bearer ${API_KEY}"
           )
 
-        role_name=$(echo $roles | jq --raw-output '.roles[] | select(.protected == false) | .name')
+        role_name=$(echo "$roles" | jq --raw-output '
+          (.roles | map(select(.protected == false))) as $roles |
+          if any($roles[]; .name == "neondb_owner")
+          then "neondb_owner"
+          else $roles[0].name
+          end
+        ')
         echo "role_name=${role_name}" >> $GITHUB_OUTPUT
       env:
         API_HOST: ${{ inputs.api_host }}
@@ -107,13 +113,13 @@ runs:
             )
 
           if [ -z "${reset_password}" ]; then
-            sleep 1
+            sleep $i
             continue
           fi
 
           password=$(echo $reset_password | jq --raw-output '.role.password')
           if [ "${password}" == "null" ]; then
-            sleep 1
+            sleep $i # increasing backoff
             continue
           fi
 
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 122fe48b68..fa6f882161 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -44,6 +44,11 @@ inputs:
     description: 'Postgres version to use for tests'
     required: false
     default: 'v16'
+  sanitizers:
+    description: 'enabled or disabled'
+    required: false
+    default: 'disabled'
+    type: string
   benchmark_durations:
     description: 'benchmark durations JSON'
     required: false
@@ -59,7 +64,7 @@ runs:
       if: inputs.build_type != 'remote'
       uses: ./.github/actions/download
       with:
-        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}-artifact
+        name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build_type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
         path: /tmp/neon
         aws-oicd-role-arn: ${{ inputs.aws-oicd-role-arn }}
 
@@ -112,6 +117,7 @@ runs:
         ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
         RERUN_FAILED: ${{ inputs.rerun_failed }}
         PG_VERSION: ${{ inputs.pg_version }}
+        SANITIZERS: ${{ inputs.sanitizers }}
       shell: bash -euxo pipefail {0}
       run: |
         # PLATFORM will be embedded in the perf test report
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 30fde127b0..6a2070424a 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -280,7 +280,7 @@ jobs:
       - name: Upload Neon artifact
         uses: ./.github/actions/upload
         with:
-          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-artifact
+          name: neon-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}${{ inputs.sanitizers == 'enabled' && '-sanitized' || '' }}-artifact
           path: /tmp/neon
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
 
@@ -347,6 +347,7 @@ jobs:
           real_s3_region: eu-central-1
           rerun_failed: true
           pg_version: ${{ matrix.pg_version }}
+          sanitizers: ${{ inputs.sanitizers }}
           aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
           # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
           # Attempt to stop tests gracefully to generate test reports
@@ -359,7 +360,6 @@ jobs:
           PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
           PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
           USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}
-          SANITIZERS: ${{ inputs.sanitizers }}
 
       # Temporary disable this step until we figure out why it's so flaky
       # Ref https://github.com/neondatabase/neon/issues/4540
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ffb6c65af9..ff7db02e42 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -141,6 +141,8 @@ jobs:
           --ignore test_runner/performance/test_physical_replication.py
           --ignore test_runner/performance/test_perf_ingest_using_pgcopydb.py
           --ignore test_runner/performance/test_cumulative_statistics_persistence.py
+          --ignore test_runner/performance/test_perf_many_relations.py
+          --ignore test_runner/performance/test_perf_oltp_large_tenant.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 639c258c5c..66758ca49f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -692,15 +692,15 @@ jobs:
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-x64 \
                                              neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
-  vm-compute-node-image:
+  vm-compute-node-image-arch:
     needs: [ check-permissions, meta, compute-node-image ]
     if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
-    runs-on: [ self-hosted, large ]
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'large-arm64' || 'large')) }}
     strategy:
       fail-fast: false
       matrix:
+        arch: [ amd64, arm64 ]
         version:
-          # see the comment for `compute-node-image-arch` job
           - pg: v14
             debian: bullseye
           - pg: v15
@@ -717,7 +717,7 @@ jobs:
 
       - name: Downloading vm-builder
         run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
+          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-${{ matrix.arch }} -o vm-builder
           chmod +x vm-builder
 
       - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -738,12 +738,37 @@ jobs:
             -size=2G \
             -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
             -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
-            -target-arch=linux/amd64
+            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }} \
+            -target-arch=linux/${{ matrix.arch }}
 
       - name: Pushing vm-compute-node image
         run: |
-          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}
+          docker push neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-${{ matrix.arch }}
+
+  vm-compute-node-image:
+    needs: [ vm-compute-node-image-arch, meta ]
+    if: ${{ contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    runs-on: ubuntu-22.04
+    strategy:
+      matrix:
+        version:
+          # see the comment for `compute-node-image-arch` job
+          - pg: v14
+          - pg: v15
+          - pg: v16
+          - pg: v17
+    steps:
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+
+      - name: Create multi-arch compute-node image
+        run: |
+          docker buildx imagetools create -t neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }} \
+                                             neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-amd64 \
+                                             neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.meta.outputs.build-tag }}-arm64
+
 
   test-images:
     needs: [ check-permissions, meta, neon-image, compute-node-image ]
@@ -831,7 +856,7 @@ jobs:
               || needs.meta.outputs.run-kind == 'pr' && needs.meta.outputs.build-tag
               || needs.meta.outputs.run-kind == 'compute-rc-pr' && needs.meta.outputs.previous-storage-release
             }}
-          TEST_EXTENSIONS_TAG: latest
+          TEST_EXTENSIONS_TAG: ${{ needs.meta.outputs.previous-compute-release }}
           NEW_COMPUTE_TAG: ${{ needs.meta.outputs.build-tag }}
           OLD_COMPUTE_TAG: ${{ needs.meta.outputs.previous-compute-release }}
         run: ./docker-compose/test_extensions_upgrade.sh
@@ -1036,7 +1061,7 @@ jobs:
           exit 1
 
   deploy:
-    needs: [ check-permissions, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
+    needs: [ check-permissions, push-neon-image-dev, push-compute-image-dev, push-neon-image-prod, push-compute-image-prod, meta, build-and-test-locally, trigger-custom-extensions-build-and-wait ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-neon-image-prod` and `push-compute-image-prod`
     if: ${{ contains(fromJSON('["push-main", "storage-release", "proxy-release", "compute-release"]'), needs.meta.outputs.run-kind) && !failure() && !cancelled() }}
     permissions:
diff --git a/.github/workflows/force-test-extensions-upgrade.yml b/.github/workflows/force-test-extensions-upgrade.yml
index 71c5158ef6..f2376306dc 100644
--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -52,8 +52,9 @@ jobs:
       - name: Test extension upgrade
         timeout-minutes: 20
         env:
-          NEWTAG: latest
-          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          NEW_COMPUTE_TAG: latest
+          OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          TEST_EXTENSIONS_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
           PG_VERSION: ${{ matrix.pg-version }}
           FORCE_ALL_UPGRADE_TESTS: true
         run: ./docker-compose/test_extensions_upgrade.sh
diff --git a/.github/workflows/large_oltp_benchmark.yml b/.github/workflows/large_oltp_benchmark.yml
new file mode 100644
index 0000000000..f33e11cd08
--- /dev/null
+++ b/.github/workflows/large_oltp_benchmark.yml
@@ -0,0 +1,147 @@
+name: large oltp benchmark
+
+on:
+  # uncomment to run on push for debugging your PR
+  push:
+    branches: [ bodobolero/synthetic_oltp_workload ]
+
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │  ┌───────────── day of the month (1 - 31)
+    #          │ │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 15 * * *' # run once a day, timezone is utc, avoid conflict with other benchmarks
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: large-oltp-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  oltp:
+    strategy:
+      fail-fast: false # allow other variants to continue even if one fails
+      matrix:
+        include:
+          - target: new_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+          - target: reuse_branch 
+            custom_scripts: insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4 
+      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      TEST_PG_BENCH_DURATIONS_MATRIX: "1h" # todo update to > 1 h 
+      TEST_PGBENCH_CUSTOM_SCRIPTS: ${{ matrix.custom_scripts }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      PG_VERSION: 16 # pre-determined by pre-determined project
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.ref_name == 'main' }}
+      PLATFORM: ${{ matrix.target }}
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    # Increase timeout to 8h, default timeout is 6h
+    timeout-minutes: 480
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target == 'new_branch' }}
+      id: create-neon-branch-oltp-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+          project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+          api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Set up Connection String
+      id: set-up-connstr
+      run: |
+          case "${{ matrix.target }}" in
+              new_branch)
+              CONNSTR=${{ steps.create-neon-branch-oltp-target.outputs.dsn }}
+              ;;
+              reuse_branch)
+              CONNSTR=${{ secrets.BENCHMARK_LARGE_OLTP_REUSE_CONNSTR }}
+              ;;
+              *)
+              echo >&2 "Unknown target=${{ matrix.target }}"
+              exit 1
+              ;;
+          esac
+
+          echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
+
+    - name: Benchmark pgbench with custom-scripts
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 21600 -k test_perf_oltp_large_tenant
+        pg_version: ${{ env.PG_VERSION }}
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        BENCHMARK_CONNSTR: ${{ steps.set-up-connstr.outputs.connstr }}
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target == 'new_branch' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_LARGE_OLTP_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-oltp-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+  
+    - name: Post to a Slack channel
+      if: ${{ github.event.schedule && failure() }}
+      uses: slackapi/slack-github-action@v1
+      with:
+        channel-id: "C06KHQVQ7U3" # on-call-qa-staging-stream
+        slack-message: |
+          Periodic large oltp perf testing: ${{ job.status }}
+          <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+          <${{ steps.create-allure-report.outputs.report-url }}|Allure report>
+      env:
+        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index af877029e4..f854bf3212 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -3,12 +3,12 @@ name: Periodic pagebench performance test on dedicated EC2 machine in eu-central
 on:
   schedule:
     # * is a special character in YAML so you have to quote this string
-    #          ┌───────────── minute (0 - 59)
-    #          │ ┌───────────── hour (0 - 23)
-    #          │ │ ┌───────────── day of the month (1 - 31)
-    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
-    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
-    - cron:  '0 18 * * *' # Runs at 6 PM UTC every day
+    #        ┌───────────── minute (0 - 59)
+    #        │   ┌───────────── hour (0 - 23)
+    #        │   │ ┌───────────── day of the month (1 - 31)
+    #        │   │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #        │   │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron: '0 */3 * * *' # Runs every 3 hours
   workflow_dispatch: # Allows manual triggering of the workflow
     inputs:
       commit_hash:
@@ -78,8 +78,10 @@ jobs:
       run: |
         if [ -z "$INPUT_COMMIT_HASH" ]; then
           echo "COMMIT_HASH=$(curl -s https://api.github.com/repos/neondatabase/neon/commits/main | jq -r '.sha')" >> $GITHUB_ENV
+          echo "COMMIT_HASH_TYPE=latest" >> $GITHUB_ENV
         else
           echo "COMMIT_HASH=$INPUT_COMMIT_HASH" >> $GITHUB_ENV
+          echo "COMMIT_HASH_TYPE=manual" >> $GITHUB_ENV
         fi
 
     - name: Start Bench with run_id
@@ -89,7 +91,7 @@ jobs:
         -H 'accept: application/json' \
         -H 'Content-Type: application/json' \
         -H "Authorization: Bearer $API_KEY" \
-        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\"}"
+        -d "{\"neonRepoCommitHash\": \"${COMMIT_HASH}\", \"neonRepoCommitHashType\": \"${COMMIT_HASH_TYPE}\"}"
 
     - name: Poll Test Status
       id: poll_step
diff --git a/CODEOWNERS b/CODEOWNERS
index 71b5e65f94..ab6d2257a4 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,8 +1,9 @@
 # Autoscaling
 /libs/vm_monitor/ @neondatabase/autoscaling
 
-# DevProd
-/.github/ @neondatabase/developer-productivity
+# DevProd & PerfCorr
+/.github/ @neondatabase/developer-productivity @neondatabase/performance-correctness
+/test_runner/	@neondatabase/performance-correctness
 
 # Compute
 /pgxn/ @neondatabase/compute
diff --git a/Cargo.lock b/Cargo.lock
index 293ed465ff..d3b09fa360 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -783,6 +783,28 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "axum-extra"
+version = "0.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fc6f625a1f7705c6cf62d0d070794e94668988b1c38111baeec177c715f7b"
+dependencies = [
+ "axum",
+ "axum-core",
+ "bytes",
+ "futures-util",
+ "headers",
+ "http 1.1.0",
+ "http-body 1.0.0",
+ "http-body-util",
+ "mime",
+ "pin-project-lite",
+ "serde",
+ "tower 0.5.2",
+ "tower-layer",
+ "tower-service",
+]
+
 [[package]]
 name = "azure_core"
 version = "0.21.0"
@@ -925,9 +947,9 @@ checksum = "0ea22880d78093b0cbe17c89f64a7d457941e65759157ec6cb31a31d652b05e5"
 
 [[package]]
 name = "base64"
-version = "0.21.1"
+version = "0.21.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3f1e31e207a6b8fb791a38ea3105e6cb541f55e4d029902d3039a4ad07cc4105"
+checksum = "9d297deb1925b89f2ccc13d7635fa0714f12c87adce1c75356b39ca9b7178567"
 
 [[package]]
 name = "base64"
@@ -1105,9 +1127,9 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.1.30"
+version = "1.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
+checksum = "be714c154be609ec7f5dad223a33bf1482fff90472de28f7362806e6d4832b8c"
 dependencies = [
  "jobserver",
  "libc",
@@ -1305,6 +1327,7 @@ dependencies = [
  "aws-sdk-s3",
  "aws-smithy-types",
  "axum",
+ "axum-extra",
  "base64 0.13.1",
  "bytes",
  "camino",
@@ -1316,6 +1339,7 @@ dependencies = [
  "flate2",
  "futures",
  "http 1.1.0",
+ "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
  "notify",
@@ -2297,7 +2321,7 @@ name = "framed-websockets"
 version = "0.1.0"
 source = "git+https://github.com/neondatabase/framed-websockets#34eff3d6f8cfccbc5f35e4f65314ff7328621127"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "bytemuck",
  "bytes",
  "futures-core",
@@ -2410,9 +2434,9 @@ checksum = "f90f7dce0722e95104fcb095585910c0977252f286e354b5e3bd38902cd99988"
 
 [[package]]
 name = "futures-timer"
-version = "3.0.2"
+version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e64b03909df88034c26dc1547e8970b91f98bdb65165d6a4e9110d94263dbb2c"
+checksum = "f288b0a4f20f9a56b5d1da57e2227c661b7b16168e2f72365f57b63326e29b24"
 
 [[package]]
 name = "futures-util"
@@ -2515,6 +2539,27 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
+[[package]]
+name = "governor"
+version = "0.8.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "842dc78579ce01e6a1576ad896edc92fca002dd60c9c3746b7fc2bec6fb429d0"
+dependencies = [
+ "cfg-if",
+ "dashmap 6.1.0",
+ "futures-sink",
+ "futures-timer",
+ "futures-util",
+ "no-std-compat",
+ "nonzero_ext",
+ "parking_lot 0.12.1",
+ "portable-atomic",
+ "quanta",
+ "rand 0.8.5",
+ "smallvec",
+ "spinning_top",
+]
+
 [[package]]
 name = "group"
 version = "0.12.1"
@@ -2632,7 +2677,7 @@ version = "7.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "765c9198f173dd59ce26ff9f95ef0aafd0a0fe01fb9d72841bc5066a4c06511d"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "byteorder",
  "crossbeam-channel",
  "flate2",
@@ -2640,6 +2685,30 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "headers"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "322106e6bd0cba2d5ead589ddb8150a13d7c4217cf80d7c4f682ca994ccc6aa9"
+dependencies = [
+ "base64 0.21.7",
+ "bytes",
+ "headers-core",
+ "http 1.1.0",
+ "httpdate",
+ "mime",
+ "sha1",
+]
+
+[[package]]
+name = "headers-core"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "54b4a22553d4242c49fddb9ba998a99962b5cc6f22cb5a3482bec22522403ce4"
+dependencies = [
+ "http 1.1.0",
+]
+
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2777,12 +2846,9 @@ name = "http-utils"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "backtrace",
  "bytes",
  "fail",
- "flate2",
  "hyper 0.14.30",
- "inferno 0.12.0",
  "itertools 0.10.5",
  "jemalloc_pprof",
  "metrics",
@@ -3281,9 +3347,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jemalloc_pprof"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
+checksum = "5622af6d21ff86ed7797ef98e11b8f302da25ec69a7db9f6cde8e2e1c8df9992"
 dependencies = [
  "anyhow",
  "libc",
@@ -3367,7 +3433,7 @@ version = "9.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5c7ea04a7c5c055c175f189b6dc6ba036fd62306b58c66c9f6389036c503a3f4"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "js-sys",
  "pem",
  "ring",
@@ -3482,9 +3548,9 @@ dependencies = [
 
 [[package]]
 name = "mappings"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
+checksum = "e434981a332777c2b3062652d16a55f8e74fa78e6b1882633f0d77399c84fc2a"
 dependencies = [
  "anyhow",
  "libc",
@@ -3725,6 +3791,12 @@ dependencies = [
  "memoffset 0.9.0",
 ]
 
+[[package]]
+name = "no-std-compat"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b93853da6d84c2e3c7d730d6473e8817692dd89be387eb01b94d7f108ecb5b8c"
+
 [[package]]
 name = "nom"
 version = "7.1.3"
@@ -3735,6 +3807,12 @@ dependencies = [
  "minimal-lexical",
 ]
 
+[[package]]
+name = "nonzero_ext"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "38bf9645c8b145698bb0b18a4637dcacbc421ea49bef2317e4fd8065a387cf21"
+
 [[package]]
 name = "notify"
 version = "8.0.0"
@@ -4225,6 +4303,7 @@ dependencies = [
  "tracing",
  "url",
  "utils",
+ "uuid",
  "wal_decoder",
  "walkdir",
  "workspace_hack",
@@ -4307,9 +4386,9 @@ dependencies = [
 
 [[package]]
 name = "papaya"
-version = "0.1.8"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dc7c76487f7eaa00a0fc1d7f88dc6b295aec478d11b0fc79f857b62c2874124c"
+checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
 dependencies = [
  "equivalent",
  "seize",
@@ -4437,7 +4516,7 @@ version = "3.0.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1b8fcc794035347fb64beda2d3b462595dd2753e3f268d89c5aae77e8cf2c310"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "serde",
 ]
 
@@ -4591,6 +4670,12 @@ dependencies = [
  "never-say-never",
 ]
 
+[[package]]
+name = "portable-atomic"
+version = "1.10.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
+
 [[package]]
 name = "postgres"
 version = "0.19.7"
@@ -4755,12 +4840,14 @@ dependencies = [
 
 [[package]]
 name = "pprof_util"
-version = "0.6.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
+checksum = "9fa015c78eed2130951e22c58d2095849391e73817ab2e74f71b0b9f63dd8416"
 dependencies = [
  "anyhow",
+ "backtrace",
  "flate2",
+ "inferno 0.12.0",
  "num",
  "paste",
  "prost",
@@ -5052,6 +5139,21 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "quanta"
+version = "0.12.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3bd1fe6824cea6538803de3ff1bc0cf3949024db3d43c9643024bfb33a807c0e"
+dependencies = [
+ "crossbeam-utils",
+ "libc",
+ "once_cell",
+ "raw-cpuid",
+ "wasi 0.11.0+wasi-snapshot-preview1",
+ "web-sys",
+ "winapi",
+]
+
 [[package]]
 name = "quick-xml"
 version = "0.26.0"
@@ -5182,6 +5284,15 @@ dependencies = [
  "num-traits",
 ]
 
+[[package]]
+name = "raw-cpuid"
+version = "11.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c6928fa44c097620b706542d428957635951bade7143269085389d42c8a4927e"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "rayon"
 version = "1.7.0"
@@ -5516,16 +5627,16 @@ dependencies = [
 
 [[package]]
 name = "ring"
-version = "0.17.6"
+version = "0.17.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "684d5e6e18f669ccebf64a92236bb7db9a34f07be010e3627368182027180866"
+checksum = "70ac5d832aa16abd7d1def883a8545280c20a60f523a370aa3a9617c2b8550ee"
 dependencies = [
  "cc",
+ "cfg-if",
  "getrandom 0.2.11",
  "libc",
- "spin",
  "untrusted",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -5752,7 +5863,7 @@ version = "1.0.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d194b56d58803a43635bdc398cd17e383d6f71f9182b9a192c127ca42494a59b"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
 ]
 
 [[package]]
@@ -5761,7 +5872,7 @@ version = "2.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "f48172685e6ff52a556baa527774f61fcaa884f59daf3375c62a3f1cd2549dab"
 dependencies = [
- "base64 0.21.1",
+ "base64 0.21.7",
  "rustls-pki-types",
 ]
 
@@ -6000,9 +6111,9 @@ dependencies = [
 
 [[package]]
 name = "seize"
-version = "0.4.9"
+version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d84b0c858bdd30cb56f5597f8b3bf702ec23829e652cc636a1e5a7b9de46ae93"
+checksum = "e4b8d813387d566f627f3ea1b914c068aac94c40ae27ec43f5f33bde65abefe7"
 dependencies = [
  "libc",
  "windows-sys 0.52.0",
@@ -6395,6 +6506,15 @@ version = "0.9.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6980e8d7511241f8acf4aebddbb1ff938df5eebe98691418c4468d0b72a96a67"
 
+[[package]]
+name = "spinning_top"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d96d2d1d716fb500937168cc09353ffdc7a012be8475ac7308e1bdf0e3923300"
+dependencies = [
+ "lock_api",
+]
+
 [[package]]
 name = "spki"
 version = "0.6.0"
@@ -6471,6 +6591,7 @@ dependencies = [
  "diesel_migrations",
  "fail",
  "futures",
+ "governor",
  "hex",
  "http-utils",
  "humantime",
@@ -7285,10 +7406,12 @@ version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "403fa3b783d4b626a8ad51d766ab03cb6d2dbfc46b1c5d4448395e6628dc9697"
 dependencies = [
+ "base64 0.22.1",
  "bitflags 2.8.0",
  "bytes",
  "http 1.1.0",
  "http-body 1.0.0",
+ "mime",
  "pin-project-lite",
  "tower-layer",
  "tower-service",
@@ -7642,7 +7765,6 @@ dependencies = [
  "anyhow",
  "arc-swap",
  "async-compression",
- "backtrace",
  "bincode",
  "byteorder",
  "bytes",
@@ -8196,7 +8318,7 @@ dependencies = [
  "ahash",
  "anyhow",
  "base64 0.13.1",
- "base64 0.21.1",
+ "base64 0.21.7",
  "base64ct",
  "bytes",
  "camino",
diff --git a/Cargo.toml b/Cargo.toml
index ff45d46a47..d11fe4f449 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -53,7 +53,6 @@ anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
-backtrace = "0.3.74"
 flate2 = "1.0.26"
 assert-json-diff = "2"
 async-stream = "0.3"
@@ -68,6 +67,7 @@ aws-credential-types = "1.2.0"
 aws-sigv4 = { version = "1.2", features = ["sign-http"] }
 aws-types = "1.3"
 axum = { version = "0.8.1", features = ["ws"] }
+axum-extra = { version = "0.10.0", features = ["typed-header"] }
 base64 = "0.13.0"
 bincode = "1.3"
 bindgen = "0.71"
@@ -95,6 +95,7 @@ futures = "0.3"
 futures-core = "0.3"
 futures-util = "0.3"
 git-version = "0.3"
+governor = "0.8"
 hashbrown = "0.14"
 hashlink = "0.9.1"
 hdrhistogram = "7.5.2"
@@ -113,11 +114,10 @@ hyper-util = "0.1"
 tokio-tungstenite = "0.21.0"
 indexmap = "2"
 indoc = "2"
-inferno = "0.12.0"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
-jemalloc_pprof = "0.6"
+jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
@@ -192,7 +192,7 @@ toml = "0.8"
 toml_edit = "0.22"
 tonic = {version = "0.12.3", default-features = false, features = ["channel", "tls", "tls-roots"]}
 tower = { version = "0.5.2", default-features = false }
-tower-http = { version = "0.6.2", features = ["request-id", "trace"] }
+tower-http = { version = "0.6.2", features = ["auth", "request-id", "trace"] }
 
 # This revision uses opentelemetry 0.27. There's no tag for it.
 tower-otel = { git = "https://github.com/mattiapenati/tower-otel", rev = "56a7321053bcb72443888257b622ba0d43a11fcd" }
diff --git a/Makefile b/Makefile
index 42ee643bb5..0911465fb8 100644
--- a/Makefile
+++ b/Makefile
@@ -11,15 +11,16 @@ ICU_PREFIX_DIR := /usr/local/icu
 #
 BUILD_TYPE ?= debug
 WITH_SANITIZERS ?= no
+PG_CFLAGS = -fsigned-char
 ifeq ($(BUILD_TYPE),release)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl
-	PG_CFLAGS = -O2 -g3 $(CFLAGS)
+	PG_CFLAGS += -O2 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 	# Unfortunately, `--profile=...` is a nightly feature
 	CARGO_BUILD_FLAGS += --release
 else ifeq ($(BUILD_TYPE),debug)
 	PG_CONFIGURE_OPTS = --enable-debug --with-openssl --enable-cassert --enable-depend
-	PG_CFLAGS = -O0 -g3 $(CFLAGS)
+	PG_CFLAGS += -O0 -g3 $(CFLAGS)
 	PG_LDFLAGS = $(LDFLAGS)
 else
 	$(error Bad build type '$(BUILD_TYPE)', see Makefile for options)
@@ -159,6 +160,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_visibility install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling pg_trgm $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_trgm install
 	+@echo "Compiling amcheck $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
 	+@echo "Compiling test_decoding $*"
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0cdb44853f..6e46185e36 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -162,7 +162,7 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION:?} postgres
 RUN cd postgres && \
-    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp \
+    export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
     --with-icu --with-libxml --with-libxslt --with-lz4" && \
     if [ "${PG_VERSION:?}" != "v14" ]; then \
         # zstd is available only from PG15
@@ -1484,7 +1484,7 @@ WORKDIR /ext-src
 COPY compute/patches/pg_duckdb_v031.patch .
 COPY compute/patches/duckdb_v120.patch .
 # pg_duckdb build requires source dir to be a git repo to get submodules
-# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: 
+# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
 # - extension management function duckdb.install_extension()
 # - access to duckdb.extensions table and its sequence
 RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
@@ -1499,8 +1499,8 @@ ARG PG_VERSION
 COPY --from=pg_duckdb-src /ext-src/ /ext-src/
 WORKDIR /ext-src/pg_duckdb-src
 RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control 
-        
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control
+
 #########################################################################################
 #
 # Layer "pg_repack"
@@ -1758,15 +1758,15 @@ ARG TARGETARCH
 # test_runner/regress/test_compute_metrics.py
 # See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN if [ "$TARGETARCH" = "amd64" ]; then\
-        postgres_exporter_sha256='027e75dda7af621237ff8f5ac66b78a40b0093595f06768612b92b1374bd3105';\
+        postgres_exporter_sha256='59aa4a7bb0f7d361f5e05732f5ed8c03cc08f78449cef5856eadec33a627694b';\
         pgbouncer_exporter_sha256='c9f7cf8dcff44f0472057e9bf52613d93f3ffbc381ad7547a959daa63c5e84ac';\
         sql_exporter_sha256='38e439732bbf6e28ca4a94d7bc3686d3fa1abdb0050773d5617a9efdb9e64d08';\
     else\
-        postgres_exporter_sha256='131a376d25778ff9701a4c81f703f179e0b58db5c2c496e66fa43f8179484786';\
+        postgres_exporter_sha256='d1dedea97f56c6d965837bfd1fbb3e35a3b4a4556f8cccee8bd513d8ee086124';\
         pgbouncer_exporter_sha256='217c4afd7e6492ae904055bc14fe603552cf9bac458c063407e991d68c519da3';\
         sql_exporter_sha256='11918b00be6e2c3a67564adfdb2414fdcbb15a5db76ea17d1d1a944237a893c6';\
     fi\
-    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.16.0/postgres_exporter-0.16.0.linux-${TARGETARCH}.tar.gz\
+    && curl -sL https://github.com/prometheus-community/postgres_exporter/releases/download/v0.17.1/postgres_exporter-0.17.1.linux-${TARGETARCH}.tar.gz\
      | tar xzf - --strip-components=1 -C.\
     && curl -sL https://github.com/prometheus-community/pgbouncer_exporter/releases/download/v0.10.2/pgbouncer_exporter-0.10.2.linux-${TARGETARCH}.tar.gz\
      | tar xzf - --strip-components=1 -C.\
@@ -1933,6 +1933,7 @@ RUN apt update && \
         locales \
         procps \
         ca-certificates \
+        rsyslog \
         $VERSION_INSTALLS && \
     apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
@@ -1978,6 +1979,13 @@ COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neo
 # Make the libraries we built available
 RUN echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig
 
+# rsyslog config permissions
+# directory for rsyslogd pid file
+RUN mkdir /var/run/rsyslogd && \
+    chown -R postgres:postgres /var/run/rsyslogd && \
+    chown -R postgres:postgres /etc/rsyslog.d/
+
+
 ENV LANG=en_US.utf8
 USER postgres
 ENTRYPOINT ["/usr/local/bin/compute_ctl"]
diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index f8f4cab63b..da2b86d542 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -29,6 +29,7 @@
     import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
     import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
     import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_chunk_size.libsonnet',
     import 'sql_exporter/lfc_hits.libsonnet',
     import 'sql_exporter/lfc_misses.libsonnet',
     import 'sql_exporter/lfc_used.libsonnet',
diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql
index 9cbbdfd8a3..fe0360ab5c 100644
--- a/compute/etc/sql_exporter/db_total_size.sql
+++ b/compute/etc/sql_exporter/db_total_size.sql
@@ -1 +1,5 @@
-SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
+SELECT sum(pg_database_size(datname)) AS total
+FROM pg_database
+-- Ignore invalid databases, as we will likely have problems with
+-- getting their size from the Pageserver.
+WHERE datconnlimit != -2;
diff --git a/compute/etc/sql_exporter/lfc_chunk_size.libsonnet b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet
new file mode 100644
index 0000000000..bbe56f869f
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_chunk_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_chunk_size',
+  type: 'gauge',
+  help: 'LFC chunk size, measured in 8KiB pages',
+  key_labels: null,
+  values: [
+    'lfc_chunk_size_pages',
+  ],
+  query: importstr 'sql_exporter/lfc_chunk_size.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_chunk_size.sql b/compute/etc/sql_exporter/lfc_chunk_size.sql
new file mode 100644
index 0000000000..0905870064
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_chunk_size.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_chunk_size_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_chunk_size_pages';
diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql
index 00ada87370..12e6c4ae59 100644
--- a/compute/etc/sql_exporter/pg_stats_userdb.sql
+++ b/compute/etc/sql_exporter/pg_stats_userdb.sql
@@ -1,10 +1,20 @@
 -- We export stats for 10 non-system databases. Without this limit it is too
 -- easy to abuse the system by creating lots of databases.
 
-SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
-  tup_updated AS updated, tup_deleted AS deleted, datname
+SELECT pg_database_size(datname) AS db_size,
+  deadlocks,
+  tup_inserted AS inserted,
+  tup_updated AS updated,
+  tup_deleted AS deleted,
+  datname
 FROM pg_stat_database
 WHERE datname IN (
   SELECT datname FROM pg_database
-  WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
+  -- Ignore invalid databases, as we will likely have problems with
+  -- getting their size from the Pageserver.
+  WHERE datconnlimit != -2
+    AND datname <> 'postgres'
+    AND NOT datistemplate
+  ORDER BY oid
+  LIMIT 10
 );
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index ff4c3387d9..e6707381ac 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -39,6 +39,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+  - name: rsyslogd
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -54,7 +58,7 @@ files:
       # regardless of hostname (ALL)
       #
       # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
@@ -69,6 +73,12 @@ files:
           }
           memory {}
       }
+# Create dummy rsyslog config, because it refuses to start without at least one action configured.
+# compute_ctl will rewrite this file with the actual configuration, if needed.
+  - filename: compute_rsyslog.conf
+    content: |
+      *.*    /dev/null
+      $IncludeConfig /etc/rsyslog.d/*.conf
 build: |
   # Build cgroup-tools
   #
@@ -132,6 +142,12 @@ merge: |
   RUN set -e \
       && chmod 0644 /etc/cgconfig.conf
 
+
+  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
+  RUN chmod 0666 /etc/compute_rsyslog.conf
+  RUN chmod 0666 /var/log/
+
+
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
   COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index c001040bc9..c89ee112dc 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -39,6 +39,10 @@ commands:
     user: nobody
     sysvInitAction: respawn
     shell: '/bin/sql_exporter -config.file=/etc/sql_exporter_autoscaling.yml -web.listen-address=:9499'
+  - name: rsyslogd
+    user: postgres
+    sysvInitAction: respawn
+    shell: '/usr/sbin/rsyslogd -n -i /var/run/rsyslogd/rsyslogd.pid -f /etc/compute_rsyslog.conf'
 shutdownHook: |
   su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -54,7 +58,7 @@ files:
       # regardless of hostname (ALL)
       #
       # Also allow it to shut down the VM. The fast_import job does that when it's finished.
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff, /usr/sbin/rsyslogd
   - filename: cgconfig.conf
     content: |
       # Configuration for cgroups in VM compute nodes
@@ -69,6 +73,12 @@ files:
           }
           memory {}
       }
+# Create dummy rsyslog config, because it refuses to start without at least one action configured.
+# compute_ctl will rewrite this file with the actual configuration, if needed.
+  - filename: compute_rsyslog.conf
+    content: |
+      *.*    /dev/null
+      $IncludeConfig /etc/rsyslog.d/*.conf
 build: |
   # Build cgroup-tools
   #
@@ -128,6 +138,11 @@ merge: |
   RUN set -e \
       && chmod 0644 /etc/cgconfig.conf
 
+  COPY compute_rsyslog.conf /etc/compute_rsyslog.conf
+  RUN chmod 0666 /etc/compute_rsyslog.conf
+  RUN chmod 0666 /var/log/
+
+
   COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
   COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
   COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 8f3bcbeef8..dd2896714d 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -17,6 +17,7 @@ aws-sdk-kms.workspace = true
 aws-smithy-types.workspace = true
 anyhow.workspace = true
 axum = { workspace = true, features = [] }
+axum-extra.workspace = true
 camino.workspace = true
 chrono.workspace = true
 cfg-if.workspace = true
@@ -25,6 +26,7 @@ fail.workspace = true
 flate2.workspace = true
 futures.workspace = true
 http.workspace = true
+jsonwebtoken.workspace = true
 metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 6dae1a2753..fc7a3e2827 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -33,39 +33,27 @@
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
 //! ```
-use std::collections::HashMap;
 use std::ffi::OsString;
 use std::fs::File;
 use std::path::Path;
 use std::process::exit;
-use std::str::FromStr;
-use std::sync::atomic::Ordering;
-use std::sync::{Arc, Condvar, Mutex, RwLock, mpsc};
+use std::sync::mpsc;
 use std::thread;
 use std::time::Duration;
 
 use anyhow::{Context, Result};
-use chrono::Utc;
 use clap::Parser;
-use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
+use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::ComputeSpec;
-use compute_tools::compute::{
-    ComputeNode, ComputeState, PG_PID, ParsedSpec, forward_termination_signal,
-};
-use compute_tools::configurator::launch_configurator;
-use compute_tools::disk_quota::set_disk_quota;
+use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
 use compute_tools::extension_server::get_pg_version_string;
-use compute_tools::http::server::Server;
 use compute_tools::logger::*;
-use compute_tools::lsn_lease::launch_lsn_lease_bg_task_for_static;
-use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
-use compute_tools::swap::resize_swap;
 use rlimit::{Resource, setrlimit};
 use signal_hook::consts::{SIGINT, SIGQUIT, SIGTERM};
 use signal_hook::iterator::Signals;
-use tracing::{error, info, warn};
+use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;
 
@@ -164,29 +152,41 @@ fn main() -> Result<()> {
     // enable core dumping for all child processes
     setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
 
-    let (pg_handle, start_pg_result) = {
-        // Enter startup tracing context
-        let _startup_context_guard = startup_context_from_env();
+    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
 
-        let cli_spec = try_spec_from_cli(&cli)?;
+    let cli_spec = try_spec_from_cli(&cli)?;
 
-        let compute = wait_spec(build_tag, &cli, cli_spec)?;
+    let compute_node = ComputeNode::new(
+        ComputeNodeParams {
+            compute_id: cli.compute_id,
+            connstr,
+            pgdata: cli.pgdata.clone(),
+            pgbin: cli.pgbin.clone(),
+            pgversion: get_pg_version_string(&cli.pgbin),
+            external_http_port: cli.external_http_port,
+            internal_http_port: cli.internal_http_port,
+            ext_remote_storage: cli.remote_ext_config.clone(),
+            resize_swap_on_bind: cli.resize_swap_on_bind,
+            set_disk_quota_for_fs: cli.set_disk_quota_for_fs,
+            #[cfg(target_os = "linux")]
+            filecache_connstr: cli.filecache_connstr,
+            #[cfg(target_os = "linux")]
+            cgroup: cli.cgroup,
+            #[cfg(target_os = "linux")]
+            vm_monitor_addr: cli.vm_monitor_addr,
+            build_tag,
 
-        start_postgres(&cli, compute)?
+            live_config_allowed: cli_spec.live_config_allowed,
+        },
+        cli_spec.spec,
+        cli_spec.compute_ctl_config,
+    )?;
 
-        // Startup is finished, exit the startup tracing span
-    };
-
-    // PostgreSQL is now running, if startup was successful. Wait until it exits.
-    let wait_pg_result = wait_postgres(pg_handle)?;
-
-    let delay_exit = cleanup_after_postgres_exit(start_pg_result)?;
-
-    maybe_delay_exit(delay_exit);
+    let exit_code = compute_node.run()?;
 
     scenario.teardown();
 
-    deinit_and_exit(wait_pg_result);
+    deinit_and_exit(exit_code);
 }
 
 async fn init() -> Result<String> {
@@ -207,56 +207,6 @@ async fn init() -> Result<String> {
     Ok(build_tag)
 }
 
-fn startup_context_from_env() -> Option<opentelemetry::ContextGuard> {
-    // Extract OpenTelemetry context for the startup actions from the
-    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
-    // tracing context.
-    //
-    // This is used to propagate the context for the 'start_compute' operation
-    // from the neon control plane. This allows linking together the wider
-    // 'start_compute' operation that creates the compute container, with the
-    // startup actions here within the container.
-    //
-    // There is no standard for passing context in env variables, but a lot of
-    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
-    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
-    //
-    // Switch to the startup context here, and exit it once the startup has
-    // completed and Postgres is up and running.
-    //
-    // If this pod is pre-created without binding it to any particular endpoint
-    // yet, this isn't the right place to enter the startup context. In that
-    // case, the control plane should pass the tracing context as part of the
-    // /configure API call.
-    //
-    // NOTE: This is supposed to only cover the *startup* actions. Once
-    // postgres is configured and up-and-running, we exit this span. Any other
-    // actions that are performed on incoming HTTP requests, for example, are
-    // performed in separate spans.
-    //
-    // XXX: If the pod is restarted, we perform the startup actions in the same
-    // context as the original startup actions, which probably doesn't make
-    // sense.
-    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
-    if let Ok(val) = std::env::var("TRACEPARENT") {
-        startup_tracing_carrier.insert("traceparent".to_string(), val);
-    }
-    if let Ok(val) = std::env::var("TRACESTATE") {
-        startup_tracing_carrier.insert("tracestate".to_string(), val);
-    }
-    if !startup_tracing_carrier.is_empty() {
-        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry_sdk::propagation::TraceContextPropagator;
-        let guard = TraceContextPropagator::new()
-            .extract(&startup_tracing_carrier)
-            .attach();
-        info!("startup tracing context attached");
-        Some(guard)
-    } else {
-        None
-    }
-}
-
 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
     // First, try to get cluster spec from the cli argument
     if let Some(ref spec_json) = cli.spec_json {
@@ -307,357 +257,7 @@ struct CliSpecParams {
     live_config_allowed: bool,
 }
 
-fn wait_spec(
-    build_tag: String,
-    cli: &Cli,
-    CliSpecParams {
-        spec,
-        live_config_allowed,
-        compute_ctl_config: _,
-    }: CliSpecParams,
-) -> Result<Arc<ComputeNode>> {
-    let mut new_state = ComputeState::new();
-    let spec_set;
-
-    if let Some(spec) = spec {
-        let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?;
-        info!("new pspec.spec: {:?}", pspec.spec);
-        new_state.pspec = Some(pspec);
-        spec_set = true;
-    } else {
-        spec_set = false;
-    }
-    let connstr = Url::parse(&cli.connstr).context("cannot parse connstr as a URL")?;
-    let conn_conf = postgres::config::Config::from_str(connstr.as_str())
-        .context("cannot build postgres config from connstr")?;
-    let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr.as_str())
-        .context("cannot build tokio postgres config from connstr")?;
-    let compute_node = ComputeNode {
-        compute_id: cli.compute_id.clone(),
-        connstr,
-        conn_conf,
-        tokio_conn_conf,
-        pgdata: cli.pgdata.clone(),
-        pgbin: cli.pgbin.clone(),
-        pgversion: get_pg_version_string(&cli.pgbin),
-        external_http_port: cli.external_http_port,
-        internal_http_port: cli.internal_http_port,
-        live_config_allowed,
-        state: Mutex::new(new_state),
-        state_changed: Condvar::new(),
-        ext_remote_storage: cli.remote_ext_config.clone(),
-        ext_download_progress: RwLock::new(HashMap::new()),
-        build_tag,
-    };
-    let compute = Arc::new(compute_node);
-
-    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have its memory allocated from the host, and
-    // the necessary binaries will already be cached.
-    if !spec_set {
-        compute.prewarm_postgres()?;
-    }
-
-    // Launch the external HTTP server first, so that we can serve control plane
-    // requests while configuration is still in progress.
-    Server::External(cli.external_http_port).launch(&compute);
-
-    // The internal HTTP server could be launched later, but there isn't much
-    // sense in waiting.
-    Server::Internal(cli.internal_http_port).launch(&compute);
-
-    if !spec_set {
-        // No spec provided, hang waiting for it.
-        info!("no compute spec provided, waiting");
-
-        let mut state = compute.state.lock().unwrap();
-        while state.status != ComputeStatus::ConfigurationPending {
-            state = compute.state_changed.wait(state).unwrap();
-
-            if state.status == ComputeStatus::ConfigurationPending {
-                info!("got spec, continue configuration");
-                // Spec is already set by the http server handler.
-                break;
-            }
-        }
-
-        // Record for how long we slept waiting for the spec.
-        let now = Utc::now();
-        state.metrics.wait_for_spec_ms = now
-            .signed_duration_since(state.start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-
-        // Reset start time, so that the total startup time that is calculated later will
-        // not include the time that we waited for the spec.
-        state.start_time = now;
-    }
-
-    launch_lsn_lease_bg_task_for_static(&compute);
-
-    Ok(compute)
-}
-
-fn start_postgres(
-    cli: &Cli,
-    compute: Arc<ComputeNode>,
-) -> Result<(Option<PostgresHandle>, StartPostgresResult)> {
-    // We got all we need, update the state.
-    let mut state = compute.state.lock().unwrap();
-
-    // Create a tracing span for the startup operation.
-    //
-    // We could otherwise just annotate the function with #[instrument], but if
-    // we're being configured from a /configure HTTP request, we want the
-    // startup to be considered part of the /configure request.
-    let _this_entered = {
-        // Temporarily enter the /configure request's span, so that the new span
-        // becomes its child.
-        let _parent_entered = state.startup_span.take().map(|p| p.entered());
-
-        tracing::info_span!("start_postgres")
-    }
-    .entered();
-
-    state.set_status(ComputeStatus::Init, &compute.state_changed);
-
-    info!(
-        "running compute with features: {:?}",
-        state.pspec.as_ref().unwrap().spec.features
-    );
-    // before we release the mutex, fetch some parameters for later.
-    let &ComputeSpec {
-        swap_size_bytes,
-        disk_quota_bytes,
-        #[cfg(target_os = "linux")]
-        disable_lfc_resizing,
-        ..
-    } = &state.pspec.as_ref().unwrap().spec;
-    drop(state);
-
-    // Launch remaining service threads
-    let _monitor_handle = launch_monitor(&compute);
-    let _configurator_handle = launch_configurator(&compute);
-
-    let mut prestartup_failed = false;
-    let mut delay_exit = false;
-
-    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, cli.resize_swap_on_bind) {
-        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
-        // *before* starting postgres.
-        //
-        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
-        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
-        // OOM-killed during startup because swap wasn't available yet.
-        match resize_swap(size_bytes) {
-            Ok(()) => {
-                let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%size_bytes, %size_mib, "resized swap");
-            }
-            Err(err) => {
-                let err = err.context("failed to resize swap");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                compute.set_failed_status(err);
-                delay_exit = true;
-            }
-        }
-    }
-
-    // Set disk quota if the compute spec says so
-    if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) =
-        (disk_quota_bytes, cli.set_disk_quota_for_fs.as_ref())
-    {
-        match set_disk_quota(disk_quota_bytes, disk_quota_fs_mountpoint) {
-            Ok(()) => {
-                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%disk_quota_bytes, %size_mib, "set disk quota");
-            }
-            Err(err) => {
-                let err = err.context("failed to set disk quota");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                compute.set_failed_status(err);
-                delay_exit = true;
-            }
-        }
-    }
-
-    // Start Postgres
-    let mut pg = None;
-    if !prestartup_failed {
-        pg = match compute.start_compute() {
-            Ok(pg) => {
-                info!(postmaster_pid = %pg.0.id(), "Postgres was started");
-                Some(pg)
-            }
-            Err(err) => {
-                error!("could not start the compute node: {:#}", err);
-                compute.set_failed_status(err);
-                delay_exit = true;
-                None
-            }
-        };
-    } else {
-        warn!("skipping postgres startup because pre-startup step failed");
-    }
-
-    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
-    // because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            use std::env;
-            use tokio_util::sync::CancellationToken;
-
-            // This token is used internally by the monitor to clean up all threads
-            let token = CancellationToken::new();
-
-            // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
-            let pgconnstr = if disable_lfc_resizing.unwrap_or(false) {
-                None
-            } else {
-                Some(cli.filecache_connstr.clone())
-            };
-
-            let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
-                let vm_monitor = tokio::spawn(vm_monitor::start(
-                    Box::leak(Box::new(vm_monitor::Args {
-                        cgroup: Some(cli.cgroup.clone()),
-                        pgconnstr,
-                        addr: cli.vm_monitor_addr.clone(),
-                    })),
-                    token.clone(),
-                ));
-                Some(vm_monitor)
-            } else {
-                None
-            };
-        }
-    }
-
-    Ok((
-        pg,
-        StartPostgresResult {
-            delay_exit,
-            compute,
-            #[cfg(target_os = "linux")]
-            token,
-            #[cfg(target_os = "linux")]
-            vm_monitor,
-        },
-    ))
-}
-
-type PostgresHandle = (std::process::Child, tokio::task::JoinHandle<Result<()>>);
-
-struct StartPostgresResult {
-    delay_exit: bool,
-    // passed through from WaitSpecResult
-    compute: Arc<ComputeNode>,
-
-    #[cfg(target_os = "linux")]
-    token: tokio_util::sync::CancellationToken,
-    #[cfg(target_os = "linux")]
-    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
-}
-
-fn wait_postgres(pg: Option<PostgresHandle>) -> Result<WaitPostgresResult> {
-    // Wait for the child Postgres process forever. In this state Ctrl+C will
-    // propagate to Postgres and it will be shut down as well.
-    let mut exit_code = None;
-    if let Some((mut pg, logs_handle)) = pg {
-        info!(postmaster_pid = %pg.id(), "Waiting for Postgres to exit");
-
-        let ecode = pg
-            .wait()
-            .expect("failed to start waiting on Postgres process");
-        PG_PID.store(0, Ordering::SeqCst);
-
-        // Process has exited. Wait for the log collecting task to finish.
-        let _ = tokio::runtime::Handle::current()
-            .block_on(logs_handle)
-            .map_err(|e| tracing::error!("log task panicked: {:?}", e));
-
-        info!("Postgres exited with code {}, shutting down", ecode);
-        exit_code = ecode.code()
-    }
-
-    Ok(WaitPostgresResult { exit_code })
-}
-
-struct WaitPostgresResult {
-    exit_code: Option<i32>,
-}
-
-fn cleanup_after_postgres_exit(
-    StartPostgresResult {
-        mut delay_exit,
-        compute,
-        #[cfg(target_os = "linux")]
-        vm_monitor,
-        #[cfg(target_os = "linux")]
-        token,
-    }: StartPostgresResult,
-) -> Result<bool> {
-    // Terminate the vm_monitor so it releases the file watcher on
-    // /sys/fs/cgroup/neon-postgres.
-    // Note: the vm-monitor only runs on linux because it requires cgroups.
-    cfg_if::cfg_if! {
-        if #[cfg(target_os = "linux")] {
-            if let Some(handle) = vm_monitor {
-                // Kills all threads spawned by the monitor
-                token.cancel();
-                // Kills the actual task running the monitor
-                handle.abort();
-            }
-        }
-    }
-
-    // Maybe sync safekeepers again, to speed up next startup
-    let compute_state = compute.state.lock().unwrap().clone();
-    let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-    if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
-        info!("syncing safekeepers on shutdown");
-        let storage_auth_token = pspec.storage_auth_token.clone();
-        let lsn = compute.sync_safekeepers(storage_auth_token)?;
-        info!("synced safekeepers at lsn {lsn}");
-    }
-
-    let mut state = compute.state.lock().unwrap();
-    if state.status == ComputeStatus::TerminationPending {
-        state.status = ComputeStatus::Terminated;
-        compute.state_changed.notify_all();
-        // we were asked to terminate gracefully, don't exit to avoid restart
-        delay_exit = true
-    }
-    drop(state);
-
-    if let Err(err) = compute.check_for_core_dumps() {
-        error!("error while checking for core dumps: {err:?}");
-    }
-
-    Ok(delay_exit)
-}
-
-fn maybe_delay_exit(delay_exit: bool) {
-    // If launch failed, keep serving HTTP requests for a while, so the cloud
-    // control plane can get the actual error.
-    if delay_exit {
-        info!("giving control plane 30s to collect the error before shutdown");
-        thread::sleep(Duration::from_secs(30));
-    }
-}
-
-fn deinit_and_exit(WaitPostgresResult { exit_code }: WaitPostgresResult) -> ! {
+fn deinit_and_exit(exit_code: Option<i32>) -> ! {
     // Shutdown trace pipeline gracefully, so that it has a chance to send any
     // pending traces before we exit. Shutting down OTEL tracing provider may
     // hang for quite some time, see, for example:
diff --git a/compute_tools/src/catalog.rs b/compute_tools/src/catalog.rs
index 2a7f56e6fc..db3e07e086 100644
--- a/compute_tools/src/catalog.rs
+++ b/compute_tools/src/catalog.rs
@@ -58,14 +58,14 @@ pub async fn get_database_schema(
     compute: &Arc<ComputeNode>,
     dbname: &str,
 ) -> Result<impl Stream<Item = Result<bytes::Bytes, std::io::Error>> + use<>, SchemaDumpError> {
-    let pgbin = &compute.pgbin;
+    let pgbin = &compute.params.pgbin;
     let basepath = Path::new(pgbin).parent().unwrap();
     let pgdump = basepath.join("pg_dump");
 
     // Replace the DB in the connection string and disable it to parts.
     // This is the only option to handle DBs with special characters.
-    let conf =
-        postgres_conf_for_db(&compute.connstr, dbname).map_err(|_| SchemaDumpError::Unexpected)?;
+    let conf = postgres_conf_for_db(&compute.params.connstr, dbname)
+        .map_err(|_| SchemaDumpError::Unexpected)?;
     let host = conf
         .get_hosts()
         .first()
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c0e28790d6..354528e2cd 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,8 +11,10 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent};
+use compute_api::responses::{ComputeCtlConfig, ComputeMetrics, ComputeStatus};
+use compute_api::spec::{
+    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
+};
 use futures::StreamExt;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
@@ -23,33 +25,59 @@ use postgres::NoTls;
 use postgres::error::SqlState;
 use remote_storage::{DownloadError, RemotePath};
 use tokio::spawn;
-use tracing::{debug, error, info, instrument, warn};
+use tracing::{Instrument, debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::measured_stream::MeasuredReader;
 
+use crate::configurator::launch_configurator;
+use crate::disk_quota::set_disk_quota;
 use crate::installed_extensions::get_installed_extensions;
+use crate::logger::startup_context_from_env;
+use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
+use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
+use crate::rsyslog::configure_audit_rsyslog;
 use crate::spec::*;
+use crate::swap::resize_swap;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
 use crate::{config, extension_server, local_proxy};
 
 pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
 pub static PG_PID: AtomicU32 = AtomicU32::new(0);
 
-/// Compute node info shared across several `compute_ctl` threads.
-pub struct ComputeNode {
+/// Static configuration params that don't change after startup. These mostly
+/// come from the CLI args, or are derived from them.
+pub struct ComputeNodeParams {
     /// The ID of the compute
     pub compute_id: String,
     // Url type maintains proper escaping
     pub connstr: url::Url,
-    // We connect to Postgres from many different places, so build configs once
-    // and reuse them where needed.
-    pub conn_conf: postgres::config::Config,
-    pub tokio_conn_conf: tokio_postgres::config::Config,
+
+    pub resize_swap_on_bind: bool,
+    pub set_disk_quota_for_fs: Option<String>,
+
+    // VM monitor parameters
+    #[cfg(target_os = "linux")]
+    pub filecache_connstr: String,
+    #[cfg(target_os = "linux")]
+    pub cgroup: String,
+    #[cfg(target_os = "linux")]
+    pub vm_monitor_addr: String,
+
     pub pgdata: String,
     pub pgbin: String,
     pub pgversion: String,
+    pub build_tag: String,
+
+    /// The port that the compute's external HTTP server listens on
+    pub external_http_port: u16,
+    /// The port that the compute's internal HTTP server listens on
+    pub internal_http_port: u16,
+
+    /// the address of extension storage proxy gateway
+    pub ext_remote_storage: Option<String>,
+
     /// We should only allow live re- / configuration of the compute node if
     /// it uses 'pull model', i.e. it can go to control-plane and fetch
     /// the latest configuration. Otherwise, there could be a case:
@@ -63,10 +91,17 @@ pub struct ComputeNode {
     /// - we push spec and it does configuration
     /// - but then it is restarted without any spec again
     pub live_config_allowed: bool,
-    /// The port that the compute's external HTTP server listens on
-    pub external_http_port: u16,
-    /// The port that the compute's internal HTTP server listens on
-    pub internal_http_port: u16,
+}
+
+/// Compute node info shared across several `compute_ctl` threads.
+pub struct ComputeNode {
+    pub params: ComputeNodeParams,
+
+    // We connect to Postgres from many different places, so build configs once
+    // and reuse them where needed. These are derived from 'params.connstr'
+    pub conn_conf: postgres::config::Config,
+    pub tokio_conn_conf: tokio_postgres::config::Config,
+
     /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
     /// To allow HTTP API server to serving status requests, while configuration
     /// is in progress, lock should be held only for short periods of time to do
@@ -74,11 +109,9 @@ pub struct ComputeNode {
     pub state: Mutex<ComputeState>,
     /// `Condvar` to allow notifying waiters about state changes.
     pub state_changed: Condvar,
-    /// the address of extension storage proxy gateway
-    pub ext_remote_storage: Option<String>,
+
     // key: ext_archive_name, value: started download time, download_completed?
     pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
-    pub build_tag: String,
 }
 
 // store some metrics about download size that might impact startup time
@@ -102,6 +135,8 @@ pub struct ComputeState {
     /// passed by the control plane with a /configure HTTP request.
     pub pspec: Option<ParsedSpec>,
 
+    pub compute_ctl_config: ComputeCtlConfig,
+
     /// If the spec is passed by a /configure request, 'startup_span' is the
     /// /configure request's tracing span. The main thread enters it when it
     /// processes the compute startup, so that the compute startup is considered
@@ -125,6 +160,7 @@ impl ComputeState {
             last_active: None,
             error: None,
             pspec: None,
+            compute_ctl_config: ComputeCtlConfig::default(),
             startup_span: None,
             metrics: ComputeMetrics::default(),
         }
@@ -242,80 +278,518 @@ fn maybe_cgexec(cmd: &str) -> Command {
     }
 }
 
-pub(crate) fn construct_superuser_query(spec: &ComputeSpec) -> String {
-    let roles = spec
-        .cluster
-        .roles
-        .iter()
-        .map(|r| escape_literal(&r.name))
-        .collect::<Vec<_>>();
+struct PostgresHandle {
+    postgres: std::process::Child,
+    log_collector: tokio::task::JoinHandle<Result<()>>,
+}
 
-    let dbs = spec
-        .cluster
-        .databases
-        .iter()
-        .map(|db| escape_literal(&db.name))
-        .collect::<Vec<_>>();
+impl PostgresHandle {
+    /// Return PID of the postgres (postmaster) process
+    fn pid(&self) -> Pid {
+        Pid::from_raw(self.postgres.id() as i32)
+    }
+}
 
-    let roles_decl = if roles.is_empty() {
-        String::from("roles text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               roles text[] := ARRAY(SELECT rolname
-                                     FROM pg_catalog.pg_roles
-                                     WHERE rolname IN ({}));"#,
-            roles.join(", ")
-        )
-    };
-
-    let database_decl = if dbs.is_empty() {
-        String::from("dbs text[] := NULL;")
-    } else {
-        format!(
-            r#"
-               dbs text[] := ARRAY(SELECT datname
-                                   FROM pg_catalog.pg_database
-                                   WHERE datname IN ({}));"#,
-            dbs.join(", ")
-        )
-    };
-
-    // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on all databases
-    // (see https://www.postgresql.org/docs/current/ddl-priv.html)
-    let query = format!(
-        r#"
-            DO $$
-                DECLARE
-                    r text;
-                    {}
-                    {}
-                BEGIN
-                    IF NOT EXISTS (
-                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
-                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
-                        IF array_length(roles, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT neon_superuser TO %s',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
-                            FOREACH r IN ARRAY roles LOOP
-                                EXECUTE format('ALTER ROLE %s CREATEROLE CREATEDB', quote_ident(r));
-                            END LOOP;
-                        END IF;
-                        IF array_length(dbs, 1) IS NOT NULL THEN
-                            EXECUTE format('GRANT ALL PRIVILEGES ON DATABASE %s TO neon_superuser',
-                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(dbs) as x), ', '));
-                        END IF;
-                    END IF;
-                END
-            $$;"#,
-        roles_decl, database_decl,
-    );
-
-    query
+struct StartVmMonitorResult {
+    #[cfg(target_os = "linux")]
+    token: tokio_util::sync::CancellationToken,
+    #[cfg(target_os = "linux")]
+    vm_monitor: Option<tokio::task::JoinHandle<Result<()>>>,
 }
 
 impl ComputeNode {
+    pub fn new(
+        params: ComputeNodeParams,
+        cli_spec: Option<ComputeSpec>,
+        compute_ctl_config: ComputeCtlConfig,
+    ) -> Result<Self> {
+        let connstr = params.connstr.as_str();
+        let conn_conf = postgres::config::Config::from_str(connstr)
+            .context("cannot build postgres config from connstr")?;
+        let tokio_conn_conf = tokio_postgres::config::Config::from_str(connstr)
+            .context("cannot build tokio postgres config from connstr")?;
+
+        let mut new_state = ComputeState::new();
+        if let Some(cli_spec) = cli_spec {
+            let pspec = ParsedSpec::try_from(cli_spec).map_err(|msg| anyhow::anyhow!(msg))?;
+            new_state.pspec = Some(pspec);
+        }
+        new_state.compute_ctl_config = compute_ctl_config;
+
+        Ok(ComputeNode {
+            params,
+            conn_conf,
+            tokio_conn_conf,
+            state: Mutex::new(new_state),
+            state_changed: Condvar::new(),
+            ext_download_progress: RwLock::new(HashMap::new()),
+        })
+    }
+
+    /// Top-level control flow of compute_ctl. Returns a process exit code we should
+    /// exit with.
+    pub fn run(self) -> Result<Option<i32>> {
+        let this = Arc::new(self);
+
+        let cli_spec = this.state.lock().unwrap().pspec.clone();
+
+        // If this is a pooled VM, prewarm before starting HTTP server and becoming
+        // available for binding. Prewarming helps Postgres start quicker later,
+        // because QEMU will already have its memory allocated from the host, and
+        // the necessary binaries will already be cached.
+        if cli_spec.is_none() {
+            this.prewarm_postgres()?;
+        }
+
+        // Launch the external HTTP server first, so that we can serve control plane
+        // requests while configuration is still in progress.
+        crate::http::server::Server::External {
+            port: this.params.external_http_port,
+            jwks: this.state.lock().unwrap().compute_ctl_config.jwks.clone(),
+            compute_id: this.params.compute_id.clone(),
+        }
+        .launch(&this);
+
+        // The internal HTTP server could be launched later, but there isn't much
+        // sense in waiting.
+        crate::http::server::Server::Internal {
+            port: this.params.internal_http_port,
+        }
+        .launch(&this);
+
+        // If we got a spec from the CLI already, use that. Otherwise wait for the
+        // control plane to pass it to us with a /configure HTTP request
+        let pspec = if let Some(cli_spec) = cli_spec {
+            cli_spec
+        } else {
+            this.wait_spec()?
+        };
+
+        launch_lsn_lease_bg_task_for_static(&this);
+
+        // We have a spec, start the compute
+        let mut delay_exit = false;
+        let mut vm_monitor = None;
+        let mut pg_process: Option<PostgresHandle> = None;
+
+        match this.start_compute(&mut pg_process) {
+            Ok(()) => {
+                // Success! Launch remaining services (just vm-monitor currently)
+                vm_monitor =
+                    Some(this.start_vm_monitor(pspec.spec.disable_lfc_resizing.unwrap_or(false)));
+            }
+            Err(err) => {
+                // Something went wrong with the startup. Log it and expose the error to
+                // HTTP status requests.
+                error!("could not start the compute node: {:#}", err);
+                this.set_failed_status(err);
+                delay_exit = true;
+
+                // If the error happened after starting PostgreSQL, kill it
+                if let Some(ref pg_process) = pg_process {
+                    kill(pg_process.pid(), Signal::SIGQUIT).ok();
+                }
+            }
+        }
+
+        // If startup was successful, or it failed in the late stages,
+        // PostgreSQL is now running. Wait until it exits.
+        let exit_code = if let Some(pg_handle) = pg_process {
+            let exit_status = this.wait_postgres(pg_handle);
+            info!("Postgres exited with code {}, shutting down", exit_status);
+            exit_status.code()
+        } else {
+            None
+        };
+
+        // Terminate the vm_monitor so it releases the file watcher on
+        // /sys/fs/cgroup/neon-postgres.
+        // Note: the vm-monitor only runs on linux because it requires cgroups.
+        if let Some(vm_monitor) = vm_monitor {
+            cfg_if::cfg_if! {
+                if #[cfg(target_os = "linux")] {
+                    // Kills all threads spawned by the monitor
+                    vm_monitor.token.cancel();
+                    if let Some(handle) = vm_monitor.vm_monitor {
+                        // Kills the actual task running the monitor
+                        handle.abort();
+                    }
+                } else {
+                    _ = vm_monitor; // appease unused lint on macOS
+                }
+            }
+        }
+
+        // Reap the postgres process
+        delay_exit |= this.cleanup_after_postgres_exit()?;
+
+        // If launch failed, keep serving HTTP requests for a while, so the cloud
+        // control plane can get the actual error.
+        if delay_exit {
+            info!("giving control plane 30s to collect the error before shutdown");
+            std::thread::sleep(Duration::from_secs(30));
+        }
+        Ok(exit_code)
+    }
+
+    pub fn wait_spec(&self) -> Result<ParsedSpec> {
+        info!("no compute spec provided, waiting");
+        let mut state = self.state.lock().unwrap();
+        while state.status != ComputeStatus::ConfigurationPending {
+            state = self.state_changed.wait(state).unwrap();
+        }
+
+        info!("got spec, continue configuration");
+        let spec = state.pspec.as_ref().unwrap().clone();
+
+        // Record for how long we slept waiting for the spec.
+        let now = Utc::now();
+        state.metrics.wait_for_spec_ms = now
+            .signed_duration_since(state.start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+
+        // Reset start time, so that the total startup time that is calculated later will
+        // not include the time that we waited for the spec.
+        state.start_time = now;
+
+        Ok(spec)
+    }
+
+    /// Start compute.
+    ///
+    /// Prerequisites:
+    /// - the compute spec has been placed in self.state.pspec
+    ///
+    /// On success:
+    /// - status is set to ComputeStatus::Running
+    /// - self.running_postgres is set
+    ///
+    /// On error:
+    /// - status is left in ComputeStatus::Init. The caller is responsible for setting it to Failed
+    /// - if Postgres was started before the fatal error happened, self.running_postgres is
+    ///   set. The caller is responsible for killing it.
+    ///
+    /// Note that this is in the critical path of a compute cold start. Keep this fast.
+    /// Try to do things concurrently, to hide the latencies.
+    fn start_compute(self: &Arc<Self>, pg_handle: &mut Option<PostgresHandle>) -> Result<()> {
+        let compute_state: ComputeState;
+
+        let start_compute_span;
+        let _this_entered;
+        {
+            let mut state_guard = self.state.lock().unwrap();
+
+            // Create a tracing span for the startup operation.
+            //
+            // We could otherwise just annotate the function with #[instrument], but if
+            // we're being configured from a /configure HTTP request, we want the
+            // startup to be considered part of the /configure request.
+            //
+            // Similarly, if a trace ID was passed in env variables, attach it to the span.
+            start_compute_span = {
+                // Temporarily enter the parent span, so that the new span becomes its child.
+                if let Some(p) = state_guard.startup_span.take() {
+                    let _parent_entered = p.entered();
+                    tracing::info_span!("start_compute")
+                } else if let Some(otel_context) = startup_context_from_env() {
+                    use tracing_opentelemetry::OpenTelemetrySpanExt;
+                    let span = tracing::info_span!("start_compute");
+                    span.set_parent(otel_context);
+                    span
+                } else {
+                    tracing::info_span!("start_compute")
+                }
+            };
+            _this_entered = start_compute_span.enter();
+
+            state_guard.set_status(ComputeStatus::Init, &self.state_changed);
+            compute_state = state_guard.clone()
+        }
+
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        info!(
+            "starting compute for project {}, operation {}, tenant {}, timeline {}, features {:?}, spec.remote_extensions {:?}",
+            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
+            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
+            pspec.tenant_id,
+            pspec.timeline_id,
+            pspec.spec.features,
+            pspec.spec.remote_extensions,
+        );
+
+        ////// PRE-STARTUP PHASE: things that need to be finished before we start the Postgres process
+
+        // Collect all the tasks that must finish here
+        let mut pre_tasks = tokio::task::JoinSet::new();
+
+        // If there are any remote extensions in shared_preload_libraries, start downloading them
+        if pspec.spec.remote_extensions.is_some() {
+            let (this, spec) = (self.clone(), pspec.spec.clone());
+            pre_tasks.spawn(async move {
+                this.download_preload_extensions(&spec)
+                    .in_current_span()
+                    .await
+            });
+        }
+
+        // Prepare pgdata directory. This downloads the basebackup, among other things.
+        {
+            let (this, cs) = (self.clone(), compute_state.clone());
+            pre_tasks.spawn_blocking_child(move || this.prepare_pgdata(&cs));
+        }
+
+        // Resize swap to the desired size if the compute spec says so
+        if let (Some(size_bytes), true) =
+            (pspec.spec.swap_size_bytes, self.params.resize_swap_on_bind)
+        {
+            pre_tasks.spawn_blocking_child(move || {
+                // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
+                // *before* starting postgres.
+                //
+                // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
+                // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
+                // OOM-killed during startup because swap wasn't available yet.
+                resize_swap(size_bytes).context("failed to resize swap")?;
+                let size_mib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%size_bytes, %size_mib, "resized swap");
+
+                Ok::<(), anyhow::Error>(())
+            });
+        }
+
+        // Set disk quota if the compute spec says so
+        if let (Some(disk_quota_bytes), Some(disk_quota_fs_mountpoint)) = (
+            pspec.spec.disk_quota_bytes,
+            self.params.set_disk_quota_for_fs.as_ref(),
+        ) {
+            let disk_quota_fs_mountpoint = disk_quota_fs_mountpoint.clone();
+            pre_tasks.spawn_blocking_child(move || {
+                set_disk_quota(disk_quota_bytes, &disk_quota_fs_mountpoint)
+                    .context("failed to set disk quota")?;
+                let size_mib = disk_quota_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
+                info!(%disk_quota_bytes, %size_mib, "set disk quota");
+
+                Ok::<(), anyhow::Error>(())
+            });
+        }
+
+        // tune pgbouncer
+        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
+            info!("tuning pgbouncer");
+
+            // Spawn a background task to do the tuning,
+            // so that we don't block the main thread that starts Postgres.
+            let pgbouncer_settings = pgbouncer_settings.clone();
+            let _handle = tokio::spawn(async move {
+                let res = tune_pgbouncer(pgbouncer_settings).await;
+                if let Err(err) = res {
+                    error!("error while tuning pgbouncer: {err:?}");
+                    // Continue with the startup anyway
+                }
+            });
+        }
+
+        // configure local_proxy
+        if let Some(local_proxy) = &pspec.spec.local_proxy_config {
+            info!("configuring local_proxy");
+
+            // Spawn a background task to do the configuration,
+            // so that we don't block the main thread that starts Postgres.
+            let local_proxy = local_proxy.clone();
+            let _handle = tokio::spawn(async move {
+                if let Err(err) = local_proxy::configure(&local_proxy) {
+                    error!("error while configuring local_proxy: {err:?}");
+                    // Continue with the startup anyway
+                }
+            });
+        }
+
+        // Configure and start rsyslog if necessary
+        if let ComputeAudit::Hipaa = pspec.spec.audit_log_level {
+            let remote_endpoint = std::env::var("AUDIT_LOGGING_ENDPOINT").unwrap_or("".to_string());
+            if remote_endpoint.is_empty() {
+                anyhow::bail!("AUDIT_LOGGING_ENDPOINT is empty");
+            }
+
+            let log_directory_path = Path::new(&self.params.pgdata).join("log");
+            // TODO: make this more robust
+            // now rsyslog starts once and there is no monitoring or restart if it fails
+            configure_audit_rsyslog(
+                log_directory_path.to_str().unwrap(),
+                "hipaa",
+                &remote_endpoint,
+            )?;
+        }
+
+        // Launch remaining service threads
+        let _monitor_handle = launch_monitor(self);
+        let _configurator_handle = launch_configurator(self);
+
+        // Wait for all the pre-tasks to finish before starting postgres
+        let rt = tokio::runtime::Handle::current();
+        while let Some(res) = rt.block_on(pre_tasks.join_next()) {
+            res??;
+        }
+
+        ////// START POSTGRES
+        let start_time = Utc::now();
+        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
+        let postmaster_pid = pg_process.pid();
+        *pg_handle = Some(pg_process);
+
+        // If this is a primary endpoint, perform some post-startup configuration before
+        // opening it up for the world.
+        let config_time = Utc::now();
+        if pspec.spec.mode == ComputeMode::Primary {
+            self.configure_as_primary(&compute_state)?;
+
+            let conf = self.get_conn_conf(None);
+            tokio::task::spawn_blocking(|| {
+                let res = get_installed_extensions(conf);
+                match res {
+                    Ok(extensions) => {
+                        info!(
+                            "[NEON_EXT_STAT] {}",
+                            serde_json::to_string(&extensions)
+                                .expect("failed to serialize extensions list")
+                        );
+                    }
+                    Err(err) => error!("could not get installed extensions: {err:?}"),
+                }
+            });
+        }
+
+        // All done!
+        let startup_end_time = Utc::now();
+        let metrics = {
+            let mut state = self.state.lock().unwrap();
+            state.metrics.start_postgres_ms = config_time
+                .signed_duration_since(start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            state.metrics.config_ms = startup_end_time
+                .signed_duration_since(config_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            state.metrics.total_startup_ms = startup_end_time
+                .signed_duration_since(compute_state.start_time)
+                .to_std()
+                .unwrap()
+                .as_millis() as u64;
+            state.metrics.clone()
+        };
+        self.set_status(ComputeStatus::Running);
+
+        // Log metrics so that we can search for slow operations in logs
+        info!(?metrics, postmaster_pid = %postmaster_pid, "compute start finished");
+
+        Ok(())
+    }
+
+    #[instrument(skip_all)]
+    async fn download_preload_extensions(&self, spec: &ComputeSpec) -> Result<()> {
+        let remote_extensions = if let Some(remote_extensions) = &spec.remote_extensions {
+            remote_extensions
+        } else {
+            return Ok(());
+        };
+
+        // First, create control files for all available extensions
+        extension_server::create_control_files(remote_extensions, &self.params.pgbin);
+
+        let library_load_start_time = Utc::now();
+        let remote_ext_metrics = self.prepare_preload_libraries(spec).await?;
+
+        let library_load_time = Utc::now()
+            .signed_duration_since(library_load_start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        let mut state = self.state.lock().unwrap();
+        state.metrics.load_ext_ms = library_load_time;
+        state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
+        state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
+        state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
+        info!(
+            "Loading shared_preload_libraries took {:?}ms",
+            library_load_time
+        );
+        info!("{:?}", remote_ext_metrics);
+
+        Ok(())
+    }
+
+    /// Start the vm-monitor if directed to. The vm-monitor only runs on linux
+    /// because it requires cgroups.
+    fn start_vm_monitor(&self, disable_lfc_resizing: bool) -> StartVmMonitorResult {
+        cfg_if::cfg_if! {
+            if #[cfg(target_os = "linux")] {
+                use std::env;
+                use tokio_util::sync::CancellationToken;
+
+                // This token is used internally by the monitor to clean up all threads
+                let token = CancellationToken::new();
+
+                // don't pass postgres connection string to vm-monitor if we don't want it to resize LFC
+                let pgconnstr = if disable_lfc_resizing {
+                    None
+                } else {
+                    Some(self.params.filecache_connstr.clone())
+                };
+
+                let vm_monitor = if env::var_os("AUTOSCALING").is_some() {
+                    let vm_monitor = tokio::spawn(vm_monitor::start(
+                        Box::leak(Box::new(vm_monitor::Args {
+                            cgroup: Some(self.params.cgroup.clone()),
+                            pgconnstr,
+                            addr: self.params.vm_monitor_addr.clone(),
+                        })),
+                        token.clone(),
+                    ));
+                    Some(vm_monitor)
+                } else {
+                    None
+                };
+                StartVmMonitorResult { token, vm_monitor }
+            } else {
+                _ = disable_lfc_resizing; // appease unused lint on macOS
+                StartVmMonitorResult { }
+            }
+        }
+    }
+
+    fn cleanup_after_postgres_exit(&self) -> Result<bool> {
+        // Maybe sync safekeepers again, to speed up next startup
+        let compute_state = self.state.lock().unwrap().clone();
+        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
+        if matches!(pspec.spec.mode, compute_api::spec::ComputeMode::Primary) {
+            info!("syncing safekeepers on shutdown");
+            let storage_auth_token = pspec.storage_auth_token.clone();
+            let lsn = self.sync_safekeepers(storage_auth_token)?;
+            info!("synced safekeepers at lsn {lsn}");
+        }
+
+        let mut delay_exit = false;
+        let mut state = self.state.lock().unwrap();
+        if state.status == ComputeStatus::TerminationPending {
+            state.status = ComputeStatus::Terminated;
+            self.state_changed.notify_all();
+            // we were asked to terminate gracefully, don't exit to avoid restart
+            delay_exit = true
+        }
+        drop(state);
+
+        if let Err(err) = self.check_for_core_dumps() {
+            error!("error while checking for core dumps: {err:?}");
+        }
+
+        Ok(delay_exit)
+    }
+
     /// Check that compute node has corresponding feature enabled.
     pub fn has_feature(&self, feature: ComputeFeature) -> bool {
         let state = self.state.lock().unwrap();
@@ -354,9 +828,10 @@ impl ComputeNode {
     fn create_pgdata(&self) -> Result<()> {
         // Ignore removal error, likely it is a 'No such file or directory (os error 2)'.
         // If it is something different then create_dir() will error out anyway.
-        let _ok = fs::remove_dir_all(&self.pgdata);
-        fs::create_dir(&self.pgdata)?;
-        fs::set_permissions(&self.pgdata, fs::Permissions::from_mode(0o700))?;
+        let pgdata = &self.params.pgdata;
+        let _ok = fs::remove_dir_all(pgdata);
+        fs::create_dir(pgdata)?;
+        fs::set_permissions(pgdata, fs::Permissions::from_mode(0o700))?;
 
         Ok(())
     }
@@ -421,7 +896,7 @@ impl ComputeNode {
         // sends an Error after finishing the tarball, we will not notice it.
         let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
         ar.set_ignore_zeros(true);
-        ar.unpack(&self.pgdata)?;
+        ar.unpack(&self.params.pgdata)?;
 
         // Report metrics
         let mut state = self.state.lock().unwrap();
@@ -566,9 +1041,9 @@ impl ComputeNode {
     pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
         let start_time = Utc::now();
 
-        let mut sync_handle = maybe_cgexec(&self.pgbin)
+        let mut sync_handle = maybe_cgexec(&self.params.pgbin)
             .args(["--sync-safekeepers"])
-            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
+            .env("PGDATA", &self.params.pgdata) // we cannot use -D in this mode
             .envs(if let Some(storage_auth_token) = &storage_auth_token {
                 vec![("NEON_AUTH_TOKEN", storage_auth_token)]
             } else {
@@ -625,14 +1100,14 @@ impl ComputeNode {
     pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         let spec = &pspec.spec;
-        let pgdata_path = Path::new(&self.pgdata);
+        let pgdata_path = Path::new(&self.params.pgdata);
 
         // Remove/create an empty pgdata directory and put configuration there.
         self.create_pgdata()?;
         config::write_postgres_conf(
             &pgdata_path.join("postgresql.conf"),
             &pspec.spec,
-            self.internal_http_port,
+            self.params.internal_http_port,
         )?;
 
         // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -732,12 +1207,15 @@ impl ComputeNode {
         info!("prewarming");
 
         // Create pgdata
-        let pgdata = &format!("{}.warmup", self.pgdata);
+        let pgdata = &format!("{}.warmup", self.params.pgdata);
         create_pgdata(pgdata)?;
 
         // Run initdb to completion
         info!("running initdb");
-        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        let initdb_bin = Path::new(&self.params.pgbin)
+            .parent()
+            .unwrap()
+            .join("initdb");
         Command::new(initdb_bin)
             .args(["--pgdata", pgdata])
             .output()
@@ -753,7 +1231,7 @@ impl ComputeNode {
 
         // Start postgres
         info!("starting postgres");
-        let mut pg = maybe_cgexec(&self.pgbin)
+        let mut pg = maybe_cgexec(&self.params.pgbin)
             .args(["-D", pgdata])
             .spawn()
             .expect("cannot start postgres process");
@@ -780,15 +1258,12 @@ impl ComputeNode {
     ///
     /// Returns a handle to the child process and a handle to the logs thread.
     #[instrument(skip_all)]
-    pub fn start_postgres(
-        &self,
-        storage_auth_token: Option<String>,
-    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
-        let pgdata_path = Path::new(&self.pgdata);
+    pub fn start_postgres(&self, storage_auth_token: Option<String>) -> Result<PostgresHandle> {
+        let pgdata_path = Path::new(&self.params.pgdata);
 
         // Run postgres as a child process.
-        let mut pg = maybe_cgexec(&self.pgbin)
-            .args(["-D", &self.pgdata])
+        let mut pg = maybe_cgexec(&self.params.pgbin)
+            .args(["-D", &self.params.pgdata])
             .envs(if let Some(storage_auth_token) = &storage_auth_token {
                 vec![("NEON_AUTH_TOKEN", storage_auth_token)]
             } else {
@@ -805,7 +1280,29 @@ impl ComputeNode {
 
         wait_for_postgres(&mut pg, pgdata_path)?;
 
-        Ok((pg, logs_handle))
+        Ok(PostgresHandle {
+            postgres: pg,
+            log_collector: logs_handle,
+        })
+    }
+
+    /// Wait for the child Postgres process forever. In this state Ctrl+C will
+    /// propagate to Postgres and it will be shut down as well.
+    fn wait_postgres(&self, mut pg_handle: PostgresHandle) -> std::process::ExitStatus {
+        info!(postmaster_pid = %pg_handle.postgres.id(), "Waiting for Postgres to exit");
+
+        let ecode = pg_handle
+            .postgres
+            .wait()
+            .expect("failed to start waiting on Postgres process");
+        PG_PID.store(0, Ordering::SeqCst);
+
+        // Process has exited. Wait for the log collecting task to finish.
+        let _ = tokio::runtime::Handle::current()
+            .block_on(pg_handle.log_collector)
+            .map_err(|e| tracing::error!("log task panicked: {:?}", e));
+
+        ecode
     }
 
     /// Do post configuration of the already started Postgres. This function spawns a background task to
@@ -972,9 +1469,12 @@ impl ComputeNode {
     // `pg_ctl` for start / stop.
     #[instrument(skip_all)]
     fn pg_reload_conf(&self) -> Result<()> {
-        let pgctl_bin = Path::new(&self.pgbin).parent().unwrap().join("pg_ctl");
+        let pgctl_bin = Path::new(&self.params.pgbin)
+            .parent()
+            .unwrap()
+            .join("pg_ctl");
         Command::new(pgctl_bin)
-            .args(["reload", "-D", &self.pgdata])
+            .args(["reload", "-D", &self.params.pgdata])
             .output()
             .expect("cannot run pg_ctl process");
         Ok(())
@@ -1014,9 +1514,9 @@ impl ComputeNode {
         }
 
         // Write new config
-        let pgdata_path = Path::new(&self.pgdata);
+        let pgdata_path = Path::new(&self.params.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, self.internal_http_port)?;
+        config::write_postgres_conf(&postgresql_conf_path, &spec, self.params.internal_http_port)?;
 
         if !spec.skip_pg_catalog_updates {
             let max_concurrent_connections = spec.reconfigure_concurrency;
@@ -1027,7 +1527,8 @@ impl ComputeNode {
                 self.pg_reload_conf()?;
 
                 if spec.mode == ComputeMode::Primary {
-                    let mut conf = tokio_postgres::Config::from_str(self.connstr.as_str()).unwrap();
+                    let mut conf =
+                        tokio_postgres::Config::from_str(self.params.connstr.as_str()).unwrap();
                     conf.application_name("apply_config");
                     let conf = Arc::new(conf);
 
@@ -1053,166 +1554,37 @@ impl ComputeNode {
     }
 
     #[instrument(skip_all)]
-    pub fn start_compute(
-        &self,
-    ) -> Result<(std::process::Child, tokio::task::JoinHandle<Result<()>>)> {
-        let compute_state = self.state.lock().unwrap().clone();
+    pub fn configure_as_primary(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
-        info!(
-            "starting compute for project {}, operation {}, tenant {}, timeline {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None"),
-            pspec.spec.operation_uuid.as_deref().unwrap_or("None"),
-            pspec.tenant_id,
-            pspec.timeline_id,
-        );
 
-        // tune pgbouncer
-        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
-            info!("tuning pgbouncer");
-
-            // Spawn a background task to do the tuning,
-            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pgbouncer_settings.clone();
-            let _handle = tokio::spawn(async move {
-                let res = tune_pgbouncer(pgbouncer_settings).await;
-                if let Err(err) = res {
-                    error!("error while tuning pgbouncer: {err:?}");
-                }
-            });
-        }
-
-        if let Some(local_proxy) = &pspec.spec.local_proxy_config {
-            info!("configuring local_proxy");
-
-            // Spawn a background task to do the configuration,
-            // so that we don't block the main thread that starts Postgres.
-            let local_proxy = local_proxy.clone();
-            let _handle = tokio::spawn(async move {
-                if let Err(err) = local_proxy::configure(&local_proxy) {
-                    error!("error while configuring local_proxy: {err:?}");
-                }
-            });
-        }
-
-        info!(
-            "start_compute spec.remote_extensions {:?}",
-            pspec.spec.remote_extensions
-        );
-
-        // This part is sync, because we need to download
-        // remote shared_preload_libraries before postgres start (if any)
-        if let Some(remote_extensions) = &pspec.spec.remote_extensions {
-            // First, create control files for all availale extensions
-            extension_server::create_control_files(remote_extensions, &self.pgbin);
-
-            let library_load_start_time = Utc::now();
-            let rt = tokio::runtime::Handle::current();
-            let remote_ext_metrics = rt.block_on(self.prepare_preload_libraries(&pspec.spec))?;
-
-            let library_load_time = Utc::now()
-                .signed_duration_since(library_load_start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            let mut state = self.state.lock().unwrap();
-            state.metrics.load_ext_ms = library_load_time;
-            state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded;
-            state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size;
-            state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size;
-            info!(
-                "Loading shared_preload_libraries took {:?}ms",
-                library_load_time
-            );
-            info!("{:?}", remote_ext_metrics);
-        }
-
-        self.prepare_pgdata(&compute_state)?;
-
-        let start_time = Utc::now();
-        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
-
-        let config_time = Utc::now();
-        if pspec.spec.mode == ComputeMode::Primary {
-            if !pspec.spec.skip_pg_catalog_updates {
-                let pgdata_path = Path::new(&self.pgdata);
-                // temporarily reset max_cluster_size in config
-                // to avoid the possibility of hitting the limit, while we are applying config:
-                // creating new extensions, roles, etc...
-                config::with_compute_ctl_tmp_override(
-                    pgdata_path,
-                    "neon.max_cluster_size=-1",
-                    || {
-                        self.pg_reload_conf()?;
-
-                        self.apply_config(&compute_state)?;
-
-                        Ok(())
-                    },
-                )?;
-
-                let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-                if config::line_in_file(
-                    &postgresql_conf_path,
-                    "neon.disable_logical_replication_subscribers=false",
-                )? {
-                    info!(
-                        "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"
-                    );
-                }
+        assert!(pspec.spec.mode == ComputeMode::Primary);
+        if !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.params.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::with_compute_ctl_tmp_override(pgdata_path, "neon.max_cluster_size=-1", || {
                 self.pg_reload_conf()?;
+
+                self.apply_config(compute_state)?;
+
+                Ok(())
+            })?;
+
+            let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+            if config::line_in_file(
+                &postgresql_conf_path,
+                "neon.disable_logical_replication_subscribers=false",
+            )? {
+                info!(
+                    "updated postgresql.conf to set neon.disable_logical_replication_subscribers=false"
+                );
             }
-            self.post_apply_config()?;
-
-            let conf = self.get_conn_conf(None);
-            tokio::task::spawn_blocking(|| {
-                let res = get_installed_extensions(conf);
-                match res {
-                    Ok(extensions) => {
-                        info!(
-                            "[NEON_EXT_STAT] {}",
-                            serde_json::to_string(&extensions)
-                                .expect("failed to serialize extensions list")
-                        );
-                    }
-                    Err(err) => error!("could not get installed extensions: {err:?}"),
-                }
-            });
+            self.pg_reload_conf()?;
         }
+        self.post_apply_config()?;
 
-        let startup_end_time = Utc::now();
-        {
-            let mut state = self.state.lock().unwrap();
-            state.metrics.start_postgres_ms = config_time
-                .signed_duration_since(start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            state.metrics.config_ms = startup_end_time
-                .signed_duration_since(config_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-            state.metrics.total_startup_ms = startup_end_time
-                .signed_duration_since(compute_state.start_time)
-                .to_std()
-                .unwrap()
-                .as_millis() as u64;
-        }
-        self.set_status(ComputeStatus::Running);
-
-        info!(
-            "finished configuration of compute for project {}",
-            pspec.spec.cluster.cluster_id.as_deref().unwrap_or("None")
-        );
-
-        // Log metrics so that we can search for slow operations in logs
-        let metrics = {
-            let state = self.state.lock().unwrap();
-            state.metrics.clone()
-        };
-        info!(?metrics, "compute start finished");
-
-        Ok(pg_process)
+        Ok(())
     }
 
     /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
@@ -1241,7 +1613,7 @@ impl ComputeNode {
     pub fn check_for_core_dumps(&self) -> Result<()> {
         let core_dump_dir = match std::env::consts::OS {
             "macos" => Path::new("/cores/"),
-            _ => Path::new(&self.pgdata),
+            _ => Path::new(&self.params.pgdata),
         };
 
         // Collect core dump paths if any
@@ -1271,7 +1643,7 @@ impl ComputeNode {
 
             // Try first with gdb
             let backtrace = Command::new("gdb")
-                .args(["--batch", "-q", "-ex", "bt", &self.pgbin])
+                .args(["--batch", "-q", "-ex", "bt", &self.params.pgbin])
                 .arg(&core_path)
                 .output();
 
@@ -1348,7 +1720,8 @@ LIMIT 100",
         ext_path: RemotePath,
     ) -> Result<u64, DownloadError> {
         let ext_remote_storage =
-            self.ext_remote_storage
+            self.params
+                .ext_remote_storage
                 .as_ref()
                 .ok_or(DownloadError::BadInput(anyhow::anyhow!(
                     "Remote extensions storage is not configured",
@@ -1411,7 +1784,7 @@ LIMIT 100",
             &real_ext_name,
             &ext_path,
             ext_remote_storage,
-            &self.pgbin,
+            &self.params.pgbin,
         )
         .await
         .map_err(DownloadError::Other);
@@ -1519,7 +1892,7 @@ LIMIT 100",
         &self,
         spec: &ComputeSpec,
     ) -> Result<RemoteExtensionMetrics> {
-        if self.ext_remote_storage.is_none() {
+        if self.params.ext_remote_storage.is_none() {
             return Ok(RemoteExtensionMetrics {
                 num_ext_downloaded: 0,
                 largest_ext_size: 0,
@@ -1570,8 +1943,12 @@ LIMIT 100",
 
         let mut download_tasks = Vec::new();
         for library in &libs_vec {
-            let (ext_name, ext_path) =
-                remote_extensions.get_ext(library, true, &self.build_tag, &self.pgversion)?;
+            let (ext_name, ext_path) = remote_extensions.get_ext(
+                library,
+                true,
+                &self.params.build_tag,
+                &self.params.pgversion,
+            )?;
             download_tasks.push(self.download_extension(ext_name, ext_path));
         }
         let results = join_all(download_tasks).await;
@@ -1648,3 +2025,26 @@ pub fn forward_termination_signal() {
         kill(pg_pid, Signal::SIGINT).ok();
     }
 }
+
+// helper trait to call JoinSet::spawn_blocking(f), but propagates the current
+// tracing span to the thread.
+trait JoinSetExt<T> {
+    fn spawn_blocking_child<F>(&mut self, f: F) -> tokio::task::AbortHandle
+    where
+        F: FnOnce() -> T + Send + 'static,
+        T: Send;
+}
+
+impl<T: 'static> JoinSetExt<T> for tokio::task::JoinSet<T> {
+    fn spawn_blocking_child<F>(&mut self, f: F) -> tokio::task::AbortHandle
+    where
+        F: FnOnce() -> T + Send + 'static,
+        T: Send,
+    {
+        let sp = tracing::Span::current();
+        self.spawn_blocking(move || {
+            let _e = sp.enter();
+            f()
+        })
+    }
+}
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index e8056ec7eb..0760568ff8 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -1,12 +1,16 @@
+use anyhow::Result;
+use std::fmt::Write as FmtWrite;
 use std::fs::{File, OpenOptions};
 use std::io;
+use std::io::Write;
 use std::io::prelude::*;
 use std::path::Path;
 
-use anyhow::Result;
-use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
+use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
 
-use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize, escape_conf_value};
+use crate::pg_helpers::{
+    GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
+};
 
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -55,10 +59,20 @@ pub fn write_postgres_conf(
         writeln!(file, "neon.stripe_size={stripe_size}")?;
     }
     if !spec.safekeeper_connstrings.is_empty() {
+        let mut neon_safekeepers_value = String::new();
+        tracing::info!(
+            "safekeepers_connstrings is not zero, gen: {:?}",
+            spec.safekeepers_generation
+        );
+        // If generation is given, prepend sk list with g#number:
+        if let Some(generation) = spec.safekeepers_generation {
+            write!(neon_safekeepers_value, "g#{}:", generation)?;
+        }
+        neon_safekeepers_value.push_str(&spec.safekeeper_connstrings.join(","));
         writeln!(
             file,
             "neon.safekeepers={}",
-            escape_conf_value(&spec.safekeeper_connstrings.join(","))
+            escape_conf_value(&neon_safekeepers_value)
         )?;
     }
     if let Some(s) = &spec.tenant_id {
@@ -126,6 +140,54 @@ pub fn write_postgres_conf(
         writeln!(file, "# Managed by compute_ctl: end")?;
     }
 
+    // If audit logging is enabled, configure pgaudit.
+    //
+    // Note, that this is called after the settings from spec are written.
+    // This way we always override the settings from the spec
+    // and don't allow the user or the control plane admin to change them.
+    if let ComputeAudit::Hipaa = spec.audit_log_level {
+        writeln!(file, "# Managed by compute_ctl audit settings: begin")?;
+        // This log level is very verbose
+        // but this is necessary for HIPAA compliance.
+        writeln!(file, "pgaudit.log='all'")?;
+        writeln!(file, "pgaudit.log_parameter=on")?;
+        // Disable logging of catalog queries
+        // The catalog doesn't contain sensitive data, so we don't need to audit it.
+        writeln!(file, "pgaudit.log_catalog=off")?;
+        // Set log rotation to 5 minutes
+        // TODO: tune this after performance testing
+        writeln!(file, "pgaudit.log_rotation_age=5")?;
+
+        // Add audit shared_preload_libraries, if they are not present.
+        //
+        // The caller who sets the flag is responsible for ensuring that the necessary
+        // shared_preload_libraries are present in the compute image,
+        // otherwise the compute start will fail.
+        if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") {
+            let mut extra_shared_preload_libraries = String::new();
+            if !libs.contains("pgaudit") {
+                extra_shared_preload_libraries.push_str(",pgaudit");
+            }
+            if !libs.contains("pgauditlogtofile") {
+                extra_shared_preload_libraries.push_str(",pgauditlogtofile");
+            }
+            writeln!(
+                file,
+                "shared_preload_libraries='{}{}'",
+                libs, extra_shared_preload_libraries
+            )?;
+        } else {
+            // Typically, this should be unreacheable,
+            // because we always set at least some shared_preload_libraries in the spec
+            // but let's handle it explicitly anyway.
+            writeln!(
+                file,
+                "shared_preload_libraries='neon,pgaudit,pgauditlogtofile'"
+            )?;
+        }
+        writeln!(file, "# Managed by compute_ctl audit settings: end")?;
+    }
+
     writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
 
     if spec.drop_subscriptions_before_start {
diff --git a/compute_tools/src/config_template/compute_audit_rsyslog_template.conf b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
new file mode 100644
index 0000000000..bef3c36446
--- /dev/null
+++ b/compute_tools/src/config_template/compute_audit_rsyslog_template.conf
@@ -0,0 +1,10 @@
+# Load imfile module to read log files
+module(load="imfile")
+
+# Input configuration for log files in the specified directory
+# Replace {log_directory} with the directory containing the log files
+input(type="imfile" File="{log_directory}/*.log" Tag="{tag}" Severity="info" Facility="local0")
+global(workDirectory="/var/log")
+
+# Forward logs to remote syslog server
+*.* @@{remote_endpoint}
\ No newline at end of file
diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 77e98359ab..b4de786b00 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -253,27 +253,31 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
     }
 }
 
-// Do request to extension storage proxy, i.e.
+// Do request to extension storage proxy, e.g.,
 // curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
-// using HHTP GET
-// and return the response body as bytes
-//
+// using HTTP GET and return the response body as bytes.
 async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
     let uri = format!("{}/{}", ext_remote_storage, ext_path);
+    let filename = Path::new(ext_path)
+        .file_name()
+        .unwrap_or_else(|| std::ffi::OsStr::new("unknown"))
+        .to_str()
+        .unwrap_or("unknown")
+        .to_string();
 
-    info!("Download extension {} from uri {}", ext_path, uri);
+    info!("Downloading extension file '{}' from uri {}", filename, uri);
 
     match do_extension_server_request(&uri).await {
         Ok(resp) => {
             info!("Successfully downloaded remote extension data {}", ext_path);
             REMOTE_EXT_REQUESTS_TOTAL
-                .with_label_values(&[&StatusCode::OK.to_string()])
+                .with_label_values(&[&StatusCode::OK.to_string(), &filename])
                 .inc();
             Ok(resp)
         }
         Err((msg, status)) => {
             REMOTE_EXT_REQUESTS_TOTAL
-                .with_label_values(&[&status])
+                .with_label_values(&[&status, &filename])
                 .inc();
             bail!(msg);
         }
diff --git a/compute_tools/src/http/extract/mod.rs b/compute_tools/src/http/extract/mod.rs
index 1b690e444d..589681cfe2 100644
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -1,7 +1,9 @@
 pub(crate) mod json;
 pub(crate) mod path;
 pub(crate) mod query;
+pub(crate) mod request_id;
 
 pub(crate) use json::Json;
 pub(crate) use path::Path;
 pub(crate) use query::Query;
+pub(crate) use request_id::RequestId;
diff --git a/compute_tools/src/http/extract/request_id.rs b/compute_tools/src/http/extract/request_id.rs
new file mode 100644
index 0000000000..d911921a05
--- /dev/null
+++ b/compute_tools/src/http/extract/request_id.rs
@@ -0,0 +1,86 @@
+use std::{
+    fmt::Display,
+    ops::{Deref, DerefMut},
+};
+
+use axum::{extract::FromRequestParts, response::IntoResponse};
+use http::{StatusCode, request::Parts};
+
+use crate::http::{JsonResponse, headers::X_REQUEST_ID};
+
+/// Extract the request ID from the `X-Request-Id` header.
+#[derive(Debug, Clone, Default)]
+pub(crate) struct RequestId(pub String);
+
+#[derive(Debug)]
+/// Rejection used for [`RequestId`].
+///
+/// Contains one variant for each way the [`RequestId`] extractor can
+/// fail.
+pub(crate) enum RequestIdRejection {
+    /// The request is missing the header.
+    MissingRequestId,
+
+    /// The value of the header is invalid UTF-8.
+    InvalidUtf8,
+}
+
+impl RequestIdRejection {
+    pub fn status(&self) -> StatusCode {
+        match self {
+            RequestIdRejection::MissingRequestId => StatusCode::INTERNAL_SERVER_ERROR,
+            RequestIdRejection::InvalidUtf8 => StatusCode::BAD_REQUEST,
+        }
+    }
+
+    pub fn message(&self) -> String {
+        match self {
+            RequestIdRejection::MissingRequestId => "request ID is missing",
+            RequestIdRejection::InvalidUtf8 => "request ID is invalid UTF-8",
+        }
+        .to_string()
+    }
+}
+
+impl IntoResponse for RequestIdRejection {
+    fn into_response(self) -> axum::response::Response {
+        JsonResponse::error(self.status(), self.message())
+    }
+}
+
+impl<S> FromRequestParts<S> for RequestId
+where
+    S: Send + Sync,
+{
+    type Rejection = RequestIdRejection;
+
+    async fn from_request_parts(parts: &mut Parts, _state: &S) -> Result<Self, Self::Rejection> {
+        match parts.headers.get(X_REQUEST_ID) {
+            Some(value) => match value.to_str() {
+                Ok(request_id) => Ok(Self(request_id.to_string())),
+                Err(_) => Err(RequestIdRejection::InvalidUtf8),
+            },
+            None => Err(RequestIdRejection::MissingRequestId),
+        }
+    }
+}
+
+impl Deref for RequestId {
+    type Target = String;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for RequestId {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
+
+impl Display for RequestId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(&self.0)
+    }
+}
diff --git a/compute_tools/src/http/headers.rs b/compute_tools/src/http/headers.rs
new file mode 100644
index 0000000000..a11638e203
--- /dev/null
+++ b/compute_tools/src/http/headers.rs
@@ -0,0 +1,2 @@
+/// Constant for `X-Request-Id` header.
+pub const X_REQUEST_ID: &str = "x-request-id";
diff --git a/compute_tools/src/http/middleware/authorize.rs b/compute_tools/src/http/middleware/authorize.rs
new file mode 100644
index 0000000000..798dd1179b
--- /dev/null
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -0,0 +1,145 @@
+use std::{collections::HashSet, net::SocketAddr};
+
+use anyhow::{Result, anyhow};
+use axum::{RequestExt, body::Body, extract::ConnectInfo};
+use axum_extra::{
+    TypedHeader,
+    headers::{Authorization, authorization::Bearer},
+};
+use futures::future::BoxFuture;
+use http::{Request, Response, StatusCode};
+use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
+use serde::Deserialize;
+use tower_http::auth::AsyncAuthorizeRequest;
+use tracing::warn;
+
+use crate::http::{JsonResponse, extract::RequestId};
+
+#[derive(Clone, Debug, Deserialize)]
+pub(in crate::http) struct Claims {
+    compute_id: String,
+}
+
+#[derive(Clone, Debug)]
+pub(in crate::http) struct Authorize {
+    compute_id: String,
+    jwks: JwkSet,
+    validation: Validation,
+}
+
+impl Authorize {
+    pub fn new(compute_id: String, jwks: JwkSet) -> Self {
+        let mut validation = Validation::new(Algorithm::EdDSA);
+        // Nothing is currently required
+        validation.required_spec_claims = HashSet::new();
+        validation.validate_exp = true;
+        // Unused by the control plane
+        validation.validate_aud = false;
+        // Unused by the control plane
+        validation.validate_nbf = false;
+
+        Self {
+            compute_id,
+            jwks,
+            validation,
+        }
+    }
+}
+
+impl AsyncAuthorizeRequest<Body> for Authorize {
+    type RequestBody = Body;
+    type ResponseBody = Body;
+    type Future = BoxFuture<'static, Result<Request<Body>, Response<Self::ResponseBody>>>;
+
+    fn authorize(&mut self, mut request: Request<Body>) -> Self::Future {
+        let compute_id = self.compute_id.clone();
+        let jwks = self.jwks.clone();
+        let validation = self.validation.clone();
+
+        Box::pin(async move {
+            let request_id = request.extract_parts::<RequestId>().await.unwrap();
+
+            // TODO: Remove this check after a successful rollout
+            if jwks.keys.is_empty() {
+                warn!(%request_id, "Authorization has not been configured");
+
+                return Ok(request);
+            }
+
+            let connect_info = request
+                .extract_parts::<ConnectInfo<SocketAddr>>()
+                .await
+                .unwrap();
+
+            // In the event the request is coming from the loopback interface,
+            // allow all requests
+            if connect_info.ip().is_loopback() {
+                warn!(%request_id, "Bypassed authorization because request is coming from the loopback interface");
+
+                return Ok(request);
+            }
+
+            let TypedHeader(Authorization(bearer)) = request
+                .extract_parts::<TypedHeader<Authorization<Bearer>>>()
+                .await
+                .map_err(|_| {
+                    JsonResponse::error(StatusCode::BAD_REQUEST, "invalid authorization token")
+                })?;
+
+            let data = match Self::verify(&jwks, bearer.token(), &validation) {
+                Ok(claims) => claims,
+                Err(e) => return Err(JsonResponse::error(StatusCode::UNAUTHORIZED, e)),
+            };
+
+            if data.claims.compute_id != compute_id {
+                return Err(JsonResponse::error(
+                    StatusCode::UNAUTHORIZED,
+                    "invalid claims in authorization token",
+                ));
+            }
+
+            // Make claims available to any subsequent middleware or request
+            // handlers
+            request.extensions_mut().insert(data.claims);
+
+            Ok(request)
+        })
+    }
+}
+
+impl Authorize {
+    /// Verify the token using the JSON Web Key set and return the token data.
+    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
+        debug_assert!(!jwks.keys.is_empty());
+
+        for jwk in jwks.keys.iter() {
+            let decoding_key = match DecodingKey::from_jwk(jwk) {
+                Ok(key) => key,
+                Err(e) => {
+                    warn!(
+                        "Failed to construct decoding key from {}: {}",
+                        jwk.common.key_id.as_ref().unwrap(),
+                        e
+                    );
+
+                    continue;
+                }
+            };
+
+            match jsonwebtoken::decode::<Claims>(token, &decoding_key, validation) {
+                Ok(data) => return Ok(data),
+                Err(e) => {
+                    warn!(
+                        "Failed to decode authorization token using {}: {}",
+                        jwk.common.key_id.as_ref().unwrap(),
+                        e
+                    );
+
+                    continue;
+                }
+            }
+        }
+
+        Err(anyhow!("Failed to verify authorization token"))
+    }
+}
diff --git a/compute_tools/src/http/middleware/mod.rs b/compute_tools/src/http/middleware/mod.rs
new file mode 100644
index 0000000000..caeeeedfe5
--- /dev/null
+++ b/compute_tools/src/http/middleware/mod.rs
@@ -0,0 +1 @@
+pub(in crate::http) mod authorize;
diff --git a/compute_tools/src/http/mod.rs b/compute_tools/src/http/mod.rs
index d182278174..9ecc1b0093 100644
--- a/compute_tools/src/http/mod.rs
+++ b/compute_tools/src/http/mod.rs
@@ -7,6 +7,8 @@ use serde::Serialize;
 use tracing::error;
 
 mod extract;
+mod headers;
+mod middleware;
 mod routes;
 pub mod server;
 
diff --git a/compute_tools/src/http/routes/configure.rs b/compute_tools/src/http/routes/configure.rs
index 63d428fff4..3c5a6a6d41 100644
--- a/compute_tools/src/http/routes/configure.rs
+++ b/compute_tools/src/http/routes/configure.rs
@@ -22,7 +22,7 @@ pub(in crate::http) async fn configure(
     State(compute): State<Arc<ComputeNode>>,
     request: Json<ConfigurationRequest>,
 ) -> Response {
-    if !compute.live_config_allowed {
+    if !compute.params.live_config_allowed {
         return JsonResponse::error(
             StatusCode::PRECONDITION_FAILED,
             "live configuration is not allowed for this compute node".to_string(),
diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
index b0265d1e99..563b73ae65 100644
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -18,11 +18,11 @@ pub(in crate::http) struct ExtensionServerParams {
 /// Download a remote extension.
 pub(in crate::http) async fn download_extension(
     Path(filename): Path<String>,
-    params: Query<ExtensionServerParams>,
+    ext_server_params: Query<ExtensionServerParams>,
     State(compute): State<Arc<ComputeNode>>,
 ) -> Response {
     // Don't even try to download extensions if no remote storage is configured
-    if compute.ext_remote_storage.is_none() {
+    if compute.params.ext_remote_storage.is_none() {
         return JsonResponse::error(
             StatusCode::PRECONDITION_FAILED,
             "remote storage is not configured",
@@ -46,9 +46,9 @@ pub(in crate::http) async fn download_extension(
 
         remote_extensions.get_ext(
             &filename,
-            params.is_library,
-            &compute.build_tag,
-            &compute.pgversion,
+            ext_server_params.is_library,
+            &compute.params.build_tag,
+            &compute.params.pgversion,
         )
     };
 
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 7283401bb5..126fa86d1c 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -10,48 +10,58 @@ use axum::middleware::{self, Next};
 use axum::response::{IntoResponse, Response};
 use axum::routing::{get, post};
 use http::StatusCode;
+use jsonwebtoken::jwk::JwkSet;
 use tokio::net::TcpListener;
 use tower::ServiceBuilder;
-use tower_http::request_id::PropagateRequestIdLayer;
-use tower_http::trace::TraceLayer;
-use tracing::{Span, debug, error, info};
+use tower_http::{
+    auth::AsyncRequireAuthorizationLayer, request_id::PropagateRequestIdLayer, trace::TraceLayer,
+};
+use tracing::{Span, error, info};
 use uuid::Uuid;
 
-use super::routes::{
-    check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-    grants, insights, metrics, metrics_json, status, terminate,
+use super::{
+    headers::X_REQUEST_ID,
+    middleware::authorize::Authorize,
+    routes::{
+        check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
+        grants, insights, metrics, metrics_json, status, terminate,
+    },
 };
 use crate::compute::ComputeNode;
 
-const X_REQUEST_ID: &str = "x-request-id";
-
 /// `compute_ctl` has two servers: internal and external. The internal server
 /// binds to the loopback interface and handles communication from clients on
 /// the compute. The external server is what receives communication from the
 /// control plane, the metrics scraper, etc. We make the distinction because
 /// certain routes in `compute_ctl` only need to be exposed to local processes
 /// like Postgres via the neon extension and local_proxy.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Debug)]
 pub enum Server {
-    Internal(u16),
-    External(u16),
+    Internal {
+        port: u16,
+    },
+    External {
+        port: u16,
+        jwks: JwkSet,
+        compute_id: String,
+    },
 }
 
 impl Display for Server {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
-            Server::Internal(_) => f.write_str("internal"),
-            Server::External(_) => f.write_str("external"),
+            Server::Internal { .. } => f.write_str("internal"),
+            Server::External { .. } => f.write_str("external"),
         }
     }
 }
 
-impl From<Server> for Router<Arc<ComputeNode>> {
-    fn from(server: Server) -> Self {
+impl From<&Server> for Router<Arc<ComputeNode>> {
+    fn from(server: &Server) -> Self {
         let mut router = Router::<Arc<ComputeNode>>::new();
 
         router = match server {
-            Server::Internal(_) => {
+            Server::Internal { .. } => {
                 router = router
                     .route(
                         "/extension_server/{*filename}",
@@ -69,59 +79,71 @@ impl From<Server> for Router<Arc<ComputeNode>> {
 
                 router
             }
-            Server::External(_) => router
-                .route("/check_writability", post(check_writability::is_writable))
-                .route("/configure", post(configure::configure))
-                .route("/database_schema", get(database_schema::get_schema_dump))
-                .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
-                .route("/insights", get(insights::get_insights))
-                .route("/metrics", get(metrics::get_metrics))
-                .route("/metrics.json", get(metrics_json::get_metrics))
-                .route("/status", get(status::get_status))
-                .route("/terminate", post(terminate::terminate)),
+            Server::External {
+                jwks, compute_id, ..
+            } => {
+                let unauthenticated_router =
+                    Router::<Arc<ComputeNode>>::new().route("/metrics", get(metrics::get_metrics));
+
+                let authenticated_router = Router::<Arc<ComputeNode>>::new()
+                    .route("/check_writability", post(check_writability::is_writable))
+                    .route("/configure", post(configure::configure))
+                    .route("/database_schema", get(database_schema::get_schema_dump))
+                    .route("/dbs_and_roles", get(dbs_and_roles::get_catalog_objects))
+                    .route("/insights", get(insights::get_insights))
+                    .route("/metrics.json", get(metrics_json::get_metrics))
+                    .route("/status", get(status::get_status))
+                    .route("/terminate", post(terminate::terminate))
+                    .layer(AsyncRequireAuthorizationLayer::new(Authorize::new(
+                        compute_id.clone(),
+                        jwks.clone(),
+                    )));
+
+                router
+                    .merge(unauthenticated_router)
+                    .merge(authenticated_router)
+            }
         };
 
-        router.fallback(Server::handle_404).method_not_allowed_fallback(Server::handle_405).layer(
-            ServiceBuilder::new()
-                // Add this middleware since we assume the request ID exists
-                .layer(middleware::from_fn(maybe_add_request_id_header))
-                .layer(
-                    TraceLayer::new_for_http()
-                        .on_request(|request: &http::Request<_>, _span: &Span| {
-                            let request_id = request
-                                .headers()
-                                .get(X_REQUEST_ID)
-                                .unwrap()
-                                .to_str()
-                                .unwrap();
-
-                            match request.uri().path() {
-                                "/metrics" => {
-                                    debug!(%request_id, "{} {}", request.method(), request.uri())
-                                }
-                                _ => info!(%request_id, "{} {}", request.method(), request.uri()),
-                            };
-                        })
-                        .on_response(
-                            |response: &http::Response<_>, latency: Duration, _span: &Span| {
-                                let request_id = response
+        router
+            .fallback(Server::handle_404)
+            .method_not_allowed_fallback(Server::handle_405)
+            .layer(
+                ServiceBuilder::new()
+                    .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
+                    // Add this middleware since we assume the request ID exists
+                    .layer(middleware::from_fn(maybe_add_request_id_header))
+                    .layer(
+                        TraceLayer::new_for_http()
+                            .on_request(|request: &http::Request<_>, _span: &Span| {
+                                let request_id = request
                                     .headers()
                                     .get(X_REQUEST_ID)
                                     .unwrap()
                                     .to_str()
                                     .unwrap();
 
-                                info!(
-                                    %request_id,
-                                    code = response.status().as_u16(),
-                                    latency = latency.as_millis()
-                                )
-                            },
-                        ),
-                )
-                .layer(PropagateRequestIdLayer::x_request_id()),
-        )
-            .layer(tower_otel::trace::HttpLayer::server(tracing::Level::INFO))
+                                info!(%request_id, "{} {}", request.method(), request.uri());
+                            })
+                            .on_response(
+                                |response: &http::Response<_>, latency: Duration, _span: &Span| {
+                                    let request_id = response
+                                        .headers()
+                                        .get(X_REQUEST_ID)
+                                        .unwrap()
+                                        .to_str()
+                                        .unwrap();
+
+                                    info!(
+                                        %request_id,
+                                        code = response.status().as_u16(),
+                                        latency = latency.as_millis()
+                                    );
+                                },
+                            ),
+                    )
+                    .layer(PropagateRequestIdLayer::x_request_id()),
+            )
     }
 }
 
@@ -145,15 +167,15 @@ impl Server {
         match self {
             // TODO: Change this to Ipv6Addr::LOCALHOST when the GitHub runners
             // allow binding to localhost
-            Server::Internal(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
-            Server::External(_) => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::Internal { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
+            Server::External { .. } => IpAddr::from(Ipv6Addr::UNSPECIFIED),
         }
     }
 
-    fn port(self) -> u16 {
+    fn port(&self) -> u16 {
         match self {
-            Server::Internal(port) => port,
-            Server::External(port) => port,
+            Server::Internal { port, .. } => *port,
+            Server::External { port, .. } => *port,
         }
     }
 
@@ -180,7 +202,9 @@ impl Server {
             );
         }
 
-        let router = Router::from(self).with_state(compute);
+        let router = Router::from(&self)
+            .with_state(compute)
+            .into_make_service_with_connect_info::<SocketAddr>();
 
         if let Err(e) = axum::serve(listener, router).await {
             error!("compute_ctl {} HTTP server error: {}", self, e);
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index b08df22134..5c78bbcd02 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -21,6 +21,7 @@ mod migration;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
+pub mod rsyslog;
 pub mod spec;
 mod spec_apply;
 pub mod swap;
diff --git a/compute_tools/src/logger.rs b/compute_tools/src/logger.rs
index 3749dfc844..a65614e94e 100644
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -1,3 +1,5 @@
+use std::collections::HashMap;
+use tracing::info;
 use tracing_subscriber::layer::SubscriberExt;
 use tracing_subscriber::prelude::*;
 
@@ -42,3 +44,50 @@ pub async fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result
 pub fn inlinify(s: &str) -> String {
     s.replace('\n', "\u{200B}")
 }
+
+pub fn startup_context_from_env() -> Option<opentelemetry::Context> {
+    // Extract OpenTelemetry context for the startup actions from the
+    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
+    // tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // There is no standard for passing context in env variables, but a lot of
+    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
+    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // If this pod is pre-created without binding it to any particular endpoint
+    // yet, this isn't the right place to enter the startup context. In that
+    // case, the control plane should pass the tracing context as part of the
+    // /configure API call.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    //
+    // XXX: If the pod is restarted, we perform the startup actions in the same
+    // context as the original startup actions, which probably doesn't make
+    // sense.
+    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
+    if let Ok(val) = std::env::var("TRACEPARENT") {
+        startup_tracing_carrier.insert("traceparent".to_string(), val);
+    }
+    if let Ok(val) = std::env::var("TRACESTATE") {
+        startup_tracing_carrier.insert("tracestate".to_string(), val);
+    }
+    if !startup_tracing_carrier.is_empty() {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry_sdk::propagation::TraceContextPropagator;
+        info!("got startup tracing context from env variables");
+        Some(TraceContextPropagator::new().extract(&startup_tracing_carrier))
+    } else {
+        None
+    }
+}
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index bc96e5074c..dab32d5dc1 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -54,9 +54,7 @@ pub(crate) static REMOTE_EXT_REQUESTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(||
     register_int_counter_vec!(
         "compute_ctl_remote_ext_requests_total",
         "Total number of requests made by compute_ctl to download extensions from S3 proxy by status",
-        // Do not use any labels like extension name yet.
-        // We can add them later if needed.
-        &["http_status"]
+        &["http_status", "filename"]
     )
     .expect("failed to define a metric")
 });
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 248505e473..83318538cd 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -18,7 +18,7 @@ const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);
 // should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
     // Suppose that `connstr` doesn't change
-    let connstr = compute.connstr.clone();
+    let connstr = compute.params.connstr.clone();
     let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
 
     // During startup and configuration we connect to every Postgres database,
diff --git a/compute_tools/src/pg_helpers.rs b/compute_tools/src/pg_helpers.rs
index 5a2e305e1d..dd8d8e9b8b 100644
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -186,15 +186,40 @@ impl DatabaseExt for Database {
 /// Postgres SQL queries and DATABASE_URL.
 pub trait Escaping {
     fn pg_quote(&self) -> String;
+    fn pg_quote_dollar(&self) -> (String, String);
 }
 
 impl Escaping for PgIdent {
     /// This is intended to mimic Postgres quote_ident(), but for simplicity it
     /// always quotes provided string with `""` and escapes every `"`.
     /// **Not idempotent**, i.e. if string is already escaped it will be escaped again.
+    /// N.B. it's not useful for escaping identifiers that are used inside WHERE
+    /// clause, use `escape_literal()` instead.
     fn pg_quote(&self) -> String {
-        let result = format!("\"{}\"", self.replace('"', "\"\""));
-        result
+        format!("\"{}\"", self.replace('"', "\"\""))
+    }
+
+    /// This helper is intended to be used for dollar-escaping strings for usage
+    /// inside PL/pgSQL procedures. In addition to dollar-escaping the string,
+    /// it also returns a tag that is intended to be used inside the outer
+    /// PL/pgSQL procedure. If you do not need an outer tag, just discard it.
+    /// Here we somewhat mimic the logic of Postgres' `pg_get_functiondef()`,
+    /// <https://github.com/postgres/postgres/blob/8b49392b270b4ac0b9f5c210e2a503546841e832/src/backend/utils/adt/ruleutils.c#L2924>
+    fn pg_quote_dollar(&self) -> (String, String) {
+        let mut tag: String = "".to_string();
+        let mut outer_tag = "x".to_string();
+
+        // Find the first suitable tag that is not present in the string.
+        // Postgres' max role/DB name length is 63 bytes, so even in the
+        // worst case it won't take long.
+        while self.contains(&format!("${tag}$")) || self.contains(&format!("${outer_tag}$")) {
+            tag += "x";
+            outer_tag = tag.clone() + "x";
+        }
+
+        let escaped = format!("${tag}${self}${tag}$");
+
+        (escaped, outer_tag)
     }
 }
 
@@ -226,10 +251,13 @@ pub async fn get_existing_dbs_async(
     // invalid state. See:
     //   https://github.com/postgres/postgres/commit/a4b4cc1d60f7e8ccfcc8ff8cb80c28ee411ad9a9
     let rowstream = client
+        // We use a subquery instead of a fancy `datdba::regrole::text AS owner`,
+        // because the latter automatically wraps the result in double quotes,
+        // if the role name contains special characters.
         .query_raw::<str, &String, &[String; 0]>(
             "SELECT
                 datname AS name,
-                datdba::regrole::text AS owner,
+                (SELECT rolname FROM pg_roles WHERE oid = datdba) AS owner,
                 NOT datallowconn AS restrict_conn,
                 datconnlimit = - 2 AS invalid
             FROM
diff --git a/compute_tools/src/rsyslog.rs b/compute_tools/src/rsyslog.rs
new file mode 100644
index 0000000000..c8fba4fdcd
--- /dev/null
+++ b/compute_tools/src/rsyslog.rs
@@ -0,0 +1,77 @@
+use std::process::Command;
+use std::{fs::OpenOptions, io::Write};
+
+use anyhow::{Context, Result};
+use tracing::info;
+
+fn get_rsyslog_pid() -> Option<String> {
+    let output = Command::new("pgrep")
+        .arg("rsyslogd")
+        .output()
+        .expect("Failed to execute pgrep");
+
+    if !output.stdout.is_empty() {
+        let pid = std::str::from_utf8(&output.stdout)
+            .expect("Invalid UTF-8 in process output")
+            .trim()
+            .to_string();
+        Some(pid)
+    } else {
+        None
+    }
+}
+
+// Restart rsyslogd to apply the new configuration.
+// This is necessary, because there is no other way to reload the rsyslog configuration.
+//
+// Rsyslogd shouldn't lose any messages, because of the restart,
+// because it tracks the last read position in the log files
+// and will continue reading from that position.
+// TODO: test it properly
+//
+fn restart_rsyslog() -> Result<()> {
+    let old_pid = get_rsyslog_pid().context("rsyslogd is not running")?;
+    info!("rsyslogd is running with pid: {}, restart it", old_pid);
+
+    // kill it to restart
+    let _ = Command::new("pkill")
+        .arg("rsyslogd")
+        .output()
+        .context("Failed to stop rsyslogd")?;
+
+    Ok(())
+}
+
+pub fn configure_audit_rsyslog(
+    log_directory: &str,
+    tag: &str,
+    remote_endpoint: &str,
+) -> Result<()> {
+    let config_content: String = format!(
+        include_str!("config_template/compute_audit_rsyslog_template.conf"),
+        log_directory = log_directory,
+        tag = tag,
+        remote_endpoint = remote_endpoint
+    );
+
+    info!("rsyslog config_content: {}", config_content);
+
+    let rsyslog_conf_path = "/etc/rsyslog.d/compute_audit_rsyslog.conf";
+    let mut file = OpenOptions::new()
+        .create(true)
+        .write(true)
+        .truncate(true)
+        .open(rsyslog_conf_path)?;
+
+    file.write_all(config_content.as_bytes())?;
+
+    info!(
+        "rsyslog configuration file {} added successfully. Starting rsyslogd",
+        rsyslog_conf_path
+    );
+
+    // start the service, using the configuration
+    restart_rsyslog()?;
+
+    Ok(())
+}
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index f9a37c5c98..e5f7aebbf8 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -6,21 +6,22 @@ use std::sync::Arc;
 
 use anyhow::{Context, Result};
 use compute_api::responses::ComputeStatus;
-use compute_api::spec::{ComputeFeature, ComputeSpec, Database, PgIdent, Role};
+use compute_api::spec::{ComputeAudit, ComputeFeature, ComputeSpec, Database, PgIdent, Role};
 use futures::future::join_all;
 use tokio::sync::RwLock;
 use tokio_postgres::Client;
 use tokio_postgres::error::SqlState;
 use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 
-use crate::compute::{ComputeNode, ComputeState, construct_superuser_query};
+use crate::compute::{ComputeNode, ComputeState};
 use crate::pg_helpers::{
-    DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, escape_literal, get_existing_dbs_async,
+    DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
     get_existing_roles_async,
 };
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateSchemaNeon,
-    CreateSuperUser, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
+    CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
+    DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
     HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
     RunInEachDatabase,
 };
@@ -187,7 +188,7 @@ impl ComputeNode {
             }
 
             for phase in [
-                CreateSuperUser,
+                CreateNeonSuperuser,
                 DropInvalidDatabases,
                 RenameRoles,
                 CreateAndAlterRoles,
@@ -277,6 +278,19 @@ impl ComputeNode {
                 phases.push(FinalizeDropLogicalSubscriptions);
             }
 
+            // Keep DisablePostgresDBPgAudit phase at the end,
+            // so that all config operations are audit logged.
+            match spec.audit_log_level
+            {
+                ComputeAudit::Hipaa => {
+                    phases.push(CreatePgauditExtension);
+                    phases.push(CreatePgauditlogtofileExtension);
+                    phases.push(DisablePostgresDBPgAudit);
+                }
+                ComputeAudit::Log => { /* not implemented yet */ }
+                ComputeAudit::Disabled => {}
+            }
+
             for phase in phases {
                 debug!("Applying phase {:?}", &phase);
                 apply_operations(
@@ -455,7 +469,7 @@ pub enum PerDatabasePhase {
 
 #[derive(Clone, Debug)]
 pub enum ApplySpecPhase {
-    CreateSuperUser,
+    CreateNeonSuperuser,
     DropInvalidDatabases,
     RenameRoles,
     CreateAndAlterRoles,
@@ -463,6 +477,9 @@ pub enum ApplySpecPhase {
     CreateAndAlterDatabases,
     CreateSchemaNeon,
     RunInEachDatabase { db: DB, subphase: PerDatabasePhase },
+    CreatePgauditExtension,
+    CreatePgauditlogtofileExtension,
+    DisablePostgresDBPgAudit,
     HandleOtherExtensions,
     HandleNeonExtension,
     CreateAvailabilityCheck,
@@ -579,14 +596,10 @@ async fn get_operations<'a>(
     apply_spec_phase: &'a ApplySpecPhase,
 ) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
     match apply_spec_phase {
-        ApplySpecPhase::CreateSuperUser => {
-            let query = construct_superuser_query(spec);
-
-            Ok(Box::new(once(Operation {
-                query,
-                comment: None,
-            })))
-        }
+        ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
+            query: include_str!("sql/create_neon_superuser.sql").to_string(),
+            comment: None,
+        }))),
         ApplySpecPhase::DropInvalidDatabases => {
             let mut ctx = ctx.write().await;
             let databases = &mut ctx.dbs;
@@ -720,14 +733,15 @@ async fn get_operations<'a>(
                         // We do not check whether the DB exists or not,
                         // Postgres will take care of it for us
                         "delete_db" => {
+                            let (db_name, outer_tag) = op.name.pg_quote_dollar();
                             // In Postgres we can't drop a database if it is a template.
                             // So we need to unset the template flag first, but it could
                             // be a retry, so we could've already dropped the database.
                             // Check that database exists first to make it idempotent.
                             let unset_template_query: String = format!(
                                 include_str!("sql/unset_template_for_drop_dbs.sql"),
-                                datname_str = escape_literal(&op.name),
-                                datname = &op.name.pg_quote()
+                                datname = db_name,
+                                outer_tag = outer_tag,
                             );
 
                             // Use FORCE to drop database even if there are active connections.
@@ -834,6 +848,8 @@ async fn get_operations<'a>(
                                 comment: None,
                             },
                             Operation {
+                                // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
+                                // (see https://www.postgresql.org/docs/current/ddl-priv.html)
                                 query: format!(
                                     "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
                                     db.name.pg_quote()
@@ -893,9 +909,11 @@ async fn get_operations<'a>(
                 PerDatabasePhase::DropLogicalSubscriptions => {
                     match &db {
                         DB::UserDB(db) => {
+                            let (db_name, outer_tag) = db.name.pg_quote_dollar();
                             let drop_subscription_query: String = format!(
                                 include_str!("sql/drop_subscriptions.sql"),
-                                datname_str = escape_literal(&db.name),
+                                datname_str = db_name,
+                                outer_tag = outer_tag,
                             );
 
                             let operations = vec![Operation {
@@ -934,6 +952,7 @@ async fn get_operations<'a>(
                                     DB::SystemDB => PgIdent::from("cloud_admin").pg_quote(),
                                     DB::UserDB(db) => db.owner.pg_quote(),
                                 };
+                                let (escaped_role, outer_tag) = op.name.pg_quote_dollar();
 
                                 Some(vec![
                                     // This will reassign all dependent objects to the db owner
@@ -948,7 +967,9 @@ async fn get_operations<'a>(
                                     Operation {
                                         query: format!(
                                             include_str!("sql/pre_drop_role_revoke_privileges.sql"),
-                                            role_name = quoted,
+                                            // N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
+                                            role_name = escaped_role,
+                                            outer_tag = outer_tag,
                                         ),
                                         comment: None,
                                     },
@@ -973,12 +994,14 @@ async fn get_operations<'a>(
                         DB::SystemDB => return Ok(Box::new(empty())),
                         DB::UserDB(db) => db,
                     };
+                    let (db_owner, outer_tag) = db.owner.pg_quote_dollar();
 
                     let operations = vec![
                         Operation {
                             query: format!(
                                 include_str!("sql/set_public_schema_owner.sql"),
-                                db_owner = db.owner.pg_quote()
+                                db_owner = db_owner,
+                                outer_tag = outer_tag,
                             ),
                             comment: None,
                         },
@@ -1098,6 +1121,25 @@ async fn get_operations<'a>(
             }
             Ok(Box::new(empty()))
         }
+        ApplySpecPhase::CreatePgauditExtension => Ok(Box::new(once(Operation {
+            query: String::from("CREATE EXTENSION IF NOT EXISTS pgaudit"),
+            comment: Some(String::from("create pgaudit extensions")),
+        }))),
+        ApplySpecPhase::CreatePgauditlogtofileExtension => Ok(Box::new(once(Operation {
+            query: String::from("CREATE EXTENSION IF NOT EXISTS pgauditlogtofile"),
+            comment: Some(String::from("create pgauditlogtofile extensions")),
+        }))),
+        // Disable pgaudit logging for postgres database.
+        // Postgres is neon system database used by monitors
+        // and compute_ctl tuning functions and thus generates a lot of noise.
+        // We do not consider data stored in this database as sensitive.
+        ApplySpecPhase::DisablePostgresDBPgAudit => {
+            let query = "ALTER DATABASE postgres SET pgaudit.log to 'none'";
+            Ok(Box::new(once(Operation {
+                query: query.to_string(),
+                comment: Some(query.to_string()),
+            })))
+        }
         ApplySpecPhase::HandleNeonExtension => {
             let operations = vec![
                 Operation {
diff --git a/compute_tools/src/sql/create_neon_superuser.sql b/compute_tools/src/sql/create_neon_superuser.sql
new file mode 100644
index 0000000000..300645627b
--- /dev/null
+++ b/compute_tools/src/sql/create_neon_superuser.sql
@@ -0,0 +1,8 @@
+DO $$
+    BEGIN
+        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
+        THEN
+            CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
+        END IF;
+    END
+$$;
diff --git a/compute_tools/src/sql/drop_subscriptions.sql b/compute_tools/src/sql/drop_subscriptions.sql
index 03e8e158fa..f5d9420130 100644
--- a/compute_tools/src/sql/drop_subscriptions.sql
+++ b/compute_tools/src/sql/drop_subscriptions.sql
@@ -1,4 +1,4 @@
-DO $$
+DO ${outer_tag}$
 DECLARE
     subname TEXT;
 BEGIN
@@ -9,4 +9,4 @@ BEGIN
         EXECUTE format('DROP SUBSCRIPTION %I;', subname);
     END LOOP;
 END;
-$$;
+${outer_tag}$;
diff --git a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
index cdaa7071d3..4342650591 100644
--- a/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
+++ b/compute_tools/src/sql/pre_drop_role_revoke_privileges.sql
@@ -1,6 +1,6 @@
 SET SESSION ROLE neon_superuser;
 
-DO $$
+DO ${outer_tag}$
 DECLARE
     schema TEXT;
     revoke_query TEXT;
@@ -16,13 +16,15 @@ BEGIN
         WHERE schema_name IN ('public')
     LOOP
         revoke_query := format(
-            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM {role_name} GRANTED BY neon_superuser;',
-            schema
+            'REVOKE ALL PRIVILEGES ON ALL TABLES IN SCHEMA %I FROM %I GRANTED BY neon_superuser;',
+            schema,
+            -- N.B. this has to be properly dollar-escaped with `pg_quote_dollar()`
+            {role_name}
         );
 
         EXECUTE revoke_query;
     END LOOP;
 END;
-$$;
+${outer_tag}$;
 
 RESET ROLE;
diff --git a/compute_tools/src/sql/set_public_schema_owner.sql b/compute_tools/src/sql/set_public_schema_owner.sql
index fd061a713e..dc502c6d2d 100644
--- a/compute_tools/src/sql/set_public_schema_owner.sql
+++ b/compute_tools/src/sql/set_public_schema_owner.sql
@@ -1,5 +1,4 @@
-DO
-$$
+DO ${outer_tag}$
     DECLARE
         schema_owner TEXT;
     BEGIN
@@ -16,8 +15,8 @@ $$
 
             IF schema_owner = 'cloud_admin' OR schema_owner = 'zenith_admin'
             THEN
-                ALTER SCHEMA public OWNER TO {db_owner};
+                EXECUTE format('ALTER SCHEMA public OWNER TO %I', {db_owner});
             END IF;
         END IF;
     END
-$$;
\ No newline at end of file
+${outer_tag}$;
\ No newline at end of file
diff --git a/compute_tools/src/sql/unset_template_for_drop_dbs.sql b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
index 6c4343a589..36dc648beb 100644
--- a/compute_tools/src/sql/unset_template_for_drop_dbs.sql
+++ b/compute_tools/src/sql/unset_template_for_drop_dbs.sql
@@ -1,12 +1,12 @@
-DO $$
+DO ${outer_tag}$
     BEGIN
         IF EXISTS(
             SELECT 1
             FROM pg_catalog.pg_database
-            WHERE datname = {datname_str}
+            WHERE datname = {datname}
         )
         THEN
-            ALTER DATABASE {datname} is_template false;
+            EXECUTE format('ALTER DATABASE %I is_template false', {datname});
         END IF;
     END
-$$;
\ No newline at end of file
+${outer_tag}$;
diff --git a/compute_tools/tests/pg_helpers_tests.rs b/compute_tools/tests/pg_helpers_tests.rs
index 4961bc293d..f2d74ff384 100644
--- a/compute_tools/tests/pg_helpers_tests.rs
+++ b/compute_tools/tests/pg_helpers_tests.rs
@@ -61,6 +61,23 @@ test.escaping = 'here''s a backslash \\ and a quote '' and a double-quote " hoor
         assert_eq!(ident.pg_quote(), "\"\"\"name\"\";\\n select 1;\"");
     }
 
+    #[test]
+    fn ident_pg_quote_dollar() {
+        let test_cases = vec![
+            ("name", ("$$name$$", "x")),
+            ("name$$", ("$x$name$$$x$", "xx")),
+            ("name$$$", ("$x$name$$$$x$", "xx")),
+            ("name$$$$", ("$x$name$$$$$x$", "xx")),
+            ("name$x$", ("$xx$name$x$$xx$", "xxx")),
+        ];
+
+        for (input, expected) in test_cases {
+            let (escaped, tag) = PgIdent::from(input).pg_quote_dollar();
+            assert_eq!(escaped, expected.0);
+            assert_eq!(tag, expected.1);
+        }
+    }
+
     #[test]
     fn generic_options_search() {
         let generic_options: GenericOptions = Some(vec![
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index f258025428..375b5d87d0 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -40,6 +40,7 @@ use pageserver_api::models::{ShardParameters, TimelineCreateRequest, TimelineInf
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
+use safekeeper_api::membership::SafekeeperGeneration;
 use safekeeper_api::{
     DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
     DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -596,7 +597,15 @@ struct EndpointStartCmdArgs {
     #[clap(long = "pageserver-id")]
     endpoint_pageserver_id: Option<NodeId>,
 
-    #[clap(long)]
+    #[clap(
+        long,
+        help = "Safekeepers membership generation to prefix neon.safekeepers with. Normally neon_local sets it on its own, but this option allows to override. Non zero value forces endpoint to use membership configurations."
+    )]
+    safekeepers_generation: Option<u32>,
+    #[clap(
+        long,
+        help = "List of safekeepers endpoint will talk to. Normally neon_local chooses them on its own, but this option allows to override."
+    )]
     safekeepers: Option<String>,
 
     #[clap(
@@ -617,9 +626,9 @@ struct EndpointStartCmdArgs {
     )]
     allow_multiple: bool,
 
-    #[clap(short = 't', long, help = "timeout until we fail the command")]
-    #[arg(default_value = "10s")]
-    start_timeout: humantime::Duration,
+    #[clap(short = 't', long, value_parser= humantime::parse_duration, help = "timeout until we fail the command")]
+    #[arg(default_value = "90s")]
+    start_timeout: Duration,
 }
 
 #[derive(clap::Args)]
@@ -1350,6 +1359,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             let pageserver_id = args.endpoint_pageserver_id;
             let remote_ext_config = &args.remote_ext_config;
 
+            let safekeepers_generation = args.safekeepers_generation.map(SafekeeperGeneration::new);
             // If --safekeepers argument is given, use only the listed
             // safekeeper nodes; otherwise all from the env.
             let safekeepers = if let Some(safekeepers) = parse_safekeepers(&args.safekeepers)? {
@@ -1425,11 +1435,13 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
             endpoint
                 .start(
                     &auth_token,
+                    safekeepers_generation,
                     safekeepers,
                     pageservers,
                     remote_ext_config.as_ref(),
                     stripe_size.0 as usize,
                     args.create_test_user,
+                    args.start_timeout,
                 )
                 .await?;
         }
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 50ccca36fe..b46d616827 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -42,17 +42,19 @@ use std::path::PathBuf;
 use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, SystemTime, UNIX_EPOCH};
+use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
 
 use anyhow::{Context, Result, anyhow, bail};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeCtlConfig, ComputeStatus, ComputeStatusResponse};
 use compute_api::spec::{
-    Cluster, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent, RemoteExtSpec, Role,
+    Cluster, ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, Database, PgIdent,
+    RemoteExtSpec, Role,
 };
 use nix::sys::signal::{Signal, kill};
 use pageserver_api::shard::ShardStripeSize;
 use reqwest::header::CONTENT_TYPE;
+use safekeeper_api::membership::SafekeeperGeneration;
 use serde::{Deserialize, Serialize};
 use tracing::debug;
 use url::Host;
@@ -576,14 +578,17 @@ impl Endpoint {
         Ok(safekeeper_connstrings)
     }
 
+    #[allow(clippy::too_many_arguments)]
     pub async fn start(
         &self,
         auth_token: &Option<String>,
+        safekeepers_generation: Option<SafekeeperGeneration>,
         safekeepers: Vec<NodeId>,
         pageservers: Vec<(Host, u16)>,
         remote_ext_config: Option<&String>,
         shard_stripe_size: usize,
         create_test_user: bool,
+        start_timeout: Duration,
     ) -> Result<()> {
         if self.status() == EndpointStatus::Running {
             anyhow::bail!("The endpoint is already running");
@@ -655,6 +660,7 @@ impl Endpoint {
             timeline_id: Some(self.timeline_id),
             mode: self.mode,
             pageserver_connstring: Some(pageserver_connstring),
+            safekeepers_generation: safekeepers_generation.map(|g| g.into_inner()),
             safekeeper_connstrings,
             storage_auth_token: auth_token.clone(),
             remote_extensions,
@@ -663,6 +669,7 @@ impl Endpoint {
             local_proxy_config: None,
             reconfigure_concurrency: self.reconfigure_concurrency,
             drop_subscriptions_before_start: self.drop_subscriptions_before_start,
+            audit_log_level: ComputeAudit::Disabled,
         };
 
         // this strange code is needed to support respec() in tests
@@ -770,17 +777,18 @@ impl Endpoint {
         std::fs::write(pidfile_path, pid.to_string())?;
 
         // Wait for it to start
-        let mut attempt = 0;
         const ATTEMPT_INTERVAL: Duration = Duration::from_millis(100);
-        const MAX_ATTEMPTS: u32 = 10 * 90; // Wait up to 1.5 min
+        let start_at = Instant::now();
         loop {
-            attempt += 1;
             match self.get_status().await {
                 Ok(state) => {
                     match state.status {
                         ComputeStatus::Init => {
-                            if attempt == MAX_ATTEMPTS {
-                                bail!("compute startup timed out; still in Init state");
+                            if Instant::now().duration_since(start_at) > start_timeout {
+                                bail!(
+                                    "compute startup timed out {:?}; still in Init state",
+                                    start_timeout
+                                );
                             }
                             // keep retrying
                         }
@@ -807,8 +815,11 @@ impl Endpoint {
                     }
                 }
                 Err(e) => {
-                    if attempt == MAX_ATTEMPTS {
-                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
+                    if Instant::now().duration_since(start_at) > start_timeout {
+                        return Err(e).context(format!(
+                            "timed out {:?} waiting to connect to compute_ctl HTTP",
+                            start_timeout,
+                        ));
                     }
                 }
             }
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index 6e6c41538d..51d1e40802 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -6,8 +6,11 @@ generate_id() {
     local -n resvar=$1
     printf -v resvar '%08x%08x%08x%08x' $SRANDOM $SRANDOM $SRANDOM $SRANDOM
 }
-if [ -z ${OLD_COMPUTE_TAG+x} ] || [ -z ${NEW_COMPUTE_TAG+x} ] || [ -z "${OLD_COMPUTE_TAG}" ] || [ -z "${NEW_COMPUTE_TAG}" ]; then
-  echo OLD_COMPUTE_TAG and NEW_COMPUTE_TAG must be defined
+echo "${OLD_COMPUTE_TAG}"
+echo "${NEW_COMPUTE_TAG}"
+echo "${TEST_EXTENSIONS_TAG}"
+if [ -z "${OLD_COMPUTE_TAG:-}" ] || [ -z "${NEW_COMPUTE_TAG:-}" ] || [ -z "${TEST_EXTENSIONS_TAG:-}" ]; then
+  echo OLD_COMPUTE_TAG, NEW_COMPUTE_TAG and TEST_EXTENSIONS_TAG must be set
   exit 1
 fi
 export PG_VERSION=${PG_VERSION:-16}
@@ -58,7 +61,7 @@ function check_timeline() {
 # Accepts the tag for the compute node and the timeline as parameters.
 function restart_compute() {
   docker compose down compute compute_is_ready
-  COMPUTE_TAG=${1} TAG=${OLD_COMPUTE_TAG} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
+  COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
   wait_for_ready
   check_timeline ${2}
 }
@@ -82,7 +85,7 @@ EXTENSIONS='[
 {"extname": "pg_repack", "extdir": "pg_repack-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
-TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
+COMPUTE_TAG=${NEW_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d
 wait_for_ready
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
@@ -90,7 +93,7 @@ create_extensions "${EXTNAMES}"
 query="select json_object_agg(extname,extversion) from pg_extension where extname in ('${EXTNAMES// /\',\'}')"
 new_vers=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
 docker compose --profile test-extensions down
-TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
+COMPUTE_TAG=${OLD_COMPUTE_TAG} docker compose --profile test-extensions up --quiet-pull --build -d --force-recreate
 wait_for_ready
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
diff --git a/docs/rfcs/041-rel-sparse-keyspace.md b/docs/rfcs/041-rel-sparse-keyspace.md
new file mode 100644
index 0000000000..03e68bd5c1
--- /dev/null
+++ b/docs/rfcs/041-rel-sparse-keyspace.md
@@ -0,0 +1,201 @@
+# Sparse Keyspace for Relation Directories
+
+## Summary
+
+This is an RFC describing a new storage strategy for storing relation directories.
+
+## Motivation
+
+Postgres maintains a directory structure for databases and relations. In Neon, we store these information
+by serializing the directory data in a single key (see `pgdatadir_mapping.rs`).
+
+```rust
+// DbDir:
+// 00 00000000 00000000 00000000 00   00000000
+
+// RelDir:
+// 00 SPCNODE  DBNODE   00000000 00   00000001 (Postgres never uses relfilenode 0)
+```
+
+We have a dedicated structure on the ingestion path to serialize the relation directory into this single key.
+
+```rust
+#[derive(Debug, Serialize, Deserialize, Default)]
+pub(crate) struct RelDirectory {
+    // Set of relations that exist. (relfilenode, forknum)
+    //
+    // TODO: Store it as a btree or radix tree or something else that spans multiple
+    // key-value pairs, if you have a lot of relations
+    pub(crate) rels: HashSet<(Oid, u8)>,
+}
+```
+
+The current codebase has the following three access patterns for the relation directory.
+
+1. Check if a relation exists.
+2. List all relations.
+3. Create/drop a relation.
+
+For (1), we currently have to get the reldir key, deserialize it, and check whether the relation exists in the
+hash set. For (2), we get the reldir key and the hash set. For (3), we need first to get
+and deserialize the key, add the new relation record to the hash set, and then serialize it and write it back.
+
+If we have 100k relations in a database, we would have a 100k-large hash set. Then, every
+relation created and dropped would have deserialized and serialized this 100k-large hash set. This makes the
+relation create/drop process to be quadratic. When we check if a relation exists in the ingestion path,
+we would have to deserialize this super big 100k-large key before checking if a single relation exists.
+
+In this RFC, we will propose a new way to store the reldir data in the sparse keyspace and propose how
+to seamlessly migrate users to use the new keyspace.
+
+The PoC patch is implemented in [PR10316](https://github.com/neondatabase/neon/pull/10316).
+
+## Key Mapping
+
+We will use the recently introduced sparse keyspace to store actual data. Sparse keyspace was proposed in
+[038-aux-file-v2.md](038-aux-file-v2.md). The original reldir has one single value of `HashSet<(Oid, u8)>`
+for each of the databases (identified as `spcnode, dbnode`). We encode the `Oid` (`relnode, forknum`),
+into the key.
+
+```plain
+(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> deleted
+(REL_DIR_KEY_PREFIX, spcnode, dbnode, relnode, forknum, 1) -> exists
+```
+
+Assume all reldir data are stored in this new keyspace; the 3 reldir operations we mentioned before can be
+implemented as follows.
+
+1. Check if a relation exists: check if the key maps to "exists".
+2. List all relations: scan the sprase keyspace over the `rel_dir_key_prefix`. Extract relnode and forknum from the key.
+3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation. The delete tombstone will
+   be removed during image layer generation upon compaction.
+
+Note that "exists" and "deleted" will be encoded as a single byte as two variants of an enum.
+The mapping is implemented as `rel_tag_sparse_key` in the PoC patch.
+
+## Changes to Sparse Keyspace
+
+Previously, we only used sparse keyspaces for the aux files, which did not carry over when branching. The reldir
+information needs to be preserved from the parent branch to the child branch. Therefore, the read path needs
+to be updated accordingly to accommodate such "inherited sparse keys". This is done in
+[PR#10313](https://github.com/neondatabase/neon/pull/10313).
+
+## Coexistence of the Old and New Keyspaces
+
+Migrating to the new keyspace will be done gradually: when we flip a config item to enable the new reldir keyspace, the
+ingestion path will start to write to the new keyspace and the old reldir data will be kept in the old one. The read
+path needs to combine the data from both keyspaces.
+
+Theoretically, we could do a rewrite at the startup time that scans all relation directories and copies that data into the
+new keyspace. However, this could take a long time, especially if we have thousands of tenants doing the migration
+process simultaneously after the pageserver restarts. Therefore, we propose the coexistence strategy so that the
+migration can happen seamlessly and imposes no potential downtime for the user.
+
+With the coexistence assumption, the 3 reldir operations will be implemented as follows:
+
+1. Check if a relation exists
+   - Check the new keyspace if the key maps to any value. If it maps to "exists" or "deleted", directly
+    return it to the user.
+   - Otherwise, deserialize the old reldir key and get the result.
+2. List all relations: scan the sparse keyspace over the `rel_dir_key_prefix` and deserialize the old reldir key.
+   Combine them to obtain the final result.
+3. Create/drop a relation: write "exists" or "deleted" to the corresponding key of the relation into the new keyspace.
+   - We assume no overwrite of relations will happen (i.e., the user won't create a relation at the same Oid). This will be implemented as a runtime check.
+   - For relation creation, we add `sparse_reldir_tableX -> exists` to the keyspace.
+   - For relation drop, we first check if the relation is recorded in the old keyspace. If yes, we deserialize the old reldir key,
+    remove the relation, and then write it back. Otherwise, we put `sparse_reldir_tableX -> deleted` to the keyspace.
+   - The delete tombstone will be removed during image layer generation upon compaction.
+
+This process ensures that the transition will not introduce any downtime and all new updates are written to the new keyspace. The total
+amount of data in the storage would be `O(relations_modifications)` and we can guarantee `O(current_relations)` after compaction.
+There could be some relations that exist in the old reldir key for a long time. Refer to the "Full Migration" section on how to deal
+with them. Plus, for relation modifications, it will have `O(old_relations)` complexity until we do the full migration, which gives
+us `O(1)` complexity after fully opt-in the sparse keyspace.
+
+The process also implies that a relation will only exists either in the old reldir key or in the new sparse keyspace. It is not possible
+to have a table to be recorded in the old reldir key while later having a delete tombstone for it in the sparse keyspace at any LSN.
+
+We will introduce a config item and an index_part record to record the current status of the migration process.
+
+- Config item `enable_reldir_v2`: controls whether the ingestion path writes the reldir info into the new keyspace.
+- `index_part.json` field `reldir_v2_status`: whether the timeline has written any key into the new reldir keyspace.
+
+If `enable_reldir_v2` is set to `true` and the timeline ingests the first key into the new reldir keyspace, it will update
+`index_part.json` to set `reldir_v2_status` to `Status::Migrating`. Even if `enable_reldir_v2` gets flipped back to
+`false` (i.e., when the pageserver restarts and such config isn't persisted), the read/write path will still
+read/write to the new keyspace to avoid data inconsistency. This also indicates that the migration is one-way only:
+once v2 is enabled, the user cannot go back to v1.
+
+## Next Steps
+
+### Full Migration
+
+This won't be implemented in the project's first phase but might be implemented in the future. Having both v1 and
+v2 existing in the system would force us to keep the code to deserialize the old reldir key forever. To entirely deprecate this
+code path, we must ensure the timeline has no old reldir data.
+
+We can trigger a special image layer generation process at the gc-horizon. The generated image layers will cover several keyspaces:
+the old reldir key in each of the databases, and the new reldir sparse keyspace. It will remove the old reldir key while
+copying them into the corresponding keys in the sparse keyspace in the resulting image. This special process happens in
+the background during compaction. For example, assume this special process is triggered at LSN 0/180. The `create_image_layers`
+process discovers the following keys at this LSN.
+
+```plain
+db1/reldir_key -> (table 1, table 2, table 3)
+...db1 rel keys
+db2/reldir_key -> (table 4, table 5, table 6)
+...db2 rel keys
+sparse_reldir_db2_table7 -> exists
+sparse_reldir_db1_table8 -> deleted
+```
+
+It will generate the following keys:
+
+```plain
+db1/reldir_key -> () # we have to keep the key because it is part of `collect_keyspace`.
+...db1 rel keys
+db2/reldir_key -> ()
+...db2 rel keys
+
+-- start image layer for the sparse keyspace at sparse_reldir_prefix at LSN 0/180
+sparse_reldir_db1_table1 -> exists
+sparse_reldir_db1_table2 -> exists
+sparse_reldir_db1_table3 -> exists
+sparse_reldir_db2_table4 -> exists
+sparse_reldir_db2_table5 -> exists
+sparse_reldir_db2_table6 -> exists
+sparse_reldir_db2_table7 -> exists
+-- end image layer for the sparse keyspace at sparse_reldir_prefix+1
+
+# The `sparse_reldir_db1_table8` key gets dropped as part of the image layer generation code for the sparse keyspace.
+# Note that the read path will stop reading if a key is not found in the image layer covering the key range so there
+# are no correctness issue.
+```
+
+We must verify that no pending modifications to the old reldir exists in the delta/image layers above the gc-horizon before
+we start this process (We can do a vectored read to get the full key history of the old reldir key and ensure there are no more images
+above the gc-horizon). Otherwise, it will violate the property that "a relation will only exists either in the old reldir key or
+in the new sparse keyspace". After we run this migration process, we can mark `reldir_v2_status` in the `index_part.json` to
+`Status::Migrated`, and the read path won't need to read from the old reldir anymore. Once the status is set to `Migrated`, we
+don't need to add the key into `collect_keyspace` and therefore all of them will be removed from all future image layers.
+
+The migration process can be proactively triggered across all attached/detached tenants to help us fully remove the old reldir code.
+
+### Consolidate Relation Size Keys
+
+We have relsize at the end of all relation nodes.
+
+```plain
+// RelSize:
+// 00 SPCNODE  DBNODE   RELNODE  FORK FFFFFFFF
+```
+
+This means that computing logical size requires us to do several single-key gets across the keyspace,
+potentially requiring downloading many layer files. We could consolidate them into a single
+keyspace, improving logical size calculation performance.
+
+### Migrate DBDir Keys
+
+We assume the number of databases created by the users will be small, and therefore, the current way
+of storing the database directory would be acceptable. In the future, we could also migrate DBDir keys into
+the sparse keyspace to support large amount of databases.
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 35c580bd37..3300fbf7dd 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -134,8 +134,10 @@ pub struct CatalogObjects {
     pub databases: Vec<Database>,
 }
 
-#[derive(Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize)]
 pub struct ComputeCtlConfig {
+    /// Set of JSON web keys that the compute can use to authenticate
+    /// communication from the control plane.
     pub jwks: JwkSet,
 }
 
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index d02bfd6814..77f2e1e631 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -101,6 +101,17 @@ pub struct ComputeSpec {
     pub timeline_id: Option<TimelineId>,
     pub pageserver_connstring: Option<String>,
 
+    /// Safekeeper membership config generation. It is put in
+    /// neon.safekeepers GUC and serves two purposes:
+    /// 1) Non zero value forces walproposer to use membership configurations.
+    /// 2) If walproposer wants to update list of safekeepers to connect to
+    ///    taking them from some safekeeper mconf, it should check what value
+    ///    is newer by comparing the generation.
+    ///
+    /// Note: it could be SafekeeperGeneration, but this needs linking
+    /// compute_ctl with postgres_ffi.
+    #[serde(default)]
+    pub safekeepers_generation: Option<u32>,
     #[serde(default)]
     pub safekeeper_connstrings: Vec<String>,
 
@@ -144,6 +155,16 @@ pub struct ComputeSpec {
     /// over the same replication content from publisher.
     #[serde(default)] // Default false
     pub drop_subscriptions_before_start: bool,
+
+    /// Log level for audit logging:
+    ///
+    /// Disabled - no audit logging. This is the default.
+    /// log - log masked statements to the postgres log using pgaudit extension
+    /// hipaa - log unmasked statements to the file using pgaudit and pgauditlogtofile extension
+    ///
+    /// Extensions should be present in shared_preload_libraries
+    #[serde(default)]
+    pub audit_log_level: ComputeAudit,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -251,6 +272,17 @@ pub enum ComputeMode {
     Replica,
 }
 
+/// Log level for audit logging
+/// Disabled, log, hipaa
+/// Default is Disabled
+#[derive(Clone, Debug, Default, Eq, PartialEq, Deserialize, Serialize)]
+pub enum ComputeAudit {
+    #[default]
+    Disabled,
+    Log,
+    Hipaa,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Cluster {
     pub cluster_id: Option<String>,
diff --git a/libs/http-utils/Cargo.toml b/libs/http-utils/Cargo.toml
index d72e4bd012..d16dac7876 100644
--- a/libs/http-utils/Cargo.toml
+++ b/libs/http-utils/Cargo.toml
@@ -6,11 +6,8 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
-backtrace.workspace = true
 bytes.workspace = true
-inferno.workspace = true
 fail.workspace = true
-flate2.workspace = true
 hyper0.workspace = true
 itertools.workspace = true
 jemalloc_pprof.workspace = true
diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs
index 6128113580..f4f93df62f 100644
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -3,8 +3,6 @@ use std::io::Write as _;
 use std::str::FromStr;
 use std::time::Duration;
 
-use ::pprof::ProfilerGuardBuilder;
-use ::pprof::protos::Message as _;
 use anyhow::{Context, anyhow};
 use bytes::{Bytes, BytesMut};
 use hyper::header::{AUTHORIZATION, CONTENT_DISPOSITION, CONTENT_TYPE, HeaderName};
@@ -12,7 +10,8 @@ use hyper::http::HeaderValue;
 use hyper::{Body, Method, Request, Response};
 use metrics::{Encoder, IntCounter, TextEncoder, register_int_counter};
 use once_cell::sync::Lazy;
-use regex::Regex;
+use pprof::ProfilerGuardBuilder;
+use pprof::protos::Message as _;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
 use tokio::sync::{Mutex, Notify, mpsc};
@@ -22,7 +21,6 @@ use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
 
 use crate::error::{ApiError, api_error_handler, route_error_handler};
-use crate::pprof;
 use crate::request::{get_query_param, parse_query_param};
 
 static SERVE_METRICS_COUNT: Lazy<IntCounter> = Lazy::new(|| {
@@ -449,20 +447,6 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
         Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
     };
 
-    // Functions and mappings to strip when symbolizing pprof profiles. If true,
-    // also remove child frames.
-    static STRIP_FUNCTIONS: Lazy<Vec<(Regex, bool)>> = Lazy::new(|| {
-        vec![
-            (Regex::new("^__rust").unwrap(), false),
-            (Regex::new("^_start$").unwrap(), false),
-            (Regex::new("^irallocx_prof").unwrap(), true),
-            (Regex::new("^prof_alloc_prep").unwrap(), true),
-            (Regex::new("^std::rt::lang_start").unwrap(), false),
-            (Regex::new("^std::sys::backtrace::__rust").unwrap(), false),
-        ]
-    });
-    const STRIP_MAPPINGS: &[&str] = &["libc", "libgcc", "pthread", "vdso"];
-
     // Obtain profiler handle.
     let mut prof_ctl = jemalloc_pprof::PROF_CTL
         .as_ref()
@@ -495,45 +479,27 @@ pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>,
         }
 
         Format::Pprof => {
-            let data = tokio::task::spawn_blocking(move || {
-                let bytes = prof_ctl.dump_pprof()?;
-                // Symbolize the profile.
-                // TODO: consider moving this upstream to jemalloc_pprof and avoiding the
-                // serialization roundtrip.
-                let profile = pprof::decode(&bytes)?;
-                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
-                pprof::encode(&profile)
-            })
-            .await
-            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-            .map_err(ApiError::InternalServerError)?;
+            let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
             Response::builder()
                 .status(200)
                 .header(CONTENT_TYPE, "application/octet-stream")
-                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb.gz\"")
                 .body(Body::from(data))
                 .map_err(|err| ApiError::InternalServerError(err.into()))
         }
 
         Format::Svg => {
-            let body = tokio::task::spawn_blocking(move || {
-                let bytes = prof_ctl.dump_pprof()?;
-                let profile = pprof::decode(&bytes)?;
-                let profile = pprof::symbolize(profile)?;
-                let profile = pprof::strip_locations(profile, STRIP_MAPPINGS, &STRIP_FUNCTIONS);
-                let mut opts = inferno::flamegraph::Options::default();
-                opts.title = "Heap inuse".to_string();
-                opts.count_name = "bytes".to_string();
-                pprof::flamegraph(profile, &mut opts)
-            })
-            .await
-            .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
-            .map_err(ApiError::InternalServerError)?;
+            let svg = tokio::task::spawn_blocking(move || prof_ctl.dump_flamegraph())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
             Response::builder()
                 .status(200)
                 .header(CONTENT_TYPE, "image/svg+xml")
-                .body(Body::from(body))
+                .body(Body::from(svg))
                 .map_err(|err| ApiError::InternalServerError(err.into()))
         }
     }
diff --git a/libs/http-utils/src/lib.rs b/libs/http-utils/src/lib.rs
index c692a54257..1e9b3c761a 100644
--- a/libs/http-utils/src/lib.rs
+++ b/libs/http-utils/src/lib.rs
@@ -2,7 +2,6 @@ pub mod endpoint;
 pub mod error;
 pub mod failpoints;
 pub mod json;
-pub mod pprof;
 pub mod request;
 
 extern crate hyper0 as hyper;
diff --git a/libs/http-utils/src/pprof.rs b/libs/http-utils/src/pprof.rs
deleted file mode 100644
index 529017f350..0000000000
--- a/libs/http-utils/src/pprof.rs
+++ /dev/null
@@ -1,238 +0,0 @@
-use std::borrow::Cow;
-use std::collections::{HashMap, HashSet};
-use std::ffi::c_void;
-use std::io::Write as _;
-
-use anyhow::bail;
-use flate2::Compression;
-use flate2::write::{GzDecoder, GzEncoder};
-use itertools::Itertools as _;
-use pprof::protos::{Function, Line, Location, Message as _, Profile};
-use regex::Regex;
-
-/// Decodes a gzip-compressed Protobuf-encoded pprof profile.
-pub fn decode(bytes: &[u8]) -> anyhow::Result<Profile> {
-    let mut gz = GzDecoder::new(Vec::new());
-    gz.write_all(bytes)?;
-    Ok(Profile::parse_from_bytes(&gz.finish()?)?)
-}
-
-/// Encodes a pprof profile as gzip-compressed Protobuf.
-pub fn encode(profile: &Profile) -> anyhow::Result<Vec<u8>> {
-    let mut gz = GzEncoder::new(Vec::new(), Compression::default());
-    profile.write_to_writer(&mut gz)?;
-    Ok(gz.finish()?)
-}
-
-/// Symbolizes a pprof profile using the current binary.
-pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {
-    if !profile.function.is_empty() {
-        return Ok(profile); // already symbolized
-    }
-
-    // Collect function names.
-    let mut functions: HashMap<String, Function> = HashMap::new();
-    let mut strings: HashMap<String, i64> = profile
-        .string_table
-        .into_iter()
-        .enumerate()
-        .map(|(i, s)| (s, i as i64))
-        .collect();
-
-    // Helper to look up or register a string.
-    let mut string_id = |s: &str| -> i64 {
-        // Don't use .entry() to avoid unnecessary allocations.
-        if let Some(id) = strings.get(s) {
-            return *id;
-        }
-        let id = strings.len() as i64;
-        strings.insert(s.to_string(), id);
-        id
-    };
-
-    for loc in &mut profile.location {
-        if !loc.line.is_empty() {
-            continue;
-        }
-
-        // Resolve the line and function for each location.
-        backtrace::resolve(loc.address as *mut c_void, |symbol| {
-            let Some(symbol_name) = symbol.name() else {
-                return;
-            };
-
-            let function_name = format!("{symbol_name:#}");
-            let functions_len = functions.len();
-            let function_id = functions
-                .entry(function_name)
-                .or_insert_with_key(|function_name| {
-                    let function_id = functions_len as u64 + 1;
-                    let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
-                    let filename = symbol
-                        .filename()
-                        .map(|path| path.to_string_lossy())
-                        .unwrap_or(Cow::Borrowed(""));
-                    Function {
-                        id: function_id,
-                        name: string_id(function_name),
-                        system_name: string_id(&system_name),
-                        filename: string_id(&filename),
-                        ..Default::default()
-                    }
-                })
-                .id;
-            loc.line.push(Line {
-                function_id,
-                line: symbol.lineno().unwrap_or(0) as i64,
-                ..Default::default()
-            });
-        });
-    }
-
-    // Store the resolved functions, and mark the mapping as resolved.
-    profile.function = functions.into_values().sorted_by_key(|f| f.id).collect();
-    profile.string_table = strings
-        .into_iter()
-        .sorted_by_key(|(_, i)| *i)
-        .map(|(s, _)| s)
-        .collect();
-
-    for mapping in &mut profile.mapping {
-        mapping.has_functions = true;
-        mapping.has_filenames = true;
-    }
-
-    Ok(profile)
-}
-
-/// Strips locations (stack frames) matching the given mappings (substring) or function names
-/// (regex). The function bool specifies whether child frames should be stripped as well.
-///
-/// The string definitions are left behind in the profile for simplicity, to avoid rewriting all
-/// string references.
-pub fn strip_locations(
-    mut profile: Profile,
-    mappings: &[&str],
-    functions: &[(Regex, bool)],
-) -> Profile {
-    // Strip mappings.
-    let mut strip_mappings: HashSet<u64> = HashSet::new();
-
-    profile.mapping.retain(|mapping| {
-        let Some(name) = profile.string_table.get(mapping.filename as usize) else {
-            return true;
-        };
-        if mappings.iter().any(|substr| name.contains(substr)) {
-            strip_mappings.insert(mapping.id);
-            return false;
-        }
-        true
-    });
-
-    // Strip functions.
-    let mut strip_functions: HashMap<u64, bool> = HashMap::new();
-
-    profile.function.retain(|function| {
-        let Some(name) = profile.string_table.get(function.name as usize) else {
-            return true;
-        };
-        for (regex, strip_children) in functions {
-            if regex.is_match(name) {
-                strip_functions.insert(function.id, *strip_children);
-                return false;
-            }
-        }
-        true
-    });
-
-    // Strip locations. The bool specifies whether child frames should be stripped too.
-    let mut strip_locations: HashMap<u64, bool> = HashMap::new();
-
-    profile.location.retain(|location| {
-        for line in &location.line {
-            if let Some(strip_children) = strip_functions.get(&line.function_id) {
-                strip_locations.insert(location.id, *strip_children);
-                return false;
-            }
-        }
-        if strip_mappings.contains(&location.mapping_id) {
-            strip_locations.insert(location.id, false);
-            return false;
-        }
-        true
-    });
-
-    // Strip sample locations.
-    for sample in &mut profile.sample {
-        // First, find the uppermost function with child removal and truncate the stack.
-        if let Some(truncate) = sample
-            .location_id
-            .iter()
-            .rposition(|id| strip_locations.get(id) == Some(&true))
-        {
-            sample.location_id.drain(..=truncate);
-        }
-        // Next, strip any individual frames without child removal.
-        sample
-            .location_id
-            .retain(|id| !strip_locations.contains_key(id));
-    }
-
-    profile
-}
-
-/// Generates an SVG flamegraph from a symbolized pprof profile.
-pub fn flamegraph(
-    profile: Profile,
-    opts: &mut inferno::flamegraph::Options,
-) -> anyhow::Result<Vec<u8>> {
-    if profile.mapping.iter().any(|m| !m.has_functions) {
-        bail!("profile not symbolized");
-    }
-
-    // Index locations, functions, and strings.
-    let locations: HashMap<u64, Location> =
-        profile.location.into_iter().map(|l| (l.id, l)).collect();
-    let functions: HashMap<u64, Function> =
-        profile.function.into_iter().map(|f| (f.id, f)).collect();
-    let strings = profile.string_table;
-
-    // Resolve stacks as function names, and sum sample values per stack. Also reverse the stack,
-    // since inferno expects it bottom-up.
-    let mut stacks: HashMap<Vec<&str>, i64> = HashMap::new();
-    for sample in profile.sample {
-        let mut stack = Vec::with_capacity(sample.location_id.len());
-        for location in sample.location_id.into_iter().rev() {
-            let Some(location) = locations.get(&location) else {
-                bail!("missing location {location}");
-            };
-            for line in location.line.iter().rev() {
-                let Some(function) = functions.get(&line.function_id) else {
-                    bail!("missing function {}", line.function_id);
-                };
-                let Some(name) = strings.get(function.name as usize) else {
-                    bail!("missing string {}", function.name);
-                };
-                stack.push(name.as_str());
-            }
-        }
-        let Some(&value) = sample.value.first() else {
-            bail!("missing value");
-        };
-        *stacks.entry(stack).or_default() += value;
-    }
-
-    // Construct stack lines for inferno.
-    let lines = stacks
-        .into_iter()
-        .map(|(stack, value)| (stack.into_iter().join(";"), value))
-        .map(|(stack, value)| format!("{stack} {value}"))
-        .sorted()
-        .collect_vec();
-
-    // Construct the flamegraph.
-    let mut bytes = Vec::new();
-    let lines = lines.iter().map(|line| line.as_str());
-    inferno::flamegraph::from_lines(opts, lines, &mut bytes)?;
-    Ok(bytes)
-}
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ea565e7769..749a8acc4e 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1146,6 +1146,15 @@ pub struct TimelineArchivalConfigRequest {
     pub state: TimelineArchivalState,
 }
 
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone)]
+pub struct TimelinePatchIndexPartRequest {
+    pub rel_size_migration: Option<RelSizeMigration>,
+    pub gc_compaction_last_completed_lsn: Option<Lsn>,
+    pub applied_gc_cutoff_lsn: Option<Lsn>,
+    #[serde(default)]
+    pub force_index_update: bool,
+}
+
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelinesInfoAndOffloaded {
     pub timelines: Vec<TimelineInfo>,
@@ -1165,6 +1174,21 @@ pub struct OffloadedTimelineInfo {
     pub archived_at: chrono::DateTime<chrono::Utc>,
 }
 
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub enum RelSizeMigration {
+    /// The tenant is using the old rel_size format.
+    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
+    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
+    Legacy,
+    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
+    /// persisted in the index part. The read path will read both formats and merge them.
+    Migrating,
+    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
+    /// in the index part, and the read path will not read the old format.
+    Migrated,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
@@ -1243,7 +1267,11 @@ pub struct TimelineInfo {
     // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
     // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
     // read.
+    /// Whether the timeline is archived.
     pub is_archived: Option<bool>,
+
+    /// The status of the rel_size migration.
+    pub rel_size_migration: Option<RelSizeMigration>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/libs/proxy/tokio-postgres2/src/cancel_query.rs b/libs/proxy/tokio-postgres2/src/cancel_query.rs
index b65fb571e6..0bdad0b554 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_query.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_query.rs
@@ -34,8 +34,13 @@ where
         .make_tls_connect(hostname)
         .map_err(|e| Error::tls(e.into()))?;
 
-    let socket =
-        connect_socket::connect_socket(&config.host, config.port, config.connect_timeout).await?;
+    let socket = connect_socket::connect_socket(
+        config.host_addr,
+        &config.host,
+        config.port,
+        config.connect_timeout,
+    )
+    .await?;
 
     cancel_query_raw::cancel_query_raw(socket, ssl_mode, tls, process_id, secret_key).await
 }
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 39b1db75da..c70cb598de 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::fmt;
+use std::net::IpAddr;
 use std::sync::Arc;
 use std::task::{Context, Poll};
 use std::time::Duration;
@@ -137,6 +138,7 @@ impl InnerClient {
 
 #[derive(Clone, Serialize, Deserialize)]
 pub struct SocketConfig {
+    pub host_addr: Option<IpAddr>,
     pub host: Host,
     pub port: u16,
     pub connect_timeout: Option<Duration>,
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 4c25491b67..978d348741 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -1,5 +1,6 @@
 //! Connection configuration.
 
+use std::net::IpAddr;
 use std::time::Duration;
 use std::{fmt, str};
 
@@ -65,6 +66,7 @@ pub enum AuthKeys {
 /// Connection configuration.
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
+    pub(crate) host_addr: Option<IpAddr>,
     pub(crate) host: Host,
     pub(crate) port: u16,
 
@@ -83,6 +85,7 @@ impl Config {
     /// Creates a new configuration.
     pub fn new(host: String, port: u16) -> Config {
         Config {
+            host_addr: None,
             host: Host::Tcp(host),
             port,
             password: None,
@@ -163,6 +166,15 @@ impl Config {
         self
     }
 
+    pub fn set_host_addr(&mut self, addr: IpAddr) -> &mut Config {
+        self.host_addr = Some(addr);
+        self
+    }
+
+    pub fn get_host_addr(&self) -> Option<IpAddr> {
+        self.host_addr
+    }
+
     /// Sets the SSL configuration.
     ///
     /// Defaults to `prefer`.
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index d2bd0dfbcd..7c3a358bba 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,3 +1,5 @@
+use std::net::IpAddr;
+
 use postgres_protocol2::message::backend::Message;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;
@@ -25,13 +27,14 @@ where
         .make_tls_connect(hostname)
         .map_err(|e| Error::tls(e.into()))?;
 
-    match connect_once(&config.host, config.port, tls, config).await {
+    match connect_once(config.host_addr, &config.host, config.port, tls, config).await {
         Ok((client, connection)) => Ok((client, connection)),
         Err(e) => Err(e),
     }
 }
 
 async fn connect_once<T>(
+    host_addr: Option<IpAddr>,
     host: &Host,
     port: u16,
     tls: T,
@@ -40,7 +43,7 @@ async fn connect_once<T>(
 where
     T: TlsConnect<TcpStream>,
 {
-    let socket = connect_socket(host, port, config.connect_timeout).await?;
+    let socket = connect_socket(host_addr, host, port, config.connect_timeout).await?;
     let RawConnection {
         stream,
         parameters,
@@ -50,6 +53,7 @@ where
     } = connect_raw(socket, tls, config).await?;
 
     let socket_config = SocketConfig {
+        host_addr,
         host: host.clone(),
         port,
         connect_timeout: config.connect_timeout,
diff --git a/libs/proxy/tokio-postgres2/src/connect_socket.rs b/libs/proxy/tokio-postgres2/src/connect_socket.rs
index 15411f7ef3..8c7d300451 100644
--- a/libs/proxy/tokio-postgres2/src/connect_socket.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_socket.rs
@@ -1,5 +1,6 @@
 use std::future::Future;
 use std::io;
+use std::net::{IpAddr, SocketAddr};
 use std::time::Duration;
 
 use tokio::net::{self, TcpStream};
@@ -9,15 +10,20 @@ use crate::Error;
 use crate::config::Host;
 
 pub(crate) async fn connect_socket(
+    host_addr: Option<IpAddr>,
     host: &Host,
     port: u16,
     connect_timeout: Option<Duration>,
 ) -> Result<TcpStream, Error> {
     match host {
         Host::Tcp(host) => {
-            let addrs = net::lookup_host((&**host, port))
-                .await
-                .map_err(Error::connect)?;
+            let addrs = match host_addr {
+                Some(addr) => vec![SocketAddr::new(addr, port)],
+                None => net::lookup_host((&**host, port))
+                    .await
+                    .map_err(Error::connect)?
+                    .collect(),
+            };
 
             let mut last_err = None;
 
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 5020d82adf..ac44300a51 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -15,7 +15,6 @@ arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
 anyhow.workspace = true
-backtrace.workspace = true
 bincode.workspace = true
 bytes.workspace = true
 camino.workspace = true
diff --git a/libs/utils/src/sentry_init.rs b/libs/utils/src/sentry_init.rs
index d77dbba087..72d192a591 100644
--- a/libs/utils/src/sentry_init.rs
+++ b/libs/utils/src/sentry_init.rs
@@ -3,20 +3,24 @@ use std::env;
 
 use sentry::ClientInitGuard;
 pub use sentry::release_name;
+use tracing::{error, info};
 
 #[must_use]
 pub fn init_sentry(
     release_name: Option<Cow<'static, str>>,
     extra_options: &[(&str, &str)],
 ) -> Option<ClientInitGuard> {
-    let dsn = env::var("SENTRY_DSN").ok()?;
+    let Ok(dsn) = env::var("SENTRY_DSN") else {
+        info!("not initializing Sentry, no SENTRY_DSN given");
+        return None;
+    };
     let environment = env::var("SENTRY_ENVIRONMENT").unwrap_or_else(|_| "development".into());
 
     let guard = sentry::init((
         dsn,
         sentry::ClientOptions {
-            release: release_name,
-            environment: Some(environment.into()),
+            release: release_name.clone(),
+            environment: Some(environment.clone().into()),
             ..Default::default()
         },
     ));
@@ -25,5 +29,19 @@ pub fn init_sentry(
             scope.set_extra(key, value.into());
         }
     });
+
+    if let Some(dsn) = guard.dsn() {
+        info!(
+            "initialized Sentry for project {}, environment {}, release {} (using API {})",
+            dsn.project_id(),
+            environment,
+            release_name.unwrap_or(Cow::Borrowed("None")),
+            dsn.envelope_api_url(),
+        );
+    } else {
+        // This should panic during sentry::init(), but we may as well cover it.
+        error!("failed to initialize Sentry, invalid DSN");
+    }
+
     Some(guard)
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 7330856be4..fa16090170 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -98,6 +98,7 @@ criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
 indoc.workspace = true
+uuid.workspace = true
 
 [[bench]]
 name = "bench_layer_map"
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index e11af49449..e1444778b8 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -7,7 +7,6 @@ use std::time::Instant;
 
 use criterion::measurement::WallTime;
 use criterion::{BenchmarkGroup, Criterion, black_box, criterion_group, criterion_main};
-use pageserver::keyspace::{KeyPartitioning, KeySpace};
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::{LayerName, PersistentLayerDesc};
 use pageserver_api::key::Key;
@@ -72,41 +71,6 @@ fn uniform_query_pattern(layer_map: &LayerMap) -> Vec<(Key, Lsn)> {
         .collect()
 }
 
-// Construct a partitioning for testing get_difficulty map when we
-// don't have an exact result of `collect_keyspace` to work with.
-fn uniform_key_partitioning(layer_map: &LayerMap, _lsn: Lsn) -> KeyPartitioning {
-    let mut parts = Vec::new();
-
-    // We add a partition boundary at the start of each image layer,
-    // no matter what lsn range it covers. This is just the easiest
-    // thing to do. A better thing to do would be to get a real
-    // partitioning from some database. Even better, remove the need
-    // for key partitions by deciding where to create image layers
-    // directly based on a coverage-based difficulty map.
-    let mut keys: Vec<_> = layer_map
-        .iter_historic_layers()
-        .filter_map(|l| {
-            if l.is_incremental() {
-                None
-            } else {
-                let kr = l.get_key_range();
-                Some(kr.start.next())
-            }
-        })
-        .collect();
-    keys.sort();
-
-    let mut current_key = Key::from_hex("000000000000000000000000000000000000").unwrap();
-    for key in keys {
-        parts.push(KeySpace {
-            ranges: vec![current_key..key],
-        });
-        current_key = key;
-    }
-
-    KeyPartitioning { parts }
-}
-
 // Benchmark using metadata extracted from our performance test environment, from
 // a project where we have run pgbench many timmes. The pgbench database was initialized
 // between each test run.
@@ -148,41 +112,6 @@ fn bench_from_real_project(c: &mut Criterion) {
     // Choose uniformly distributed queries
     let queries: Vec<(Key, Lsn)> = uniform_query_pattern(&layer_map);
 
-    // Choose inputs for get_difficulty_map
-    let latest_lsn = layer_map
-        .iter_historic_layers()
-        .map(|l| l.get_lsn_range().end)
-        .max()
-        .unwrap();
-    let partitioning = uniform_key_partitioning(&layer_map, latest_lsn);
-
-    // Check correctness of get_difficulty_map
-    // TODO put this in a dedicated test outside of this mod
-    {
-        println!("running correctness check");
-
-        let now = Instant::now();
-        let result_bruteforce = layer_map.get_difficulty_map_bruteforce(latest_lsn, &partitioning);
-        assert!(result_bruteforce.len() == partitioning.parts.len());
-        println!("Finished bruteforce in {:?}", now.elapsed());
-
-        let now = Instant::now();
-        let result_fast = layer_map.get_difficulty_map(latest_lsn, &partitioning, None);
-        assert!(result_fast.len() == partitioning.parts.len());
-        println!("Finished fast in {:?}", now.elapsed());
-
-        // Assert results are equal. Manually iterate for easier debugging.
-        let zip = std::iter::zip(
-            &partitioning.parts,
-            std::iter::zip(result_bruteforce, result_fast),
-        );
-        for (_part, (bruteforce, fast)) in zip {
-            assert_eq!(bruteforce, fast);
-        }
-
-        println!("No issues found");
-    }
-
     // Define and name the benchmark function
     let mut group = c.benchmark_group("real_map");
     group.bench_function("uniform_queries", |b| {
@@ -192,11 +121,6 @@ fn bench_from_real_project(c: &mut Criterion) {
             }
         });
     });
-    group.bench_function("get_difficulty_map", |b| {
-        b.iter(|| {
-            layer_map.get_difficulty_map(latest_lsn, &partitioning, Some(3));
-        });
-    });
     group.finish();
 }
 
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index f19b4e964d..37c914c4e9 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -480,6 +480,7 @@ impl Client {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         concurrency: Option<usize>,
+        recurse: bool,
     ) -> Result<()> {
         let mut path = reqwest::Url::parse(&format!(
             "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
@@ -487,6 +488,9 @@ impl Client {
         ))
         .expect("Cannot build URL");
 
+        path.query_pairs_mut()
+            .append_pair("recurse", &format!("{}", recurse));
+
         if let Some(concurrency) = concurrency {
             path.query_pairs_mut()
                 .append_pair("concurrency", &format!("{}", concurrency));
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index ce54bd9c1c..de527e307b 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -33,8 +33,9 @@ use utils::lsn::Lsn;
 
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::Version;
-use crate::tenant::Timeline;
 use crate::tenant::storage_layer::IoConcurrency;
+use crate::tenant::timeline::GetVectoredError;
+use crate::tenant::{PageReconstructError, Timeline};
 
 #[derive(Debug, thiserror::Error)]
 pub enum BasebackupError {
@@ -42,6 +43,26 @@ pub enum BasebackupError {
     Server(#[from] anyhow::Error),
     #[error("basebackup client error {0:#} when {1}")]
     Client(#[source] io::Error, &'static str),
+    #[error("basebackup during shutdown")]
+    Shutdown,
+}
+
+impl From<PageReconstructError> for BasebackupError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
+}
+
+impl From<GetVectoredError> for BasebackupError {
+    fn from(value: GetVectoredError) -> Self {
+        match value {
+            GetVectoredError::Cancelled => BasebackupError::Shutdown,
+            err => BasebackupError::Server(err.into()),
+        }
+    }
 }
 
 /// Create basebackup with non-rel data in it.
@@ -127,7 +148,7 @@ where
             timeline
                 .gate
                 .enter()
-                .map_err(|e| BasebackupError::Server(e.into()))?,
+                .map_err(|_| BasebackupError::Shutdown)?,
         ),
     };
     basebackup
@@ -323,8 +344,7 @@ where
             let slru_partitions = self
                 .timeline
                 .get_slru_keyspace(Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?
+                .await?
                 .partition(
                     self.timeline.get_shard_identity(),
                     Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
@@ -336,11 +356,10 @@ where
                 let blocks = self
                     .timeline
                     .get_vectored(part, self.lsn, self.io_concurrency.clone(), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
 
                 for (key, block) in blocks {
-                    let block = block.map_err(|e| BasebackupError::Server(e.into()))?;
+                    let block = block?;
                     slru_builder.add_block(&key, block).await?;
                 }
             }
@@ -349,11 +368,8 @@ where
 
         let mut min_restart_lsn: Lsn = Lsn::MAX;
         // Create tablespace directories
-        for ((spcnode, dbnode), has_relmap_file) in self
-            .timeline
-            .list_dbdirs(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+        for ((spcnode, dbnode), has_relmap_file) in
+            self.timeline.list_dbdirs(self.lsn, self.ctx).await?
         {
             self.add_dbdir(spcnode, dbnode, has_relmap_file).await?;
 
@@ -362,8 +378,7 @@ where
             let rels = self
                 .timeline
                 .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
             for &rel in rels.iter() {
                 // Send init fork as main fork to provide well formed empty
                 // contents of UNLOGGED relations. Postgres copies it in
@@ -391,8 +406,7 @@ where
         let aux_files = self
             .timeline
             .list_aux_files(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
         let aux_scan_time = start_time.elapsed();
         let aux_estimated_size = aux_files
             .values()
@@ -451,16 +465,14 @@ where
         for xid in self
             .timeline
             .list_twophase_files(self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
+            .await?
         {
             self.add_twophase_file(xid).await?;
         }
         let repl_origins = self
             .timeline
             .get_replorigins(self.lsn, self.ctx, self.io_concurrency.clone())
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
         let n_origins = repl_origins.len();
         if n_origins != 0 {
             //
@@ -505,8 +517,7 @@ where
         let nblocks = self
             .timeline
             .get_rel_size(src, Version::Lsn(self.lsn), self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
 
         // If the relation is empty, create an empty file
         if nblocks == 0 {
@@ -532,8 +543,7 @@ where
                     // TODO: investigate using get_vectored for the entire startblk..endblk range.
                     // But this code path is not on the critical path for most basebackups (?).
                     .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?;
+                    .await?;
                 segment_data.extend_from_slice(&img[..]);
             }
 
@@ -567,8 +577,7 @@ where
             let img = self
                 .timeline
                 .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                .await
-                .map_err(|e| BasebackupError::Server(e.into()))?;
+                .await?;
 
             if img.len()
                 != dispatch_pgversion!(self.timeline.pg_version, pgv::bindings::SIZEOF_RELMAPFILE)
@@ -622,8 +631,7 @@ where
                 && self
                     .timeline
                     .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
-                    .await
-                    .map_err(|e| BasebackupError::Server(e.into()))?
+                    .await?
                     .is_empty()
             {
                 return Ok(());
@@ -674,8 +682,7 @@ where
         let img = self
             .timeline
             .get_twophase_file(xid, self.lsn, self.ctx)
-            .await
-            .map_err(|e| BasebackupError::Server(e.into()))?;
+            .await?;
 
         let mut buf = BytesMut::new();
         buf.extend_from_slice(&img[..]);
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 33ae8c4790..06be873160 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -456,8 +456,8 @@ impl PageServerConf {
             no_sync: no_sync.unwrap_or(false),
             enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
             validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
-            load_previous_heatmap: load_previous_heatmap.unwrap_or(false),
-            generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(false),
+            load_previous_heatmap: load_previous_heatmap.unwrap_or(true),
+            generate_unarchival_heatmap: generate_unarchival_heatmap.unwrap_or(true),
         };
 
         // ------------------------------------------------------------
@@ -491,7 +491,9 @@ impl PageServerConf {
     #[cfg(test)]
     pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
         let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
-        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
+
+        let test_id = uuid::Uuid::new_v4();
+        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}_{test_id}"))
     }
 
     pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 12252739fd..0fb9a240d5 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -842,6 +842,12 @@ paths:
         required: false
         schema:
           type: integer
+      - name: recurse
+        description: When set, will recurse with the downloads into ancestor timelines
+        in: query
+        required: false
+        schema:
+          type: boolean
     post:
       description: |
         Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b738d22740..3c0c23a56d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -37,7 +37,8 @@ use pageserver_api::models::{
     TenantShardSplitResponse, TenantSorting, TenantState, TenantWaitLsnRequest,
     TimelineArchivalConfigRequest, TimelineCreateRequest, TimelineCreateRequestMode,
     TimelineCreateRequestModeImportPgdata, TimelineGcRequest, TimelineInfo,
-    TimelinesInfoAndOffloaded, TopTenantShardItem, TopTenantShardsRequest, TopTenantShardsResponse,
+    TimelinePatchIndexPartRequest, TimelinesInfoAndOffloaded, TopTenantShardItem,
+    TopTenantShardsRequest, TopTenantShardsResponse,
 };
 use pageserver_api::shard::{ShardCount, TenantShardId};
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
@@ -63,6 +64,7 @@ use crate::tenant::mgr::{
     GetActiveTenantError, GetTenantError, TenantManager, TenantMapError, TenantMapInsertError,
     TenantSlot, TenantSlotError, TenantSlotUpsertError, TenantStateError, UpsertLocationError,
 };
+use crate::tenant::remote_timeline_client::index::GcCompactionState;
 use crate::tenant::remote_timeline_client::{
     download_index_part, list_remote_tenant_shards, list_remote_timelines,
 };
@@ -481,6 +483,7 @@ async fn build_timeline_info_common(
 
         state,
         is_archived: Some(is_archived),
+        rel_size_migration: Some(timeline.get_rel_size_v2_status()),
 
         walreceiver_status,
     };
@@ -857,6 +860,75 @@ async fn timeline_archival_config_handler(
     json_response(StatusCode::OK, ())
 }
 
+/// This API is used to patch the index part of a timeline. You must ensure such patches are safe to apply. Use this API as an emergency
+/// measure only.
+///
+/// Some examples of safe patches:
+/// - Increase the gc_cutoff and gc_compaction_cutoff to a larger value in case of a bug that didn't bump the cutoff and cause read errors.
+/// - Force set the index part to use reldir v2 (migrating/migrated).
+///
+/// Some examples of unsafe patches:
+/// - Force set the index part from v2 to v1 (legacy). This will cause the code path to ignore anything written to the new keyspace and cause
+///   errors.
+/// - Decrease the gc_cutoff without validating the data really exists. It will cause read errors in the background.
+async fn timeline_patch_index_part_handler(
+    mut request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let request_data: TimelinePatchIndexPartRequest = json_request(&mut request).await?;
+    check_permission(&request, None)?; // require global permission for this request
+    let state = get_state(&request);
+
+    async {
+        let timeline =
+            active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+                .await?;
+
+        if let Some(rel_size_migration) = request_data.rel_size_migration {
+            timeline
+                .update_rel_size_v2_status(rel_size_migration)
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(gc_compaction_last_completed_lsn) =
+            request_data.gc_compaction_last_completed_lsn
+        {
+            timeline
+                .update_gc_compaction_state(GcCompactionState {
+                    last_completed_lsn: gc_compaction_last_completed_lsn,
+                })
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        if let Some(applied_gc_cutoff_lsn) = request_data.applied_gc_cutoff_lsn {
+            {
+                let guard = timeline.applied_gc_cutoff_lsn.lock_for_write();
+                guard.store_and_unlock(applied_gc_cutoff_lsn);
+            }
+        }
+
+        if request_data.force_index_update {
+            timeline
+                .remote_client
+                .force_schedule_index_upload()
+                .context("force schedule index upload")
+                .map_err(ApiError::InternalServerError)?;
+        }
+
+        Ok::<_, ApiError>(())
+    }
+    .instrument(info_span!("timeline_patch_index_part",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug(),
+                %timeline_id))
+    .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn timeline_detail_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -1435,6 +1507,7 @@ async fn timeline_download_heatmap_layers_handler(
 
     let desired_concurrency =
         parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
+    let recurse = parse_query_param(&request, "recurse")?.unwrap_or(false);
 
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
@@ -1451,9 +1524,7 @@ async fn timeline_download_heatmap_layers_handler(
         .unwrap_or(DEFAULT_MAX_CONCURRENCY);
     let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
 
-    timeline
-        .start_heatmap_layers_download(concurrency, &ctx)
-        .await?;
+    timeline.start_heatmap_layers_download(concurrency, recurse, &ctx)?;
 
     json_response(StatusCode::ACCEPTED, ())
 }
@@ -3629,6 +3700,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
             |r| api_handler(r, get_timestamp_of_lsn_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/patch_index_part",
+            |r| api_handler(r, timeline_patch_index_part_handler),
+        )
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/lsn_lease",
             |r| api_handler(r, lsn_lease_handler),
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index eb8a9b8e24..b5b4e5c91f 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -143,6 +143,29 @@ pub(crate) static LAYERS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static LAYERS_PER_READ_BATCH_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_batch_global",
+        "Layers visited to serve a single read batch (read amplification), regardless of number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static LAYERS_PER_READ_AMORTIZED_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_layers_per_read_amortized_global",
+        "Layers visited to serve a single read (read amplification). Amortized across a batch: \
+            all visited layers are divided by number of reads.",
+        vec![
+            1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0
+        ],
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static DELTAS_PER_READ_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
     // We expect this to be low because of Postgres checkpoints. Let's see if that holds.
     register_histogram!(
@@ -4074,6 +4097,8 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
     // histograms
     [
         &LAYERS_PER_READ_GLOBAL,
+        &LAYERS_PER_READ_BATCH_GLOBAL,
+        &LAYERS_PER_READ_AMORTIZED_GLOBAL,
         &DELTAS_PER_READ_GLOBAL,
         &WAIT_LSN_TIME,
         &WAL_REDO_TIME,
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 8972515163..ba2ed9dc81 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -392,10 +392,6 @@ impl TimelineHandles {
             .await
             .map_err(|e| match e {
                 timeline::handle::GetError::TenantManager(e) => e,
-                timeline::handle::GetError::TimelineGateClosed => {
-                    trace!("timeline gate closed");
-                    GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
-                }
                 timeline::handle::GetError::PerTimelineStateShutDown => {
                     trace!("per-timeline state shut down");
                     GetActiveTimelineError::Timeline(GetTimelineError::ShuttingDown)
@@ -422,24 +418,33 @@ pub(crate) struct TenantManagerTypes;
 impl timeline::handle::Types for TenantManagerTypes {
     type TenantManagerError = GetActiveTimelineError;
     type TenantManager = TenantManagerWrapper;
-    type Timeline = Arc<Timeline>;
+    type Timeline = TenantManagerCacheItem;
 }
 
-impl timeline::handle::ArcTimeline<TenantManagerTypes> for Arc<Timeline> {
-    fn gate(&self) -> &utils::sync::gate::Gate {
-        &self.gate
-    }
+pub(crate) struct TenantManagerCacheItem {
+    pub(crate) timeline: Arc<Timeline>,
+    #[allow(dead_code)] // we store it to keep the gate open
+    pub(crate) gate_guard: GateGuard,
+}
 
+impl std::ops::Deref for TenantManagerCacheItem {
+    type Target = Arc<Timeline>;
+    fn deref(&self) -> &Self::Target {
+        &self.timeline
+    }
+}
+
+impl timeline::handle::Timeline<TenantManagerTypes> for TenantManagerCacheItem {
     fn shard_timeline_id(&self) -> timeline::handle::ShardTimelineId {
-        Timeline::shard_timeline_id(self)
+        Timeline::shard_timeline_id(&self.timeline)
     }
 
     fn per_timeline_state(&self) -> &timeline::handle::PerTimelineState<TenantManagerTypes> {
-        &self.handles
+        &self.timeline.handles
     }
 
     fn get_shard_identity(&self) -> &pageserver_api::shard::ShardIdentity {
-        Timeline::get_shard_identity(self)
+        Timeline::get_shard_identity(&self.timeline)
     }
 }
 
@@ -448,7 +453,7 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
         &self,
         timeline_id: TimelineId,
         shard_selector: ShardSelector,
-    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
+    ) -> Result<TenantManagerCacheItem, GetActiveTimelineError> {
         let tenant_id = self.tenant_id.get().expect("we set this in get()");
         let timeout = ACTIVE_TENANT_TIMEOUT;
         let wait_start = Instant::now();
@@ -491,7 +496,20 @@ impl timeline::handle::TenantManager<TenantManagerTypes> for TenantManagerWrappe
         let timeline = tenant_shard
             .get_timeline(timeline_id, true)
             .map_err(GetActiveTimelineError::Timeline)?;
-        Ok(timeline)
+
+        let gate_guard = match timeline.gate.enter() {
+            Ok(guard) => guard,
+            Err(_) => {
+                return Err(GetActiveTimelineError::Timeline(
+                    GetTimelineError::ShuttingDown,
+                ));
+            }
+        };
+
+        Ok(TenantManagerCacheItem {
+            timeline,
+            gate_guard,
+        })
     }
 }
 
@@ -2095,6 +2113,7 @@ impl PageServerHandler {
                 // TODO: passthrough the error site to the final error message?
                 BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
                 BasebackupError::Server(e) => QueryError::Other(e),
+                BasebackupError::Shutdown => QueryError::Shutdown,
             }
         }
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 787b1b895c..8bcc6d58ec 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -21,6 +21,7 @@ use pageserver_api::key::{
     slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
@@ -492,7 +493,9 @@ impl Timeline {
         // Otherwise, read the old reldir keyspace.
         // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
 
-        if self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Migrated | RelSizeMigration::Migrating =
+            self.get_rel_size_v2_status()
+        {
             // fetch directory listing (new)
             let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
             let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
@@ -544,7 +547,7 @@ impl Timeline {
                 forknum: *forknum,
             }));
 
-        if !self.get_rel_size_v2_enabled() {
+        if let RelSizeMigration::Legacy = self.get_rel_size_v2_status() {
             return Ok(rels_v1);
         }
 
@@ -599,28 +602,36 @@ impl Timeline {
         let n_blocks = self
             .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
             .await?;
-        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
-        for blkno in 0..n_blocks {
-            let block = self
-                .get_slru_page_at_lsn(kind, segno, blkno, lsn, ctx)
-                .await?;
-            segment.extend_from_slice(&block[..BLCKSZ as usize]);
-        }
-        Ok(segment.freeze())
-    }
 
-    /// Look up given SLRU page version.
-    pub(crate) async fn get_slru_page_at_lsn(
-        &self,
-        kind: SlruKind,
-        segno: u32,
-        blknum: BlockNumber,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<Bytes, PageReconstructError> {
-        assert!(self.tenant_shard_id.is_shard_zero());
-        let key = slru_block_to_key(kind, segno, blknum);
-        self.get(key, lsn, ctx).await
+        let keyspace = KeySpace::single(
+            slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, n_blocks),
+        );
+
+        let batches = keyspace.partition(
+            self.get_shard_identity(),
+            Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+        );
+
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+
+        let mut segment = BytesMut::with_capacity(n_blocks as usize * BLCKSZ as usize);
+        for batch in batches.parts {
+            let blocks = self
+                .get_vectored(batch, lsn, io_concurrency.clone(), ctx)
+                .await?;
+
+            for (_key, block) in blocks {
+                let block = block?;
+                segment.extend_from_slice(&block[..BLCKSZ as usize]);
+            }
+        }
+
+        Ok(segment.freeze())
     }
 
     /// Get size of an SLRU segment
@@ -829,19 +840,41 @@ impl Timeline {
             let nblocks = self
                 .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                 .await?;
-            for blknum in (0..nblocks).rev() {
-                let clog_page = self
-                    .get_slru_page_at_lsn(SlruKind::Clog, segno, blknum, probe_lsn, ctx)
+
+            let keyspace = KeySpace::single(
+                slru_block_to_key(SlruKind::Clog, segno, 0)
+                    ..slru_block_to_key(SlruKind::Clog, segno, nblocks),
+            );
+
+            let batches = keyspace.partition(
+                self.get_shard_identity(),
+                Timeline::MAX_GET_VECTORED_KEYS * BLCKSZ as u64,
+            );
+
+            let io_concurrency = IoConcurrency::spawn_from_conf(
+                self.conf,
+                self.gate
+                    .enter()
+                    .map_err(|_| PageReconstructError::Cancelled)?,
+            );
+
+            for batch in batches.parts.into_iter().rev() {
+                let blocks = self
+                    .get_vectored(batch, probe_lsn, io_concurrency.clone(), ctx)
                     .await?;
 
-                if clog_page.len() == BLCKSZ as usize + 8 {
-                    let mut timestamp_bytes = [0u8; 8];
-                    timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
-                    let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+                for (_key, clog_page) in blocks.into_iter().rev() {
+                    let clog_page = clog_page?;
 
-                    match f(timestamp) {
-                        ControlFlow::Break(b) => return Ok(b),
-                        ControlFlow::Continue(()) => (),
+                    if clog_page.len() == BLCKSZ as usize + 8 {
+                        let mut timestamp_bytes = [0u8; 8];
+                        timestamp_bytes.copy_from_slice(&clog_page[BLCKSZ as usize..]);
+                        let timestamp = TimestampTz::from_be_bytes(timestamp_bytes);
+
+                        match f(timestamp) {
+                            ControlFlow::Break(b) => return Ok(b),
+                            ControlFlow::Continue(()) => (),
+                        }
                     }
                 }
             }
@@ -1052,6 +1085,8 @@ impl Timeline {
     ) -> Result<u64, CalculateLogicalSizeError> {
         debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
 
+        fail::fail_point!("skip-logical-size-calculation", |_| { Ok(0) });
+
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await?;
         let dbdir = DbDirectory::des(&buf)?;
@@ -1718,6 +1753,35 @@ impl DatadirModification<'_> {
         Ok(())
     }
 
+    /// Returns `true` if the rel_size_v2 write path is enabled. If it is the first time that
+    /// we enable it, we also need to persist it in `index_part.json`.
+    pub fn maybe_enable_rel_size_v2(&mut self) -> anyhow::Result<bool> {
+        let status = self.tline.get_rel_size_v2_status();
+        let config = self.tline.get_rel_size_v2_enabled();
+        match (config, status) {
+            (false, RelSizeMigration::Legacy) => {
+                // tenant config didn't enable it and we didn't write any reldir_v2 key yet
+                Ok(false)
+            }
+            (false, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                Ok(true)
+            }
+            (true, RelSizeMigration::Legacy) => {
+                // The first time we enable it, we need to persist it in `index_part.json`
+                self.tline
+                    .update_rel_size_v2_status(RelSizeMigration::Migrating)?;
+                tracing::info!("enabled rel_size_v2");
+                Ok(true)
+            }
+            (true, RelSizeMigration::Migrating | RelSizeMigration::Migrated) => {
+                // index_part already persisted that the timeline has enabled rel_size_v2
+                // and we don't need to do anything
+                Ok(true)
+            }
+        }
+    }
+
     /// Store a relmapper file (pg_filenode.map) in the repository
     pub async fn put_relmap_file(
         &mut self,
@@ -1726,6 +1790,8 @@ impl DatadirModification<'_> {
         img: Bytes,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
+
         // Add it to the directory (if it doesn't exist already)
         let buf = self.get(DBDIR_KEY, ctx).await?;
         let mut dbdir = DbDirectory::des(&buf)?;
@@ -1746,7 +1812,7 @@ impl DatadirModification<'_> {
             })?;
             self.pending_directory_entries
                 .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
-            if self.tline.get_rel_size_v2_enabled() {
+            if v2_enabled {
                 self.pending_directory_entries
                     .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
             }
@@ -1898,12 +1964,12 @@ impl DatadirModification<'_> {
                 .context("deserialize db")?
         };
 
-        // Add the new relation to the rel directory entry, and write it back
-        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
-            return Err(RelationError::AlreadyExists);
-        }
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
 
-        if self.tline.get_rel_size_v2_enabled() {
+        if v2_enabled {
+            if rel_dir.rels.contains(&(rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
             let sparse_rel_dir_key =
                 rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
             // check if the rel_dir_key exists in v2
@@ -1938,6 +2004,10 @@ impl DatadirModification<'_> {
             self.pending_directory_entries
                 .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
         } else {
+            // Add the new relation to the rel directory entry, and write it back
+            if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
+                return Err(RelationError::AlreadyExists);
+            }
             if !dbdir_exists {
                 self.pending_directory_entries
                     .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
@@ -1951,6 +2021,7 @@ impl DatadirModification<'_> {
                 )),
             );
         }
+
         // Put size
         let size_key = rel_size_to_key(rel);
         let buf = nblocks.to_le_bytes();
@@ -2029,6 +2100,7 @@ impl DatadirModification<'_> {
         drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let v2_enabled = self.maybe_enable_rel_size_v2()?;
         for ((spc_node, db_node), rel_tags) in drop_relations {
             let dir_key = rel_dir_to_key(spc_node, db_node);
             let buf = self.get(dir_key, ctx).await?;
@@ -2041,7 +2113,7 @@ impl DatadirModification<'_> {
                         .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                     dirty = true;
                     true
-                } else if self.tline.get_rel_size_v2_enabled() {
+                } else if v2_enabled {
                     // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
                     // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
                     // logic).
@@ -2072,7 +2144,7 @@ impl DatadirModification<'_> {
                     // Remove entry from relation size cache
                     self.tline.remove_cached_rel_size(&rel_tag);
 
-                    // Delete size entry, as well as all blocks
+                    // Delete size entry, as well as all blocks; this is currently a no-op because we haven't implemented tombstones in storage.
                     self.delete(rel_key_range(rel_tag));
                 }
             }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ba1c814c4e..c78d15c9b5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -31,8 +31,8 @@ use futures::StreamExt;
 use futures::stream::FuturesUnordered;
 use itertools::Itertools as _;
 use once_cell::sync::Lazy;
-use pageserver_api::models;
 pub use pageserver_api::models::TenantState;
+use pageserver_api::models::{self, RelSizeMigration};
 use pageserver_api::models::{
     CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
     WalRedoManagerStatus,
@@ -1123,6 +1123,7 @@ impl Tenant {
             CreateTimelineCause::Load,
             idempotency.clone(),
             index_part.gc_compaction.clone(),
+            index_part.rel_size_migration.clone(),
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -1153,12 +1154,15 @@ impl Tenant {
             let mut tline_ending_at = Some((&timeline, timeline.get_last_record_lsn()));
             while let Some((tline, end_lsn)) = tline_ending_at {
                 let unarchival_heatmap = tline.generate_unarchival_heatmap(end_lsn).await;
-                if !tline.is_previous_heatmap_active() {
+                // Another unearchived timeline might have generated a heatmap for this ancestor.
+                // If the current branch point greater than the previous one use the the heatmap
+                // we just generated - it should include more layers.
+                if !tline.should_keep_previous_heatmap(end_lsn) {
                     tline
                         .previous_heatmap
                         .store(Some(Arc::new(unarchival_heatmap)));
                 } else {
-                    tracing::info!("Previous heatmap still active. Dropping unarchival heatmap.")
+                    tracing::info!("Previous heatmap preferred. Dropping unarchival heatmap.")
                 }
 
                 match tline.ancestor_timeline() {
@@ -1943,6 +1947,7 @@ impl Tenant {
                 hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
                     heatmap: h,
                     read_at: hs.1,
+                    end_lsn: None,
                 })
             });
             part_downloads.spawn(
@@ -2446,6 +2451,7 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
+            None,
         )
         .await
     }
@@ -2501,6 +2507,7 @@ impl Tenant {
         initdb_lsn: Lsn,
         pg_version: u32,
         ctx: &RequestContext,
+        in_memory_layer_desc: Vec<timeline::InMemoryLayerTestDesc>,
         delta_layer_desc: Vec<timeline::DeltaLayerTestDesc>,
         image_layer_desc: Vec<(Lsn, Vec<(pageserver_api::key::Key, bytes::Bytes)>)>,
         end_lsn: Lsn,
@@ -2522,6 +2529,11 @@ impl Tenant {
                 .force_create_image_layer(lsn, images, Some(initdb_lsn), ctx)
                 .await?;
         }
+        for in_memory in in_memory_layer_desc {
+            tline
+                .force_create_in_memory_layer(in_memory, Some(initdb_lsn), ctx)
+                .await?;
+        }
         let layer_names = tline
             .layers
             .read()
@@ -2771,6 +2783,7 @@ impl Tenant {
                     timeline_create_guard,
                     initdb_lsn,
                     None,
+                    None,
                 )
                 .await
             }
@@ -4122,6 +4135,7 @@ impl Tenant {
         cause: CreateTimelineCause,
         create_idempotency: CreateTimelineIdempotency,
         gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -4154,6 +4168,7 @@ impl Tenant {
             self.attach_wal_lag_cooldown.clone(),
             create_idempotency,
             gc_compaction_state,
+            rel_size_v2_status,
             self.cancel.child_token(),
         );
 
@@ -4856,6 +4871,7 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
+                Some(src_timeline.get_rel_size_v2_status()),
             )
             .await?;
 
@@ -5129,6 +5145,7 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
+                None,
             )
             .await?;
 
@@ -5207,13 +5224,14 @@ impl Tenant {
         create_guard: TimelineCreateGuard,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
+        rel_size_v2_status: Option<RelSizeMigration>,
     ) -> anyhow::Result<UninitializedTimeline<'a>> {
         let tenant_shard_id = self.tenant_shard_id;
 
         let resources = self.build_timeline_resources(new_timeline_id);
         resources
             .remote_client
-            .init_upload_queue_for_empty_remote(new_metadata)?;
+            .init_upload_queue_for_empty_remote(new_metadata, rel_size_v2_status.clone())?;
 
         let timeline_struct = self
             .create_timeline_struct(
@@ -5225,6 +5243,7 @@ impl Tenant {
                 CreateTimelineCause::Load,
                 create_guard.idempotency.clone(),
                 None,
+                rel_size_v2_status,
             )
             .context("Failed to create timeline data structure")?;
 
@@ -5913,6 +5932,8 @@ mod tests {
     #[cfg(feature = "testing")]
     use timeline::GcInfo;
     #[cfg(feature = "testing")]
+    use timeline::InMemoryLayerTestDesc;
+    #[cfg(feature = "testing")]
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     use timeline::{CompactOptions, DeltaLayerTestDesc};
     use utils::id::TenantId;
@@ -7925,6 +7946,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 Vec::new(), // delta layers
                 vec![(Lsn(0x20), vec![(base_key, test_img("data key 1"))])], // image layers
                 Lsn(0x20), // it's fine to not advance LSN to 0x30 while using 0x30 to get below because `get_vectored_impl` does not wait for LSN
@@ -8012,6 +8034,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 Vec::new(), // delta layers
                 vec![(
                     Lsn(0x20),
@@ -8227,6 +8250,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 // delta layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8307,6 +8331,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 // delta layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8380,6 +8405,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 // delta layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(
@@ -8512,6 +8538,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -8705,6 +8732,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                     Lsn(0x10)..Lsn(0x40),
                     delta1,
@@ -8761,6 +8789,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 Vec::new(),
                 image_layers,
                 end_lsn,
@@ -8967,6 +8996,7 @@ mod tests {
                     Lsn(0x08),
                     DEFAULT_PG_VERSION,
                     &ctx,
+                    Vec::new(), // in-memory layers
                     vec![
                         DeltaLayerTestDesc::new_with_inferred_key_range(
                             Lsn(0x08)..Lsn(0x10),
@@ -8985,7 +9015,7 @@ mod tests {
                             delta3,
                         ),
                     ], // delta layers
-                    vec![], // image layers
+                    vec![],     // image layers
                     Lsn(0x50),
                 )
                 .await?
@@ -8996,6 +9026,7 @@ mod tests {
                     Lsn(0x10),
                     DEFAULT_PG_VERSION,
                     &ctx,
+                    Vec::new(), // in-memory layers
                     vec![
                         DeltaLayerTestDesc::new_with_inferred_key_range(
                             Lsn(0x10)..Lsn(0x48),
@@ -9546,6 +9577,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
@@ -9793,6 +9825,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                Vec::new(), // in-memory layers
                 vec![
                     // delta1 and delta 2 only contain a single key but multiple updates
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x30), delta1),
@@ -10028,6 +10061,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![],                       // in-memory layers
                 vec![],                       // delta layers
                 vec![(Lsn(0x18), img_layer)], // image layers
                 Lsn(0x18),
@@ -10274,6 +10308,7 @@ mod tests {
                 baseline_image_layer_lsn,
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![DeltaLayerTestDesc::new_with_inferred_key_range(
                     delta_layer_start_lsn..delta_layer_end_lsn,
                     delta_layer_spec,
@@ -10305,6 +10340,158 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_vectored_read_with_image_layer_inside_inmem() -> anyhow::Result<()> {
+        let harness =
+            TenantHarness::create("test_vectored_read_with_image_layer_inside_inmem").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        let will_init_keys = [2, 6];
+        fn get_key(id: u32) -> Key {
+            let mut key = Key::from_hex("110000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        let mut expected_key_values = HashMap::new();
+
+        let baseline_image_layer_lsn = Lsn(0x10);
+        let mut baseline_img_layer = Vec::new();
+        for i in 0..5 {
+            let key = get_key(i);
+            let value = format!("value {i}@{baseline_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            baseline_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let nested_image_layer_lsn = Lsn(0x50);
+        let mut nested_img_layer = Vec::new();
+        for i in 5..10 {
+            let key = get_key(i);
+            let value = format!("value {i}@{nested_image_layer_lsn}");
+
+            let removed = expected_key_values.insert(key, value.clone());
+            assert!(removed.is_none());
+
+            nested_img_layer.push((key, Bytes::from(value)));
+        }
+
+        let frozen_layer = {
+            let lsn_range = Lsn(0x40)..Lsn(0x60);
+            let mut data = Vec::new();
+            for i in 0..10 {
+                let key = get_key(i);
+                let key_in_nested = nested_img_layer
+                    .iter()
+                    .any(|(key_with_img, _)| *key_with_img == key);
+                let lsn = {
+                    if key_in_nested {
+                        Lsn(nested_image_layer_lsn.0 + 5)
+                    } else {
+                        lsn_range.start
+                    }
+                };
+
+                let will_init = will_init_keys.contains(&i);
+                if will_init {
+                    data.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
+
+                    expected_key_values.insert(key, "".to_string());
+                } else {
+                    let delta = format!("@{lsn}");
+                    data.push((
+                        key,
+                        lsn,
+                        Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                    ));
+
+                    expected_key_values
+                        .get_mut(&key)
+                        .expect("An image exists for each key")
+                        .push_str(delta.as_str());
+                }
+            }
+
+            InMemoryLayerTestDesc {
+                lsn_range,
+                is_open: false,
+                data,
+            }
+        };
+
+        let (open_layer, last_record_lsn) = {
+            let start_lsn = Lsn(0x70);
+            let mut data = Vec::new();
+            let mut end_lsn = Lsn(0);
+            for i in 0..10 {
+                let key = get_key(i);
+                let lsn = Lsn(start_lsn.0 + i as u64);
+                let delta = format!("@{lsn}");
+                data.push((
+                    key,
+                    lsn,
+                    Value::WalRecord(NeonWalRecord::wal_append(&delta)),
+                ));
+
+                expected_key_values
+                    .get_mut(&key)
+                    .expect("An image exists for each key")
+                    .push_str(delta.as_str());
+
+                end_lsn = std::cmp::max(end_lsn, lsn);
+            }
+
+            (
+                InMemoryLayerTestDesc {
+                    lsn_range: start_lsn..Lsn::MAX,
+                    is_open: true,
+                    data,
+                },
+                end_lsn,
+            )
+        };
+
+        assert!(
+            nested_image_layer_lsn > frozen_layer.lsn_range.start
+                && nested_image_layer_lsn < frozen_layer.lsn_range.end
+        );
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                baseline_image_layer_lsn,
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![open_layer, frozen_layer], // in-memory layers
+                Vec::new(),                     // delta layers
+                vec![
+                    (baseline_image_layer_lsn, baseline_img_layer),
+                    (nested_image_layer_lsn, nested_img_layer),
+                ], // image layers
+                last_record_lsn,
+            )
+            .await?;
+
+        let keyspace = KeySpace::single(get_key(0)..get_key(10));
+        let results = tline
+            .get_vectored(keyspace, last_record_lsn, IoConcurrency::sequential(), &ctx)
+            .await
+            .expect("No vectored errors");
+        for (key, res) in results {
+            let value = res.expect("No key errors");
+            let expected_value = expected_key_values.remove(&key).expect("No unknown keys");
+            assert_eq!(value, Bytes::from(expected_value.clone()));
+
+            tracing::info!("key={key} value={expected_value}");
+        }
+
+        Ok(())
+    }
+
     fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
         (
             k1.is_delta,
@@ -10420,6 +10607,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
@@ -10804,6 +10992,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![
                     // delta1/2/4 only contain a single key but multiple updates
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
@@ -11055,6 +11244,7 @@ mod tests {
                 Lsn(0x10),
                 DEFAULT_PG_VERSION,
                 &ctx,
+                vec![], // in-memory layers
                 vec![
                     // delta1/2/4 only contain a single key but multiple updates
                     DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x28), delta1),
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 59f5a6bd90..2b04e53f10 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -62,8 +62,7 @@ use utils::lsn::Lsn;
 
 use super::storage_layer::{LayerVisibilityHint, PersistentLayerDesc};
 use crate::context::RequestContext;
-use crate::keyspace::KeyPartitioning;
-use crate::tenant::storage_layer::InMemoryLayer;
+use crate::tenant::storage_layer::{InMemoryLayer, ReadableLayerWeak};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -167,7 +166,7 @@ impl Drop for BatchedUpdates<'_> {
 /// Return value of LayerMap::search
 #[derive(Eq, PartialEq, Debug, Hash)]
 pub struct SearchResult {
-    pub layer: Arc<PersistentLayerDesc>,
+    pub layer: ReadableLayerWeak,
     pub lsn_floor: Lsn,
 }
 
@@ -175,19 +174,37 @@ pub struct SearchResult {
 ///
 /// Contains a mapping from a layer description to a keyspace
 /// accumulator that contains all the keys which intersect the layer
-/// from the original search space. Keys that were not found are accumulated
-/// in a separate key space accumulator.
+/// from the original search space.
 #[derive(Debug)]
 pub struct RangeSearchResult {
     pub found: HashMap<SearchResult, KeySpaceAccum>,
-    pub not_found: KeySpaceAccum,
 }
 
 impl RangeSearchResult {
     fn new() -> Self {
         Self {
             found: HashMap::new(),
-            not_found: KeySpaceAccum::new(),
+        }
+    }
+
+    fn map_to_in_memory_layer(
+        in_memory_layer: Option<InMemoryLayerDesc>,
+        range: Range<Key>,
+    ) -> RangeSearchResult {
+        match in_memory_layer {
+            Some(inmem) => {
+                let search_result = SearchResult {
+                    lsn_floor: inmem.get_lsn_range().start,
+                    layer: ReadableLayerWeak::InMemoryLayer(inmem),
+                };
+
+                let mut accum = KeySpaceAccum::new();
+                accum.add_range(range);
+                RangeSearchResult {
+                    found: HashMap::from([(search_result, accum)]),
+                }
+            }
+            None => RangeSearchResult::new(),
         }
     }
 }
@@ -199,6 +216,7 @@ struct RangeSearchCollector<Iter>
 where
     Iter: Iterator<Item = (i128, Option<Arc<PersistentLayerDesc>>)>,
 {
+    in_memory_layer: Option<InMemoryLayerDesc>,
     delta_coverage: Peekable<Iter>,
     image_coverage: Peekable<Iter>,
     key_range: Range<Key>,
@@ -234,10 +252,12 @@ where
     fn new(
         key_range: Range<Key>,
         end_lsn: Lsn,
+        in_memory_layer: Option<InMemoryLayerDesc>,
         delta_coverage: Iter,
         image_coverage: Iter,
     ) -> Self {
         Self {
+            in_memory_layer,
             delta_coverage: delta_coverage.peekable(),
             image_coverage: image_coverage.peekable(),
             key_range,
@@ -266,8 +286,7 @@ where
                 return self.result;
             }
             Some(layer_type) => {
-                // Changes for the range exist. Record anything before the first
-                // coverage change as not found.
+                // Changes for the range exist.
                 let coverage_start = layer_type.next_change_at_key();
                 let range_before = self.key_range.start..coverage_start;
                 self.pad_range(range_before);
@@ -297,10 +316,22 @@ where
         self.result
     }
 
-    /// Mark a range as not found (i.e. no layers intersect it)
+    /// Map a range which does not intersect any persistent layers to
+    /// the in-memory layer candidate.
     fn pad_range(&mut self, key_range: Range<Key>) {
         if !key_range.is_empty() {
-            self.result.not_found.add_range(key_range);
+            if let Some(ref inmem) = self.in_memory_layer {
+                let search_result = SearchResult {
+                    layer: ReadableLayerWeak::InMemoryLayer(inmem.clone()),
+                    lsn_floor: inmem.get_lsn_range().start,
+                };
+
+                self.result
+                    .found
+                    .entry(search_result)
+                    .or_default()
+                    .add_range(key_range);
+            }
         }
     }
 
@@ -310,6 +341,7 @@ where
         let selected = LayerMap::select_layer(
             self.current_delta.clone(),
             self.current_image.clone(),
+            self.in_memory_layer.clone(),
             self.end_lsn,
         );
 
@@ -365,6 +397,24 @@ where
     }
 }
 
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct InMemoryLayerDesc {
+    handle: InMemoryLayerHandle,
+    lsn_range: Range<Lsn>,
+}
+
+impl InMemoryLayerDesc {
+    pub(crate) fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+}
+
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+enum InMemoryLayerHandle {
+    Open,
+    Frozen(usize),
+}
+
 impl LayerMap {
     ///
     /// Find the latest layer (by lsn.end) that covers the given
@@ -394,69 +444,161 @@ impl LayerMap {
     /// layer result, or simplify the api to `get_latest_image` and
     /// `get_latest_delta`, and only call `get_latest_image` once.
     ///
-    /// NOTE: This only searches the 'historic' layers, *not* the
-    /// 'open' and 'frozen' layers!
-    ///
     pub fn search(&self, key: Key, end_lsn: Lsn) -> Option<SearchResult> {
-        let version = self.historic.get().unwrap().get_version(end_lsn.0 - 1)?;
+        let in_memory_layer = self.search_in_memory_layer(end_lsn);
+
+        let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
+            Some(version) => version,
+            None => {
+                return in_memory_layer.map(|desc| SearchResult {
+                    lsn_floor: desc.get_lsn_range().start,
+                    layer: ReadableLayerWeak::InMemoryLayer(desc),
+                });
+            }
+        };
+
         let latest_delta = version.delta_coverage.query(key.to_i128());
         let latest_image = version.image_coverage.query(key.to_i128());
 
-        Self::select_layer(latest_delta, latest_image, end_lsn)
+        Self::select_layer(latest_delta, latest_image, in_memory_layer, end_lsn)
     }
 
+    /// Select a layer from three potential candidates (in-memory, delta and image layer).
+    /// The candidates represent the first layer of each type which intersect a key range.
+    ///
+    /// Layer types have an in implicit priority (image > delta > in-memory). For instance,
+    /// if we have the option of reading an LSN range from both an image and a delta, we
+    /// should read from the image.
     fn select_layer(
         delta_layer: Option<Arc<PersistentLayerDesc>>,
         image_layer: Option<Arc<PersistentLayerDesc>>,
+        in_memory_layer: Option<InMemoryLayerDesc>,
         end_lsn: Lsn,
     ) -> Option<SearchResult> {
         assert!(delta_layer.as_ref().is_none_or(|l| l.is_delta()));
         assert!(image_layer.as_ref().is_none_or(|l| !l.is_delta()));
 
-        match (delta_layer, image_layer) {
-            (None, None) => None,
-            (None, Some(image)) => {
+        match (delta_layer, image_layer, in_memory_layer) {
+            (None, None, None) => None,
+            (None, Some(image), None) => {
                 let lsn_floor = image.get_lsn_range().start;
                 Some(SearchResult {
-                    layer: image,
+                    layer: ReadableLayerWeak::PersistentLayer(image),
                     lsn_floor,
                 })
             }
-            (Some(delta), None) => {
+            (Some(delta), None, None) => {
                 let lsn_floor = delta.get_lsn_range().start;
                 Some(SearchResult {
-                    layer: delta,
+                    layer: ReadableLayerWeak::PersistentLayer(delta),
                     lsn_floor,
                 })
             }
-            (Some(delta), Some(image)) => {
+            (Some(delta), Some(image), None) => {
                 let img_lsn = image.get_lsn_range().start;
                 let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                 let image_exact_match = img_lsn + 1 == end_lsn;
                 if image_is_newer || image_exact_match {
                     Some(SearchResult {
-                        layer: image,
+                        layer: ReadableLayerWeak::PersistentLayer(image),
+                        lsn_floor: img_lsn,
+                    })
+                } else {
+                    // If the delta overlaps with the image in the LSN dimension, do a partial
+                    // up to the image layer.
+                    let lsn_floor =
+                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    Some(SearchResult {
+                        layer: ReadableLayerWeak::PersistentLayer(delta),
+                        lsn_floor,
+                    })
+                }
+            }
+            (None, None, Some(inmem)) => {
+                let lsn_floor = inmem.get_lsn_range().start;
+                Some(SearchResult {
+                    layer: ReadableLayerWeak::InMemoryLayer(inmem),
+                    lsn_floor,
+                })
+            }
+            (None, Some(image), Some(inmem)) => {
+                // If the in-memory layer overlaps with the image in the LSN dimension, do a partial
+                // up to the image layer.
+                let img_lsn = image.get_lsn_range().start;
+                let image_is_newer = image.get_lsn_range().end >= inmem.get_lsn_range().end;
+                let image_exact_match = img_lsn + 1 == end_lsn;
+                if image_is_newer || image_exact_match {
+                    Some(SearchResult {
+                        layer: ReadableLayerWeak::PersistentLayer(image),
                         lsn_floor: img_lsn,
                     })
                 } else {
                     let lsn_floor =
-                        std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                        std::cmp::max(inmem.get_lsn_range().start, image.get_lsn_range().start + 1);
                     Some(SearchResult {
-                        layer: delta,
+                        layer: ReadableLayerWeak::InMemoryLayer(inmem),
                         lsn_floor,
                     })
                 }
             }
+            (Some(delta), None, Some(inmem)) => {
+                // Overlaps between delta and in-memory layers are not a valid
+                // state, but we handle them here for completeness.
+                let delta_end = delta.get_lsn_range().end;
+                let delta_is_newer = delta_end >= inmem.get_lsn_range().end;
+                let delta_exact_match = delta_end == end_lsn;
+                if delta_is_newer || delta_exact_match {
+                    Some(SearchResult {
+                        lsn_floor: delta.get_lsn_range().start,
+                        layer: ReadableLayerWeak::PersistentLayer(delta),
+                    })
+                } else {
+                    // If the in-memory layer overlaps with the delta in the LSN dimension, do a partial
+                    // up to the delta layer.
+                    let lsn_floor =
+                        std::cmp::max(inmem.get_lsn_range().start, delta.get_lsn_range().end);
+                    Some(SearchResult {
+                        layer: ReadableLayerWeak::InMemoryLayer(inmem),
+                        lsn_floor,
+                    })
+                }
+            }
+            (Some(delta), Some(image), Some(inmem)) => {
+                // Determine the preferred persistent layer without taking the in-memory layer
+                // into consideration.
+                let persistent_res =
+                    Self::select_layer(Some(delta.clone()), Some(image.clone()), None, end_lsn)
+                        .unwrap();
+                let persistent_l = match persistent_res.layer {
+                    ReadableLayerWeak::PersistentLayer(l) => l,
+                    ReadableLayerWeak::InMemoryLayer(_) => unreachable!(),
+                };
+
+                // Now handle the in-memory layer overlaps.
+                let inmem_res = if persistent_l.is_delta() {
+                    Self::select_layer(Some(persistent_l), None, Some(inmem.clone()), end_lsn)
+                        .unwrap()
+                } else {
+                    Self::select_layer(None, Some(persistent_l), Some(inmem.clone()), end_lsn)
+                        .unwrap()
+                };
+
+                Some(SearchResult {
+                    layer: inmem_res.layer,
+                    // Use the more restrictive LSN floor
+                    lsn_floor: std::cmp::max(persistent_res.lsn_floor, inmem_res.lsn_floor),
+                })
+            }
         }
     }
 
     pub fn range_search(&self, key_range: Range<Key>, end_lsn: Lsn) -> RangeSearchResult {
+        let in_memory_layer = self.search_in_memory_layer(end_lsn);
+
         let version = match self.historic.get().unwrap().get_version(end_lsn.0 - 1) {
             Some(version) => version,
             None => {
-                let mut result = RangeSearchResult::new();
-                result.not_found.add_range(key_range);
-                return result;
+                return RangeSearchResult::map_to_in_memory_layer(in_memory_layer, key_range);
             }
         };
 
@@ -464,7 +606,13 @@ impl LayerMap {
         let delta_changes = version.delta_coverage.range_overlaps(&raw_range);
         let image_changes = version.image_coverage.range_overlaps(&raw_range);
 
-        let collector = RangeSearchCollector::new(key_range, end_lsn, delta_changes, image_changes);
+        let collector = RangeSearchCollector::new(
+            key_range,
+            end_lsn,
+            in_memory_layer,
+            delta_changes,
+            image_changes,
+        );
         collector.collect()
     }
 
@@ -571,17 +719,36 @@ impl LayerMap {
     }
 
     /// Get a ref counted pointer for the first in memory layer that matches the provided predicate.
-    pub fn find_in_memory_layer<Pred>(&self, mut pred: Pred) -> Option<Arc<InMemoryLayer>>
-    where
-        Pred: FnMut(&Arc<InMemoryLayer>) -> bool,
-    {
+    pub(crate) fn search_in_memory_layer(&self, below: Lsn) -> Option<InMemoryLayerDesc> {
+        let is_below = |l: &Arc<InMemoryLayer>| {
+            let start_lsn = l.get_lsn_range().start;
+            below > start_lsn
+        };
+
         if let Some(open) = &self.open_layer {
-            if pred(open) {
-                return Some(open.clone());
+            if is_below(open) {
+                return Some(InMemoryLayerDesc {
+                    handle: InMemoryLayerHandle::Open,
+                    lsn_range: open.get_lsn_range(),
+                });
             }
         }
 
-        self.frozen_layers.iter().rfind(|l| pred(l)).cloned()
+        self.frozen_layers
+            .iter()
+            .enumerate()
+            .rfind(|(_idx, l)| is_below(l))
+            .map(|(idx, l)| InMemoryLayerDesc {
+                handle: InMemoryLayerHandle::Frozen(idx),
+                lsn_range: l.get_lsn_range(),
+            })
+    }
+
+    pub(crate) fn in_memory_layer(&self, desc: &InMemoryLayerDesc) -> Arc<InMemoryLayer> {
+        match desc.handle {
+            InMemoryLayerHandle::Open => self.open_layer.as_ref().unwrap().clone(),
+            InMemoryLayerHandle::Frozen(idx) => self.frozen_layers[idx].clone(),
+        }
     }
 
     ///
@@ -737,136 +904,6 @@ impl LayerMap {
         max_stacked_deltas
     }
 
-    /// Count how many reimage-worthy layers we need to visit for given key-lsn pair.
-    ///
-    /// The `partition_range` argument is used as context for the reimage-worthiness decision.
-    ///
-    /// Used as a helper for correctness checks only. Performance not critical.
-    pub fn get_difficulty(&self, lsn: Lsn, key: Key, partition_range: &Range<Key>) -> usize {
-        match self.search(key, lsn) {
-            Some(search_result) => {
-                if search_result.layer.is_incremental() {
-                    (Self::is_reimage_worthy(&search_result.layer, partition_range) as usize)
-                        + self.get_difficulty(search_result.lsn_floor, key, partition_range)
-                } else {
-                    0
-                }
-            }
-            None => 0,
-        }
-    }
-
-    /// Used for correctness checking. Results are expected to be identical to
-    /// self.get_difficulty_map. Assumes self.search is correct.
-    pub fn get_difficulty_map_bruteforce(
-        &self,
-        lsn: Lsn,
-        partitioning: &KeyPartitioning,
-    ) -> Vec<usize> {
-        // Looking at the difficulty as a function of key, it could only increase
-        // when a delta layer starts or an image layer ends. Therefore it's sufficient
-        // to check the difficulties at:
-        // - the key.start for each non-empty part range
-        // - the key.start for each delta
-        // - the key.end for each image
-        let keys_iter: Box<dyn Iterator<Item = Key>> = {
-            let mut keys: Vec<Key> = self
-                .iter_historic_layers()
-                .map(|layer| {
-                    if layer.is_incremental() {
-                        layer.get_key_range().start
-                    } else {
-                        layer.get_key_range().end
-                    }
-                })
-                .collect();
-            keys.sort();
-            Box::new(keys.into_iter())
-        };
-        let mut keys_iter = keys_iter.peekable();
-
-        // Iter the partition and keys together and query all the necessary
-        // keys, computing the max difficulty for each part.
-        partitioning
-            .parts
-            .iter()
-            .map(|part| {
-                let mut difficulty = 0;
-                // Partition ranges are assumed to be sorted and disjoint
-                // TODO assert it
-                for range in &part.ranges {
-                    if !range.is_empty() {
-                        difficulty =
-                            std::cmp::max(difficulty, self.get_difficulty(lsn, range.start, range));
-                    }
-                    while let Some(key) = keys_iter.peek() {
-                        if key >= &range.end {
-                            break;
-                        }
-                        let key = keys_iter.next().unwrap();
-                        if key < range.start {
-                            continue;
-                        }
-                        difficulty =
-                            std::cmp::max(difficulty, self.get_difficulty(lsn, key, range));
-                    }
-                }
-                difficulty
-            })
-            .collect()
-    }
-
-    /// For each part of a keyspace partitioning, return the maximum number of layers
-    /// that would be needed for page reconstruction in that part at the given LSN.
-    ///
-    /// If `limit` is provided we don't try to count above that number.
-    ///
-    /// This method is used to decide where to create new image layers. Computing the
-    /// result for the entire partitioning at once allows this function to be more
-    /// efficient, and further optimization is possible by using iterators instead,
-    /// to allow early return.
-    ///
-    /// TODO actually use this method instead of count_deltas. Currently we only use
-    ///      it for benchmarks.
-    pub fn get_difficulty_map(
-        &self,
-        lsn: Lsn,
-        partitioning: &KeyPartitioning,
-        limit: Option<usize>,
-    ) -> Vec<usize> {
-        // TODO This is a naive implementation. Perf improvements to do:
-        // 1. Instead of calling self.image_coverage and self.count_deltas,
-        //    iterate the image and delta coverage only once.
-        partitioning
-            .parts
-            .iter()
-            .map(|part| {
-                let mut difficulty = 0;
-                for range in &part.ranges {
-                    if limit == Some(difficulty) {
-                        break;
-                    }
-                    for (img_range, last_img) in self.image_coverage(range, lsn) {
-                        if limit == Some(difficulty) {
-                            break;
-                        }
-                        let img_lsn = if let Some(last_img) = last_img {
-                            last_img.get_lsn_range().end
-                        } else {
-                            Lsn(0)
-                        };
-
-                        if img_lsn < lsn {
-                            let num_deltas = self.count_deltas(&img_range, &(img_lsn..lsn), limit);
-                            difficulty = std::cmp::max(difficulty, num_deltas);
-                        }
-                    }
-                }
-                difficulty
-            })
-            .collect()
-    }
-
     /// Return all L0 delta layers
     pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
         &self.l0_delta_layers
@@ -1069,6 +1106,10 @@ mod tests {
     use std::collections::HashMap;
     use std::path::PathBuf;
 
+    use crate::{
+        DEFAULT_PG_VERSION,
+        tenant::{harness::TenantHarness, storage_layer::LayerName},
+    };
     use pageserver_api::key::DBDIR_KEY;
     use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
     use utils::id::{TenantId, TimelineId};
@@ -1076,7 +1117,6 @@ mod tests {
 
     use super::*;
     use crate::tenant::IndexPart;
-    use crate::tenant::storage_layer::LayerName;
 
     #[derive(Clone)]
     struct LayerDesc {
@@ -1101,7 +1141,6 @@ mod tests {
     }
 
     fn assert_range_search_result_eq(lhs: RangeSearchResult, rhs: RangeSearchResult) {
-        assert_eq!(lhs.not_found.to_keyspace(), rhs.not_found.to_keyspace());
         let lhs: HashMap<SearchResult, KeySpace> = lhs
             .found
             .into_iter()
@@ -1127,17 +1166,12 @@ mod tests {
         let mut key = key_range.start;
         while key != key_range.end {
             let res = layer_map.search(key, end_lsn);
-            match res {
-                Some(res) => {
-                    range_search_result
-                        .found
-                        .entry(res)
-                        .or_default()
-                        .add_key(key);
-                }
-                None => {
-                    range_search_result.not_found.add_key(key);
-                }
+            if let Some(res) = res {
+                range_search_result
+                    .found
+                    .entry(res)
+                    .or_default()
+                    .add_key(key);
             }
 
             key = key.next();
@@ -1152,20 +1186,49 @@ mod tests {
         let range = Key::from_i128(100)..Key::from_i128(200);
 
         let res = layer_map.range_search(range.clone(), Lsn(100));
-        assert_eq!(
-            res.not_found.to_keyspace(),
-            KeySpace {
-                ranges: vec![range]
-            }
-        );
+        assert_range_search_result_eq(res, RangeSearchResult::new());
     }
 
-    #[test]
-    fn ranged_search() {
+    #[tokio::test]
+    async fn ranged_search() {
+        let harness = TenantHarness::create("ranged_search").await.unwrap();
+        let (tenant, ctx) = harness.load().await;
+        let timeline_id = TimelineId::generate();
+        // Create the timeline such that the in-memory layers can be written
+        // to the timeline directory.
+        tenant
+            .create_test_timeline(timeline_id, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await
+            .unwrap();
+
+        let gate = utils::sync::gate::Gate::default();
+        let add_in_memory_layer = async |layer_map: &mut LayerMap, lsn_range: Range<Lsn>| {
+            let layer = InMemoryLayer::create(
+                harness.conf,
+                timeline_id,
+                harness.tenant_shard_id,
+                lsn_range.start,
+                &gate,
+                &ctx,
+            )
+            .await
+            .unwrap();
+
+            layer.freeze(lsn_range.end).await;
+
+            layer_map.frozen_layers.push_back(Arc::new(layer));
+        };
+
+        let in_memory_layer_configurations = [
+            vec![],
+            // Overlaps with the top-most image
+            vec![Lsn(35)..Lsn(50)],
+        ];
+
         let layers = vec![
             LayerDesc {
                 key_range: Key::from_i128(15)..Key::from_i128(50),
-                lsn_range: Lsn(0)..Lsn(5),
+                lsn_range: Lsn(5)..Lsn(6),
                 is_delta: false,
             },
             LayerDesc {
@@ -1185,19 +1248,27 @@ mod tests {
             },
             LayerDesc {
                 key_range: Key::from_i128(35)..Key::from_i128(40),
-                lsn_range: Lsn(35)..Lsn(40),
+                lsn_range: Lsn(40)..Lsn(41),
                 is_delta: false,
             },
         ];
 
-        let layer_map = create_layer_map(layers.clone());
-        for start in 0..60 {
-            for end in (start + 1)..60 {
-                let range = Key::from_i128(start)..Key::from_i128(end);
-                let result = layer_map.range_search(range.clone(), Lsn(100));
-                let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+        let mut layer_map = create_layer_map(layers.clone());
+        for in_memory_layers in in_memory_layer_configurations {
+            for in_mem_layer_range in in_memory_layers {
+                add_in_memory_layer(&mut layer_map, in_mem_layer_range).await;
+            }
 
-                assert_range_search_result_eq(result, expected);
+            for start in 0..60 {
+                for end in (start + 1)..60 {
+                    let range = Key::from_i128(start)..Key::from_i128(end);
+                    let result = layer_map.range_search(range.clone(), Lsn(100));
+                    let expected = brute_force_range_search(&layer_map, range, Lsn(100));
+
+                    eprintln!("{start}..{end}: {result:?}");
+
+                    assert_range_search_result_eq(result, expected);
+                }
             }
         }
     }
@@ -1490,12 +1561,348 @@ mod tests {
 
         // Sanity: the layer that holds latest data for the DBDIR key should always be visible
         // (just using this key as a key that will always exist for any layermap fixture)
-        let dbdir_layer = layer_map
-            .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
-            .unwrap();
+        let dbdir_layer = {
+            let readable_layer = layer_map
+                .search(DBDIR_KEY, index.metadata.disk_consistent_lsn())
+                .unwrap();
+
+            match readable_layer.layer {
+                ReadableLayerWeak::PersistentLayer(desc) => desc,
+                ReadableLayerWeak::InMemoryLayer(_) => unreachable!(""),
+            }
+        };
         assert!(matches!(
-            layer_visibilities.get(&dbdir_layer.layer).unwrap(),
+            layer_visibilities.get(&dbdir_layer).unwrap(),
             LayerVisibilityHint::Visible
         ));
     }
 }
+
+#[cfg(test)]
+mod select_layer_tests {
+    use super::*;
+
+    fn create_persistent_layer(
+        start_lsn: u64,
+        end_lsn: u64,
+        is_delta: bool,
+    ) -> Arc<PersistentLayerDesc> {
+        if !is_delta {
+            assert_eq!(end_lsn, start_lsn + 1);
+        }
+
+        Arc::new(PersistentLayerDesc::new_test(
+            Key::MIN..Key::MAX,
+            Lsn(start_lsn)..Lsn(end_lsn),
+            is_delta,
+        ))
+    }
+
+    fn create_inmem_layer(start_lsn: u64, end_lsn: u64) -> InMemoryLayerDesc {
+        InMemoryLayerDesc {
+            handle: InMemoryLayerHandle::Open,
+            lsn_range: Lsn(start_lsn)..Lsn(end_lsn),
+        }
+    }
+
+    #[test]
+    fn test_select_layer_empty() {
+        assert!(LayerMap::select_layer(None, None, None, Lsn(100)).is_none());
+    }
+
+    #[test]
+    fn test_select_layer_only_delta() {
+        let delta = create_persistent_layer(10, 20, true);
+        let result = LayerMap::select_layer(Some(delta.clone()), None, None, Lsn(100)).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_only_image() {
+        let image = create_persistent_layer(10, 11, false);
+        let result = LayerMap::select_layer(None, Some(image.clone()), None, Lsn(100)).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_only_inmem() {
+        let inmem = create_inmem_layer(10, 20);
+        let result = LayerMap::select_layer(None, None, Some(inmem.clone()), Lsn(100)).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+    }
+
+    #[test]
+    fn test_select_layer_image_inside_delta() {
+        let delta = create_persistent_layer(10, 20, true);
+        let image = create_persistent_layer(15, 16, false);
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(100))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(16));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            None,
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_newer_image() {
+        let delta = create_persistent_layer(10, 20, true);
+        let image = create_persistent_layer(25, 26, false);
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(25));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), None, None, result.lsn_floor).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_delta_with_older_image() {
+        let delta = create_persistent_layer(15, 25, true);
+        let image = create_persistent_layer(10, 11, false);
+
+        let result =
+            LayerMap::select_layer(Some(delta.clone()), Some(image.clone()), None, Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result =
+            LayerMap::select_layer(None, Some(image.clone()), None, result.lsn_floor).unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_image_inside_inmem() {
+        let image = create_persistent_layer(15, 16, false);
+        let inmem = create_inmem_layer(10, 25);
+
+        let result =
+            LayerMap::select_layer(None, Some(image.clone()), Some(inmem.clone()), Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(16));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            None,
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+
+        let result =
+            LayerMap::select_layer(None, None, Some(inmem.clone()), result.lsn_floor).unwrap();
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+    }
+
+    #[test]
+    fn test_select_layer_delta_inside_inmem() {
+        let delta_top = create_persistent_layer(15, 20, true);
+        let delta_bottom = create_persistent_layer(10, 15, true);
+        let inmem = create_inmem_layer(15, 25);
+
+        let result =
+            LayerMap::select_layer(Some(delta_top.clone()), None, Some(inmem.clone()), Lsn(30))
+                .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(20));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            Some(delta_top.clone()),
+            None,
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(15));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_top))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta_bottom.clone()),
+            None,
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+        assert_eq!(result.lsn_floor, Lsn(10));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta_bottom))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_all_overlap_1() {
+        let inmem = create_inmem_layer(10, 30);
+        let delta = create_persistent_layer(15, 25, true);
+        let image = create_persistent_layer(20, 21, false);
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            Lsn(50),
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(25));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(21));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(20));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_all_overlap_2() {
+        let inmem = create_inmem_layer(20, 30);
+        let delta = create_persistent_layer(10, 40, true);
+        let image = create_persistent_layer(25, 26, false);
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            Lsn(50),
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(26));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(25));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+
+    #[test]
+    fn test_select_layer_all_overlap_3() {
+        let inmem = create_inmem_layer(30, 40);
+        let delta = create_persistent_layer(10, 30, true);
+        let image = create_persistent_layer(20, 21, false);
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            Some(inmem.clone()),
+            Lsn(50),
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(30));
+        assert!(matches!(result.layer, ReadableLayerWeak::InMemoryLayer(l) if l == inmem));
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            None,
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(21));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &delta))
+        );
+
+        let result = LayerMap::select_layer(
+            Some(delta.clone()),
+            Some(image.clone()),
+            None,
+            result.lsn_floor,
+        )
+        .unwrap();
+
+        assert_eq!(result.lsn_floor, Lsn(20));
+        assert!(
+            matches!(result.layer, ReadableLayerWeak::PersistentLayer(l) if Arc::ptr_eq(&l, &image))
+        );
+    }
+}
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index f8bec48886..b3dc8e56a3 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -63,6 +63,8 @@ pub struct HistoricLayerCoverage<Value> {
     /// The latest state
     head: LayerCoverageTuple<Value>,
 
+    /// TODO: this could be an ordered vec using binary search.
+    /// We push into this map everytime we add a layer, so might see some benefit
     /// All previous states
     historic: BTreeMap<u64, LayerCoverageTuple<Value>>,
 }
@@ -419,6 +421,10 @@ pub struct BufferedHistoricLayerCoverage<Value> {
     buffer: BTreeMap<LayerKey, Option<Value>>,
 
     /// All current layers. This is not used for search. Only to make rebuilds easier.
+    // TODO: This map is never cleared. Rebuilds could use the post-trim last entry of
+    // [`Self::historic_coverage`] instead of doubling memory usage.
+    // [`Self::len`]: can require rebuild and serve from latest historic
+    // [`Self::iter`]: already requires rebuild => can serve from latest historic
     layers: BTreeMap<LayerKey, Value>,
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4ba5844fea..891760b499 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -194,7 +194,7 @@ pub(crate) use download::{
 };
 use index::GcCompactionState;
 pub(crate) use index::LayerFileMetadata;
-use pageserver_api::models::TimelineArchivalState;
+use pageserver_api::models::{RelSizeMigration, TimelineArchivalState};
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use regex::Regex;
 use remote_storage::{
@@ -437,9 +437,13 @@ impl RemoteTimelineClient {
 
     /// Initialize the upload queue for the case where the remote storage is empty,
     /// i.e., it doesn't have an `IndexPart`.
+    ///
+    /// `rel_size_v2_status` needs to be carried over during branching, and that's why
+    /// it's passed in here.
     pub fn init_upload_queue_for_empty_remote(
         &self,
         local_metadata: &TimelineMetadata,
+        rel_size_v2_status: Option<RelSizeMigration>,
     ) -> anyhow::Result<()> {
         // Set the maximum number of inprogress tasks to the remote storage concurrency. There's
         // certainly no point in starting more upload tasks than this.
@@ -449,7 +453,9 @@ impl RemoteTimelineClient {
             .as_ref()
             .map_or(0, |r| r.concurrency_limit());
         let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        let initialized_queue =
+            upload_queue.initialize_empty_remote(local_metadata, inprogress_limit)?;
+        initialized_queue.dirty.rel_size_migration = rel_size_v2_status;
         self.update_remote_physical_size_gauge(None);
         info!("initialized upload queue as empty");
         Ok(())
@@ -900,7 +906,7 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    /// Launch an index-file upload operation in the background, setting `import_pgdata` field.
+    /// Launch an index-file upload operation in the background, setting `gc_compaction_state` field.
     pub(crate) fn schedule_index_upload_for_gc_compaction_state_update(
         self: &Arc<Self>,
         gc_compaction_state: GcCompactionState,
@@ -912,6 +918,21 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Launch an index-file upload operation in the background, setting `rel_size_v2_status` field.
+    pub(crate) fn schedule_index_upload_for_rel_size_v2_status_update(
+        self: &Arc<Self>,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        upload_queue.dirty.rel_size_migration = Some(rel_size_v2_status);
+        // TODO: allow this operation to bypass the validation check because we might upload the index part
+        // with no layers but the flag updated. For now, we just modify the index part in memory and the next
+        // upload will include the flag.
+        // self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
     ///
     /// Launch an index-file upload operation in the background, if necessary.
     ///
@@ -933,6 +954,14 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Only used in the `patch_index_part` HTTP API to force trigger an index upload.
+    pub fn force_schedule_index_upload(self: &Arc<Self>) -> Result<(), NotInitialized> {
+        let mut guard = self.upload_queue.lock().unwrap();
+        let upload_queue = guard.initialized_mut()?;
+        self.schedule_index_upload(upload_queue);
+        Ok(())
+    }
+
     /// Launch an index-file upload operation in the background (internal function)
     fn schedule_index_upload(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
         let disk_consistent_lsn = upload_queue.dirty.metadata.disk_consistent_lsn();
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index ceaed58bbd..16c38be907 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,6 +7,7 @@ use std::collections::HashMap;
 
 use chrono::NaiveDateTime;
 use pageserver_api::models::AuxFilePolicy;
+use pageserver_api::models::RelSizeMigration;
 use pageserver_api::shard::ShardIndex;
 use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
@@ -117,21 +118,6 @@ pub struct GcCompactionState {
     pub(crate) last_completed_lsn: Lsn,
 }
 
-#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub enum RelSizeMigration {
-    /// The tenant is using the old rel_size format.
-    /// Note that this enum is persisted as `Option<RelSizeMigration>` in the index part, so
-    /// `None` is the same as `Some(RelSizeMigration::Legacy)`.
-    Legacy,
-    /// The tenant is migrating to the new rel_size format. Both old and new rel_size format are
-    /// persisted in the index part. The read path will read both formats and merge them.
-    Migrating,
-    /// The tenant has migrated to the new rel_size format. Only the new rel_size format is persisted
-    /// in the index part, and the read path will not read the old format.
-    Migrated,
-}
-
 impl IndexPart {
     /// When adding or modifying any parts of `IndexPart`, increment the version so that it can be
     /// used to understand later versions.
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index a13b9323ac..5f3a0932c4 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -869,8 +869,7 @@ impl<'a> TenantDownloader<'a> {
                 let heatmap_timeline = heatmap.timelines.get(heatmap_timeline_index).unwrap();
 
                 let layers_in_heatmap = heatmap_timeline
-                    .layers
-                    .iter()
+                    .hot_layers()
                     .map(|l| (&l.name, l.metadata.generation))
                     .collect::<HashSet<_>>();
                 let layers_on_disk = timeline_state
@@ -1015,7 +1014,8 @@ impl<'a> TenantDownloader<'a> {
         // Accumulate updates to the state
         let mut touched = Vec::new();
 
-        for layer in timeline.layers {
+        let timeline_id = timeline.timeline_id;
+        for layer in timeline.into_hot_layers() {
             if self.secondary_state.cancel.is_cancelled() {
                 tracing::debug!("Cancelled -- dropping out of layer loop");
                 return (Err(UpdateError::Cancelled), touched);
@@ -1040,7 +1040,7 @@ impl<'a> TenantDownloader<'a> {
             }
 
             match self
-                .download_layer(tenant_shard_id, &timeline.timeline_id, layer, ctx)
+                .download_layer(tenant_shard_id, &timeline_id, layer, ctx)
                 .await
             {
                 Ok(Some(layer)) => touched.push(layer),
@@ -1148,7 +1148,7 @@ impl<'a> TenantDownloader<'a> {
         let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
         let timeline_id = timeline.timeline_id;
 
-        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.layers.len());
+        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());
 
         let (result, touched) = self
             .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
@@ -1316,11 +1316,11 @@ async fn init_timeline_state(
     // As we iterate through layers found on disk, we will look up their metadata from this map.
     // Layers not present in metadata will be discarded.
     let heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
-        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+        heatmap.hot_layers().map(|l| (&l.name, l)).collect();
 
     let last_heatmap_metadata: HashMap<&LayerName, &HeatMapLayer> =
         if let Some(last_heatmap) = last_heatmap {
-            last_heatmap.layers.iter().map(|l| (&l.name, l)).collect()
+            last_heatmap.hot_layers().map(|l| (&l.name, l)).collect()
         } else {
             HashMap::new()
         };
diff --git a/pageserver/src/tenant/secondary/heatmap.rs b/pageserver/src/tenant/secondary/heatmap.rs
index 4a938e9095..6dbb3f091f 100644
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -42,7 +42,7 @@ pub(crate) struct HeatMapTimeline {
     #[serde_as(as = "DisplayFromStr")]
     pub(crate) timeline_id: TimelineId,
 
-    pub(crate) layers: Vec<HeatMapLayer>,
+    layers: Vec<HeatMapLayer>,
 }
 
 #[serde_as]
@@ -53,8 +53,10 @@ pub(crate) struct HeatMapLayer {
 
     #[serde_as(as = "TimestampSeconds<i64>")]
     pub(crate) access_time: SystemTime,
-    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
-    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
+
+    #[serde(default)]
+    pub(crate) cold: bool, // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
+                           // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
 }
 
 impl HeatMapLayer {
@@ -62,11 +64,13 @@ impl HeatMapLayer {
         name: LayerName,
         metadata: LayerFileMetadata,
         access_time: SystemTime,
+        cold: bool,
     ) -> Self {
         Self {
             name,
             metadata,
             access_time,
+            cold,
         }
     }
 }
@@ -78,6 +82,18 @@ impl HeatMapTimeline {
             layers,
         }
     }
+
+    pub(crate) fn into_hot_layers(self) -> impl Iterator<Item = HeatMapLayer> {
+        self.layers.into_iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn hot_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter().filter(|l| !l.cold)
+    }
+
+    pub(crate) fn all_layers(&self) -> impl Iterator<Item = &HeatMapLayer> {
+        self.layers.iter()
+    }
 }
 
 pub(crate) struct HeatMapStats {
@@ -92,7 +108,7 @@ impl HeatMapTenant {
             layers: 0,
         };
         for timeline in &self.timelines {
-            for layer in &timeline.layers {
+            for layer in timeline.hot_layers() {
                 stats.layers += 1;
                 stats.bytes += layer.metadata.file_size;
             }
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 7f313f46a2..ece163b24a 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -40,6 +40,7 @@ use utils::sync::gate::GateGuard;
 
 use self::inmemory_layer::InMemoryLayerFileId;
 use super::PageReconstructError;
+use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
 use crate::config::PageServerConf;
 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -721,6 +722,12 @@ struct LayerToVisitId {
     lsn_floor: Lsn,
 }
 
+#[derive(Debug, PartialEq, Eq, Hash)]
+pub enum ReadableLayerWeak {
+    PersistentLayer(Arc<PersistentLayerDesc>),
+    InMemoryLayer(InMemoryLayerDesc),
+}
+
 /// Layer wrapper for the read path. Note that it is valid
 /// to use these layers even after external operations have
 /// been performed on them (compaction, freeze, etc.).
@@ -873,7 +880,7 @@ impl ReadableLayer {
             }
             ReadableLayer::InMemoryLayer(layer) => {
                 layer
-                    .get_values_reconstruct_data(keyspace, lsn_range.end, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
                     .await
             }
         }
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index ffdfe1dc27..46135b5330 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -416,7 +416,7 @@ impl InMemoryLayer {
     pub(crate) async fn get_values_reconstruct_data(
         self: &Arc<InMemoryLayer>,
         keyspace: KeySpace,
-        end_lsn: Lsn,
+        lsn_range: Range<Lsn>,
         reconstruct_state: &mut ValuesReconstructState,
         ctx: &RequestContext,
     ) -> Result<(), GetVectoredError> {
@@ -433,8 +433,6 @@ impl InMemoryLayer {
         let mut reads: HashMap<Key, Vec<ValueRead>> = HashMap::new();
         let mut ios: HashMap<(Key, Lsn), OnDiskValueIo> = Default::default();
 
-        let lsn_range = self.start_lsn..end_lsn;
-
         for range in keyspace.ranges.iter() {
             for (key, vec_map) in inner
                 .index
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index bde7fbc1f9..247092bf45 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1563,10 +1563,10 @@ impl LayerInner {
 
         self.access_stats.record_residence_event();
 
-        self.status.as_ref().unwrap().send_replace(Status::Evicted);
-
         *self.last_evicted_at.lock().unwrap() = Some(std::time::Instant::now());
 
+        self.status.as_ref().unwrap().send_replace(Status::Evicted);
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index d43dfefdbc..a7f3c6b8c5 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -49,6 +49,7 @@ async fn smoke_test() {
             Lsn(0x10),
             14,
             &ctx,
+            Default::default(), // in-memory layers
             Default::default(),
             image_layers,
             Lsn(0x100),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 662088fbde..4483ecfe94 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -46,7 +46,7 @@ use pageserver_api::keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPart
 use pageserver_api::models::{
     CompactKeyRange, CompactLsnRange, CompactionAlgorithm, CompactionAlgorithmSettings,
     DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, TimelineState,
+    InMemoryLayerInfo, LayerMapInfo, LsnLease, PageTraceEvent, RelSizeMigration, TimelineState,
 };
 use pageserver_api::reltag::{BlockNumber, RelTag};
 use pageserver_api::shard::{ShardIdentity, ShardIndex, ShardNumber, TenantShardId};
@@ -99,7 +99,8 @@ use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate,
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
 use crate::metrics::{
-    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
+    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
+    LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
 };
 use crate::page_service::TenantManagerTypes;
 use crate::pgdatadir_mapping::{
@@ -436,12 +437,16 @@ pub struct Timeline {
     /// May host a background Tokio task which downloads all the layers from the current
     /// heatmap on demand.
     heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
+
+    pub(crate) rel_size_v2_status: ArcSwapOption<RelSizeMigration>,
 }
 
 pub(crate) enum PreviousHeatmap {
     Active {
         heatmap: HeatMapTimeline,
         read_at: std::time::Instant,
+        // End LSN covered by the heatmap if known
+        end_lsn: Option<Lsn>,
     },
     Obsolete,
 }
@@ -1326,10 +1331,6 @@ impl Timeline {
         // (this is a requirement, not a bug). Skip updating the metric in these cases
         // to avoid infinite results.
         if !results.is_empty() {
-            // Record the total number of layers visited towards each key in the batch. While some
-            // layers may not intersect with a given read, and the cost of layer visits are
-            // amortized across the batch, each visited layer contributes directly to the observed
-            // latency for every read in the batch, which is what we care about.
             if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
                 static LOG_PACER: Lazy<Mutex<RateLimit>> =
                     Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
@@ -1344,9 +1345,23 @@ impl Timeline {
                 });
             }
 
+            // Records the number of layers visited in a few different ways:
+            //
+            // * LAYERS_PER_READ: all layers count towards every read in the batch, because each
+            //   layer directly affects its observed latency.
+            //
+            // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
+            //   layer visits and access cost.
+            //
+            // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
+            //   read amplification after batching.
+            let layers_visited = layers_visited as f64;
+            let avg_layers_visited = layers_visited / results.len() as f64;
+            LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
             for _ in &results {
-                self.metrics.layers_per_read.observe(layers_visited as f64);
-                LAYERS_PER_READ_GLOBAL.observe(layers_visited as f64);
+                self.metrics.layers_per_read.observe(layers_visited);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited);
+                LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
             }
         }
 
@@ -2366,6 +2381,9 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
     }
 
+    /// Returns `true` if the rel_size_v2 config is enabled. NOTE: the write path and read path
+    /// should look at `get_rel_size_v2_status()` to get the actual status of the timeline. It is
+    /// possible that the index part persists the state while the config doesn't get persisted.
     pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2374,6 +2392,14 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
     }
 
+    pub(crate) fn get_rel_size_v2_status(&self) -> RelSizeMigration {
+        self.rel_size_v2_status
+            .load()
+            .as_ref()
+            .map(|s| s.as_ref().clone())
+            .unwrap_or(RelSizeMigration::Legacy)
+    }
+
     fn get_compaction_upper_limit(&self) -> usize {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2634,6 +2660,7 @@ impl Timeline {
         attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
         create_idempotency: crate::tenant::CreateTimelineIdempotency,
         gc_compaction_state: Option<GcCompactionState>,
+        rel_size_v2_status: Option<RelSizeMigration>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2792,6 +2819,8 @@ impl Timeline {
                 previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
 
                 heatmap_layers_downloader: Mutex::new(None),
+
+                rel_size_v2_status: ArcSwapOption::from_pointee(rel_size_v2_status),
             };
 
             result.repartition_threshold =
@@ -2868,6 +2897,16 @@ impl Timeline {
             .schedule_index_upload_for_gc_compaction_state_update(gc_compaction_state)
     }
 
+    pub(crate) fn update_rel_size_v2_status(
+        &self,
+        rel_size_v2_status: RelSizeMigration,
+    ) -> anyhow::Result<()> {
+        self.rel_size_v2_status
+            .store(Some(Arc::new(rel_size_v2_status.clone())));
+        self.remote_client
+            .schedule_index_upload_for_rel_size_v2_status_update(rel_size_v2_status)
+    }
+
     pub(crate) fn get_gc_compaction_state(&self) -> Option<GcCompactionState> {
         self.gc_compaction_state.load_full().as_ref().clone()
     }
@@ -3570,12 +3609,16 @@ impl Timeline {
         Ok(layer)
     }
 
-    pub(super) fn is_previous_heatmap_active(&self) -> bool {
-        self.previous_heatmap
-            .load()
-            .as_ref()
-            .map(|prev| matches!(**prev, PreviousHeatmap::Active { .. }))
-            .unwrap_or(false)
+    pub(super) fn should_keep_previous_heatmap(&self, new_heatmap_end_lsn: Lsn) -> bool {
+        let crnt = self.previous_heatmap.load();
+        match crnt.as_deref() {
+            Some(PreviousHeatmap::Active { end_lsn, .. }) => match end_lsn {
+                Some(crnt_end_lsn) => *crnt_end_lsn > new_heatmap_end_lsn,
+                None => true,
+            },
+            Some(PreviousHeatmap::Obsolete) => false,
+            None => false,
+        }
     }
 
     /// The timeline heatmap is a hint to secondary locations from the primary location,
@@ -3603,26 +3646,26 @@ impl Timeline {
         // heatamp.
         let previous_heatmap = self.previous_heatmap.load();
         let visible_non_resident = match previous_heatmap.as_deref() {
-            Some(PreviousHeatmap::Active { heatmap, read_at }) => {
-                Some(heatmap.layers.iter().filter_map(|hl| {
-                    let desc: PersistentLayerDesc = hl.name.clone().into();
-                    let layer = guard.try_get_from_key(&desc.key())?;
+            Some(PreviousHeatmap::Active {
+                heatmap, read_at, ..
+            }) => Some(heatmap.all_layers().filter_map(|hl| {
+                let desc: PersistentLayerDesc = hl.name.clone().into();
+                let layer = guard.try_get_from_key(&desc.key())?;
 
-                    if layer.visibility() == LayerVisibilityHint::Covered {
-                        return None;
-                    }
+                if layer.visibility() == LayerVisibilityHint::Covered {
+                    return None;
+                }
 
-                    if layer.is_likely_resident() {
-                        return None;
-                    }
+                if layer.is_likely_resident() {
+                    return None;
+                }
 
-                    if layer.last_evicted_at().happened_after(*read_at) {
-                        return None;
-                    }
+                if layer.last_evicted_at().happened_after(*read_at) {
+                    return None;
+                }
 
-                    Some((desc, hl.metadata.clone(), hl.access_time))
-                }))
-            }
+                Some((desc, hl.metadata.clone(), hl.access_time, hl.cold))
+            })),
             Some(PreviousHeatmap::Obsolete) => None,
             None => None,
         };
@@ -3637,6 +3680,7 @@ impl Timeline {
                         layer.layer_desc().clone(),
                         layer.metadata(),
                         last_activity_ts,
+                        false, // these layers are not cold
                     ))
                 }
                 LayerVisibilityHint::Covered => {
@@ -3663,12 +3707,14 @@ impl Timeline {
         // Sort layers in order of which to download first.  For a large set of layers to download, we
         // want to prioritize those layers which are most likely to still be in the resident many minutes
         // or hours later:
+        // - Cold layers go last for convenience when a human inspects the heatmap.
         // - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
         //   only exist for a few minutes before being compacted into L1s.
         // - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
         //   the layer is likely to be covered by an image layer during compaction.
-        layers.sort_by_key(|(desc, _meta, _atime)| {
+        layers.sort_by_key(|(desc, _meta, _atime, cold)| {
             std::cmp::Reverse((
+                *cold,
                 !LayerMap::is_l0(&desc.key_range, desc.is_delta),
                 desc.lsn_range.end,
             ))
@@ -3676,7 +3722,9 @@ impl Timeline {
 
         let layers = layers
             .into_iter()
-            .map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
+            .map(|(desc, meta, atime, cold)| {
+                HeatMapLayer::new(desc.layer_name(), meta, atime, cold)
+            })
             .collect();
 
         Some(HeatMapTimeline::new(self.timeline_id, layers))
@@ -3696,6 +3744,7 @@ impl Timeline {
                 name: vl.layer_desc().layer_name(),
                 metadata: vl.metadata(),
                 access_time: now,
+                cold: true,
             };
             heatmap_layers.push(hl);
         }
@@ -3709,6 +3758,7 @@ impl Timeline {
         PreviousHeatmap::Active {
             heatmap,
             read_at: Instant::now(),
+            end_lsn: Some(end_lsn),
         }
     }
 
@@ -3907,39 +3957,22 @@ impl Timeline {
                 let guard = timeline.layers.read().await;
                 let layers = guard.layer_map()?;
 
-                let in_memory_layer = layers.find_in_memory_layer(|l| {
-                    let start_lsn = l.get_lsn_range().start;
-                    cont_lsn > start_lsn
-                });
+                for range in unmapped_keyspace.ranges.iter() {
+                    let results = layers.range_search(range.clone(), cont_lsn);
 
-                match in_memory_layer {
-                    Some(l) => {
-                        let lsn_range = l.get_lsn_range().start..cont_lsn;
-                        fringe.update(
-                            ReadableLayer::InMemoryLayer(l),
-                            unmapped_keyspace.clone(),
-                            lsn_range,
-                        );
-                    }
-                    None => {
-                        for range in unmapped_keyspace.ranges.iter() {
-                            let results = layers.range_search(range.clone(), cont_lsn);
-
-                            results
-                                .found
-                                .into_iter()
-                                .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
-                                    (
-                                        ReadableLayer::PersistentLayer(guard.get_from_desc(&layer)),
-                                        keyspace_accum.to_keyspace(),
-                                        lsn_floor..cont_lsn,
-                                    )
-                                })
-                                .for_each(|(layer, keyspace, lsn_range)| {
-                                    fringe.update(layer, keyspace, lsn_range)
-                                });
-                        }
-                    }
+                    results
+                        .found
+                        .into_iter()
+                        .map(|(SearchResult { layer, lsn_floor }, keyspace_accum)| {
+                            (
+                                guard.upgrade(layer),
+                                keyspace_accum.to_keyspace(),
+                                lsn_floor..cont_lsn,
+                            )
+                        })
+                        .for_each(|(layer, keyspace, lsn_range)| {
+                            fringe.update(layer, keyspace, lsn_range)
+                        });
                 }
 
                 // It's safe to drop the layer map lock after planning the next round of reads.
@@ -5548,6 +5581,14 @@ pub struct DeltaLayerTestDesc {
     pub data: Vec<(Key, Lsn, Value)>,
 }
 
+#[cfg(test)]
+#[derive(Clone)]
+pub struct InMemoryLayerTestDesc {
+    pub lsn_range: Range<Lsn>,
+    pub data: Vec<(Key, Lsn, Value)>,
+    pub is_open: bool,
+}
+
 #[cfg(test)]
 impl DeltaLayerTestDesc {
     pub fn new(lsn_range: Range<Lsn>, key_range: Range<Key>, data: Vec<(Key, Lsn, Value)>) -> Self {
@@ -6560,6 +6601,92 @@ impl Timeline {
         Ok(())
     }
 
+    /// Force create an in-memory layer and place them into the layer map.
+    #[cfg(test)]
+    pub(super) async fn force_create_in_memory_layer(
+        self: &Arc<Timeline>,
+        mut in_memory: InMemoryLayerTestDesc,
+        check_start_lsn: Option<Lsn>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        use utils::bin_ser::BeSer;
+
+        // Validate LSNs
+        if let Some(check_start_lsn) = check_start_lsn {
+            assert!(in_memory.lsn_range.start >= check_start_lsn);
+        }
+
+        let last_record_lsn = self.get_last_record_lsn();
+        let layer_end_lsn = if in_memory.is_open {
+            in_memory
+                .data
+                .iter()
+                .map(|(_key, lsn, _value)| lsn)
+                .max()
+                .cloned()
+        } else {
+            Some(in_memory.lsn_range.end)
+        };
+
+        if let Some(end) = layer_end_lsn {
+            assert!(
+                end <= last_record_lsn,
+                "advance last record lsn before inserting a layer, end_lsn={}, last_record_lsn={}",
+                end,
+                last_record_lsn,
+            );
+        }
+
+        in_memory.data.iter().for_each(|(_key, lsn, _value)| {
+            assert!(*lsn >= in_memory.lsn_range.start);
+            assert!(*lsn < in_memory.lsn_range.end);
+        });
+
+        // Build the batch
+        in_memory
+            .data
+            .sort_unstable_by(|(ka, la, _), (kb, lb, _)| (ka, la).cmp(&(kb, lb)));
+
+        let data = in_memory
+            .data
+            .into_iter()
+            .map(|(key, lsn, value)| {
+                let value_size = value.serialized_size().unwrap() as usize;
+                (key.to_compact(), lsn, value_size, value)
+            })
+            .collect::<Vec<_>>();
+
+        let batch = SerializedValueBatch::from_values(data);
+
+        // Create the in-memory layer and write the batch into it
+        let layer = InMemoryLayer::create(
+            self.conf,
+            self.timeline_id,
+            self.tenant_shard_id,
+            in_memory.lsn_range.start,
+            &self.gate,
+            ctx,
+        )
+        .await
+        .unwrap();
+
+        layer.put_batch(batch, ctx).await.unwrap();
+        if !in_memory.is_open {
+            layer.freeze(in_memory.lsn_range.end).await;
+        }
+
+        info!("force created in-memory layer {:?}", in_memory.lsn_range);
+
+        // Link the layer to the layer map
+        {
+            let mut guard = self.layers.write().await;
+            let layer_map = guard.open_mut().unwrap();
+            layer_map.force_insert_in_memory_layer(Arc::new(layer));
+        }
+
+        Ok(())
+    }
+
     /// Return all keys at the LSN in the image layers
     #[cfg(test)]
     pub(crate) async fn inspect_image_layers(
@@ -6919,6 +7046,7 @@ mod tests {
 
     use pageserver_api::key::Key;
     use pageserver_api::value::Value;
+    use std::iter::Iterator;
     use tracing::Instrument;
     use utils::id::TimelineId;
     use utils::lsn::Lsn;
@@ -6932,8 +7060,8 @@ mod tests {
     use crate::tenant::{PreviousHeatmap, Timeline};
 
     fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
-        assert_eq!(lhs.layers.len(), rhs.layers.len());
-        let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
+        assert_eq!(lhs.all_layers().count(), rhs.all_layers().count());
+        let lhs_rhs = lhs.all_layers().zip(rhs.all_layers());
         for (l, r) in lhs_rhs {
             assert_eq!(l.name, r.name);
             assert_eq!(l.metadata, r.metadata);
@@ -6992,6 +7120,7 @@ mod tests {
                 Lsn(0x10),
                 14,
                 &ctx,
+                Vec::new(), // in-memory layers
                 delta_layers,
                 image_layers,
                 Lsn(0x100),
@@ -7010,10 +7139,11 @@ mod tests {
         assert_eq!(heatmap.timeline_id, timeline.timeline_id);
 
         // L0 should come last
-        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
+        let heatmap_layers = heatmap.all_layers().collect::<Vec<_>>();
+        assert_eq!(heatmap_layers.last().unwrap().name, l0_delta.layer_name());
 
         let mut last_lsn = Lsn::MAX;
-        for layer in &heatmap.layers {
+        for layer in heatmap_layers {
             // Covered layer should be omitted
             assert!(layer.name != covered_delta.layer_name());
 
@@ -7046,6 +7176,7 @@ mod tests {
             .store(Some(Arc::new(PreviousHeatmap::Active {
                 heatmap: heatmap.clone(),
                 read_at: std::time::Instant::now(),
+                end_lsn: None,
             })));
 
         // Generate a new heatmap and assert that it contains the same layers as the old one.
@@ -7124,6 +7255,7 @@ mod tests {
                 Lsn(0x10),
                 14,
                 &ctx,
+                Vec::new(), // in-memory layers
                 delta_layers,
                 image_layers,
                 Lsn(0x100),
@@ -7140,7 +7272,7 @@ mod tests {
             .expect("Infallible while timeline is not shut down");
 
         // Both layers should be in the heatmap
-        assert!(!heatmap.layers.is_empty());
+        assert!(heatmap.all_layers().count() > 0);
 
         // Now simulate a migration.
         timeline
@@ -7148,6 +7280,7 @@ mod tests {
             .store(Some(Arc::new(PreviousHeatmap::Active {
                 heatmap: heatmap.clone(),
                 read_at: std::time::Instant::now(),
+                end_lsn: None,
             })));
 
         // Evict all the layers in the previous heatmap
@@ -7165,7 +7298,7 @@ mod tests {
             .await
             .expect("Infallible while timeline is not shut down");
 
-        assert!(post_eviction_heatmap.layers.is_empty());
+        assert_eq!(post_eviction_heatmap.all_layers().count(), 0);
         assert!(matches!(
             timeline.previous_heatmap.load().as_deref(),
             Some(PreviousHeatmap::Obsolete)
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index c835980a7d..42b36f7252 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -7,6 +7,7 @@
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
+use std::time::Instant;
 
 use super::layer_manager::LayerManager;
 use super::{
@@ -15,10 +16,11 @@ use super::{
     Timeline,
 };
 
-use anyhow::{Context, anyhow, bail};
+use anyhow::{Context, anyhow};
 use bytes::Bytes;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::FutureExt;
 use itertools::Itertools;
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
@@ -234,6 +236,12 @@ impl GcCompactionQueue {
             // it enough in staging yet.
             return Ok(());
         }
+        if timeline.get_gc_compaction_watermark() == Lsn::INVALID {
+            // If the gc watermark is not set, we don't need to trigger auto compaction.
+            // This check is the same as in `gc_compaction_split_jobs` but we don't log
+            // here and we can also skip the computation of the trigger condition earlier.
+            return Ok(());
+        }
 
         let Ok(permit) = CONCURRENT_GC_COMPACTION_TASKS.clone().try_acquire_owned() else {
             // Only allow one compaction run at a time. TODO: As we do `try_acquire_owned`, we cannot ensure
@@ -321,7 +329,7 @@ impl GcCompactionQueue {
                 l1_size, l2_size, l2_lsn, gc_cutoff
             );
         } else {
-            info!(
+            debug!(
                 "did not trigger auto gc-compaction: l1_size={}, l2_size={}, l2_lsn={}, gc_cutoff={}",
                 l1_size, l2_size, l2_lsn, gc_cutoff
             );
@@ -357,8 +365,7 @@ impl GcCompactionQueue {
                 GcCompactJob::from_compact_options(options.clone()),
                 options.sub_compaction_max_job_size_mb,
             )
-            .await
-            .map_err(CompactionError::Other)?;
+            .await?;
         if jobs.is_empty() {
             info!("no jobs to run, skipping scheduled compaction task");
             self.notify_and_unblock(id);
@@ -437,6 +444,7 @@ impl GcCompactionQueue {
             ));
         };
         let has_pending_tasks;
+        let mut yield_for_l0 = false;
         let Some((id, item)) = ({
             let mut guard = self.inner.lock().unwrap();
             if let Some((id, item)) = guard.queued.pop_front() {
@@ -486,13 +494,23 @@ impl GcCompactionQueue {
                         let mut guard = self.inner.lock().unwrap();
                         guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                     }
-                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                    let compaction_result =
+                        timeline.compact_with_options(cancel, options, ctx).await?;
                     self.notify_and_unblock(id);
+                    if compaction_result == CompactionOutcome::YieldForL0 {
+                        yield_for_l0 = true;
+                    }
                 }
             }
             GcCompactionQueueItem::SubCompactionJob(options) => {
                 // TODO: error handling, clear the queue if any task fails?
-                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
+                let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
+                if compaction_result == CompactionOutcome::YieldForL0 {
+                    // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
+                    // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
+                    // we need to clean things up before returning from the function.
+                    yield_for_l0 = true;
+                }
             }
             GcCompactionQueueItem::Notify(id, l2_lsn) => {
                 self.notify_and_unblock(id);
@@ -521,7 +539,10 @@ impl GcCompactionQueue {
             let mut guard = self.inner.lock().unwrap();
             guard.running = None;
         }
-        Ok(if has_pending_tasks {
+        Ok(if yield_for_l0 {
+            tracing::info!("give up gc-compaction: yield for L0 compaction");
+            CompactionOutcome::YieldForL0
+        } else if has_pending_tasks {
             CompactionOutcome::Pending
         } else {
             CompactionOutcome::Done
@@ -719,17 +740,41 @@ struct CompactionStatisticsNumSize {
 
 #[derive(Debug, Serialize, Default)]
 pub struct CompactionStatistics {
+    /// Delta layer visited (maybe compressed, physical size)
     delta_layer_visited: CompactionStatisticsNumSize,
+    /// Image layer visited (maybe compressed, physical size)
     image_layer_visited: CompactionStatisticsNumSize,
+    /// Delta layer produced (maybe compressed, physical size)
     delta_layer_produced: CompactionStatisticsNumSize,
+    /// Image layer produced (maybe compressed, physical size)
     image_layer_produced: CompactionStatisticsNumSize,
-    num_delta_layer_discarded: usize,
-    num_image_layer_discarded: usize,
+    /// Delta layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    delta_layer_discarded: CompactionStatisticsNumSize,
+    /// Image layer discarded (maybe compressed, physical size of the layer being discarded instead of the original layer)
+    image_layer_discarded: CompactionStatisticsNumSize,
     num_unique_keys_visited: usize,
+    /// Delta visited (uncompressed, original size)
     wal_keys_visited: CompactionStatisticsNumSize,
+    /// Image visited (uncompressed, original size)
     image_keys_visited: CompactionStatisticsNumSize,
+    /// Delta produced (uncompressed, original size)
     wal_produced: CompactionStatisticsNumSize,
+    /// Image produced (uncompressed, original size)
     image_produced: CompactionStatisticsNumSize,
+
+    // Time spent in each phase
+    time_acquire_lock_secs: f64,
+    time_analyze_secs: f64,
+    time_download_layer_secs: f64,
+    time_main_loop_secs: f64,
+    time_final_phase_secs: f64,
+    time_total_secs: f64,
+
+    // Summary
+    /// Ratio of the key-value size before/after gc-compaction.
+    uncompressed_size_ratio: f64,
+    /// Ratio of the physical size before/after gc-compaction.
+    physical_size_ratio: f64,
 }
 
 impl CompactionStatistics {
@@ -779,11 +824,13 @@ impl CompactionStatistics {
         self.image_produced.num += 1;
         self.image_produced.size += val.len() as u64 + Self::estimated_size_of_key() as u64;
     }
-    fn discard_delta_layer(&mut self) {
-        self.num_delta_layer_discarded += 1;
+    fn discard_delta_layer(&mut self, original_size: u64) {
+        self.delta_layer_discarded.num += 1;
+        self.delta_layer_discarded.size += original_size;
     }
-    fn discard_image_layer(&mut self) {
-        self.num_image_layer_discarded += 1;
+    fn discard_image_layer(&mut self, original_size: u64) {
+        self.image_layer_discarded.num += 1;
+        self.image_layer_discarded.size += original_size;
     }
     fn produce_delta_layer(&mut self, size: u64) {
         self.delta_layer_produced.num += 1;
@@ -793,6 +840,19 @@ impl CompactionStatistics {
         self.image_layer_produced.num += 1;
         self.image_layer_produced.size += size;
     }
+    fn finalize(&mut self) {
+        let original_key_value_size = self.image_keys_visited.size + self.wal_keys_visited.size;
+        let produced_key_value_size = self.image_produced.size + self.wal_produced.size;
+        self.uncompressed_size_ratio =
+            original_key_value_size as f64 / (produced_key_value_size as f64 + 1.0); // avoid div by 0
+        let original_physical_size = self.image_layer_visited.size + self.delta_layer_visited.size;
+        let produced_physical_size = self.image_layer_produced.size
+            + self.delta_layer_produced.size
+            + self.image_layer_discarded.size
+            + self.delta_layer_discarded.size; // Also include the discarded layers to make the ratio accurate
+        self.physical_size_ratio =
+            original_physical_size as f64 / (produced_physical_size as f64 + 1.0); // avoid div by 0
+    }
 }
 
 #[derive(Default, Debug, Clone, Copy, PartialEq, Eq)]
@@ -825,9 +885,7 @@ impl Timeline {
             .flags
             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
         {
-            self.compact_with_gc(cancel, options, ctx)
-                .await
-                .map_err(CompactionError::Other)?;
+            self.compact_with_gc(cancel, options, ctx).await?;
             return Ok(CompactionOutcome::Done);
         }
 
@@ -2345,12 +2403,19 @@ impl Timeline {
     async fn check_compaction_space(
         self: &Arc<Self>,
         layer_selection: &[Layer],
-    ) -> anyhow::Result<()> {
-        let available_space = self.check_available_space().await?;
+    ) -> Result<(), CompactionError> {
+        let available_space = self
+            .check_available_space()
+            .await
+            .map_err(CompactionError::Other)?;
         let mut remote_layer_size = 0;
         let mut all_layer_size = 0;
         for layer in layer_selection {
-            let needs_download = layer.needs_download().await?;
+            let needs_download = layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?;
             if needs_download.is_some() {
                 remote_layer_size += layer.layer_desc().file_size;
             }
@@ -2359,14 +2424,14 @@ impl Timeline {
         let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
         if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
         {
-            return Err(anyhow!(
+            return Err(CompactionError::Other(anyhow!(
                 "not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
                 available_space,
                 allocated_space,
                 all_layer_size,
                 remote_layer_size,
                 all_layer_size + remote_layer_size
-            ));
+            )));
         }
         Ok(())
     }
@@ -2397,7 +2462,7 @@ impl Timeline {
         self: &Arc<Self>,
         job: GcCompactJob,
         sub_compaction_max_job_size_mb: Option<u64>,
-    ) -> anyhow::Result<Vec<GcCompactJob>> {
+    ) -> Result<Vec<GcCompactJob>, CompactionError> {
         let compact_below_lsn = if job.compact_lsn_range.end != Lsn::MAX {
             job.compact_lsn_range.end
         } else {
@@ -2548,7 +2613,7 @@ impl Timeline {
         cancel: &CancellationToken,
         options: CompactOptions,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         let sub_compaction = options.sub_compaction;
         let job = GcCompactJob::from_compact_options(options.clone());
         if sub_compaction {
@@ -2570,7 +2635,7 @@ impl Timeline {
             if jobs_len == 0 {
                 info!("no jobs to run, skipping gc bottom-most compaction");
             }
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
         }
         self.compact_with_gc_inner(cancel, job, ctx).await
     }
@@ -2580,19 +2645,24 @@ impl Timeline {
         cancel: &CancellationToken,
         job: GcCompactJob,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<CompactionOutcome, CompactionError> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
 
+        let timer = Instant::now();
+        let begin_timer = timer;
+
         let gc_lock = async {
             tokio::select! {
                 guard = self.gc_lock.lock() => Ok(guard),
-                // TODO: refactor to CompactionError to correctly pass cancelled error
-                _ = cancel.cancelled() => Err(anyhow!("cancelled")),
+                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
             }
         };
 
+        let time_acquire_lock = timer.elapsed();
+        let timer = Instant::now();
+
         let gc_lock = crate::timed(
             gc_lock,
             "acquires gc lock",
@@ -2644,7 +2714,7 @@ impl Timeline {
                         tracing::warn!(
                             "no layers to compact with gc: gc_cutoff not generated yet, skipping gc bottom-most compaction"
                         );
-                        return Ok(());
+                        return Ok(CompactionOutcome::Skipped);
                     }
                     real_gc_cutoff
                 } else {
@@ -2682,7 +2752,7 @@ impl Timeline {
                     "no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}",
                     gc_cutoff
                 );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
             };
             // Next, if the user specifies compact_lsn_range.start, we need to filter some layers out. All the layers (strictly) below
             // the min_layer_lsn computed as below will be filtered out and the data will be accessed using the normal read path, as if
@@ -2703,7 +2773,7 @@ impl Timeline {
                     "no layers to compact with gc: no historic layers above compact_above_lsn, compact_above_lsn={}",
                     compact_lsn_range.end
                 );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
             // layers to compact.
@@ -2729,7 +2799,7 @@ impl Timeline {
                     "no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}",
                     gc_cutoff, compact_key_range.start, compact_key_range.end
                 );
-                return Ok(());
+                return Ok(CompactionOutcome::Done);
             }
             retain_lsns_below_horizon.sort();
             GcCompactionJobDescription {
@@ -2782,6 +2852,9 @@ impl Timeline {
             has_data_below,
         );
 
+        let time_analyze = timer.elapsed();
+        let timer = Instant::now();
+
         for layer in &job_desc.selected_layers {
             debug!("read layer: {}", layer.layer_desc().key());
         }
@@ -2810,10 +2883,10 @@ impl Timeline {
             .map(|layer| layer.layer_desc().layer_name())
             .collect_vec();
         if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                 "gc-compaction layer map check failed because {}, cannot proceed with compaction due to potential data loss",
                 err
-            );
+            )));
         }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = job_desc
@@ -2828,11 +2901,33 @@ impl Timeline {
         let mut total_downloaded_size = 0;
         let mut total_layer_size = 0;
         for layer in &job_desc.selected_layers {
-            if layer.needs_download().await?.is_some() {
+            if layer
+                .needs_download()
+                .await
+                .context("failed to check if layer needs download")
+                .map_err(CompactionError::Other)?
+                .is_some()
+            {
                 total_downloaded_size += layer.layer_desc().file_size;
             }
             total_layer_size += layer.layer_desc().file_size;
-            let resident_layer = layer.download_and_keep_resident(ctx).await?;
+            if cancel.is_cancelled() {
+                return Err(CompactionError::ShuttingDown);
+            }
+            let should_yield = self
+                .l0_compaction_trigger
+                .notified()
+                .now_or_never()
+                .is_some();
+            if should_yield {
+                tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
+            }
+            let resident_layer = layer
+                .download_and_keep_resident(ctx)
+                .await
+                .context("failed to download and keep resident layer")
+                .map_err(CompactionError::Other)?;
             downloaded_layers.push(resident_layer);
         }
         info!(
@@ -2843,19 +2938,36 @@ impl Timeline {
         );
         for resident_layer in &downloaded_layers {
             if resident_layer.layer_desc().is_delta() {
-                let layer = resident_layer.get_as_delta(ctx).await?;
+                let layer = resident_layer
+                    .get_as_delta(ctx)
+                    .await
+                    .context("failed to get delta layer")
+                    .map_err(CompactionError::Other)?;
                 delta_layers.push(layer);
             } else {
-                let layer = resident_layer.get_as_image(ctx).await?;
+                let layer = resident_layer
+                    .get_as_image(ctx)
+                    .await
+                    .context("failed to get image layer")
+                    .map_err(CompactionError::Other)?;
                 image_layers.push(layer);
             }
         }
-        let (dense_ks, sparse_ks) = self.collect_gc_compaction_keyspace().await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_gc_compaction_keyspace()
+            .await
+            .context("failed to collect gc compaction keyspace")
+            .map_err(CompactionError::Other)?;
         let mut merge_iter = FilterIterator::create(
             MergeIterator::create(&delta_layers, &image_layers, ctx),
             dense_ks,
             sparse_ks,
-        )?;
+        )
+        .context("failed to create filter iterator")
+        .map_err(CompactionError::Other)?;
+
+        let time_download_layer = timer.elapsed();
+        let timer = Instant::now();
 
         // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
@@ -2874,7 +2986,9 @@ impl Timeline {
                     self.get_compaction_target_size(),
                     ctx,
                 )
-                .await?,
+                .await
+                .context("failed to create image layer writer")
+                .map_err(CompactionError::Other)?,
             )
         } else {
             None
@@ -2887,7 +3001,9 @@ impl Timeline {
             lowest_retain_lsn..end_lsn,
             self.get_compaction_target_size(),
         )
-        .await?;
+        .await
+        .context("failed to create delta layer writer")
+        .map_err(CompactionError::Other)?;
 
         #[derive(Default)]
         struct RewritingLayers {
@@ -2927,9 +3043,28 @@ impl Timeline {
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
-        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
+        let mut keys_processed = 0;
+
+        while let Some(((key, lsn, val), desc)) = merge_iter
+            .next_with_trace()
+            .await
+            .context("failed to get next key-value pair")
+            .map_err(CompactionError::Other)?
+        {
             if cancel.is_cancelled() {
-                return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
+                return Err(CompactionError::ShuttingDown);
+            }
+            keys_processed += 1;
+            if keys_processed % 1000 == 0 {
+                let should_yield = self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some();
+                if should_yield {
+                    tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
+                    return Ok(CompactionOutcome::YieldForL0);
+                }
             }
             if self.shard_identity.is_key_disposable(&key) {
                 // If this shard does not need to store this key, simply skip it.
@@ -2960,7 +3095,9 @@ impl Timeline {
                                 desc.lsn_range.clone(),
                                 ctx,
                             )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                         );
                     }
                     rewriter.before.as_mut().unwrap()
@@ -2975,14 +3112,20 @@ impl Timeline {
                                 desc.lsn_range.clone(),
                                 ctx,
                             )
-                            .await?,
+                            .await
+                            .context("failed to create delta layer writer")
+                            .map_err(CompactionError::Other)?,
                         );
                     }
                     rewriter.after.as_mut().unwrap()
                 } else {
                     unreachable!()
                 };
-                rewriter.put_value(key, lsn, val, ctx).await?;
+                rewriter
+                    .put_value(key, lsn, val, ctx)
+                    .await
+                    .context("failed to put value")
+                    .map_err(CompactionError::Other)?;
                 continue;
             }
             match val {
@@ -3005,9 +3148,13 @@ impl Timeline {
                         &job_desc.retain_lsns_below_horizon,
                         COMPACTION_DELTA_THRESHOLD,
                         get_ancestor_image(self, *last_key, ctx, has_data_below, lowest_retain_lsn)
-                            .await?,
+                            .await
+                            .context("failed to get ancestor image")
+                            .map_err(CompactionError::Other)?,
                     )
-                    .await?;
+                    .await
+                    .context("failed to generate key retention")
+                    .map_err(CompactionError::Other)?;
                 retention
                     .pipe_to(
                         *last_key,
@@ -3016,7 +3163,9 @@ impl Timeline {
                         &mut stat,
                         ctx,
                     )
-                    .await?;
+                    .await
+                    .context("failed to pipe to delta layer writer")
+                    .map_err(CompactionError::Other)?;
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
@@ -3034,9 +3183,14 @@ impl Timeline {
                 job_desc.gc_cutoff,
                 &job_desc.retain_lsns_below_horizon,
                 COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn).await?,
+                get_ancestor_image(self, last_key, ctx, has_data_below, lowest_retain_lsn)
+                    .await
+                    .context("failed to get ancestor image")
+                    .map_err(CompactionError::Other)?,
             )
-            .await?;
+            .await
+            .context("failed to generate key retention")
+            .map_err(CompactionError::Other)?;
         retention
             .pipe_to(
                 last_key,
@@ -3045,21 +3199,36 @@ impl Timeline {
                 &mut stat,
                 ctx,
             )
-            .await?;
+            .await
+            .context("failed to pipe to delta layer writer")
+            .map_err(CompactionError::Other)?;
         // end: move the above part to the loop body
 
+        let time_main_loop = timer.elapsed();
+        let timer = Instant::now();
+
         let mut rewrote_delta_layers = Vec::new();
         for (key, writers) in delta_layer_rewriters {
             if let Some(delta_writer_before) = writers.before {
                 let (desc, path) = delta_writer_before
                     .finish(job_desc.compaction_key_range.start, ctx)
-                    .await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                 rewrote_delta_layers.push(layer);
             }
             if let Some(delta_writer_after) = writers.after {
-                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
-                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                let (desc, path) = delta_writer_after
+                    .finish(key.key_range.end, ctx)
+                    .await
+                    .context("failed to finish delta layer writer")
+                    .map_err(CompactionError::Other)?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)
+                    .context("failed to finish creating delta layer")
+                    .map_err(CompactionError::Other)?;
                 rewrote_delta_layers.push(layer);
             }
         }
@@ -3074,7 +3243,9 @@ impl Timeline {
                 let end_key = job_desc.compaction_key_range.end;
                 writer
                     .finish_with_discard_fn(self, ctx, end_key, discard)
-                    .await?
+                    .await
+                    .context("failed to finish image layer writer")
+                    .map_err(CompactionError::Other)?
             } else {
                 drop(writer);
                 Vec::new()
@@ -3086,7 +3257,9 @@ impl Timeline {
         let produced_delta_layers = if !dry_run {
             delta_layer_writer
                 .finish_with_discard_fn(self, ctx, discard)
-                .await?
+                .await
+                .context("failed to finish delta layer writer")
+                .map_err(CompactionError::Other)?
         } else {
             drop(delta_layer_writer);
             Vec::new()
@@ -3098,6 +3271,13 @@ impl Timeline {
         let mut keep_layers = HashSet::new();
         let produced_delta_layers_len = produced_delta_layers.len();
         let produced_image_layers_len = produced_image_layers.len();
+
+        let layer_selection_by_key = job_desc
+            .selected_layers
+            .iter()
+            .map(|l| (l.layer_desc().key(), l.layer_desc().clone()))
+            .collect::<HashMap<_, _>>();
+
         for action in produced_delta_layers {
             match action {
                 BatchWriterResult::Produced(layer) => {
@@ -3111,8 +3291,16 @@ impl Timeline {
                     if cfg!(debug_assertions) {
                         info!("discarded delta layer: {}", l);
                     }
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_delta_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded delta layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_delta_layer(0);
+                    }
                     keep_layers.insert(l);
-                    stat.discard_delta_layer();
                 }
             }
         }
@@ -3121,6 +3309,9 @@ impl Timeline {
                 "produced rewritten delta layer: {}",
                 layer.layer_desc().key()
             );
+            // For now, we include rewritten delta layer size in the "produce_delta_layer". We could
+            // make it a separate statistics in the future.
+            stat.produce_delta_layer(layer.layer_desc().file_size());
         }
         compact_to.extend(rewrote_delta_layers);
         for action in produced_image_layers {
@@ -3132,8 +3323,16 @@ impl Timeline {
                 }
                 BatchWriterResult::Discarded(l) => {
                     debug!("discarded image layer: {}", l);
+                    if let Some(layer_desc) = layer_selection_by_key.get(&l) {
+                        stat.discard_image_layer(layer_desc.file_size());
+                    } else {
+                        tracing::warn!(
+                            "discarded image layer not in layer_selection: {}, produced a layer outside of the compaction key range?",
+                            l
+                        );
+                        stat.discard_image_layer(0);
+                    }
                     keep_layers.insert(l);
-                    stat.discard_image_layer();
                 }
             }
         }
@@ -3166,7 +3365,9 @@ impl Timeline {
                     &layer.layer_desc().key_range,
                     &job_desc.compaction_key_range,
                 ) {
-                    bail!("violated constraint: image layer outside of compaction key range");
+                    return Err(CompactionError::Other(anyhow!(
+                        "violated constraint: image layer outside of compaction key range"
+                    )));
                 }
                 if !fully_contains(
                     &job_desc.compaction_key_range,
@@ -3179,13 +3380,25 @@ impl Timeline {
 
         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
 
+        let time_final_phase = timer.elapsed();
+
+        stat.time_final_phase_secs = time_final_phase.as_secs_f64();
+        stat.time_main_loop_secs = time_main_loop.as_secs_f64();
+        stat.time_acquire_lock_secs = time_acquire_lock.as_secs_f64();
+        stat.time_download_layer_secs = time_download_layer.as_secs_f64();
+        stat.time_analyze_secs = time_analyze.as_secs_f64();
+        stat.time_total_secs = begin_timer.elapsed().as_secs_f64();
+        stat.finalize();
+
         info!(
             "gc-compaction statistics: {}",
-            serde_json::to_string(&stat)?
+            serde_json::to_string(&stat)
+                .context("failed to serialize gc-compaction statistics")
+                .map_err(CompactionError::Other)?
         );
 
         if dry_run {
-            return Ok(());
+            return Ok(CompactionOutcome::Done);
         }
 
         info!(
@@ -3220,10 +3433,10 @@ impl Timeline {
         // the writer, so potentially, we will need a function like `ImageLayerBatchWriter::get_all_pending_layer_keys` to get all the keys that are
         // in the writer before finalizing the persistent layers. Now we would leave some dangling layers on the disk if the check fails.
         if let Some(err) = check_valid_layermap(&final_layers) {
-            bail!(
+            return Err(CompactionError::Other(anyhow!(
                 "gc-compaction layer map check failed after compaction because {}, compaction result not applied to the layer map due to potential data loss",
                 err
-            );
+            )));
         }
 
         // Between the sanity check and this compaction update, there could be new layers being flushed, but it should be fine because we only
@@ -3275,7 +3488,9 @@ impl Timeline {
         // find_gc_cutoffs will try accessing things below the cutoff. TODO: ideally, this should
         // be batched into `schedule_compaction_update`.
         let disk_consistent_lsn = self.disk_consistent_lsn.load();
-        self.schedule_uploads(disk_consistent_lsn, None)?;
+        self.schedule_uploads(disk_consistent_lsn, None)
+            .context("failed to schedule uploads")
+            .map_err(CompactionError::Other)?;
         // If a layer gets rewritten throughout gc-compaction, we need to keep that layer only in `compact_to` instead
         // of `compact_from`.
         let compact_from = {
@@ -3302,7 +3517,7 @@ impl Timeline {
 
         drop(gc_lock);
 
-        Ok(())
+        Ok(CompactionOutcome::Done)
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 7cdc69e55f..c9666bb4e1 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -306,6 +306,7 @@ impl DeleteTimelineFlow {
                 CreateTimelineCause::Delete,
                 crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
                 None, // doesn't matter what we put here
+                None, // doesn't matter what we put here
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 67fb89c433..809b350f38 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1,5 +1,4 @@
-//! An efficient way to keep the timeline gate open without preventing
-//! timeline shutdown for longer than a single call to a timeline method.
+//! A cache for [`crate::tenant::mgr`]+`Tenant::get_timeline`+`Timeline::gate.enter()`.
 //!
 //! # Motivation
 //!
@@ -19,27 +18,32 @@
 //! we hold the Timeline gate open while we're invoking the method on the
 //! Timeline object.
 //!
-//! However, we want to avoid the overhead of entering the gate for every
-//! method invocation.
-//!
-//! Further, for shard routing, we want to avoid calling the tenant manager to
-//! resolve the shard for every request. Instead, we want to cache the
-//! routing result so we can bypass the tenant manager for all subsequent requests
-//! that get routed to that shard.
+//! We want to avoid the overhead of doing, for each incoming request,
+//! - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//!   release the mgr rwlock before doing any request processing work
+//! - re-entering the Timeline gate for each Timeline method invocation.
 //!
 //! Regardless of how we accomplish the above, it should not
 //! prevent the Timeline from shutting down promptly.
 //!
+//!
 //! # Design
 //!
 //! ## Data Structures
 //!
-//! There are three user-facing data structures:
+//! There are two concepts expressed as associated types in the `Types` trait:
+//! - `TenantManager`: the thing that performs the expensive work. It produces
+//!   a `Timeline` object, which is the other associated type.
+//! - `Timeline`: the item that we cache for fast (TenantTimelineId,ShardSelector) lookup.
+//!
+//! There are three user-facing data structures exposed by this module:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
-//! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
+//! - `Handle`: a smart pointer that derefs to the Types::Timeline.
 //! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
-//!   trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
+//!   trying to ugprade back to a `Handle`. If successful, a re-upgraded Handle will always
+//!   point to the same cached `Types::Timeline`. Upgrades never invoke the `TenantManager`.
 //!
 //! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
 //! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
@@ -64,11 +68,14 @@
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
-//! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
-//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
+//! A cache miss means we call Types::TenantManager::resolve for shard routing,
+//! cloning the `Arc<Timeline>` out of it, and entering the gate. The result of
+//! resolve() is the object we want to cache, and return `Handle`s to for subseqent `Cache::get` calls.
+//!
+//! We wrap the object returned from resolve() in an `Arc` and store that inside the
+//! `Arc<Mutex<HandleInner>>>`. A weak ref to the HandleInner is stored in the `Cache`
 //! and a strong ref in the `PerTimelineState`.
-//! A strong ref is returned wrapped in a `Handle`.
+//! Another strong ref is returned wrapped in a `Handle`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
 //! and find the weak ref in the cache.
@@ -78,51 +85,51 @@
 //! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
 //! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
 //! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
-//! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
+//! It then drops the `Handle`, and thus the `Arc<Mutex<HandleInner>>` inside it.
 //!
 //! # Performance
 //!
 //! Remember from the introductory section:
 //!
-//! > However, we want to avoid the overhead of entering the gate for every
-//! > method invocation.
+//! > We want to avoid the overhead of doing, for each incoming request,
+//! > - tenant manager lookup (global rwlock + btreemap lookup for shard routing)
+//! > - cloning the `Arc<Timeline>` out of the tenant manager so we can
+//! >   release the mgr rwlock before doing any request processing work
+//! > - re-entering the Timeline gate for each Timeline method invocation.
 //!
-//! Why do we want to avoid that?
-//! Because the gate is a shared location in memory and entering it involves
-//! bumping refcounts, which leads to cache contention if done frequently
-//! from multiple cores in parallel.
+//! All of these boil down to some state that is either globally shared among all shards
+//! or state shared among all tasks that serve a particular timeline.
+//! It is either protected by RwLock or manipulated via atomics.
+//! Even atomics are costly when shared across multiple cores.
+//! So, we want to avoid any permanent need for coordination between page_service tasks.
 //!
-//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
-//! That `Arc` is private to the `HandleInner` and hence to the connection.
+//! The solution is to add indirection: we wrap the Types::Timeline object that is
+//! returned by Types::TenantManager into an Arc that is rivate to the `HandleInner`
+//! and hence to the single Cache / page_service connection.
 //! (Review the "Data Structures" section if that is unclear to you.)
 //!
-//! A `WeakHandle` is a weak ref to the `HandleInner`.
-//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
-//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
-//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
 //!
-//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
-//! Again, this is cheap because the `Arc` is private to the connection.
+//! When upgrading a `WeakHandle`, we upgrade its weak to a strong ref (of the `Mutex<HandleInner>`),
+//! lock the mutex, take out a clone of the `Arc<Types::Timeline>`, and drop the Mutex.
+//! The Mutex is not contended because it is private to the connection.
+//! And again, the  `Arc<Types::Timeline>` clone is cheap because that wrapper
+//! Arc's refcounts are private to the connection.
+//!
+//! Downgrading drops these two Arcs, which again, manipulates refcounts that are private to the connection.
 //!
-//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
-//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
-//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
-//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
-//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
-//! so that we can clone it cheaply when upgrading a `WeakHandle`.
 //!
 //! # Shutdown
 //!
 //! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> Timeline
 //! ```
 //!
 //! Further, there is this cycle:
 //!
 //! ```text
-//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Types::Timeline --strong--> GateGuard --keepalive--> Timeline
 //! ```
 //!
 //! The former cycle is a memory leak if not broken.
@@ -135,9 +142,12 @@
 //! - Timeline shutdown (=> `PerTimelineState::shutdown`)
 //! - Connection shutdown (=> dropping the `Cache`).
 //!
-//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
-//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
-//! `Arc<GateGuard>`.
+//! Both transition the `HandleInner` from [`HandleInner::Open`] to
+//! [`HandleInner::ShutDown`], which drops the only long-lived
+//! `Arc<Types::Timeline>`. Once the last short-lived Arc<Types::Timeline>
+//! is dropped, the `Types::Timeline` gets dropped and thereby
+//! the `GateGuard` and the `Arc<Timeline>` that it stores,
+//! thereby breaking both cycles.
 //!
 //! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
 //! thereby breaking the cycle.
@@ -216,7 +226,7 @@ use crate::tenant::mgr::ShardSelector;
 pub(crate) trait Types: Sized + std::fmt::Debug {
     type TenantManagerError: Sized + std::fmt::Debug;
     type TenantManager: TenantManager<Self> + Sized;
-    type Timeline: ArcTimeline<Self> + Sized;
+    type Timeline: Timeline<Self> + Sized;
 }
 
 /// Uniquely identifies a [`Cache`] instance over the lifetime of the process.
@@ -261,20 +271,15 @@ pub(crate) struct ShardTimelineId {
 
 /// See module-level comment.
 pub(crate) struct Handle<T: Types> {
-    timeline: Arc<T::Timeline>,
-    #[allow(dead_code)] // the field exists to keep the gate open
-    gate_guard: Arc<utils::sync::gate::GateGuard>,
     inner: Arc<Mutex<HandleInner<T>>>,
+    open: Arc<T::Timeline>,
 }
 pub(crate) struct WeakHandle<T: Types> {
     inner: Weak<Mutex<HandleInner<T>>>,
 }
+
 enum HandleInner<T: Types> {
-    KeepingTimelineGateOpen {
-        #[allow(dead_code)]
-        gate_guard: Arc<utils::sync::gate::GateGuard>,
-        timeline: Arc<T::Timeline>,
-    },
+    Open(Arc<T::Timeline>),
     ShutDown,
 }
 
@@ -307,8 +312,7 @@ pub(crate) trait TenantManager<T: Types> {
 }
 
 /// Abstract view of an [`Arc<Timeline>`], for testability.
-pub(crate) trait ArcTimeline<T: Types>: Clone {
-    fn gate(&self) -> &utils::sync::gate::Gate;
+pub(crate) trait Timeline<T: Types> {
     fn shard_timeline_id(&self) -> ShardTimelineId;
     fn get_shard_identity(&self) -> &ShardIdentity;
     fn per_timeline_state(&self) -> &PerTimelineState<T>;
@@ -318,7 +322,6 @@ pub(crate) trait ArcTimeline<T: Types>: Clone {
 #[derive(Debug)]
 pub(crate) enum GetError<T: Types> {
     TenantManager(T::TenantManagerError),
-    TimelineGateClosed,
     PerTimelineStateShutDown,
 }
 
@@ -434,21 +437,9 @@ impl<T: Types> Cache<T> {
                 }
 
                 trace!("creating new HandleInner");
-                let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
-                    gate_guard: Arc::new(
-                        // this enter() is expensive in production code because
-                        // it hits the global Arc<Timeline>::gate refcounts
-                        match timeline.gate().enter() {
-                            Ok(guard) => guard,
-                            Err(_) => {
-                                return Err(GetError::TimelineGateClosed);
-                            }
-                        },
-                    ),
-                    // this clone is expensive in production code because
-                    // it hits the global Arc<Timeline>::clone refcounts
-                    timeline: Arc::new(timeline.clone()),
-                }));
+                let timeline = Arc::new(timeline);
+                let handle_inner_arc =
+                    Arc::new(Mutex::new(HandleInner::Open(Arc::clone(&timeline))));
                 let handle_weak = WeakHandle {
                     inner: Arc::downgrade(&handle_inner_arc),
                 };
@@ -503,18 +494,10 @@ impl<T: Types> WeakHandle<T> {
         };
         let lock_guard = inner.lock().expect("poisoned");
         match &*lock_guard {
-            HandleInner::KeepingTimelineGateOpen {
-                timeline,
-                gate_guard,
-            } => {
-                let gate_guard = Arc::clone(gate_guard);
-                let timeline = Arc::clone(timeline);
+            HandleInner::Open(open) => {
+                let open = Arc::clone(open);
                 drop(lock_guard);
-                Ok(Handle {
-                    timeline,
-                    gate_guard,
-                    inner,
-                })
+                Ok(Handle { open, inner })
             }
             HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
         }
@@ -528,7 +511,7 @@ impl<T: Types> WeakHandle<T> {
 impl<T: Types> std::ops::Deref for Handle<T> {
     type Target = T::Timeline;
     fn deref(&self) -> &Self::Target {
-        &self.timeline
+        &self.open
     }
 }
 
@@ -545,7 +528,7 @@ impl<T: Types> PerTimelineState<T> {
     /// to the [`Types::Timeline`] that embeds this per-timeline state.
     /// Even if [`TenantManager::resolve`] would still resolve to it.
     ///
-    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`ArcTimeline`] alive.
+    /// Already-alive [`Handle`]s for will remain open, usable, and keeping the [`Types::Timeline`] alive.
     /// That's ok because they're short-lived. See module-level comment for details.
     #[instrument(level = "trace", skip_all)]
     pub(super) fn shutdown(&self) {
@@ -611,7 +594,7 @@ impl<T: Types> Drop for Cache<T> {
 impl<T: Types> HandleInner<T> {
     fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
         match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
+            HandleInner::Open(timeline) => Some(timeline),
             HandleInner::ShutDown => {
                 // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
                 // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
@@ -631,6 +614,7 @@ mod tests {
     use pageserver_api::reltag::RelTag;
     use pageserver_api::shard::ShardStripeSize;
     use utils::shard::ShardCount;
+    use utils::sync::gate::GateGuard;
 
     use super::*;
 
@@ -641,7 +625,7 @@ mod tests {
     impl Types for TestTypes {
         type TenantManagerError = anyhow::Error;
         type TenantManager = StubManager;
-        type Timeline = Arc<StubTimeline>;
+        type Timeline = Entered;
     }
 
     struct StubManager {
@@ -656,17 +640,19 @@ mod tests {
         myself: Weak<StubTimeline>,
     }
 
+    struct Entered {
+        timeline: Arc<StubTimeline>,
+        #[allow(dead_code)] // it's stored here to keep the gate open
+        gate_guard: Arc<GateGuard>,
+    }
+
     impl StubTimeline {
         fn getpage(&self) {
             // do nothing
         }
     }
 
-    impl ArcTimeline<TestTypes> for Arc<StubTimeline> {
-        fn gate(&self) -> &utils::sync::gate::Gate {
-            &self.gate
-        }
-
+    impl Timeline<TestTypes> for Entered {
         fn shard_timeline_id(&self) -> ShardTimelineId {
             ShardTimelineId {
                 shard_index: self.shard.shard_index(),
@@ -688,20 +674,34 @@ mod tests {
             &self,
             timeline_id: TimelineId,
             shard_selector: ShardSelector,
-        ) -> anyhow::Result<Arc<StubTimeline>> {
+        ) -> anyhow::Result<Entered> {
             for timeline in &self.shards {
                 if timeline.id == timeline_id {
+                    let enter_gate = || {
+                        let gate_guard = timeline.gate.enter()?;
+                        let gate_guard = Arc::new(gate_guard);
+                        anyhow::Ok(gate_guard)
+                    };
                     match &shard_selector {
                         ShardSelector::Zero if timeline.shard.is_shard_zero() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                         }
                         ShardSelector::Zero => continue,
                         ShardSelector::Page(key) if timeline.shard.is_key_local(key) => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                         }
                         ShardSelector::Page(_) => continue,
                         ShardSelector::Known(idx) if idx == &timeline.shard.shard_index() => {
-                            return Ok(Arc::clone(timeline));
+                            return Ok(Entered {
+                                timeline: Arc::clone(timeline),
+                                gate_guard: enter_gate()?,
+                            });
                         }
                         ShardSelector::Known(_) => continue,
                     }
@@ -711,6 +711,13 @@ mod tests {
         }
     }
 
+    impl std::ops::Deref for Entered {
+        type Target = StubTimeline;
+        fn deref(&self) -> &Self::Target {
+            &self.timeline
+        }
+    }
+
     #[tokio::test(start_paused = true)]
     async fn test_timeline_shutdown() {
         crate::tenant::harness::setup_logging();
@@ -1038,7 +1045,6 @@ mod tests {
         let key = DBDIR_KEY;
 
         // Simulate 10 connections that's opened, used, and closed
-        let mut used_handles = vec![];
         for _ in 0..10 {
             let mut cache = Cache::<TestTypes>::default();
             let handle = {
@@ -1050,7 +1056,6 @@ mod tests {
                 handle
             };
             handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.timeline));
         }
 
         // No handles exist, thus gates are closed and don't require shutdown.
diff --git a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
index 184c830464..11df232a10 100644
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -32,6 +32,7 @@ impl HeatmapLayersDownloader {
     fn new(
         timeline: Arc<Timeline>,
         concurrency: usize,
+        recurse: bool,
         ctx: RequestContext,
     ) -> Result<HeatmapLayersDownloader, ApiError> {
         let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
@@ -60,11 +61,11 @@ impl HeatmapLayersDownloader {
 
                 tracing::info!(
                     resident_size=%timeline.resident_physical_size(),
-                    heatmap_layers=%heatmap.layers.len(),
+                    heatmap_layers=%heatmap.all_layers().count(),
                     "Starting heatmap layers download"
                 );
 
-                let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
+                let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
                     |layer| {
                         let ctx = ctx.attached_child();
                         let tl = timeline.clone();
@@ -98,6 +99,20 @@ impl HeatmapLayersDownloader {
                     },
                     _ = cancel.cancelled() => {
                         tracing::info!("Heatmap layers download cancelled");
+                        return;
+                    }
+                }
+
+                if recurse {
+                    if let Some(ancestor) = timeline.ancestor_timeline() {
+                        let ctx = ctx.attached_child();
+                        let res =
+                            ancestor.start_heatmap_layers_download(concurrency, recurse, &ctx);
+                        if let Err(err) = res {
+                            tracing::info!(
+                                "Failed to start heatmap layers download for ancestor: {err}"
+                            );
+                        }
                     }
                 }
             }
@@ -140,14 +155,20 @@ impl HeatmapLayersDownloader {
 }
 
 impl Timeline {
-    pub(crate) async fn start_heatmap_layers_download(
+    pub(crate) fn start_heatmap_layers_download(
         self: &Arc<Self>,
         concurrency: usize,
+        recurse: bool,
         ctx: &RequestContext,
     ) -> Result<(), ApiError> {
         let mut locked = self.heatmap_layers_downloader.lock().unwrap();
         if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
-            let dl = HeatmapLayersDownloader::new(self.clone(), concurrency, ctx.attached_child())?;
+            let dl = HeatmapLayersDownloader::new(
+                self.clone(),
+                concurrency,
+                recurse,
+                ctx.attached_child(),
+            )?;
             *locked = Some(dl);
             Ok(())
         } else {
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index e552ea83de..1b489028dc 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -8,14 +8,14 @@ use tracing::trace;
 use utils::id::TimelineId;
 use utils::lsn::{AtomicLsn, Lsn};
 
-use super::TimelineWriterState;
+use super::{ReadableLayer, TimelineWriterState};
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::metrics::TimelineMetrics;
 use crate::tenant::layer_map::{BatchedUpdates, LayerMap};
 use crate::tenant::storage_layer::{
     AsLayerDesc, InMemoryLayer, Layer, LayerVisibilityHint, PersistentLayerDesc,
-    PersistentLayerKey, ResidentLayer,
+    PersistentLayerKey, ReadableLayerWeak, ResidentLayer,
 };
 
 /// Provides semantic APIs to manipulate the layer map.
@@ -37,6 +37,21 @@ impl Default for LayerManager {
 }
 
 impl LayerManager {
+    pub(crate) fn upgrade(&self, weak: ReadableLayerWeak) -> ReadableLayer {
+        match weak {
+            ReadableLayerWeak::PersistentLayer(desc) => {
+                ReadableLayer::PersistentLayer(self.get_from_desc(&desc))
+            }
+            ReadableLayerWeak::InMemoryLayer(desc) => {
+                let inmem = self
+                    .layer_map()
+                    .expect("no concurrent shutdown")
+                    .in_memory_layer(&desc);
+                ReadableLayer::InMemoryLayer(inmem)
+            }
+        }
+    }
+
     pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
         // The assumption for the `expect()` is that all code maintains the following invariant:
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
@@ -470,6 +485,25 @@ impl OpenLayerManager {
         mapping.remove(layer);
         layer.delete_on_drop();
     }
+
+    #[cfg(test)]
+    pub(crate) fn force_insert_in_memory_layer(&mut self, layer: Arc<InMemoryLayer>) {
+        use pageserver_api::models::InMemoryLayerInfo;
+
+        match layer.info() {
+            InMemoryLayerInfo::Open { .. } => {
+                assert!(self.layer_map.open_layer.is_none());
+                self.layer_map.open_layer = Some(layer);
+            }
+            InMemoryLayerInfo::Frozen { lsn_start, .. } => {
+                if let Some(last) = self.layer_map.frozen_layers.back() {
+                    assert!(last.get_lsn_range().end <= lsn_start);
+                }
+
+                self.layer_map.frozen_layers.push_back(layer);
+            }
+        }
+    }
 }
 
 pub(crate) struct LayerFileManager<T>(HashMap<PersistentLayerKey, T>);
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index f6a577abfc..9f0a877b07 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1369,6 +1369,10 @@ neon_get_lfc_stats(PG_FUNCTION_ARGS)
 			if (lfc_ctl)
 				value = lfc_ctl->limit;
 			break;
+		case 8:
+			key = "file_cache_chunk_size_pages";
+			value = BLOCKS_PER_CHUNK;
+			break;
 		default:
 			SRF_RETURN_DONE(funcctx);
 	}
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index fe463fd4a6..0414661a5f 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1026,6 +1026,19 @@ prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blocknum, n
 			if (!neon_prefetch_response_usable(&lsns[i], slot))
 				continue;
 
+			/*
+			 * Ignore errors
+			 */
+			if (slot->response->tag != T_NeonGetPageResponse)
+			{
+				if (slot->response->tag != T_NeonErrorResponse)
+				{
+					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
+											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
+				}
+				continue;
+			}
 			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
 			prefetch_set_unused(ring_index);
 			BITMAP_SET(mask, i);
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 356895aa82..7ec4ec99fc 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -83,6 +83,7 @@ static void AssertEventsOkForState(uint32 events, Safekeeper *sk);
 static char *FormatEvents(WalProposer *wp, uint32 events);
 static void UpdateDonorShmem(WalProposer *wp);
 static char *MembershipConfigurationToString(MembershipConfiguration *mconf);
+static void MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst);
 static void MembershipConfigurationFree(MembershipConfiguration *mconf);
 
 WalProposer *
@@ -97,7 +98,32 @@ WalProposerCreate(WalProposerConfig *config, walproposer_api api)
 	wp->config = config;
 	wp->api = api;
 
-	for (host = wp->config->safekeepers_list; host != NULL && *host != '\0'; host = sep)
+	wp_log(LOG, "neon.safekeepers=%s", wp->config->safekeepers_list);
+
+	/*
+	 * If safekeepers list starts with g# parse generation number followed by
+	 * :
+	 */
+	if (strncmp(wp->config->safekeepers_list, "g#", 2) == 0)
+	{
+		char	   *endptr;
+
+		errno = 0;
+		wp->safekeepers_generation = strtoul(wp->config->safekeepers_list + 2, &endptr, 10);
+		if (errno != 0)
+		{
+			wp_log(FATAL, "failed to parse neon.safekeepers generation number: %m");
+		}
+		/* Skip past : to the first hostname. */
+		host = endptr + 1;
+	}
+	else
+	{
+		host = wp->config->safekeepers_list;
+	}
+	wp_log(LOG, "safekeepers_generation=%u", wp->safekeepers_generation);
+
+	for (; host != NULL && *host != '\0'; host = sep)
 	{
 		port = strchr(host, ':');
 		if (port == NULL)
@@ -183,6 +209,12 @@ WalProposerFree(WalProposer *wp)
 	pfree(wp);
 }
 
+static bool
+WalProposerGenerationsEnabled(WalProposer *wp)
+{
+	return wp->safekeepers_generation != 0;
+}
+
 /*
  * Create new AppendRequest message and start sending it. This function is
  * called from walsender every time the new WAL is available.
@@ -600,10 +632,14 @@ static void
 SendStartWALPush(Safekeeper *sk)
 {
 	WalProposer *wp = sk->wp;
+
+	/* Forbid implicit timeline creation if generations are enabled. */
+	char	   *allow_timeline_creation = WalProposerGenerationsEnabled(wp) ? "false" : "true";
 #define CMD_LEN 512
 	char		cmd[CMD_LEN];
 
-	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d')", wp->config->proto_version);
+
+	snprintf(cmd, CMD_LEN, "START_WAL_PUSH (proto_version '%d', allow_timeline_creation '%s')", wp->config->proto_version, allow_timeline_creation);
 	if (!wp->api.conn_send_query(sk, cmd))
 	{
 		wp_log(WARNING, "failed to send '%s' query to safekeeper %s:%s: %s",
@@ -705,6 +741,18 @@ RecvAcceptorGreeting(Safekeeper *sk)
 		   sk->host, sk->port, sk->greetResponse.nodeId, mconf_toml, sk->greetResponse.term);
 	pfree(mconf_toml);
 
+	/*
+	 * Adopt mconf of safekeepers if it is higher. TODO: mconf change should
+	 * restart wp if it started voting.
+	 */
+	if (sk->greetResponse.mconf.generation > wp->mconf.generation)
+	{
+		MembershipConfigurationFree(&wp->mconf);
+		MembershipConfigurationCopy(&sk->greetResponse.mconf, &wp->mconf);
+		/* full conf was just logged above */
+		wp_log(LOG, "changed mconf to generation %u", wp->mconf.generation);
+	}
+
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
 
@@ -1896,7 +1944,8 @@ PAMessageSerialize(WalProposer *wp, ProposerAcceptorMessage *msg, StringInfo buf
 						pq_sendint64_le(buf, m->termHistory->entries[i].term);
 						pq_sendint64_le(buf, m->termHistory->entries[i].lsn);
 					}
-					/* 
+
+					/*
 					 * Removed timeline_start_lsn. Still send it as a valid
 					 * value until safekeepers taking it from term history are
 					 * deployed.
@@ -2162,7 +2211,7 @@ AsyncReadMessage(Safekeeper *sk, AcceptorProposerMessage *anymsg)
 		}
 	}
 	wp_log(FATAL, "unsupported proto_version %d", wp->config->proto_version);
-	return false; /* keep the compiler quiet */
+	return false;				/* keep the compiler quiet */
 }
 
 /*
@@ -2570,6 +2619,18 @@ MembershipConfigurationToString(MembershipConfiguration *mconf)
 	return s.data;
 }
 
+static void
+MembershipConfigurationCopy(MembershipConfiguration *src, MembershipConfiguration *dst)
+{
+	dst->generation = src->generation;
+	dst->members.len = src->members.len;
+	dst->members.m = palloc0(sizeof(SafekeeperId) * dst->members.len);
+	memcpy(dst->members.m, src->members.m, sizeof(SafekeeperId) * dst->members.len);
+	dst->new_members.len = src->new_members.len;
+	dst->new_members.m = palloc0(sizeof(SafekeeperId) * dst->new_members.len);
+	memcpy(dst->new_members.m, src->new_members.m, sizeof(SafekeeperId) * dst->new_members.len);
+}
+
 static void
 MembershipConfigurationFree(MembershipConfiguration *mconf)
 {
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index eee55f924f..8d1ae26cac 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -160,7 +160,10 @@ typedef struct MemberSet
 	SafekeeperId *m;			/* ids themselves */
 } MemberSet;
 
-/* Timeline safekeeper membership configuration. */
+/*
+ * Timeline safekeeper membership configuration as sent in the
+ * protocol.
+ */
 typedef struct MembershipConfiguration
 {
 	Generation	generation;
@@ -761,8 +764,22 @@ typedef struct WalProposer
 	/* (n_safekeepers / 2) + 1 */
 	int			quorum;
 
+	/*
+	 * Generation of the membership conf of which safekeepers[] are presumably
+	 * members. To make cplane life a bit easier and have more control in
+	 * tests with which sks walproposer gets connected neon.safekeepers GUC
+	 * doesn't provide full mconf, only the list of endpoints to connect to.
+	 * We still would like to know generation associated with it because 1) we
+	 * need some handle to enforce using generations in walproposer, and
+	 * non-zero value of this serves the purpose; 2) currently we don't do
+	 * that, but in theory walproposer can update list of safekeepers to
+	 * connect to upon receiving mconf from safekeepers, and generation number
+	 * must be checked to see which list is newer.
+	 */
+	Generation	safekeepers_generation;
 	/* Number of occupied slots in safekeepers[] */
 	int			n_safekeepers;
+	/* Safekeepers walproposer is connecting to. */
 	Safekeeper	safekeeper[MAX_SAFEKEEPERS];
 
 	/* WAL has been generated up to this point */
diff --git a/pgxn/neon_walredo/inmem_smgr.c b/pgxn/neon_walredo/inmem_smgr.c
index ff2846a9e7..75b9ab4464 100644
--- a/pgxn/neon_walredo/inmem_smgr.c
+++ b/pgxn/neon_walredo/inmem_smgr.c
@@ -32,8 +32,8 @@
 
 #include "inmem_smgr.h"
 
-/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, but we can update up to 3 forks for each block */
-#define MAX_PAGES 100
+/* Size of the in-memory smgr: XLR_MAX_BLOCK_ID is 32, so assume that 64 will be enough */
+#define MAX_PAGES 64
 
 /* If more than WARN_PAGES are used, print a warning in the log */
 #define WARN_PAGES 32
@@ -174,10 +174,7 @@ static void
 inmem_zeroextend(SMgrRelation reln, ForkNumber forknum,
 				 BlockNumber blocknum, int nblocks, bool skipFsync)
 {
-	char buffer[BLCKSZ] = {0};
-
-	for (int i = 0; i < nblocks; i++)
-		inmem_extend(reln, forknum, blocknum + i, buffer, skipFsync);
+	/* Do nothing: inmem_read will return zero page in any case */
 }
 #endif
 
diff --git a/poetry.lock b/poetry.lock
index ba3b0535e4..03aa543b06 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1414,14 +1414,14 @@ files = [
 
 [[package]]
 name = "jinja2"
-version = "3.1.5"
+version = "3.1.6"
 description = "A very fast and expressive template engine."
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
 files = [
-    {file = "jinja2-3.1.5-py3-none-any.whl", hash = "sha256:aba0f4dc9ed8013c424088f68a5c226f7d6097ed89b246d7749c2ec4175c6adb"},
-    {file = "jinja2-3.1.5.tar.gz", hash = "sha256:8fefff8dc3034e27bb80d67c671eb8a9bc424c0ef4c0826edbff304cceff43bb"},
+    {file = "jinja2-3.1.6-py3-none-any.whl", hash = "sha256:85ece4451f492d0c13c5dd7c13a64681a86afae63a5f347908daf103ce6d2f67"},
+    {file = "jinja2-3.1.6.tar.gz", hash = "sha256:0137fb05990d35f1275a587e9aee6d56da821fc83491a0fb838183be43f66d6d"},
 ]
 
 [package.dependencies]
@@ -3820,4 +3820,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "9711c5479c867fa614ce3d352f1bbc63dba1cb2376d347f96fbeda6f512ee308"
+content-hash = "010ffce959bb256880ab5a267048c182e4612b3151f9a94e3bf5d3a7807962fe"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 5964b76ecf..b6e3f03a81 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -53,7 +53,7 @@ measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
 once_cell.workspace = true
 opentelemetry = { workspace = true, features = ["trace"] }
-papaya = "0.1.8"
+papaya = "0.2.0"
 parking_lot.workspace = true
 parquet.workspace = true
 parquet_derive.workspace = true
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 9c3a3772cd..7a6dceb194 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -35,6 +35,7 @@ impl LocalBackend {
                     endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
                     project_id: ProjectIdTag::get_interner().get_or_intern("local"),
                     branch_id: BranchIdTag::get_interner().get_or_intern("local"),
+                    compute_id: "local".into(),
                     cold_start_info: ColdStartInfo::WarmCached,
                 },
             },
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 5447a4a4c0..dfa6015b10 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,3 +1,4 @@
+use std::fmt::Debug;
 use std::io;
 use std::net::SocketAddr;
 use std::time::Duration;
@@ -10,7 +11,7 @@ use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
-use tokio::net::TcpStream;
+use tokio::net::{TcpStream, lookup_host};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::backend::ComputeUserInfo;
@@ -180,21 +181,19 @@ impl ConnCfg {
         use postgres_client::config::Host;
 
         // wrap TcpStream::connect with timeout
-        let connect_with_timeout = |host, port| {
-            tokio::time::timeout(timeout, TcpStream::connect((host, port))).map(
-                move |res| match res {
-                    Ok(tcpstream_connect_res) => tcpstream_connect_res,
-                    Err(_) => Err(io::Error::new(
-                        io::ErrorKind::TimedOut,
-                        format!("exceeded connection timeout {timeout:?}"),
-                    )),
-                },
-            )
+        let connect_with_timeout = |addrs| {
+            tokio::time::timeout(timeout, TcpStream::connect(addrs)).map(move |res| match res {
+                Ok(tcpstream_connect_res) => tcpstream_connect_res,
+                Err(_) => Err(io::Error::new(
+                    io::ErrorKind::TimedOut,
+                    format!("exceeded connection timeout {timeout:?}"),
+                )),
+            })
         };
 
-        let connect_once = |host, port| {
-            debug!("trying to connect to compute node at {host}:{port}");
-            connect_with_timeout(host, port).and_then(|stream| async {
+        let connect_once = |addrs| {
+            debug!("trying to connect to compute node at {addrs:?}");
+            connect_with_timeout(addrs).and_then(|stream| async {
                 let socket_addr = stream.peer_addr()?;
                 let socket = socket2::SockRef::from(&stream);
                 // Disable Nagle's algorithm to not introduce latency between
@@ -216,7 +215,12 @@ impl ConnCfg {
             Host::Tcp(host) => host.as_str(),
         };
 
-        match connect_once(host, port).await {
+        let addrs = match self.0.get_host_addr() {
+            Some(addr) => vec![SocketAddr::new(addr, port)],
+            None => lookup_host((host, port)).await?.collect(),
+        };
+
+        match connect_once(&*addrs).await {
             Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)),
             Err(err) => {
                 warn!("couldn't connect to compute node at {host}:{port}: {err}");
@@ -277,13 +281,15 @@ impl ConnCfg {
         } = connection;
 
         tracing::Span::current().record("pid", tracing::field::display(process_id));
+        tracing::Span::current().record("compute_id", tracing::field::display(&aux.compute_id));
         let stream = stream.into_inner();
 
         // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
         info!(
             cold_start_info = ctx.cold_start_info().as_str(),
-            "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
-            self.0.get_ssl_mode()
+            "connected to compute node at {host} ({socket_addr}) sslmode={:?}, latency={}",
+            self.0.get_ssl_mode(),
+            ctx.get_proxy_latency(),
         );
 
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index f87f4e9ef8..e10a04b4f1 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -17,7 +17,8 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::error::ErrorKind;
 use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
-    ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
+    ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol,
+    Waiting,
 };
 use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
@@ -346,6 +347,14 @@ impl RequestContext {
         }
     }
 
+    pub(crate) fn get_proxy_latency(&self) -> LatencyAccumulated {
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .latency_timer
+            .accumulated()
+    }
+
     pub(crate) fn success(&self) {
         self.0
             .try_lock()
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index 977fcf4727..2765aaa462 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -1,5 +1,7 @@
 //! Production console backend.
 
+use std::net::IpAddr;
+use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -274,11 +276,27 @@ impl NeonControlPlaneClient {
                 Some(x) => x,
             };
 
+            let host_addr = IpAddr::from_str(host).ok();
+
+            let ssl_mode = match &body.server_name {
+                Some(_) => SslMode::Require,
+                None => SslMode::Disable,
+            };
+            let host_name = match body.server_name {
+                Some(host) => host,
+                None => host.to_owned(),
+            };
+
             // Don't set anything but host and port! This config will be cached.
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new(host.to_owned(), port);
-            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            let mut config = compute::ConnCfg::new(host_name, port);
+
+            if let Some(addr) = host_addr {
+                config.set_host_addr(addr);
+            }
+
+            config.ssl_mode(ssl_mode);
 
             let node = NodeInfo {
                 config,
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 7da5464aa5..ee722e839e 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -1,5 +1,6 @@
 //! Mock console backend which relies on a user-provided postgres instance.
 
+use std::net::{IpAddr, Ipv4Addr};
 use std::str::FromStr;
 use std::sync::Arc;
 
@@ -167,10 +168,22 @@ impl MockControlPlane {
     }
 
     async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
-        let mut config = compute::ConnCfg::new(
-            self.endpoint.host_str().unwrap_or("localhost").to_owned(),
-            self.endpoint.port().unwrap_or(5432),
-        );
+        let port = self.endpoint.port().unwrap_or(5432);
+        let mut config = match self.endpoint.host_str() {
+            None => {
+                let mut config = compute::ConnCfg::new("localhost".to_string(), port);
+                config.set_host_addr(IpAddr::V4(Ipv4Addr::LOCALHOST));
+                config
+            }
+            Some(host) => {
+                let mut config = compute::ConnCfg::new(host.to_string(), port);
+                if let Ok(addr) = IpAddr::from_str(host) {
+                    config.set_host_addr(addr);
+                }
+                config
+            }
+        };
+
         config.ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
@@ -179,6 +192,7 @@ impl MockControlPlane {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),
                 branch_id: (&BranchId::from("branch")).into(),
+                compute_id: "compute".into(),
                 cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
             },
         };
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 8d6b2e96f5..ec4554eab5 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -2,6 +2,7 @@ use std::fmt::{self, Display};
 
 use measured::FixedCardinalityLabel;
 use serde::{Deserialize, Serialize};
+use smol_str::SmolStr;
 
 use crate::auth::IpPattern;
 use crate::intern::{AccountIdInt, BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
@@ -239,6 +240,7 @@ pub(crate) struct GetEndpointAccessControl {
 #[derive(Debug, Deserialize)]
 pub(crate) struct WakeCompute {
     pub(crate) address: Box<str>,
+    pub(crate) server_name: Option<String>,
     pub(crate) aux: MetricsAuxInfo,
 }
 
@@ -312,6 +314,9 @@ pub(crate) struct MetricsAuxInfo {
     pub(crate) endpoint_id: EndpointIdInt,
     pub(crate) project_id: ProjectIdInt,
     pub(crate) branch_id: BranchIdInt,
+    // note: we don't use interned strings for compute IDs.
+    // they churn too quickly and we have no way to clean up interned strings.
+    pub(crate) compute_id: SmolStr,
     #[serde(default)]
     pub(crate) cold_start_info: ColdStartInfo,
 }
@@ -378,6 +383,7 @@ mod tests {
             "endpoint_id": "endpoint",
             "project_id": "project",
             "branch_id": "branch",
+            "compute_id": "compute",
             "cold_start_info": "unknown",
         })
     }
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 3c34918d84..6f9845fd6e 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,9 +1,11 @@
 use std::cell::{Cell, RefCell};
 use std::collections::HashMap;
 use std::hash::BuildHasher;
-use std::{env, io};
+use std::sync::atomic::{AtomicU32, Ordering};
+use std::{array, env, fmt, io};
 
 use chrono::{DateTime, Utc};
+use indexmap::IndexSet;
 use opentelemetry::trace::TraceContextExt;
 use scopeguard::defer;
 use serde::ser::{SerializeMap, Serializer};
@@ -17,6 +19,7 @@ use tracing_subscriber::fmt::{FormatEvent, FormatFields};
 use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
 use tracing_subscriber::registry::{LookupSpan, SpanRef};
+use try_lock::TryLock;
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -46,13 +49,13 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
     let otlp_layer = tracing_utils::init_tracing("proxy").await;
 
     let json_log_layer = if logfmt == LogFormat::Json {
-        Some(JsonLoggingLayer {
-            clock: RealClock,
-            skipped_field_indices: papaya::HashMap::default(),
-            writer: StderrWriter {
+        Some(JsonLoggingLayer::new(
+            RealClock,
+            StderrWriter {
                 stderr: std::io::stderr(),
             },
-        })
+            ["request_id", "session_id", "conn_id"],
+        ))
     } else {
         None
     };
@@ -191,13 +194,39 @@ thread_local! {
 }
 
 /// Implements tracing layer to handle events specific to logging.
-struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
+struct JsonLoggingLayer<C: Clock, W: MakeWriter, const F: usize> {
     clock: C,
     skipped_field_indices: papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+    callsite_ids: papaya::HashMap<callsite::Identifier, CallsiteId>,
     writer: W,
+    // We use a const generic and arrays to bypass one heap allocation.
+    extract_fields: IndexSet<&'static str>,
+    _marker: std::marker::PhantomData<[&'static str; F]>,
 }
 
-impl<S, C: Clock + 'static, W: MakeWriter + 'static> Layer<S> for JsonLoggingLayer<C, W>
+impl<C: Clock, W: MakeWriter, const F: usize> JsonLoggingLayer<C, W, F> {
+    fn new(clock: C, writer: W, extract_fields: [&'static str; F]) -> Self {
+        JsonLoggingLayer {
+            clock,
+            skipped_field_indices: papaya::HashMap::default(),
+            callsite_ids: papaya::HashMap::default(),
+            writer,
+            extract_fields: IndexSet::from_iter(extract_fields),
+            _marker: std::marker::PhantomData,
+        }
+    }
+
+    #[inline]
+    fn callsite_id(&self, cs: callsite::Identifier) -> CallsiteId {
+        *self
+            .callsite_ids
+            .pin()
+            .get_or_insert_with(cs, CallsiteId::next)
+    }
+}
+
+impl<S, C: Clock + 'static, W: MakeWriter + 'static, const F: usize> Layer<S>
+    for JsonLoggingLayer<C, W, F>
 where
     S: Subscriber + for<'a> LookupSpan<'a>,
 {
@@ -211,7 +240,14 @@ where
         let res: io::Result<()> = REENTRANCY_GUARD.with(move |entered| {
             if entered.get() {
                 let mut formatter = EventFormatter::new();
-                formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                formatter.format::<S, F>(
+                    now,
+                    event,
+                    &ctx,
+                    &self.skipped_field_indices,
+                    &self.callsite_ids,
+                    &self.extract_fields,
+                )?;
                 self.writer.make_writer().write_all(formatter.buffer())
             } else {
                 entered.set(true);
@@ -219,7 +255,14 @@ where
 
                 EVENT_FORMATTER.with_borrow_mut(move |formatter| {
                     formatter.reset();
-                    formatter.format(now, event, &ctx, &self.skipped_field_indices)?;
+                    formatter.format::<S, F>(
+                        now,
+                        event,
+                        &ctx,
+                        &self.skipped_field_indices,
+                        &self.callsite_ids,
+                        &self.extract_fields,
+                    )?;
                     self.writer.make_writer().write_all(formatter.buffer())
                 })
             }
@@ -246,10 +289,13 @@ where
         let span = ctx.span(id).expect("span must exist");
         let fields = SpanFields::default();
         fields.record_fields(attrs);
+
         // This could deadlock when there's a panic somewhere in the tracing
         // event handling and a read or write guard is still held. This includes
         // the OTel subscriber.
-        span.extensions_mut().insert(fields);
+        let mut exts = span.extensions_mut();
+
+        exts.insert(fields);
     }
 
     fn on_record(&self, id: &span::Id, values: &span::Record<'_>, ctx: Context<'_, S>) {
@@ -265,6 +311,7 @@ where
     /// wins.
     fn register_callsite(&self, metadata: &'static Metadata<'static>) -> Interest {
         if !metadata.is_event() {
+            self.callsite_id(metadata.callsite());
             // Must not be never because we wouldn't get trace and span data.
             return Interest::always();
         }
@@ -297,6 +344,26 @@ where
     }
 }
 
+#[derive(Copy, Clone, Debug, Default)]
+#[repr(transparent)]
+struct CallsiteId(u32);
+
+impl CallsiteId {
+    #[inline]
+    fn next() -> Self {
+        // Start at 1 to reserve 0 for default.
+        static COUNTER: AtomicU32 = AtomicU32::new(1);
+        CallsiteId(COUNTER.fetch_add(1, Ordering::Relaxed))
+    }
+}
+
+impl fmt::Display for CallsiteId {
+    #[inline]
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 /// Stores span field values recorded during the spans lifetime.
 #[derive(Default)]
 struct SpanFields {
@@ -448,12 +515,14 @@ impl EventFormatter {
         self.logline_buffer.clear();
     }
 
-    fn format<S>(
+    fn format<S, const F: usize>(
         &mut self,
         now: DateTime<Utc>,
         event: &Event<'_>,
         ctx: &Context<'_, S>,
         skipped_field_indices: &papaya::HashMap<callsite::Identifier, SkippedFieldIndices>,
+        callsite_ids: &papaya::HashMap<callsite::Identifier, CallsiteId>,
+        extract_fields: &IndexSet<&'static str>,
     ) -> io::Result<()>
     where
         S: Subscriber + for<'a> LookupSpan<'a>,
@@ -485,6 +554,7 @@ impl EventFormatter {
             event.record(&mut message_extractor);
             let mut serializer = message_extractor.into_serializer()?;
 
+            // Direct message fields.
             let mut fields_present = FieldsPresent(false, skipped_field_indices);
             event.record(&mut fields_present);
             if fields_present.0 {
@@ -494,7 +564,16 @@ impl EventFormatter {
                 )?;
             }
 
+            let spans = SerializableSpans {
+                ctx,
+                callsite_ids,
+                extract: ExtractedSpanFields::<'_, F>::new(extract_fields),
+            };
+            serializer.serialize_entry("spans", &spans)?;
+
+            // TODO: thread-local cache?
             let pid = std::process::id();
+            // Skip adding pid 1 to reduce noise for services running in containers.
             if pid != 1 {
                 serializer.serialize_entry("process_id", &pid)?;
             }
@@ -514,6 +593,7 @@ impl EventFormatter {
 
             serializer.serialize_entry("target", meta.target())?;
 
+            // Skip adding module if it's the same as target.
             if let Some(module) = meta.module_path() {
                 if module != meta.target() {
                     serializer.serialize_entry("module", module)?;
@@ -540,7 +620,10 @@ impl EventFormatter {
                 }
             }
 
-            serializer.serialize_entry("spans", &SerializableSpanStack(ctx))?;
+            if spans.extract.has_values() {
+                // TODO: add fields from event, too?
+                serializer.serialize_entry("extract", &spans.extract)?;
+            }
 
             serializer.end()
         };
@@ -818,15 +901,20 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
     }
 }
 
-/// Serializes the span stack from root to leaf (parent of event) enumerated
-/// inside an object where the keys are just the number padded with zeroes
-/// to retain sorting order.
-// The object is necessary because Loki cannot flatten arrays.
-struct SerializableSpanStack<'a, 'b, Span>(&'b Context<'a, Span>)
+/// Serializes the span stack from root to leaf (parent of event) as object
+/// with the span names as keys. To prevent collision we append a numberic value
+/// to the name. Also, collects any span fields we're interested in. Last one
+/// wins.
+struct SerializableSpans<'a, 'ctx, Span, const F: usize>
 where
-    Span: Subscriber + for<'lookup> LookupSpan<'lookup>;
+    Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
+{
+    ctx: &'a Context<'ctx, Span>,
+    callsite_ids: &'a papaya::HashMap<callsite::Identifier, CallsiteId>,
+    extract: ExtractedSpanFields<'a, F>,
+}
 
-impl<Span> serde::ser::Serialize for SerializableSpanStack<'_, '_, Span>
+impl<Span, const F: usize> serde::ser::Serialize for SerializableSpans<'_, '_, Span, F>
 where
     Span: Subscriber + for<'lookup> LookupSpan<'lookup>,
 {
@@ -836,9 +924,24 @@ where
     {
         let mut serializer = serializer.serialize_map(None)?;
 
-        if let Some(leaf_span) = self.0.lookup_current() {
-            for (i, span) in leaf_span.scope().from_root().enumerate() {
-                serializer.serialize_entry(&format_args!("{i:02}"), &SerializableSpan(&span))?;
+        if let Some(leaf_span) = self.ctx.lookup_current() {
+            for span in leaf_span.scope().from_root() {
+                // Append a numeric callsite ID to the span name to keep the name unique
+                // in the JSON object.
+                let cid = self
+                    .callsite_ids
+                    .pin()
+                    .get(&span.metadata().callsite())
+                    .copied()
+                    .unwrap_or_default();
+
+                // Loki turns the # into an underscore during field name concatenation.
+                serializer.serialize_key(&format_args!("{}#{}", span.metadata().name(), &cid))?;
+
+                serializer.serialize_value(&SerializableSpanFields {
+                    span: &span,
+                    extract: &self.extract,
+                })?;
             }
         }
 
@@ -846,28 +949,79 @@ where
     }
 }
 
-/// Serializes a single span. Include the span ID, name and its fields as
-/// recorded up to this point.
-struct SerializableSpan<'a, 'b, Span>(&'b SpanRef<'a, Span>)
-where
-    Span: for<'lookup> LookupSpan<'lookup>;
-
-impl<Span> serde::ser::Serialize for SerializableSpan<'_, '_, Span>
+/// Serializes the span fields as object.
+struct SerializableSpanFields<'a, 'span, Span, const F: usize>
 where
     Span: for<'lookup> LookupSpan<'lookup>,
 {
-    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
+    span: &'a SpanRef<'span, Span>,
+    extract: &'a ExtractedSpanFields<'a, F>,
+}
+
+impl<Span, const F: usize> serde::ser::Serialize for SerializableSpanFields<'_, '_, Span, F>
+where
+    Span: for<'lookup> LookupSpan<'lookup>,
+{
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
-        Ser: serde::ser::Serializer,
+        S: serde::ser::Serializer,
     {
         let mut serializer = serializer.serialize_map(None)?;
-        // TODO: the span ID is probably only useful for debugging tracing.
-        serializer.serialize_entry("span_id", &format_args!("{:016x}", self.0.id().into_u64()))?;
-        serializer.serialize_entry("span_name", self.0.metadata().name())?;
 
-        let ext = self.0.extensions();
+        let ext = self.span.extensions();
         if let Some(data) = ext.get::<SpanFields>() {
-            for (key, value) in &data.fields.pin() {
+            for (name, value) in &data.fields.pin() {
+                serializer.serialize_entry(name, value)?;
+                // TODO: replace clone with reference, if possible.
+                self.extract.set(name, value.clone());
+            }
+        }
+
+        serializer.end()
+    }
+}
+
+struct ExtractedSpanFields<'a, const F: usize> {
+    names: &'a IndexSet<&'static str>,
+    // TODO: replace TryLock with something local thread and interior mutability.
+    //       serde API doesn't let us use `mut`.
+    values: TryLock<([Option<serde_json::Value>; F], bool)>,
+}
+
+impl<'a, const F: usize> ExtractedSpanFields<'a, F> {
+    fn new(names: &'a IndexSet<&'static str>) -> Self {
+        ExtractedSpanFields {
+            names,
+            values: TryLock::new((array::from_fn(|_| Option::default()), false)),
+        }
+    }
+
+    #[inline]
+    fn set(&self, name: &'static str, value: serde_json::Value) {
+        if let Some((index, _)) = self.names.get_full(name) {
+            let mut fields = self.values.try_lock().expect("thread-local use");
+            fields.0[index] = Some(value);
+            fields.1 = true;
+        }
+    }
+
+    #[inline]
+    fn has_values(&self) -> bool {
+        self.values.try_lock().expect("thread-local use").1
+    }
+}
+
+impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::ser::Serializer,
+    {
+        let mut serializer = serializer.serialize_map(None)?;
+
+        let values = self.values.try_lock().expect("thread-local use");
+        for (i, value) in values.0.iter().enumerate() {
+            if let Some(value) = value {
+                let key = self.names[i];
                 serializer.serialize_entry(key, value)?;
             }
         }
@@ -879,6 +1033,7 @@ where
 #[cfg(test)]
 #[allow(clippy::unwrap_used)]
 mod tests {
+    use std::marker::PhantomData;
     use std::sync::{Arc, Mutex, MutexGuard};
 
     use assert_json_diff::assert_json_eq;
@@ -927,14 +1082,17 @@ mod tests {
         let log_layer = JsonLoggingLayer {
             clock: clock.clone(),
             skipped_field_indices: papaya::HashMap::default(),
+            callsite_ids: papaya::HashMap::default(),
             writer: buffer.clone(),
+            extract_fields: IndexSet::from_iter(["x"]),
+            _marker: PhantomData::<[&'static str; 1]>,
         };
 
         let registry = tracing_subscriber::Registry::default().with(log_layer);
 
         tracing::subscriber::with_default(registry, || {
-            info_span!("span1", x = 40, x = 41, x = 42).in_scope(|| {
-                info_span!("span2").in_scope(|| {
+            info_span!("some_span", x = 24).in_scope(|| {
+                info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| {
                     tracing::error!(
                         a = 1,
                         a = 2,
@@ -960,16 +1118,16 @@ mod tests {
                     "a": 3,
                 },
                 "spans": {
-                    "00":{
-                        "span_id": "0000000000000001",
-                        "span_name": "span1",
-                        "x": 42,
+                    "some_span#1":{
+                        "x": 24,
                     },
-                    "01": {
-                        "span_id": "0000000000000002",
-                        "span_name": "span2",
+                    "some_span#2": {
+                        "x": 42,
                     }
                 },
+                "extract": {
+                    "x": 42,
+                },
                 "src": actual.as_object().unwrap().get("src").unwrap().as_str().unwrap(),
                 "target": "proxy::logging::tests",
                 "process_id": actual.as_object().unwrap().get("process_id").unwrap().as_number().unwrap(),
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index db1f096de1..b6a2a059ea 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -394,21 +394,31 @@ pub enum RedisMsgKind {
     HDel,
 }
 
-#[derive(Default)]
-struct Accumulated {
+#[derive(Default, Clone)]
+pub struct LatencyAccumulated {
     cplane: time::Duration,
     client: time::Duration,
     compute: time::Duration,
     retry: time::Duration,
 }
 
+impl std::fmt::Display for LatencyAccumulated {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "client: {:?}, cplane: {:?}, compute: {:?}, retry: {:?}",
+            self.client, self.cplane, self.compute, self.retry
+        )
+    }
+}
+
 pub struct LatencyTimer {
     // time since the stopwatch was started
     start: time::Instant,
     // time since the stopwatch was stopped
     stop: Option<time::Instant>,
     // accumulated time on the stopwatch
-    accumulated: Accumulated,
+    accumulated: LatencyAccumulated,
     // label data
     protocol: Protocol,
     cold_start_info: ColdStartInfo,
@@ -422,7 +432,7 @@ impl LatencyTimer {
         Self {
             start: time::Instant::now(),
             stop: None,
-            accumulated: Accumulated::default(),
+            accumulated: LatencyAccumulated::default(),
             protocol,
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
@@ -435,7 +445,7 @@ impl LatencyTimer {
         Self {
             start: time::Instant::now(),
             stop: None,
-            accumulated: Accumulated::default(),
+            accumulated: LatencyAccumulated::default(),
             protocol,
             cold_start_info: ColdStartInfo::Unknown,
             // assume failed unless otherwise specified
@@ -465,6 +475,10 @@ impl LatencyTimer {
         // success
         self.outcome = ConnectOutcome::Success;
     }
+
+    pub fn accumulated(&self) -> LatencyAccumulated {
+        self.accumulated.clone()
+    }
 }
 
 #[derive(FixedCardinalityLabel, Clone, Copy, Debug)]
@@ -511,7 +525,7 @@ impl Drop for LatencyTimer {
             duration.saturating_sub(accumulated_total).as_secs_f64(),
         );
 
-        // Exclude client cplane, compue communication from the accumulated time.
+        // Exclude client, cplane, compute communication from the accumulated time.
         let accumulated_total =
             self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
         metric.observe(
@@ -524,7 +538,7 @@ impl Drop for LatencyTimer {
             duration.saturating_sub(accumulated_total).as_secs_f64(),
         );
 
-        // Exclude client cplane, compue, retry communication from the accumulated time.
+        // Exclude client, cplane, compute, retry communication from the accumulated time.
         let accumulated_total = self.accumulated.client
             + self.accumulated.cplane
             + self.accumulated.compute
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index b8b39fa121..e013fbbe2e 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -81,7 +81,10 @@ impl ConnectMechanism for TcpMechanism<'_> {
     type ConnectError = compute::ConnectionError;
     type Error = compute::ConnectionError;
 
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        compute_id = tracing::field::Empty
+    ))]
     async fn connect_once(
         &self,
         ctx: &RequestContext,
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 171f539b1e..e0b7539538 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -555,6 +555,7 @@ fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeIn
             endpoint_id: (&EndpointId::from("endpoint")).into(),
             project_id: (&ProjectId::from("project")).into(),
             branch_id: (&BranchId::from("branch")).into(),
+            compute_id: "compute".into(),
             cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
         },
     };
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 72029102e0..b55661cec8 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,4 +1,5 @@
 use std::io;
+use std::net::{IpAddr, SocketAddr};
 use std::sync::Arc;
 use std::time::Duration;
 
@@ -6,11 +7,15 @@ use async_trait::async_trait;
 use ed25519_dalek::SigningKey;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
 use jose_jwk::jose_b64;
+use postgres_client::config::SslMode;
 use rand::rngs::OsRng;
+use rustls::pki_types::{DnsName, ServerName};
 use tokio::net::{TcpStream, lookup_host};
+use tokio_rustls::TlsConnector;
 use tracing::field::display;
 use tracing::{debug, info};
 
+use super::AsyncRW;
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, EndpointConnPool, GlobalConnPool};
 use super::http_conn_pool::{self, HttpConnPool, Send, poll_http2_client};
@@ -190,7 +195,11 @@ impl PoolingBackend {
     // Wake up the destination if needed. Code here is a bit involved because
     // we reuse the code from the usual proxy and we need to prepare few structures
     // that this code expects.
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        compute_id = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
     pub(crate) async fn connect_to_compute(
         &self,
         ctx: &RequestContext,
@@ -229,7 +238,10 @@ impl PoolingBackend {
     }
 
     // Wake up the destination if needed
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        compute_id = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
     pub(crate) async fn connect_to_local_proxy(
         &self,
         ctx: &RequestContext,
@@ -276,7 +288,10 @@ impl PoolingBackend {
     /// # Panics
     ///
     /// Panics if called with a non-local_proxy backend.
-    #[tracing::instrument(fields(pid = tracing::field::Empty), skip_all)]
+    #[tracing::instrument(skip_all, fields(
+        pid = tracing::field::Empty,
+        conn_id = tracing::field::Empty,
+    ))]
     pub(crate) async fn connect_to_local_postgres(
         &self,
         ctx: &RequestContext,
@@ -552,6 +567,10 @@ impl ConnectMechanism for TokioMechanism {
         let (client, connection) = permit.release_result(res)?;
 
         tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
+        tracing::Span::current().record(
+            "compute_id",
+            tracing::field::display(&node_info.aux.compute_id),
+        );
         Ok(poll_client(
             self.pool.clone(),
             ctx,
@@ -587,16 +606,28 @@ impl ConnectMechanism for HyperMechanism {
         node_info: &CachedNodeInfo,
         config: &ComputeConfig,
     ) -> Result<Self::Connection, Self::ConnectError> {
+        let host_addr = node_info.config.get_host_addr();
         let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
 
+        let tls = if node_info.config.get_ssl_mode() == SslMode::Disable {
+            None
+        } else {
+            Some(&config.tls)
+        };
+
         let port = node_info.config.get_port();
-        let res = connect_http2(&host, port, config.timeout).await;
+        let res = connect_http2(host_addr, &host, port, config.timeout, tls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
+        tracing::Span::current().record(
+            "compute_id",
+            tracing::field::display(&node_info.aux.compute_id),
+        );
+
         Ok(poll_http2_client(
             self.pool.clone(),
             ctx,
@@ -612,18 +643,22 @@ impl ConnectMechanism for HyperMechanism {
 }
 
 async fn connect_http2(
+    host_addr: Option<IpAddr>,
     host: &str,
     port: u16,
     timeout: Duration,
+    tls: Option<&Arc<rustls::ClientConfig>>,
 ) -> Result<(http_conn_pool::Send, http_conn_pool::Connect), LocalProxyConnError> {
-    // assumption: host is an ip address so this should not actually perform any requests.
-    // todo: add that assumption as a guarantee in the control-plane API.
-    let mut addrs = lookup_host((host, port))
-        .await
-        .map_err(LocalProxyConnError::Io)?;
-
+    let addrs = match host_addr {
+        Some(addr) => vec![SocketAddr::new(addr, port)],
+        None => lookup_host((host, port))
+            .await
+            .map_err(LocalProxyConnError::Io)?
+            .collect(),
+    };
     let mut last_err = None;
 
+    let mut addrs = addrs.into_iter();
     let stream = loop {
         let Some(addr) = addrs.next() else {
             return Err(last_err.unwrap_or_else(|| {
@@ -651,6 +686,20 @@ async fn connect_http2(
         }
     };
 
+    let stream = if let Some(tls) = tls {
+        let host = DnsName::try_from(host)
+            .map_err(io::Error::other)
+            .map_err(LocalProxyConnError::Io)?
+            .to_owned();
+        let stream = TlsConnector::from(tls.clone())
+            .connect(ServerName::DnsName(host), stream)
+            .await
+            .map_err(LocalProxyConnError::Io)?;
+        Box::pin(stream) as AsyncRW
+    } else {
+        Box::pin(stream) as AsyncRW
+    };
+
     let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())
         .timer(TokioTimer::new())
         .keep_alive_interval(Duration::from_secs(20))
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 6a9089fc2a..516d474a11 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -221,6 +221,7 @@ mod tests {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),
                 branch_id: (&BranchId::from("branch")).into(),
+                compute_id: "compute".into(),
                 cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
             },
             conn_id: uuid::Uuid::new_v4(),
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 338a79b4b3..bca2d4c165 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -6,9 +6,9 @@ use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use smol_str::ToSmolStr;
-use tokio::net::TcpStream;
 use tracing::{Instrument, debug, error, info, info_span};
 
+use super::AsyncRW;
 use super::backend::HttpConnError;
 use super::conn_pool_lib::{
     ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, ConnPoolEntry,
@@ -22,8 +22,7 @@ use crate::types::EndpointCacheKey;
 use crate::usage_metrics::{Ids, MetricCounter, TrafficDirection, USAGE_METRICS};
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
-pub(crate) type Connect =
-    http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
+pub(crate) type Connect = http2::Connection<TokioIo<AsyncRW>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
 pub(crate) struct ClientDataHttp();
diff --git a/proxy/src/tls/client_config.rs b/proxy/src/tls/client_config.rs
index a2d695aae1..ce873e678e 100644
--- a/proxy/src/tls/client_config.rs
+++ b/proxy/src/tls/client_config.rs
@@ -1,17 +1,49 @@
+use std::env;
+use std::io::Cursor;
+use std::path::PathBuf;
 use std::sync::Arc;
 
-use anyhow::bail;
+use anyhow::{Context, bail};
 use rustls::crypto::ring;
 
-pub(crate) fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+/// We use an internal certificate authority when establishing a TLS connection with compute.
+fn load_internal_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> {
+    let Some(ca_file) = env::var_os("NEON_INTERNAL_CA_FILE") else {
+        return Ok(());
+    };
+    let ca_file = PathBuf::from(ca_file);
+
+    let ca = std::fs::read(&ca_file)
+        .with_context(|| format!("could not read CA from {}", ca_file.display()))?;
+
+    for cert in rustls_pemfile::certs(&mut Cursor::new(&*ca)) {
+        store
+            .add(cert.context("could not parse internal CA certificate")?)
+            .context("could not parse internal CA certificate")?;
+    }
+
+    Ok(())
+}
+
+/// For console redirect proxy, we need to establish a connection to compute via pg-sni-router.
+/// pg-sni-router needs TLS and uses a Let's Encrypt signed certificate, so we
+/// load certificates from our native store.
+fn load_native_certs(store: &mut rustls::RootCertStore) -> anyhow::Result<()> {
     let der_certs = rustls_native_certs::load_native_certs();
 
     if !der_certs.errors.is_empty() {
         bail!("could not parse certificates: {:?}", der_certs.errors);
     }
 
-    let mut store = rustls::RootCertStore::empty();
     store.add_parsable_certificates(der_certs.certs);
+
+    Ok(())
+}
+
+fn load_compute_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let mut store = rustls::RootCertStore::empty();
+    load_native_certs(&mut store)?;
+    load_internal_certs(&mut store)?;
     Ok(Arc::new(store))
 }
 
@@ -22,7 +54,7 @@ pub fn compute_client_config_with_root_certs() -> anyhow::Result<rustls::ClientC
         rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
             .expect("ring should support the default protocol versions")
-            .with_root_certificates(load_certs()?)
+            .with_root_certificates(load_compute_certs()?)
             .with_no_client_auth(),
     )
 }
diff --git a/pyproject.toml b/pyproject.toml
index c6e5073bcd..e7f5c62bd0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -13,7 +13,7 @@ requests = "^2.32.3"
 pytest-xdist = "^3.3.1"
 asyncpg = "^0.30.0"
 aiopg = "^1.4.0"
-Jinja2 = "^3.1.5"
+Jinja2 = "^3.1.6"
 types-requests = "^2.31.0.0"
 types-psycopg2 = "^2.9.21.20241019"
 boto3 = "^1.34.11"
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index e196f91d3c..c71f23a010 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -184,6 +184,16 @@ impl InterpretedWalReaderState {
                         to: *current_position,
                     }
                 } else {
+                    // Edge case: The new shard is at the same current position as
+                    // the reader. Note that the current position is WAL record aligned,
+                    // so the reader might have done some partial reads and updated the
+                    // batch start. If that's the case, adjust the batch start to match
+                    // starting position of the new shard. It can lead to some shards
+                    // seeing overlaps, but in that case the actual record LSNs are checked
+                    // which should be fine based on the filtering logic.
+                    if let Some(start) = current_batch_wal_start {
+                        *start = std::cmp::min(*start, new_shard_start_pos);
+                    }
                     CurrentPositionUpdate::NotReset(*current_position)
                 }
             }
@@ -287,7 +297,13 @@ impl InterpretedWalReader {
                 reader
                     .run_impl(start_pos)
                     .await
-                    .inspect_err(|err| critical!("failed to read WAL record: {err:?}"))
+                    .inspect_err(|err| match err {
+                        // TODO: we may want to differentiate these errors further.
+                        InterpretedWalReaderError::Decode(_) => {
+                            critical!("failed to decode WAL record: {err:?}");
+                        }
+                        err => error!("failed to read WAL record: {err}"),
+                    })
             }
             .instrument(info_span!("interpreted wal reader")),
         );
@@ -347,10 +363,12 @@ impl InterpretedWalReader {
             metric.dec();
         }
 
-        if let Err(err) = self.run_impl(start_pos).await {
-            critical!("failed to read WAL record: {err:?}");
-        } else {
-            info!("interpreted wal reader exiting");
+        match self.run_impl(start_pos).await {
+            Err(err @ InterpretedWalReaderError::Decode(_)) => {
+                critical!("failed to decode WAL record: {err:?}");
+            }
+            Err(err) => error!("failed to read WAL record: {err}"),
+            Ok(()) => info!("interpreted wal reader exiting"),
         }
 
         Err(CopyStreamHandlerEnd::Other(anyhow!(
@@ -412,7 +430,10 @@ impl InterpretedWalReader {
                         .with_context(|| "Failed to interpret WAL")?;
 
                         for (shard, record) in interpreted {
-                            if record.is_empty() {
+                            // Shard zero needs to track the start LSN of the latest record
+                            // in adition to the LSN of the next record to ingest. The former
+                            // is included in basebackup persisted by the compute in WAL.
+                            if !shard.is_shard_zero() && record.is_empty() {
                                 continue;
                             }
 
@@ -722,7 +743,7 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None)
             .await
             .unwrap();
         let end_pos = end_watch.get();
@@ -865,10 +886,16 @@ mod tests {
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
         let mut next_record_lsns = Vec::default();
-        let end_watch =
-            Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
-                .await
-                .unwrap();
+        let end_watch = Env::write_wal(
+            tli,
+            start_lsn,
+            SIZE,
+            MSG_COUNT,
+            c"neon-file:",
+            Some(&mut next_record_lsns),
+        )
+        .await
+        .unwrap();
         let end_pos = end_watch.get();
 
         let streaming_wal_reader = StreamingWalReader::new(
@@ -1009,10 +1036,16 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch =
-            Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, Some(&mut next_record_lsns))
-                .await
-                .unwrap();
+        let end_watch = Env::write_wal(
+            tli,
+            start_lsn,
+            SIZE,
+            MSG_COUNT,
+            c"neon-file:",
+            Some(&mut next_record_lsns),
+        )
+        .await
+        .unwrap();
 
         assert!(next_record_lsns.len() > 3);
         let shard_0_start_lsn = next_record_lsns[3];
@@ -1106,4 +1139,88 @@ mod tests {
             }
         }
     }
+
+    #[tokio::test]
+    async fn test_shard_zero_does_not_skip_empty_records() {
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 8 * 1024;
+        const MSG_COUNT: usize = 10;
+        const PG_VERSION: u32 = 17;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let mut next_record_lsns = Vec::new();
+        let end_watch = Env::write_wal(
+            tli,
+            start_lsn,
+            SIZE,
+            MSG_COUNT,
+            // This is a logical message prefix that is not persisted to key value storage.
+            // We use it in order to validate that shard zero receives emtpy interpreted records.
+            c"test:",
+            Some(&mut next_record_lsns),
+        )
+        .await
+        .unwrap();
+        let end_pos = end_watch.get();
+
+        let streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            start_lsn,
+            end_pos,
+            end_watch,
+            MAX_SEND_SIZE,
+        );
+
+        let shard = ShardIdentity::unsharded();
+        let (records_tx, mut records_rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+
+        let handle = InterpretedWalReader::spawn(
+            streaming_wal_reader,
+            start_lsn,
+            records_tx,
+            shard,
+            PG_VERSION,
+            &Some("pageserver".to_string()),
+        );
+
+        let mut interpreted_records = Vec::new();
+        while let Some(batch) = records_rx.recv().await {
+            interpreted_records.push(batch.records);
+            if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                break;
+            }
+        }
+
+        let received_next_record_lsns = interpreted_records
+            .into_iter()
+            .flat_map(|b| b.records)
+            .map(|rec| rec.next_record_lsn)
+            .collect::<Vec<_>>();
+
+        // By default this also includes the start LSN. Trim it since it shouldn't be received.
+        let next_record_lsns = next_record_lsns.into_iter().skip(1).collect::<Vec<_>>();
+
+        assert_eq!(received_next_record_lsns, next_record_lsns);
+
+        handle.abort();
+        let mut done = false;
+        for _ in 0..5 {
+            if handle.current_position().is_none() {
+                done = true;
+                break;
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+
+        assert!(done);
+    }
 }
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index e6f74185c1..618e2b59d2 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -1,3 +1,4 @@
+use std::ffi::CStr;
 use std::sync::Arc;
 
 use camino_tempfile::Utf8TempDir;
@@ -124,6 +125,7 @@ impl Env {
         start_lsn: Lsn,
         msg_size: usize,
         msg_count: usize,
+        prefix: &CStr,
         mut next_record_lsns: Option<&mut Vec<Lsn>>,
     ) -> anyhow::Result<EndWatch> {
         let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
@@ -133,7 +135,6 @@ impl Env {
 
         WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
 
-        let prefix = c"neon-file:";
         let prefixlen = prefix.to_bytes_with_nul().len();
         assert!(msg_size >= prefixlen);
         let message = vec![0; msg_size - prefixlen];
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 930f66a207..d3c841ec09 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -415,6 +415,9 @@ impl From<TimelineError> for ApiError {
     }
 }
 
+/// We run remote deletion in a background task, this is how it sends its results back.
+type RemoteDeletionReceiver = tokio::sync::watch::Receiver<Option<anyhow::Result<()>>>;
+
 /// Timeline struct manages lifecycle (creation, deletion, restore) of a safekeeper timeline.
 /// It also holds SharedState and provides mutually exclusive access to it.
 pub struct Timeline {
@@ -446,6 +449,8 @@ pub struct Timeline {
     manager_ctl: ManagerCtl,
     conf: Arc<SafeKeeperConf>,
 
+    remote_deletion: std::sync::Mutex<Option<RemoteDeletionReceiver>>,
+
     /// Hold this gate from code that depends on the Timeline's non-shut-down state.  While holding
     /// this gate, you must respect [`Timeline::cancel`]
     pub(crate) gate: Gate,
@@ -494,6 +499,7 @@ impl Timeline {
             walreceivers,
             gate: Default::default(),
             cancel: CancellationToken::default(),
+            remote_deletion: std::sync::Mutex::new(None),
             manager_ctl: ManagerCtl::new(),
             conf,
             broker_active: AtomicBool::new(false),
@@ -598,15 +604,95 @@ impl Timeline {
         shared_state.sk.close_wal_store();
 
         if !only_local && self.conf.is_wal_backup_enabled() {
-            // Note: we concurrently delete remote storage data from multiple
-            // safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
-            // do some retries anyway.
-            wal_backup::delete_timeline(&self.ttid).await?;
+            self.remote_delete().await?;
         }
         let dir_existed = delete_dir(&self.timeline_dir).await?;
         Ok(dir_existed)
     }
 
+    /// Delete timeline content from remote storage.  If the returned future is dropped,
+    /// deletion will continue in the background.
+    ///
+    /// This function ordinarily spawns a task and stashes a result receiver into [`Self::remote_deletion`].  If
+    /// deletion is already happening, it may simply wait for an existing task's result.
+    ///
+    /// Note: we concurrently delete remote storage data from multiple
+    /// safekeepers. That's ok, s3 replies 200 if object doesn't exist and we
+    /// do some retries anyway.
+    async fn remote_delete(&self) -> Result<()> {
+        // We will start a background task to do the deletion, so that it proceeds even if our
+        // API request is dropped.  Future requests will see the existing deletion task and wait
+        // for it to complete.
+        let mut result_rx = {
+            let mut remote_deletion_state = self.remote_deletion.lock().unwrap();
+            let result_rx = if let Some(result_rx) = remote_deletion_state.as_ref() {
+                if let Some(result) = result_rx.borrow().as_ref() {
+                    if let Err(e) = result {
+                        // A previous remote deletion failed: we will start a new one
+                        tracing::error!("remote deletion failed, will retry ({e})");
+                        None
+                    } else {
+                        // A previous remote deletion call already succeeded
+                        return Ok(());
+                    }
+                } else {
+                    // Remote deletion is still in flight
+                    Some(result_rx.clone())
+                }
+            } else {
+                // Remote deletion was not attempted yet, start it now.
+                None
+            };
+
+            match result_rx {
+                Some(result_rx) => result_rx,
+                None => self.start_remote_delete(&mut remote_deletion_state),
+            }
+        };
+
+        // Wait for a result
+        let Ok(result) = result_rx.wait_for(|v| v.is_some()).await else {
+            // Unexpected: sender should always send a result before dropping the channel, even if it has an error
+            return Err(anyhow::anyhow!(
+                "remote deletion task future was dropped without sending a result"
+            ));
+        };
+
+        result
+            .as_ref()
+            .expect("We did a wait_for on this being Some above")
+            .as_ref()
+            .map(|_| ())
+            .map_err(|e| anyhow::anyhow!("remote deletion failed: {e}"))
+    }
+
+    /// Spawn background task to do remote deletion, return a receiver for its outcome
+    fn start_remote_delete(
+        &self,
+        guard: &mut std::sync::MutexGuard<Option<RemoteDeletionReceiver>>,
+    ) -> RemoteDeletionReceiver {
+        tracing::info!("starting remote deletion");
+        let (result_tx, result_rx) = tokio::sync::watch::channel(None);
+        let ttid = self.ttid;
+        tokio::task::spawn(
+            async move {
+                let r = wal_backup::delete_timeline(&ttid).await;
+                if let Err(e) = &r {
+                    // Log error here in case nobody ever listens for our result (e.g. dropped API request)
+                    tracing::error!("remote deletion failed: {e}");
+                }
+
+                // Ignore send results: it's legal for the Timeline to give up waiting for us.
+                let _ = result_tx.send(Some(r));
+            }
+            .instrument(info_span!("remote_delete", timeline = %self.ttid)),
+        );
+
+        **guard = Some(result_rx.clone());
+
+        result_rx
+    }
+
     /// Returns if timeline is cancelled.
     pub fn is_cancelled(&self) -> bool {
         self.cancel.is_cancelled()
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 6176e64698..56f4a2faf9 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -21,9 +21,9 @@ use tokio::sync::{OnceCell, watch};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::backoff;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
+use utils::{backoff, pausable_failpoint};
 
 use crate::metrics::{BACKED_UP_SEGMENTS, BACKUP_ERRORS, WAL_BACKUP_TASKS};
 use crate::timeline::WalResidentTimeline;
@@ -564,6 +564,12 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
     // We don't currently have http requests timeout cancellation, but if/once
     // we have listing should get streaming interface to make progress.
 
+    pausable_failpoint!("sk-delete-timeline-remote-pause");
+
+    fail::fail_point!("sk-delete-timeline-remote", |_| {
+        Err(anyhow::anyhow!("failpoint: sk-delete-timeline-remote"))
+    });
+
     let cancel = CancellationToken::new(); // not really used
     backoff::retry(
         || async {
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
index cc9d4e6e3b..aab82fedb5 100644
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -246,7 +246,7 @@ mod tests {
             .unwrap();
 
         let resident_tli = tli.wal_residence_guard().await.unwrap();
-        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, None)
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT, c"neon-file:", None)
             .await
             .unwrap();
         let end_pos = end_watch.get();
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index b63ba154da..6b657b5ea0 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -21,6 +21,7 @@ clap.workspace = true
 cron.workspace = true
 fail.workspace = true
 futures.workspace = true
+governor.workspace = true
 hex.workspace = true
 hyper0.workspace = true
 humantime.workspace = true
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5b5ae80eaf..3e448d7013 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1,5 +1,5 @@
 use std::str::FromStr;
-use std::sync::Arc;
+use std::sync::{Arc, LazyLock};
 use std::time::{Duration, Instant};
 
 use anyhow::Context;
@@ -33,6 +33,7 @@ use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 use pageserver_client::{BlockUnblock, mgmt_api};
 use routerify::Middleware;
 use tokio_util::sync::CancellationToken;
+use tracing::warn;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -49,6 +50,7 @@ use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIME
 pub struct HttpState {
     service: Arc<crate::service::Service>,
     auth: Option<Arc<SwappableJwtAuth>>,
+    rate_limiter: governor::DefaultKeyedRateLimiter<TenantId>,
     neon_metrics: NeonMetrics,
     allowlist_routes: &'static [&'static str],
 }
@@ -59,9 +61,11 @@ impl HttpState {
         auth: Option<Arc<SwappableJwtAuth>>,
         build_info: BuildInfo,
     ) -> Self {
+        let quota = governor::Quota::per_second(service.get_config().tenant_rate_limit);
         Self {
             service,
             auth,
+            rate_limiter: governor::RateLimiter::keyed(quota),
             neon_metrics: NeonMetrics::new(build_info),
             allowlist_routes: &[
                 "/status",
@@ -82,6 +86,40 @@ fn get_state(request: &Request<Body>) -> &HttpState {
         .as_ref()
 }
 
+/// Rate limits tenant requests.
+///
+/// TODO: this should be a request middleware, but requires us to extract the tenant ID from
+/// different URLs in a systematic way.
+///
+/// TODO: consider returning a 429 response if these start piling up.
+async fn maybe_rate_limit(request: &Request<Body>, tenant_id: TenantId) {
+    // Check if the tenant should be rate-limited.
+    let rate_limiter = &get_state(request).rate_limiter;
+    if rate_limiter.check_key(&tenant_id).is_ok() {
+        return;
+    }
+
+    // Measure the rate limiting delay.
+    let _timer = METRICS_REGISTRY
+        .metrics_group
+        .storage_controller_http_request_rate_limited
+        .start_timer();
+
+    // Log rate limited tenants once every 10 seconds.
+    static LOG_RATE_LIMITER: LazyLock<governor::DefaultKeyedRateLimiter<TenantId>> =
+        LazyLock::new(|| {
+            let quota = governor::Quota::with_period(Duration::from_secs(10)).unwrap();
+            governor::RateLimiter::keyed(quota)
+        });
+
+    if LOG_RATE_LIMITER.check_key(&tenant_id).is_ok() {
+        warn!("tenant {tenant_id} is rate limited")
+    }
+
+    // Wait for quota.
+    rate_limiter.until_key_ready(&tenant_id).await;
+}
+
 /// Pageserver calls into this on startup, to learn which tenants it should attach
 async fn handle_re_attach(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::GenerationsApi)?;
@@ -247,6 +285,7 @@ async fn handle_tenant_config_get(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -264,6 +303,7 @@ async fn handle_tenant_time_travel_remote_storage(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -311,6 +351,7 @@ async fn handle_tenant_secondary_download(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -329,6 +370,7 @@ async fn handle_tenant_delete(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -356,6 +398,7 @@ async fn handle_tenant_timeline_create(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -381,6 +424,7 @@ async fn handle_tenant_timeline_delete(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -457,6 +501,7 @@ async fn handle_tenant_timeline_archival_config(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -482,6 +527,7 @@ async fn handle_tenant_timeline_detach_ancestor(
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -504,6 +550,7 @@ async fn handle_tenant_timeline_block_unblock_gc(
 ) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
 
@@ -521,12 +568,14 @@ async fn handle_tenant_timeline_download_heatmap_layers(
     let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
 
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_shard_id.tenant_id).await;
 
     let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
     let concurrency: Option<usize> = parse_query_param(&req, "concurrency")?;
+    let recurse = parse_query_param(&req, "recurse")?.unwrap_or(false);
 
     service
-        .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+        .tenant_timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency, recurse)
         .await?;
 
     json_response(StatusCode::OK, ())
@@ -547,8 +596,9 @@ async fn handle_tenant_timeline_passthrough(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let tenant_or_shard_id: TenantShardId = parse_request_param(&req, "tenant_id")?;
     check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_or_shard_id.tenant_id).await;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -562,15 +612,28 @@ async fn handle_tenant_timeline_passthrough(
         return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
     };
 
-    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
+    tracing::info!(
+        "Proxying request for tenant {} ({})",
+        tenant_or_shard_id.tenant_id,
+        path
+    );
 
     // Find the node that holds shard zero
-    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?;
+    let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() {
+        service
+            .tenant_shard0_node(tenant_or_shard_id.tenant_id)
+            .await?
+    } else {
+        (
+            service.tenant_shard_node(tenant_or_shard_id).await?,
+            tenant_or_shard_id,
+        )
+    };
 
     // Callers will always pass an unsharded tenant ID.  Before proxying, we must
     // rewrite this to a shard-aware shard zero ID.
     let path = format!("{}", path);
-    let tenant_str = tenant_id.to_string();
+    let tenant_str = tenant_or_shard_id.tenant_id.to_string();
     let tenant_shard_str = format!("{}", tenant_shard_id);
     let path = path.replace(&tenant_str, &tenant_shard_str);
 
@@ -610,7 +673,7 @@ async fn handle_tenant_timeline_passthrough(
     // Transform 404 into 503 if we raced with a migration
     if resp.status() == reqwest::StatusCode::NOT_FOUND {
         // Look up node again: if we migrated it will be different
-        let (new_node, _tenant_shard_id) = service.tenant_shard0_node(tenant_id).await?;
+        let new_node = service.tenant_shard_node(tenant_shard_id).await?;
         if new_node.get_id() != node.get_id() {
             // Rather than retry here, send the client a 503 to prompt a retry: this matches
             // the pageserver's use of 503, and all clients calling this API should retry on 503.
@@ -640,6 +703,7 @@ async fn handle_tenant_locate(
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
 
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -655,9 +719,9 @@ async fn handle_tenant_describe(
     service: Arc<Service>,
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Scrubber)?;
-
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::Scrubber)?;
+    // NB: don't rate limit: scrubber operation.
 
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -992,6 +1056,7 @@ async fn handle_tenant_shard_split(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1014,6 +1079,7 @@ async fn handle_tenant_shard_migrate(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1037,6 +1103,7 @@ async fn handle_tenant_shard_migrate_secondary(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1060,6 +1127,7 @@ async fn handle_tenant_shard_cancel_reconcile(
     req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1079,6 +1147,7 @@ async fn handle_tenant_shard_cancel_reconcile(
 
 async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
+    // NB: don't rate limit: admin operation.
 
     let mut req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1134,9 +1203,9 @@ async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError
 }
 
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -1151,9 +1220,9 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
 }
 
 async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::PageServerApi)?;
-
     let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 380ffeb9b7..6ef17c0007 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -1,3 +1,4 @@
+use std::num::NonZeroU32;
 use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;
@@ -98,6 +99,10 @@ struct Cli {
     #[arg(long)]
     priority_reconciler_concurrency: Option<usize>,
 
+    /// Tenant API rate limit, as requests per second per tenant.
+    #[arg(long, default_value = "10")]
+    tenant_rate_limit: NonZeroU32,
+
     /// How long to wait for the initial database connection to be available.
     #[arg(long, default_value = "5s")]
     db_connect_timeout: humantime::Duration,
@@ -339,6 +344,7 @@ async fn async_main() -> anyhow::Result<()> {
         priority_reconciler_concurrency: args
             .priority_reconciler_concurrency
             .unwrap_or(PRIORITY_RECONCILER_CONCURRENCY_DEFAULT),
+        tenant_rate_limit: args.tenant_rate_limit,
         split_threshold: args.split_threshold,
         neon_local_repo_dir: args.neon_local_repo_dir,
         max_secondary_lag_bytes: args.max_secondary_lag_bytes,
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index f490edb68f..ea390df726 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -76,6 +76,10 @@ pub(crate) struct StorageControllerMetricGroup {
     pub(crate) storage_controller_http_request_latency:
         measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,
 
+    /// HTTP rate limiting latency across all tenants and endpoints
+    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 10.0))]
+    pub(crate) storage_controller_http_request_rate_limited: measured::Histogram<10>,
+
     /// Count of HTTP requests to the pageserver that resulted in an error,
     /// broken down by the pageserver node id, request name and method
     pub(crate) storage_controller_pageserver_request_error:
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index e9c54414a3..d6127c355a 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -281,13 +281,19 @@ impl PageserverClient {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         concurrency: Option<usize>,
+        recurse: bool,
     ) -> Result<()> {
         measured_request!(
             "download_heatmap_layers",
             crate::metrics::Method::Post,
             &self.node_id_label,
             self.inner
-                .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+                .timeline_download_heatmap_layers(
+                    tenant_shard_id,
+                    timeline_id,
+                    concurrency,
+                    recurse
+                )
                 .await
         )
     }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d53b3d6598..8fc7f7a0c5 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5,6 +5,7 @@ use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap, HashSet};
 use std::error::Error;
+use std::num::NonZeroU32;
 use std::ops::Deref;
 use std::path::PathBuf;
 use std::str::FromStr;
@@ -365,6 +366,10 @@ pub struct Config {
     /// How many high-priority Reconcilers may be spawned concurrently
     pub priority_reconciler_concurrency: usize,
 
+    /// How many API requests per second to allow per tenant, across all
+    /// tenant-scoped API endpoints. Further API requests queue until ready.
+    pub tenant_rate_limit: NonZeroU32,
+
     /// How large must a shard grow in bytes before we split it?
     /// None disables auto-splitting.
     pub split_threshold: Option<u64>,
@@ -3781,6 +3786,7 @@ impl Service {
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
         concurrency: Option<usize>,
+        recurse: bool,
     ) -> Result<(), ApiError> {
         let _tenant_lock = trace_shared_lock(
             &self.tenant_op_locks,
@@ -3818,7 +3824,12 @@ impl Service {
             targets,
             |tenant_shard_id, client| async move {
                 client
-                    .timeline_download_heatmap_layers(tenant_shard_id, timeline_id, concurrency)
+                    .timeline_download_heatmap_layers(
+                        tenant_shard_id,
+                        timeline_id,
+                        concurrency,
+                        recurse,
+                    )
                     .await
             },
             1,
@@ -4165,16 +4176,14 @@ impl Service {
         }).await?
     }
 
-    /// When you need to send an HTTP request to the pageserver that holds shard0 of a tenant, this
-    /// function looks up and returns node. If the tenant isn't found, returns Err(ApiError::NotFound)
+    /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0.
     pub(crate) async fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
     ) -> Result<(Node, TenantShardId), ApiError> {
-        // Look up in-memory state and maybe use the node from there.
-        {
+        let tenant_shard_id = {
             let locked = self.inner.read().unwrap();
-            let Some((tenant_shard_id, shard)) = locked
+            let Some((tenant_shard_id, _shard)) = locked
                 .tenants
                 .range(TenantShardId::tenant_range(tenant_id))
                 .next()
@@ -4184,6 +4193,29 @@ impl Service {
                 ));
             };
 
+            *tenant_shard_id
+        };
+
+        self.tenant_shard_node(tenant_shard_id)
+            .await
+            .map(|node| (node, tenant_shard_id))
+    }
+
+    /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this
+    /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound)
+    pub(crate) async fn tenant_shard_node(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<Node, ApiError> {
+        // Look up in-memory state and maybe use the node from there.
+        {
+            let locked = self.inner.read().unwrap();
+            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant shard {tenant_shard_id} not found").into(),
+                ));
+            };
+
             let Some(intent_node_id) = shard.intent.get_attached() else {
                 tracing::warn!(
                     tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
@@ -4204,7 +4236,7 @@ impl Service {
                         "Shard refers to nonexistent node"
                     )));
                 };
-                return Ok((node.clone(), *tenant_shard_id));
+                return Ok(node.clone());
             }
         };
 
@@ -4212,29 +4244,34 @@ impl Service {
         // generation state: this will reflect the progress of any ongoing migration.
         // Note that it is not guaranteed to _stay_ here, our caller must still handle
         // the case where they call through to the pageserver and get a 404.
-        let db_result = self.persistence.tenant_generations(tenant_id).await?;
+        let db_result = self
+            .persistence
+            .tenant_generations(tenant_shard_id.tenant_id)
+            .await?;
         let Some(ShardGenerationState {
-            tenant_shard_id,
+            tenant_shard_id: _,
             generation: _,
             generation_pageserver: Some(node_id),
-        }) = db_result.first()
+        }) = db_result
+            .into_iter()
+            .find(|s| s.tenant_shard_id == tenant_shard_id)
         else {
             // This can happen if we raced with a tenant deletion or a shard split.  On a retry
             // the caller will either succeed (shard split case), get a proper 404 (deletion case),
             // or a conflict response (case where tenant was detached in background)
             return Err(ApiError::ResourceUnavailable(
-                "Shard {} not found in database, or is not attached".into(),
+                format!("Shard {tenant_shard_id} not found in database, or is not attached").into(),
             ));
         };
         let locked = self.inner.read().unwrap();
-        let Some(node) = locked.nodes.get(node_id) else {
+        let Some(node) = locked.nodes.get(&node_id) else {
             // This should never happen
             return Err(ApiError::InternalServerError(anyhow::anyhow!(
                 "Shard refers to nonexistent node"
             )));
         };
 
-        Ok((node.clone(), *tenant_shard_id))
+        Ok(node.clone())
     }
 
     pub(crate) fn tenant_locate(
diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 2ff68d7037..a0419e0205 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -46,48 +46,51 @@ impl ChaosInjector {
         }
     }
 
+    fn get_cron_interval_sleep_future(&self) -> Option<tokio::time::Sleep> {
+        if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
+            match cron_to_next_duration(chaos_exit_crontab) {
+                Ok(interval_exit) => Some(interval_exit),
+                Err(e) => {
+                    tracing::error!("Error processing the cron schedule: {e}");
+                    None
+                }
+            }
+        } else {
+            None
+        }
+    }
+
     pub async fn run(&mut self, cancel: CancellationToken) {
         let mut interval = tokio::time::interval(self.interval);
-        let cron_interval = {
-            if let Some(ref chaos_exit_crontab) = self.chaos_exit_crontab {
-                match cron_to_next_duration(chaos_exit_crontab) {
-                    Ok(interval_exit) => Some(interval_exit),
-                    Err(e) => {
-                        tracing::error!("Error processing the cron schedule: {e}");
-                        None
-                    }
-                }
-            } else {
-                None
-            }
-        };
+        #[derive(Debug)]
         enum ChaosEvent {
             ShuffleTenant,
             ForceKill,
         }
-        let chaos_type = tokio::select! {
-            _ = interval.tick() => {
-                ChaosEvent::ShuffleTenant
-            }
-            Some(_) = maybe_sleep(cron_interval) => {
-                ChaosEvent::ForceKill
-            }
-            _ = cancel.cancelled() => {
-                tracing::info!("Shutting down");
-                return;
-            }
-        };
-
-        match chaos_type {
-            ChaosEvent::ShuffleTenant => {
-                self.inject_chaos().await;
-            }
-            ChaosEvent::ForceKill => {
-                self.force_kill().await;
+        loop {
+            let cron_interval = self.get_cron_interval_sleep_future();
+            let chaos_type = tokio::select! {
+                _ = interval.tick() => {
+                    ChaosEvent::ShuffleTenant
+                }
+                Some(_) = maybe_sleep(cron_interval) => {
+                    ChaosEvent::ForceKill
+                }
+                _ = cancel.cancelled() => {
+                    tracing::info!("Shutting down");
+                    return;
+                }
+            };
+            tracing::info!("Chaos iteration: {chaos_type:?}...");
+            match chaos_type {
+                ChaosEvent::ShuffleTenant => {
+                    self.inject_chaos().await;
+                }
+                ChaosEvent::ForceKill => {
+                    self.force_kill().await;
+                }
             }
         }
-
-        tracing::info!("Chaos iteration...");
     }
 
     /// If a shard has a secondary and attached location, then re-assign the secondary to be
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 97a5a36814..6e53987e7c 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -525,12 +525,14 @@ class NeonLocalCli(AbstractNeonCli):
     def endpoint_start(
         self,
         endpoint_id: str,
+        safekeepers_generation: int | None = None,
         safekeepers: list[int] | None = None,
         remote_ext_config: str | None = None,
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
+        timeout: str | None = None,
         env: dict[str, str] | None = None,
     ) -> subprocess.CompletedProcess[str]:
         args = [
@@ -543,6 +545,8 @@ class NeonLocalCli(AbstractNeonCli):
         if remote_ext_config is not None:
             args.extend(["--remote-ext-config", remote_ext_config])
 
+        if safekeepers_generation is not None:
+            args.extend(["--safekeepers-generation", str(safekeepers_generation)])
         if safekeepers is not None:
             args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
         if endpoint_id is not None:
@@ -553,6 +557,8 @@ class NeonLocalCli(AbstractNeonCli):
             args.extend(["--allow-multiple"])
         if create_test_user:
             args.extend(["--create-test-user"])
+        if timeout is not None:
+            args.extend(["--start-timeout", str(timeout)])
 
         res = self.raw_cli(args, extra_env_vars)
         res.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index cd197d8e77..0065a8a3fa 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -253,10 +253,15 @@ class PgProtocol:
         # enough for our tests, but if you need a longer, you can
         # change it by calling "SET statement_timeout" after
         # connecting.
+        # pooler does not support statement_timeout
+        # Check if the hostname contains the string 'pooler'
+        hostname = result.get("host", "")
+        log.info(f"Hostname: {hostname}")
         options = result.get("options", "")
-        if "statement_timeout" not in options:
+        if "statement_timeout" not in options and "pooler" not in hostname:
             options = f"-cstatement_timeout=120s {options}"
         result["options"] = options
+
         return result
 
     # autocommit=True here by default because that's what we need most of the time
@@ -1176,15 +1181,6 @@ class NeonEnv:
                 "max_batch_size": 32,
             }
 
-            if config.test_may_use_compatibility_snapshot_binaries:
-                log.info(
-                    "Skipping prev heatmap settings to avoid forward-compatibility related test failures"
-                )
-            else:
-                # Look for gaps in WAL received from safekeepeers
-                ps_cfg["load_previous_heatmap"] = True
-                ps_cfg["generate_unarchival_heatmap"] = True
-
             get_vectored_concurrent_io = self.pageserver_get_vectored_concurrent_io
             if get_vectored_concurrent_io is not None:
                 ps_cfg["get_vectored_concurrent_io"] = {
@@ -1199,6 +1195,9 @@ class NeonEnv:
                     config.pageserver_default_tenant_config_compaction_algorithm
                 )
 
+            tenant_config = ps_cfg.setdefault("tenant_config", {})
+            tenant_config["rel_size_v2_enabled"] = True  # Enable relsize_v2 by default in tests
+
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
                     self.pageserver_remote_storage
@@ -2479,12 +2478,21 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return [TenantShardId.parse(tid) for tid in response.json()["updated"]]
 
-    def download_heatmap_layers(self, tenant_shard_id: TenantShardId, timeline_id: TimelineId):
+    def download_heatmap_layers(
+        self, tenant_shard_id: TenantShardId, timeline_id: TimelineId, recurse: bool | None = None
+    ):
+        url = (
+            f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers"
+        )
+        if recurse is not None:
+            url = url + f"?recurse={str(recurse).lower()}"
+
         response = self.request(
             "POST",
-            f"{self.api}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+            url,
             headers=self.headers(TokenScope.ADMIN),
         )
+
         response.raise_for_status()
 
     def __enter__(self) -> Self:
@@ -3602,6 +3610,7 @@ class NeonProxy(PgProtocol):
                             "project_id": "test_project_id",
                             "endpoint_id": "test_endpoint_id",
                             "branch_id": "test_branch_id",
+                            "compute_id": "test_compute_id",
                         },
                     }
                 },
@@ -3827,6 +3836,7 @@ def static_auth_broker(
         {
             "address": local_proxy_addr,
             "aux": {
+                "compute_id": "compute-foo-bar-1234-5678",
                 "endpoint_id": "ep-foo-bar-1234",
                 "branch_id": "br-foo-bar",
                 "project_id": "foo-bar",
@@ -3997,10 +4007,12 @@ class Endpoint(PgProtocol, LogUtils):
         self,
         remote_ext_config: str | None = None,
         pageserver_id: int | None = None,
+        safekeeper_generation: int | None = None,
         safekeepers: list[int] | None = None,
         allow_multiple: bool = False,
         create_test_user: bool = False,
         basebackup_request_tries: int | None = None,
+        timeout: str | None = None,
         env: dict[str, str] | None = None,
     ) -> Self:
         """
@@ -4010,19 +4022,21 @@ class Endpoint(PgProtocol, LogUtils):
 
         assert self.endpoint_id is not None
 
-        # If `safekeepers` is not None, they are remember them as active and use
-        # in the following commands.
+        # If `safekeepers` is not None, remember them as active and use in the
+        # following commands.
         if safekeepers is not None:
             self.active_safekeepers = safekeepers
 
         self.env.neon_cli.endpoint_start(
             self.endpoint_id,
+            safekeepers_generation=safekeeper_generation,
             safekeepers=self.active_safekeepers,
             remote_ext_config=remote_ext_config,
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             create_test_user=create_test_user,
             basebackup_request_tries=basebackup_request_tries,
+            timeout=timeout,
             env=env,
         )
         self._running.release(1)
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 4fce558840..abddfa2768 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -124,6 +124,8 @@ DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
     # controller's attempts to notify the endpoint).
     ".*reconciler.*neon_local notification hook failed.*",
     ".*reconciler.*neon_local error.*",
+    # Tenant rate limits may fire in tests that submit lots of API requests.
+    ".*tenant \\S+ is rate limited.*",
 ]
 
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 364aff325d..0efe0b9575 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -375,6 +375,19 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
         self.verbose_error(res)
 
+    def timeline_patch_index_part(
+        self,
+        tenant_id: TenantId | TenantShardId,
+        timeline_id: TimelineId,
+        data: dict[str, Any],
+    ):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/patch_index_part",
+            json=data,
+        )
+        self.verbose_error(res)
+        return res.json()
+
     def tenant_location_conf(
         self,
         tenant_id: TenantId | TenantShardId,
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index 4df2b2df2b..cac84c07e7 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -282,6 +282,17 @@ class S3Storage:
     def timeline_path(self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId) -> str:
         return f"{self.tenant_path(tenant_id)}/timelines/{timeline_id}"
 
+    def safekeeper_tenants_path(self) -> str:
+        return f"{self.prefix_in_bucket}"
+
+    def safekeeper_tenant_path(self, tenant_id: TenantShardId | TenantId) -> str:
+        return f"{self.safekeeper_tenants_path()}/{tenant_id}"
+
+    def safekeeper_timeline_path(
+        self, tenant_id: TenantShardId | TenantId, timeline_id: TimelineId
+    ) -> str:
+        return f"{self.safekeeper_tenant_path(tenant_id)}/{timeline_id}"
+
     def get_latest_generation_key(self, prefix: str, suffix: str, keys: list[str]) -> str:
         """
         Gets the latest generation key from a list of keys.
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 7038d87aba..e409151b76 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -229,13 +229,14 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
 
     # only_local doesn't remove segments in the remote storage.
     def timeline_delete(
-        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
+        self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False, **kwargs
     ) -> dict[Any, Any]:
         res = self.delete(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
             params={
                 "only_local": str(only_local).lower(),
             },
+            **kwargs,
         )
         res.raise_for_status()
         res_json = res.json()
diff --git a/test_runner/fixtures/safekeeper_utils.py b/test_runner/fixtures/safekeeper_utils.py
new file mode 100644
index 0000000000..158baf7bb6
--- /dev/null
+++ b/test_runner/fixtures/safekeeper_utils.py
@@ -0,0 +1,92 @@
+from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import Endpoint, NeonPageserver, Safekeeper
+from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
+from fixtures.utils import get_dir_size
+
+
+def is_segment_offloaded(
+    sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn
+):
+    http_cli = sk.http_client()
+    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"sk status is {tli_status}")
+    return tli_status.backup_lsn >= seg_end
+
+
+def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
+    http_cli = sk.http_client()
+    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
+    log.info(f"sk status is {tli_status}")
+    return tli_status.flush_lsn >= lsn
+
+
+def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
+    http_cli = sk.http_client()
+    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
+    sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id))
+    sk_wal_size_mb = sk_wal_size / 1024 / 1024
+    log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
+    return sk_wal_size_mb <= target_size_mb
+
+
+def wait_lsn_force_checkpoint(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    endpoint: Endpoint,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    pageserver_conn_options = pageserver_conn_options or {}
+    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
+
+    wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
+def wait_lsn_force_checkpoint_at_sk(
+    safekeeper: Safekeeper,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id)
+    wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
+
+
+def wait_lsn_force_checkpoint_at(
+    lsn: Lsn,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    ps: NeonPageserver,
+    pageserver_conn_options=None,
+):
+    """
+    Wait until pageserver receives given lsn, force checkpoint and wait for
+    upload, i.e. remote_consistent_lsn advancement.
+    """
+    pageserver_conn_options = pageserver_conn_options or {}
+
+    auth_token = None
+    if "password" in pageserver_conn_options:
+        auth_token = pageserver_conn_options["password"]
+
+    # wait for the pageserver to catch up
+    wait_for_last_record_lsn(
+        ps.http_client(auth_token=auth_token),
+        tenant_id,
+        timeline_id,
+        lsn,
+    )
+
+    # force checkpoint to advance remote_consistent_lsn
+    ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id)
+
+    # ensure that remote_consistent_lsn is advanced
+    wait_for_upload(
+        ps.http_client(auth_token=auth_token),
+        tenant_id,
+        timeline_id,
+        lsn,
+    )
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 84d62fb877..d1b2a5a400 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -337,6 +337,8 @@ def allure_add_grafana_link(host: str, timeline_id: TimelineId, start_ms: int, e
     """
     # We expect host to be in format like ep-holy-mouse-w2u462gi.us-east-2.aws.neon.build
     endpoint_id, region_id, _ = host.split(".", 2)
+    # Remove "-pooler" suffix if present
+    endpoint_id = endpoint_id.removesuffix("-pooler")
 
     params = {
         "orgId": 1,
diff --git a/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql
new file mode 100644
index 0000000000..69e6366a53
--- /dev/null
+++ b/test_runner/performance/large_synthetic_oltp/insert_webhooks.sql
@@ -0,0 +1,47 @@
+\set event_type random(1,10)
+\set service_key random(1, 3)
+
+INSERT INTO webhook.incoming_webhooks (
+    created_at, 
+    delivery_id, 
+    upstream_emitted_at, 
+    service_key, 
+    event_id, 
+    source, 
+    body, 
+    json, 
+    additional_data, 
+    is_body_encrypted, 
+    event_type
+) VALUES (
+    now(),
+    gen_random_uuid(),
+    now() - interval '10 minutes',
+    CASE :service_key::int
+        WHEN 1 THEN 'shopify'
+        WHEN 2 THEN 'stripe'
+        WHEN 3 THEN 'github'
+    END,
+    'evt_' || gen_random_uuid(),  -- Ensures uniqueness
+    CASE :service_key::int
+        WHEN 1 THEN 'Shopify'
+        WHEN 2 THEN 'Stripe'
+        WHEN 3 THEN 'GitHub'
+    END,
+    '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}',
+    '{"order_id": 987654, "customer": {"name": "John Doe", "email": "john.doe@example.com"}, "items": [{"product_id": 12345, "quantity": 2}, {"product_id": 67890, "quantity": 1}], "total": 199.99}'::jsonb,
+    '{"metadata": {"user_agent": "Mozilla/5.0", "ip_address": "203.0.113.42"}}'::jsonb,
+    false,
+    CASE :event_type::int
+        WHEN 1 THEN 'ORDER_PLACED'
+        WHEN 2 THEN 'ORDER_CANCELLED'
+        WHEN 3 THEN 'PAYMENT_SUCCESSFUL'
+        WHEN 4 THEN 'PAYMENT_FAILED'
+        WHEN 5 THEN 'CUSTOMER_CREATED'
+        WHEN 6 THEN 'CUSTOMER_UPDATED'
+        WHEN 7 THEN 'PRODUCT_UPDATED'
+        WHEN 8 THEN 'INVENTORY_LOW'
+        WHEN 9 THEN 'SHIPPING_DISPATCHED'
+        WHEN 10 THEN 'REFUND_ISSUED'
+    END
+);
\ No newline at end of file
diff --git a/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql
new file mode 100644
index 0000000000..b2f173f011
--- /dev/null
+++ b/test_runner/performance/large_synthetic_oltp/select_any_webhook_with_skew.sql
@@ -0,0 +1,15 @@
+-- Zipfian distributions model real-world access patterns where:
+--	A few values (popular IDs) are accessed frequently.
+--	Many values are accessed rarely.
+-- This is useful for simulating realistic workloads, like webhook processing where recent events are more frequently accessed.
+
+\set alpha 1.2  
+\set min_id 1
+\set max_id 135000000
+
+\set zipf_random_id random_zipfian(:min_id, :max_id, :alpha)
+
+SELECT * 
+FROM webhook.incoming_webhooks
+WHERE id = (:zipf_random_id)::bigint
+LIMIT 1;
\ No newline at end of file
diff --git a/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql
new file mode 100644
index 0000000000..78a843bf0f
--- /dev/null
+++ b/test_runner/performance/large_synthetic_oltp/select_recent_webhook.sql
@@ -0,0 +1,9 @@
+-- select one of the most recent webhook records (created in the branch timeline during the bench run)
+SELECT * 
+FROM webhook.incoming_webhooks
+WHERE id = (
+    SELECT (floor(random() * (
+        (SELECT last_value FROM webhook.incoming_webhooks_id_seq) - 1350000001 + 1
+    ) + 1350000001))::bigint
+)
+LIMIT 1;
\ No newline at end of file
diff --git a/test_runner/performance/test_perf_many_relations.py b/test_runner/performance/test_perf_many_relations.py
index 2570c55f6c..e2f0a79018 100644
--- a/test_runner/performance/test_perf_many_relations.py
+++ b/test_runner/performance/test_perf_many_relations.py
@@ -83,6 +83,13 @@ def test_perf_simple_many_relations_reldir_v2(
         ],
     )
 
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        != "legacy"
+    )
+
     n = 100000
     step = 5000
     # Create many relations
diff --git a/test_runner/performance/test_perf_oltp_large_tenant.py b/test_runner/performance/test_perf_oltp_large_tenant.py
new file mode 100644
index 0000000000..ae00dbb3b5
--- /dev/null
+++ b/test_runner/performance/test_perf_oltp_large_tenant.py
@@ -0,0 +1,90 @@
+from __future__ import annotations
+
+import os
+import timeit
+from pathlib import Path
+
+import pytest
+from fixtures.benchmark_fixture import PgBenchRunResult
+from fixtures.compare_fixtures import PgCompare
+
+from performance.test_perf_pgbench import get_durations_matrix, utc_now_timestamp
+
+
+def get_custom_scripts(
+    default: str = "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4 select_recent_webhook.sql@4",
+) -> list[str]:
+    # We parametrize each run with the custom scripts to run and their weights.
+    # The custom scripts and their weights are passed through TEST_PGBENCH_CUSTOM_SCRIPTS env variable.
+    # Delimit the custom scripts for one run by spaces and for different runs by commas, for example:
+    # "insert_webhooks.sql@2 select_any_webhook_with_skew.sql@4,insert_webhooks.sql@8 select_any_webhook_with_skew.sql@2"
+    # Databases/branches  are pre-created and passed through BENCHMARK_CONNSTR env variable.
+    scripts = os.getenv("TEST_PGBENCH_CUSTOM_SCRIPTS", default=str(default))
+    rv = []
+    for s in scripts.split(","):
+        rv.append(s)
+    return rv
+
+
+def run_test_pgbench(env: PgCompare, custom_scripts: str, duration: int):
+    password = env.pg.default_options.get("password", None)
+    options = env.pg.default_options.get("options", "")
+    # drop password from the connection string by passing password=None and set password separately
+    connstr = env.pg.connstr(password=None, options=options)
+    # if connstr does not contain pooler we can set statement_timeout to 0
+    if "pooler" not in connstr:
+        options = "-cstatement_timeout=0 " + env.pg.default_options.get("options", "")
+        connstr = env.pg.connstr(password=None, options=options)
+
+    script_args = [
+        "pgbench",
+        "-n",  # no explicit vacuum before the test - we want to rely on auto-vacuum
+        "-M",
+        "prepared",
+        "--client=500",
+        "--jobs=100",
+        f"-T{duration}",
+        "-P60",  # progress every minute
+        "--progress-timestamp",
+    ]
+    for script in custom_scripts.split():
+        script_args.extend(["-f", f"test_runner/performance/large_synthetic_oltp/{script}"])
+    script_args.append(connstr)
+
+    run_pgbench(
+        env,
+        "custom-scripts",
+        script_args,
+        password=password,
+    )
+
+
+def run_pgbench(env: PgCompare, prefix: str, cmdline, password: None):
+    environ: dict[str, str] = {}
+    if password is not None:
+        environ["PGPASSWORD"] = password
+
+    run_start_timestamp = utc_now_timestamp()
+    t0 = timeit.default_timer()
+    out = env.pg_bin.run_capture(cmdline, env=environ)
+    run_duration = timeit.default_timer() - t0
+    run_end_timestamp = utc_now_timestamp()
+    env.flush()
+
+    stdout = Path(f"{out}.stdout").read_text()
+
+    res = PgBenchRunResult.parse_from_stdout(
+        stdout=stdout,
+        run_duration=run_duration,
+        run_start_timestamp=run_start_timestamp,
+        run_end_timestamp=run_end_timestamp,
+    )
+    env.zenbenchmark.record_pg_bench_result(prefix, res)
+
+
+@pytest.mark.parametrize("custom_scripts", get_custom_scripts())
+@pytest.mark.parametrize("duration", get_durations_matrix())
+@pytest.mark.remote_cluster
+def test_perf_oltp_large_tenant(remote_compare: PgCompare, custom_scripts: str, duration: int):
+    run_test_pgbench(remote_compare, custom_scripts, duration)
+    # todo: run re-index, analyze, vacuum, etc. after the test and measure and report its duration
diff --git a/test_runner/regress/data/test_signed_char.out b/test_runner/regress/data/test_signed_char.out
new file mode 100644
index 0000000000..a68876e383
--- /dev/null
+++ b/test_runner/regress/data/test_signed_char.out
@@ -0,0 +1 @@
+0000000094010815f81f042000000000b89f8000909f5000689f5000489f4000309f3000189f3000009f3000e89e3000d09e3000b89e3000a09e3000889e3000709e3000309e8000189e3000009e3000e89d3000d09d3000b89d3000a09d3000889d3000709d3000589d3000409d3000289d3000109d3000f89c3000e09c3000c89c3000b09c3000989c3000809c3000689c3000509c3000389c3000209c3000089c3000f09b3000d89b3000c09b3000a89b3000909b3000789b3000609b3000489b3000309b3000189b3000009b3000e89a3000d09a3000b89a3000a09a3000889a3000489a8000309a3000189a3000009a3000e8993000d0993000b8993000a09930008899300070993000589930004099300000998000e8983000d0983000b8983000a0983000889830007098300058983000409830002898300010983000f8973000b8978000a09730008897300070973000589730004097300028973000e8968000a89680006896800028968000e8958000a8958000909530005095800038953000209530000895300000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000801000010018004c198900000000000000000029000000008010000100180049787f000000000000000000290000000080100001001800727c7000000000000000000029000000008010002800400020766200000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800207262000000000000000000290000000080100028004000766239000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040006239380000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400039383700000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100028004000383736000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000280040003736350000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100008010002800400036353400000000000000000001002700010101010101010101010101010101010101010101010101010101010101010101010101010101000080100001001800203034000000000000000000280000000080100001001800203933000000000000000000270000000080100001001800203833000000000000000000260000000080100001001800203733000000000000000000250000000080100001001800203633000000000000000000240000000080100001001800203533000000000000000000230000000080100028004000353433000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002034330000000000000000002200000000801000010018002033330000000000000000002100000000801000010018002032330000000000000000002000000000801000010018002031330000000000000000001f00000000801000010018002030330000000000000000001e00000000801000010018002039320000000000000000001d00000000801000010018002038320000000000000000001c00000000801000010018002037320000000000000000001b00000000801000010018002036320000000000000000001a0000000080100001001800203532000000000000000000190000000080100001001800203432000000000000000000180000000080100028004000343332000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002033320000000000000000001700000000801000010018002032320000000000000000001600000000801000010018002031320000000000000000001500000000801000010018002030320000000000000000001400000000801000010018002039310000000000000000001300000000801000010018002038310000000000000000001200000000801000010018002037310000000000000000001100000000801000010018002036310000000000000000001000000000801000010018002035310000000000000000000f00000000801000010018002034310000000000000000000e00000000801000010018002033310000000000000000000d0000000080100028004000333231000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018002032310000000000000000000c00000000801000010018002031310000000000000000000b00000000801000010018002030310000000000000000000a00000000801000010018002039200000000000000000000900000000801000010018002038200000000000000000000800000000801000010018002037200000000000000000000700000000801000010018002036200000000000000000000600000000801000010018002035200000000000000000000500000000801000010018003034200000000000000000002800000000801000010018002034200000000000000000000400000000801000010018003933200000000000000000002700000000801000010018003833200000000000000000002600000000801000010018003733200000000000000000002500000000801000010018003633200000000000000000002400000000801000010018003533200000000000000000002300000000801000010018003433200000000000000000002200000000801000010018003333200000000000000000002100000000801000010018003233200000000000000000002000000000801000010018003133200000000000000000001f00000000801000010018003033200000000000000000001e00000000801000010018002033200000000000000000000300000000801000010018003932200000000000000000001d00000000801000010018003832200000000000000000001c00000000801000010018003732200000000000000000001b00000000801000010018003632200000000000000000001a00000000801000010018003532200000000000000000001900000000801000010018003432200000000000000000001800000000801000010018003332200000000000000000001700000000801000010018003232200000000000000000001600000000801000010018003132200000000000000000001500000000801000010018003032200000000000000000001400000000801000010018002032200000000000000000000200000000801000010018003931200000000000000000001300000000801000010018003831200000000000000000001200000000801000010018003731200000000000000000001100000000801000010018003631200000000000000000001000000000801000010018003531200000000000000000000f00000000801000010018003431200000000000000000000e00000000801000010018003331200000000000000000000d0000000080100028004000323120000000000000000000010027000101010101010101010101010101010101010101010101010101010101010101010101010101010000801000010018003131200000000000000000000b00000000801000010018003031200000000000000000000a0000000080100001001800203120000000000000000000010000000080100001001800622020000000000000000000290000000080100001001800392020000000000000000000090000000080100001001800382020000000000000000000080000000080100001001800372020000000000000000000070000000080100001001800362020000000000000000000060000000080100001001800352020000000000000000000050000000080100002002000342020000000000000000000040001002400000000000000008010000b00280033202000000000000000000003000a001b010101010101010101000000000000008010000b00280032202000000000000000000002000a001201010101010101010100000000000000801000280040003120200000000000000000000100270001010101010101010101010101010101010101010101010101010101010101010101010101010100ffffffff00000200
\ No newline at end of file
diff --git a/test_runner/regress/test_compute_catalog.py b/test_runner/regress/test_compute_catalog.py
index 3a08671bbf..ce655d22b5 100644
--- a/test_runner/regress/test_compute_catalog.py
+++ b/test_runner/regress/test_compute_catalog.py
@@ -5,34 +5,59 @@ import logging
 import requests
 from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
 
+TEST_ROLE_NAMES = [
+    {"name": "neondb_owner"},
+    {"name": "role with spaces"},
+    {"name": "role with%20spaces "},
+    {"name": "role with whitespaces	"},
+    {"name": "injective role with spaces'; SELECT pg_sleep(1000);"},
+    {"name": "role with #pound-sign and &ampersands=true"},
+    {"name": "role with emoji 🌍"},
+    {"name": "role \";with ';injections $$ $x$ $ %I !/\\&#@"},
+    {"name": '"role in double quotes"'},
+    {"name": "'role in single quotes'"},
+]
+
 TEST_DB_NAMES = [
     {
         "name": "neondb",
-        "owner": "cloud_admin",
+        "owner": "neondb_owner",
     },
     {
         "name": "db with spaces",
-        "owner": "cloud_admin",
+        "owner": "role with spaces",
     },
     {
         "name": "db with%20spaces ",
-        "owner": "cloud_admin",
+        "owner": "role with%20spaces ",
     },
     {
         "name": "db with whitespaces	",
-        "owner": "cloud_admin",
+        "owner": "role with whitespaces	",
     },
     {
-        "name": "injective db with spaces'; SELECT pg_sleep(10);",
-        "owner": "cloud_admin",
+        "name": "injective db with spaces'; SELECT pg_sleep(1000);",
+        "owner": "injective role with spaces'; SELECT pg_sleep(1000);",
     },
     {
         "name": "db with #pound-sign and &ampersands=true",
-        "owner": "cloud_admin",
+        "owner": "role with #pound-sign and &ampersands=true",
     },
     {
         "name": "db with emoji 🌍",
-        "owner": "cloud_admin",
+        "owner": "role with emoji 🌍",
+    },
+    {
+        "name": "db \";with ';injections $$ $x$ $ %I !/\\&#@",
+        "owner": "role \";with ';injections $$ $x$ $ %I !/\\&#@",
+    },
+    {
+        "name": '"db in double quotes"',
+        "owner": '"role in double quotes"',
+    },
+    {
+        "name": "'db in single quotes'",
+        "owner": "'role in single quotes'",
     },
 ]
 
@@ -52,6 +77,7 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         **{
             "skip_pg_catalog_updates": False,
             "cluster": {
+                "roles": TEST_ROLE_NAMES,
                 "databases": TEST_DB_NAMES,
             },
         }
@@ -99,10 +125,10 @@ def test_compute_catalog(neon_simple_env: NeonEnv):
         ), f"Expected 404 status code, but got {e.response.status_code}"
 
 
-def test_compute_create_databases(neon_simple_env: NeonEnv):
+def test_compute_create_drop_dbs_and_roles(neon_simple_env: NeonEnv):
     """
-    Test that compute_ctl can create and work with databases with special
-    characters (whitespaces, %, tabs, etc.) in the name.
+    Test that compute_ctl can create and work with databases and roles
+    with special characters (whitespaces, %, tabs, etc.) in the name.
     """
     env = neon_simple_env
 
@@ -116,6 +142,7 @@ def test_compute_create_databases(neon_simple_env: NeonEnv):
         **{
             "skip_pg_catalog_updates": False,
             "cluster": {
+                "roles": TEST_ROLE_NAMES,
                 "databases": TEST_DB_NAMES,
             },
         }
@@ -139,6 +166,43 @@ def test_compute_create_databases(neon_simple_env: NeonEnv):
             assert len(curr_db) == 1
             assert curr_db[0] == db["name"]
 
+    for role in TEST_ROLE_NAMES:
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],))
+            catalog_role = cursor.fetchone()
+            assert catalog_role is not None
+            assert catalog_role[0] == role["name"]
+
+    delta_operations = []
+    for db in TEST_DB_NAMES:
+        delta_operations.append({"action": "delete_db", "name": db["name"]})
+    for role in TEST_ROLE_NAMES:
+        delta_operations.append({"action": "delete_role", "name": role["name"]})
+
+    endpoint.respec_deep(
+        **{
+            "skip_pg_catalog_updates": False,
+            "cluster": {
+                "roles": [],
+                "databases": [],
+            },
+            "delta_operations": delta_operations,
+        }
+    )
+    endpoint.reconfigure()
+
+    for db in TEST_DB_NAMES:
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (db["name"],))
+            catalog_db = cursor.fetchone()
+            assert catalog_db is None
+
+    for role in TEST_ROLE_NAMES:
+        with endpoint.cursor() as cursor:
+            cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = %s", (role["name"],))
+            catalog_role = cursor.fetchone()
+            assert catalog_role is None
+
 
 def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     """
@@ -150,17 +214,19 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     # stuff into the spec.json file.
     endpoint = env.endpoints.create_start("main")
 
+    SUB_DB_NAME = "';subscriber_db $$ $x$ $;"
+    PUB_DB_NAME = "publisher_db"
     TEST_DB_NAMES = [
         {
             "name": "neondb",
             "owner": "cloud_admin",
         },
         {
-            "name": "subscriber_db",
+            "name": SUB_DB_NAME,
             "owner": "cloud_admin",
         },
         {
-            "name": "publisher_db",
+            "name": PUB_DB_NAME,
             "owner": "cloud_admin",
         },
     ]
@@ -177,47 +243,47 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
     )
     endpoint.reconfigure()
 
-    # connect to the publisher_db and create a publication
-    with endpoint.cursor(dbname="publisher_db") as cursor:
+    # Connect to the PUB_DB_NAME and create a publication
+    with endpoint.cursor(dbname=PUB_DB_NAME) as cursor:
         cursor.execute("CREATE PUBLICATION mypub FOR ALL TABLES")
         cursor.execute("select pg_catalog.pg_create_logical_replication_slot('mysub', 'pgoutput');")
         cursor.execute("CREATE TABLE t(a int)")
         cursor.execute("INSERT INTO t VALUES (1)")
         cursor.execute("CHECKPOINT")
 
-    # connect to the subscriber_db and create a subscription
-    # Note that we need to create subscription with
-    connstr = endpoint.connstr(dbname="publisher_db").replace("'", "''")
-    with endpoint.cursor(dbname="subscriber_db") as cursor:
+    # Connect to the SUB_DB_NAME and create a subscription
+    # Note that we need to create subscription with the following connstr:
+    connstr = endpoint.connstr(dbname=PUB_DB_NAME).replace("'", "''")
+    with endpoint.cursor(dbname=SUB_DB_NAME) as cursor:
         cursor.execute("CREATE TABLE t(a int)")
         cursor.execute(
-            f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub  WITH (create_slot = false) "
+            f"CREATE SUBSCRIPTION mysub CONNECTION '{connstr}' PUBLICATION mypub WITH (create_slot = false) "
         )
 
-    # wait for the subscription to be active
+    # Wait for the subscription to be active
     logical_replication_sync(
         endpoint,
         endpoint,
         "mysub",
-        sub_dbname="subscriber_db",
-        pub_dbname="publisher_db",
+        sub_dbname=SUB_DB_NAME,
+        pub_dbname=PUB_DB_NAME,
     )
 
     # Check that replication is working
-    with endpoint.cursor(dbname="subscriber_db") as cursor:
+    with endpoint.cursor(dbname=SUB_DB_NAME) as cursor:
         cursor.execute("SELECT * FROM t")
         rows = cursor.fetchall()
         assert len(rows) == 1
         assert rows[0][0] == 1
 
-    # drop the subscriber_db from the list
+    # Drop the SUB_DB_NAME from the list
     TEST_DB_NAMES_NEW = [
         {
             "name": "neondb",
             "owner": "cloud_admin",
         },
         {
-            "name": "publisher_db",
+            "name": PUB_DB_NAME,
             "owner": "cloud_admin",
         },
     ]
@@ -230,7 +296,7 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
                 "databases": TEST_DB_NAMES_NEW,
             },
             "delta_operations": [
-                {"action": "delete_db", "name": "subscriber_db"},
+                {"action": "delete_db", "name": SUB_DB_NAME},
                 # also test the case when we try to delete a non-existent database
                 # shouldn't happen in normal operation,
                 # but can occur when failed operations are retried
@@ -239,22 +305,22 @@ def test_dropdb_with_subscription(neon_simple_env: NeonEnv):
         }
     )
 
-    logging.info("Reconfiguring the endpoint to drop the subscriber_db")
+    logging.info(f"Reconfiguring the endpoint to drop the {SUB_DB_NAME} database")
     endpoint.reconfigure()
 
-    # Check that the subscriber_db is dropped
+    # Check that the SUB_DB_NAME is dropped
     with endpoint.cursor() as cursor:
-        cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", ("subscriber_db",))
+        cursor.execute("SELECT datname FROM pg_database WHERE datname = %s", (SUB_DB_NAME,))
         catalog_db = cursor.fetchone()
         assert catalog_db is None
 
-    # Check that we can still connect to the publisher_db
-    with endpoint.cursor(dbname="publisher_db") as cursor:
+    # Check that we can still connect to the PUB_DB_NAME
+    with endpoint.cursor(dbname=PUB_DB_NAME) as cursor:
         cursor.execute("SELECT * FROM current_database()")
         curr_db = cursor.fetchone()
         assert curr_db is not None
         assert len(curr_db) == 1
-        assert curr_db[0] == "publisher_db"
+        assert curr_db[0] == PUB_DB_NAME
 
 
 def test_compute_drop_role(neon_simple_env: NeonEnv):
@@ -265,6 +331,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
     """
     env = neon_simple_env
     TEST_DB_NAME = "db_with_permissions"
+    TEST_GRANTEE = "'); MALFORMED SQL $$ $x$ $/;5%$ %I"
 
     endpoint = env.endpoints.create_start("main")
 
@@ -301,16 +368,18 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
         cursor.execute("create view test_view as select * from test_table")
 
     with endpoint.cursor(dbname=TEST_DB_NAME, user="neon") as cursor:
-        cursor.execute("create role readonly")
+        cursor.execute(f'create role "{TEST_GRANTEE}"')
         # We (`compute_ctl`) make 'neon' the owner of schema 'public' in the owned database.
         # Postgres has all sorts of permissions and grants that we may not handle well,
         # but this is the shortest repro grant for the issue
         # https://github.com/neondatabase/cloud/issues/13582
-        cursor.execute("grant select on all tables in schema public to readonly")
+        cursor.execute(f'grant select on all tables in schema public to "{TEST_GRANTEE}"')
 
     # Check that role was created
     with endpoint.cursor() as cursor:
-        cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'")
+        cursor.execute(
+            "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE}
+        )
         role = cursor.fetchone()
         assert role is not None
 
@@ -318,7 +387,8 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
     # that may block our ability to drop the role.
     with endpoint.cursor(dbname=TEST_DB_NAME) as cursor:
         cursor.execute(
-            "select grantor from information_schema.role_table_grants where grantee = 'readonly'"
+            "select grantor from information_schema.role_table_grants where grantee = %(grantee)s",
+            {"grantee": TEST_GRANTEE},
         )
         res = cursor.fetchall()
         assert len(res) == 2, f"Expected 2 table grants, got {len(res)}"
@@ -332,7 +402,7 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
             "delta_operations": [
                 {
                     "action": "delete_role",
-                    "name": "readonly",
+                    "name": TEST_GRANTEE,
                 },
             ],
         }
@@ -341,7 +411,9 @@ def test_compute_drop_role(neon_simple_env: NeonEnv):
 
     # Check that role is dropped
     with endpoint.cursor() as cursor:
-        cursor.execute("SELECT rolname FROM pg_roles WHERE rolname = 'readonly'")
+        cursor.execute(
+            "SELECT rolname FROM pg_roles WHERE rolname = %(role)s", {"role": TEST_GRANTEE}
+        )
         role = cursor.fetchone()
         assert role is None
 
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 7f12c14073..2ff525464d 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -137,6 +137,8 @@ def test_remote_extensions(
     metrics = parse_metrics(raw_metrics)
     remote_ext_requests = metrics.query_all(
         "compute_ctl_remote_ext_requests_total",
+        # Check that we properly report the filename in the metrics
+        {"filename": "anon.tar.zst"},
     )
     assert len(remote_ext_requests) == 1
     for sample in remote_ext_requests:
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 55fd7a8608..17ffeca23b 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -7,7 +7,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 
 
 def check_client(env: NeonEnv, client: PageserverHttpClient):
@@ -138,3 +138,25 @@ def test_pageserver_http_api_client_auth_enabled(neon_env_builder: NeonEnvBuilde
 
     with env.pageserver.http_client(auth_token=pageserver_token) as client:
         check_client(env, client)
+
+
+@run_only_on_default_postgres("it does not use any postgres functionality")
+def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    with env.pageserver.http_client() as client:
+        client.timeline_patch_index_part(
+            tenant_id,
+            timeline_id,
+            {"rel_size_migration": "migrating"},
+        )
+        assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "migrating"
+        # This is invalid in practice: we should never rollback the migrating state to legacy.
+        # But we do it here to test the API.
+        client.timeline_patch_index_part(
+            tenant_id,
+            timeline_id,
+            {"rel_size_migration": "legacy"},
+        )
+        assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index a9b897b741..b9e2934505 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -938,9 +938,12 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     # Expect lots of layers
     assert len(ps_attached.list_layers(tenant_id, timeline_id)) > 10
 
-    # Simulate large data by making layer downloads artifically slow
     for ps in env.pageservers:
+        # Simulate large data by making layer downloads artifically slow
         ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "return(1000)")])
+        # Make the initial logical size calculation lie. Otherwise it on demand downloads
+        # layers and makes accounting difficult.
+        ps.http_client().configure_failpoints(("skip-logical-size-calculation", "return"))
 
     def timeline_heatmap(tlid):
         assert env.pageserver_remote_storage is not None
@@ -952,20 +955,16 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
 
         raise RuntimeError(f"No heatmap for timeline: {tlid}")
 
-    # Upload a heatmap, so that secondaries have something to download
-    ps_attached.http_client().tenant_heatmap_upload(tenant_id)
-    heatmap_before_migration = timeline_heatmap(timeline_id)
+    def count_timeline_heatmap_layers(tlid) -> tuple[int, int]:
+        cold, hot = 0, 0
+        layers = timeline_heatmap(tlid)["layers"]
+        for layer in layers:
+            if layer["cold"]:
+                cold += 1
+            else:
+                hot += 1
 
-    # This has no chance to succeed: we have lots of layers and each one takes at least 1000ms.
-    # However, it pulls the heatmap, which will be important later.
-    http_client = env.storage_controller.pageserver_api()
-    (status, progress) = http_client.tenant_secondary_download(tenant_id, wait_ms=4000)
-    assert status == 202
-    assert progress["heatmap_mtime"] is not None
-    assert progress["layers_downloaded"] > 0
-    assert progress["bytes_downloaded"] > 0
-    assert progress["layers_total"] > progress["layers_downloaded"]
-    assert progress["bytes_total"] > progress["bytes_downloaded"]
+        return cold, hot
 
     env.storage_controller.allowed_errors.extend(
         [
@@ -975,6 +974,7 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
 
     # Use a custom configuration that gives up earlier than usual.
     # We can't hydrate everything anyway because of the failpoints.
+    # Implicitly, this also uploads a heatmap from the current attached location.
     config = StorageControllerMigrationConfig(
         secondary_warmup_timeout="5s", secondary_download_request_timeout="2s"
     )
@@ -988,31 +988,33 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
     heatmap_after_migration = timeline_heatmap(timeline_id)
 
-    assert len(heatmap_before_migration["layers"]) > 0
+    local_layers = ps_secondary.list_layers(tenant_id, timeline_id)
+    # We download 1 layer per second and give up within 5 seconds.
+    assert len(local_layers) < 10
 
     after_migration_heatmap_layers_count = len(heatmap_after_migration["layers"])
-    assert len(heatmap_before_migration["layers"]) <= after_migration_heatmap_layers_count
-
     log.info(f"Heatmap size after cold migration is {after_migration_heatmap_layers_count}")
 
     env.storage_controller.download_heatmap_layers(
         TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
     )
 
-    # Now simulate the case where a child timeline is archived, parent layers
-    # are evicted and the child is unarchived. When the child is unarchived,
-    # itself and the parent update their heatmaps to contain layers needed by the
-    # child. One can warm up the timeline hierarchy since the heatmaps are ready.
-
-    def all_layers_downloaded(expected_layer_count: int):
-        local_layers_count = len(ps_secondary.list_layers(tenant_id, timeline_id))
+    def all_layers_downloaded(node, expected_layer_count: int):
+        local_layers_count = len(node.list_layers(tenant_id, timeline_id))
 
         log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
         assert local_layers_count >= expected_layer_count
 
-    wait_until(lambda: all_layers_downloaded(after_migration_heatmap_layers_count))
-    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+    def no_layers_downloaded(node):
+        local_layers_count = len(node.list_layers(tenant_id, timeline_id))
 
+        log.info(f"{local_layers_count=} {after_migration_heatmap_layers_count=}")
+        assert local_layers_count == 0
+
+    wait_until(lambda: all_layers_downloaded(ps_secondary, after_migration_heatmap_layers_count))
+
+    # Read everything and make sure that we're not downloading anything extra.
+    # All hot layers should be available locally now.
     before = (
         ps_secondary.http_client()
         .get_metrics()
@@ -1030,6 +1032,11 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     workload.stop()
     assert before == after
 
+    # Now simulate the case where a child timeline is archived, parent layers
+    # are evicted and the child is unarchived. When the child is unarchived,
+    # itself and the parent update their heatmaps to contain layers needed by the
+    # child. One can warm up the timeline hierarchy since the heatmaps are ready.
+
     def check_archival_state(state: TimelineArchivalState, tline):
         timelines = (
             timeline["timeline_id"]
@@ -1057,13 +1064,35 @@ def test_migration_to_cold_secondary(neon_env_builder: NeonEnvBuilder):
     wait_until(lambda: check_archival_state(TimelineArchivalState.UNARCHIVED, child_timeline_id))
 
     ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
-    log.info(f"Parent timeline heatmap size: {len(timeline_heatmap(timeline_id)['layers'])}")
-    log.info(f"Child timeline heatmap size: {len(timeline_heatmap(child_timeline_id)['layers'])}")
 
-    expected_locally = len(timeline_heatmap(timeline_id)["layers"])
-    assert expected_locally > 0
+    parent_cold, parent_hot = count_timeline_heatmap_layers(timeline_id)
+    child_cold, child_hot = count_timeline_heatmap_layers(child_timeline_id)
+
+    log.info(f"Parent timeline heatmap size: cold={parent_cold}, hot={parent_hot}")
+    log.info(f"Child timeline heatmap size: cold={child_cold}, hot={child_hot}")
+
+    # All layers in the heatmap should come from the generation on unarchival.
+    # Hence, they should be cold.
+    assert parent_cold > 0
+    assert parent_hot == 0
+
+    expected_locally = parent_cold
 
     env.storage_controller.download_heatmap_layers(
-        TenantShardId(tenant_id, shard_number=0, shard_count=0), timeline_id
+        TenantShardId(tenant_id, shard_number=0, shard_count=0), child_timeline_id, recurse=True
     )
-    wait_until(lambda: all_layers_downloaded(expected_locally))
+    wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally))
+
+    for ps in env.pageservers:
+        ps.http_client().configure_failpoints([("secondary-layer-download-sleep", "off")])
+
+    # The uploaded heatmap is still empty. Clean up all layers on the secondary.
+    ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100)
+    wait_until(lambda: no_layers_downloaded(ps_attached))
+
+    # Upload a new heatmap. The previously cold layers become hot since they're now resident.
+    ps_secondary.http_client().tenant_heatmap_upload(tenant_id)
+
+    # Warm up the current secondary.
+    ps_attached.http_client().tenant_secondary_download(tenant_id, wait_ms=100)
+    wait_until(lambda: all_layers_downloaded(ps_secondary, expected_locally))
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 6a76ad5ca8..1d9f385358 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,7 +5,7 @@ from __future__ import annotations
 
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, Any, cast
 
 import pytest
 from fixtures.log_helper import log
@@ -118,10 +118,20 @@ def post_checks(env: NeonEnv, test_output_dir: Path, db_name: str, endpoint: End
         pageserver.http_client().timeline_gc(shard, env.initial_timeline, None)
 
 
+def patch_tenant_conf(tenant_conf: dict[str, Any], reldir_type: str) -> dict[str, Any]:
+    tenant_conf = tenant_conf.copy()
+    if reldir_type == "v2":
+        tenant_conf["rel_size_v2_enabled"] = "true"
+    else:
+        tenant_conf["rel_size_v2_enabled"] = "false"
+    return tenant_conf
+
+
 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
 @pytest.mark.timeout(3000)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_pg_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -130,6 +140,7 @@ def test_pg_regress(
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: int | None,
+    reldir_type: str,
 ):
     DBNAME = "regression"
 
@@ -142,7 +153,7 @@ def test_pg_regress(
 
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF,
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
         initial_tenant_shard_count=shard_count,
     )
 
@@ -196,6 +207,7 @@ def test_pg_regress(
 #
 @pytest.mark.timeout(1500)  # Contains many sub-tests, is slow in debug builds
 @pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_isolation(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -204,6 +216,7 @@ def test_isolation(
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: int | None,
+    reldir_type: str,
 ):
     DBNAME = "isolation_regression"
 
@@ -211,7 +224,8 @@ def test_isolation(
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
+        initial_tenant_shard_count=shard_count,
     )
 
     # Connect to postgres and create a database called "regression".
@@ -267,6 +281,7 @@ def test_isolation(
 # Run extra Neon-specific pg_regress-based tests. The tests and their
 # schedule file are in the sql_regress/ directory.
 @pytest.mark.parametrize("shard_count", [None, 4])
+@pytest.mark.parametrize("reldir_type", ["v1", "v2"])
 def test_sql_regress(
     neon_env_builder: NeonEnvBuilder,
     test_output_dir: Path,
@@ -275,6 +290,7 @@ def test_sql_regress(
     base_dir: Path,
     pg_distrib_dir: Path,
     shard_count: int | None,
+    reldir_type: str,
 ):
     DBNAME = "regression"
 
@@ -282,7 +298,8 @@ def test_sql_regress(
         neon_env_builder.num_pageservers = shard_count
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
     env = neon_env_builder.init_start(
-        initial_tenant_conf=TENANT_CONF, initial_tenant_shard_count=shard_count
+        initial_tenant_conf=patch_tenant_conf(TENANT_CONF, reldir_type),
+        initial_tenant_shard_count=shard_count,
     )
 
     # Connect to postgres and create a database called "regression".
@@ -345,9 +362,7 @@ def test_tx_abort_with_many_relations(
     """
 
     env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "rel_size_v2_enabled": "true" if reldir_type == "v2" else "false",
-        }
+        initial_tenant_conf=patch_tenant_conf({}, reldir_type),
     )
     ep = env.endpoints.create_start(
         "main",
@@ -358,14 +373,25 @@ def test_tx_abort_with_many_relations(
         ],
     )
 
+    if reldir_type == "v1":
+        assert (
+            env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+                "rel_size_migration"
+            ]
+            == "legacy"
+        )
+    else:
+        assert (
+            env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+                "rel_size_migration"
+            ]
+            != "legacy"
+        )
+
     # How many relations: this number is tuned to be long enough to take tens of seconds
     # if the rollback code path is buggy, tripping the test's timeout.
-    if reldir_type == "v1":
-        n = 4000
-        step = 4000
-    else:
-        n = 20000
-        step = 5000
+    n = 5000
+    step = 2500
 
     def create():
         # Create many relations
diff --git a/test_runner/regress/test_relations.py b/test_runner/regress/test_relations.py
index 3e29c92a96..07eacfc775 100644
--- a/test_runner/regress/test_relations.py
+++ b/test_runner/regress/test_relations.py
@@ -19,6 +19,17 @@ def test_pageserver_reldir_v2(
     endpoint.safe_psql("CREATE TABLE foo1 (id INTEGER PRIMARY KEY, val text)")
     endpoint.safe_psql("CREATE TABLE foo2 (id INTEGER PRIMARY KEY, val text)")
 
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        == "legacy"
+    )
+
+    # Ensure the pageserver accepts the table creation SQLs before the migration. In theory, we can also do
+    # a "wait_flush_lsn" here, but it's easier to just do a restart.
+    env.pageserver.restart()
+
     # Switch to v2
     env.pageserver.http_client().update_tenant_config(
         env.initial_tenant,
@@ -27,6 +38,13 @@ def test_pageserver_reldir_v2(
         },
     )
 
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        == "legacy"
+    )
+
     # Check if both relations are still accessible
     endpoint.safe_psql("SELECT * FROM foo1")
     endpoint.safe_psql("SELECT * FROM foo2")
@@ -41,12 +59,14 @@ def test_pageserver_reldir_v2(
 
     # Create a relation in v2
     endpoint.safe_psql("CREATE TABLE foo3 (id INTEGER PRIMARY KEY, val text)")
+    endpoint.safe_psql("CREATE TABLE foo4 (id INTEGER PRIMARY KEY, val text)")
     # Delete a relation in v1
     endpoint.safe_psql("DROP TABLE foo1")
 
     # Check if both relations are still accessible
     endpoint.safe_psql("SELECT * FROM foo2")
     endpoint.safe_psql("SELECT * FROM foo3")
+    endpoint.safe_psql("SELECT * FROM foo4")
 
     # Restart the endpoint
     endpoint.stop()
@@ -57,7 +77,7 @@ def test_pageserver_reldir_v2(
     endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
     endpoint.safe_psql("SELECT * FROM foo2")
     endpoint.safe_psql("SELECT * FROM foo3")
-
+    endpoint.safe_psql("SELECT * FROM foo4")
     endpoint.safe_psql("DROP TABLE foo3")
     endpoint.stop()
     endpoint.start()
@@ -66,3 +86,25 @@ def test_pageserver_reldir_v2(
     endpoint.safe_psql("DROP TABLE IF EXISTS foo1")
     endpoint.safe_psql("SELECT * FROM foo2")
     endpoint.safe_psql("DROP TABLE IF EXISTS foo3")
+    endpoint.safe_psql("SELECT * FROM foo4")
+
+    # Set the config to false to emulate the case where the config is not persisted when the tenant gets detached/attached.
+    env.pageserver.http_client().update_tenant_config(
+        env.initial_tenant,
+        {
+            "rel_size_v2_enabled": False,
+        },
+    )
+
+    # Check if the relation is still accessible
+    endpoint.safe_psql("SELECT * FROM foo2")
+    endpoint.safe_psql("SELECT * FROM foo4")
+
+    env.pageserver.restart()
+
+    assert (
+        env.pageserver.http_client().timeline_detail(env.initial_tenant, env.initial_timeline)[
+            "rel_size_migration"
+        ]
+        == "migrating"
+    )
diff --git a/test_runner/regress/test_safekeeper_deletion.py b/test_runner/regress/test_safekeeper_deletion.py
new file mode 100644
index 0000000000..b46095d583
--- /dev/null
+++ b/test_runner/regress/test_safekeeper_deletion.py
@@ -0,0 +1,331 @@
+from __future__ import annotations
+
+import threading
+import time
+from contextlib import closing
+from enum import StrEnum
+
+import pytest
+import requests
+from fixtures.common_types import Lsn, TimelineId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    Endpoint,
+    NeonEnvBuilder,
+)
+from fixtures.remote_storage import S3Storage, s3_storage
+from fixtures.safekeeper_utils import is_segment_offloaded
+from fixtures.utils import wait_until
+
+
+@pytest.mark.parametrize("auth_enabled", [False, True])
+def test_safekeeper_delete_timeline(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
+    neon_env_builder.auth_enabled = auth_enabled
+    env = neon_env_builder.init_start()
+
+    # FIXME: are these expected?
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Timeline .* was not found in global map.*",
+            ".*Timeline .* was cancelled and cannot be used anymore.*",
+        ]
+    )
+
+    # Create two tenants: one will be deleted, other should be preserved.
+    tenant_id = env.initial_tenant
+    timeline_id_1 = env.create_branch("br1")  # Active, delete explicitly
+    timeline_id_2 = env.create_branch("br2")  # Inactive, delete explicitly
+    timeline_id_3 = env.create_branch("br3")  # Active, delete with the tenant
+    timeline_id_4 = env.create_branch("br4")  # Inactive, delete with the tenant
+
+    tenant_id_other, timeline_id_other = env.create_tenant()
+
+    # Populate branches
+    endpoint_1 = env.endpoints.create_start("br1")
+    endpoint_2 = env.endpoints.create_start("br2")
+    endpoint_3 = env.endpoints.create_start("br3")
+    endpoint_4 = env.endpoints.create_start("br4")
+    endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other)
+    for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("CREATE TABLE t(key int primary key)")
+    sk = env.safekeepers[0]
+    sk_data_dir = sk.data_dir
+    if not auth_enabled:
+        sk_http = sk.http_client()
+        sk_http_other = sk_http
+    else:
+        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+        sk_http_other = sk.http_client(
+            auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)
+        )
+        sk_http_noauth = sk.http_client(gen_sk_wide_token=False)
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state.
+    endpoint_2.stop_and_destroy()
+    endpoint_4.stop_and_destroy()
+    sk.stop()
+    sk.start()
+
+    # Ensure connections to Safekeeper are established
+    for endpoint in [endpoint_1, endpoint_3, endpoint_other]:
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("INSERT INTO t (key) VALUES (1)")
+
+    # Stop all computes gracefully before safekeepers stop responding to them
+    endpoint_1.stop_and_destroy()
+    endpoint_3.stop_and_destroy()
+
+    # Remove initial tenant's br1 (active)
+    assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Ensure repeated deletion succeeds
+    assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    if auth_enabled:
+        # Ensure we cannot delete the other tenant
+        for sk_h in [sk_http, sk_http_noauth]:
+            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
+                assert sk_h.timeline_delete(tenant_id_other, timeline_id_other)
+            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
+                assert sk_h.tenant_delete_force(tenant_id_other)
+        assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove initial tenant's br2 (inactive)
+    assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove non-existing branch, should succeed
+    assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
+    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()
+    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove initial tenant fully (two branches are active)
+    response = sk_http.tenant_delete_force(tenant_id)
+    assert response[str(timeline_id_3)]["dir_existed"]
+    assert not (sk_data_dir / str(tenant_id)).exists()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Remove initial tenant again.
+    response = sk_http.tenant_delete_force(tenant_id)
+    # assert response == {}
+    assert not (sk_data_dir / str(tenant_id)).exists()
+    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
+
+    # Ensure the other tenant still works
+    sk_http_other.timeline_status(tenant_id_other, timeline_id_other)
+    with closing(endpoint_other.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("INSERT INTO t (key) VALUES (123)")
+
+
+def test_safekeeper_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder):
+    """
+    Test deleting timelines on a safekeeper while they're under load.
+
+    This should not happen under normal operation, but it can happen if
+    there is some rogue compute/pageserver that is writing/reading to a
+    safekeeper that we're migrating a timeline away from, or if the timeline
+    is being deleted while such a rogue client is running.
+    """
+    neon_env_builder.auth_enabled = True
+    env = neon_env_builder.init_start()
+
+    # Create two endpoints that will generate load
+    timeline_id_a = env.create_branch("deleteme_a")
+    timeline_id_b = env.create_branch("deleteme_b")
+
+    endpoint_a = env.endpoints.create("deleteme_a")
+    endpoint_a.start()
+    endpoint_b = env.endpoints.create("deleteme_b")
+    endpoint_b.start()
+
+    # Get tenant and timeline IDs
+    tenant_id = env.initial_tenant
+
+    # Start generating load on both timelines
+    def generate_load(endpoint: Endpoint):
+        with closing(endpoint.connect()) as conn:
+            with conn.cursor() as cur:
+                cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+                while True:
+                    try:
+                        cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'")
+                    except:  # noqa
+                        # Ignore errors since timeline may be deleted
+                        break
+
+    t_a = threading.Thread(target=generate_load, args=(endpoint_a,))
+    t_b = threading.Thread(target=generate_load, args=(endpoint_b,))
+    try:
+        t_a.start()
+        t_b.start()
+
+        # Let the load run for a bit
+        log.info("Warming up...")
+        time.sleep(2)
+
+        # Safekeeper errors will propagate to the pageserver: it is correct that these are
+        # logged at error severity because they indicate the pageserver is trying to read
+        # a timeline that it shouldn't.
+        env.pageserver.allowed_errors.extend(
+            [
+                ".*Timeline.*was cancelled.*",
+                ".*Timeline.*was not found.*",
+            ]
+        )
+
+        # Try deleting timelines while under load
+        sk = env.safekeepers[0]
+        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
+
+        # Delete first timeline
+        log.info(f"Deleting {timeline_id_a}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"]
+
+        # Delete second timeline
+        log.info(f"Deleting {timeline_id_b}...")
+        assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"]
+
+        # Verify timelines are gone from disk
+        sk_data_dir = sk.data_dir
+        assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists()
+        # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists()
+
+    finally:
+        log.info("Stopping endpoints...")
+        # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang
+        endpoint_a.stop(mode="immediate")
+        endpoint_b.stop(mode="immediate")
+        log.info("Joining threads...")
+        t_a.join()
+        t_b.join()
+
+
+class RemoteDeleteFailpoint(StrEnum):
+    PAUSE = "sk-delete-timeline-remote-pause"
+    FAIL = "sk-delete-timeline-remote"
+
+
+@pytest.mark.parametrize("failpoint", [RemoteDeleteFailpoint.PAUSE, RemoteDeleteFailpoint.FAIL])
+def test_safekeeper_delete_remote_errors(
+    neon_env_builder: NeonEnvBuilder, failpoint: RemoteDeleteFailpoint
+):
+    """
+    Test that errors and delays during remote deletion are handled correctly.
+    """
+
+    # Configure safekeepers with ultra-fast eviction policy
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--delete-offloaded-wal",
+        "--control-file-save-interval",
+        "1s",
+    ]
+    neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+    env = neon_env_builder.init_start()
+
+    # FIXME: pageserver is intermittently emitting this
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*unsupported command START_WAL_PUSH in START_WAL_PUSH.*",
+        ]
+    )
+
+    timeline_id_a = env.create_branch("deleteme_a")
+    endpoint_a = env.endpoints.create("deleteme_a")
+    endpoint_a.start()
+    with closing(endpoint_a.connect()) as conn:
+        with conn.cursor() as cur:
+            # roughly fills one segment
+            cur.execute("create table t(key int, value text)")
+            cur.execute("insert into t select generate_series(1,250000), 'payload'")
+    endpoint_a.stop()
+
+    # Ensure something is uploaded to remote storage
+    def assert_is_uploaded():
+        assert is_segment_offloaded(
+            env.safekeepers[0], env.initial_tenant, timeline_id_a, Lsn("0/2000000")
+        )
+
+    wait_until(assert_is_uploaded)
+
+    def list_timeline_remote():
+        assert isinstance(env.safekeepers_remote_storage, S3Storage)
+        prefix = f"{env.safekeepers_remote_storage.safekeeper_timeline_path(env.initial_tenant, timeline_id_a)}/"
+
+        listing = env.safekeepers_remote_storage.client.list_objects_v2(
+            Bucket=env.safekeepers_remote_storage.bucket_name,
+            Prefix=prefix,
+        )
+        return listing.get("Contents", [])
+
+    assert list_timeline_remote() != []
+
+    sk_http = env.safekeepers[0].http_client()
+    env.pageserver.http_client().timeline_delete(env.initial_tenant, timeline_id_a)
+
+    # Set up failpoint
+    if failpoint == RemoteDeleteFailpoint.PAUSE:
+        sk_http.configure_failpoints((failpoint, "pause"))
+    elif failpoint == RemoteDeleteFailpoint.FAIL:
+        sk_http.configure_failpoints((failpoint, "return"))
+    else:
+        raise NotImplementedError(f"Unknown failpoint: {failpoint}")
+
+    # Delete the timeline - this should hit the configured failpoint
+    if failpoint == RemoteDeleteFailpoint.PAUSE:
+        # Expect time out
+        with pytest.raises(requests.exceptions.ReadTimeout, match="timed out"):
+            sk_http.timeline_delete(env.initial_tenant, timeline_id_a, timeout=5)
+
+        # Assert deletion didn't happy yet
+        assert list_timeline_remote() != []
+
+        # Unblock the background task that should still be running
+        sk_http.configure_failpoints((failpoint, "off"))
+
+        # Expect that after unblocking, remote deletion proceeds
+        def assert_remote_deleted():
+            assert list_timeline_remote() == []
+
+        wait_until(assert_remote_deleted)
+
+    elif failpoint == RemoteDeleteFailpoint.FAIL:
+        # Expect immediate failure
+        with pytest.raises(sk_http.HTTPError, match="Internal Server Error"):
+            sk_http.timeline_delete(env.initial_tenant, timeline_id_a)
+
+        sk_http.configure_failpoints((failpoint, "off"))
+    else:
+        raise NotImplementedError(f"Unknown failpoint: {failpoint}")
+
+    # Retry should succeed
+    sk_http.timeline_delete(env.initial_tenant, timeline_id_a)
+
+    # Remote storage should be empty
+    assert list_timeline_remote() == []
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index f58bbcd3c0..cb28f5b12d 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1814,14 +1814,3 @@ def test_sharding_gc(
         shard_gc_cutoff_lsn = Lsn(shard_index["metadata_bytes"]["latest_gc_cutoff_lsn"])
         log.info(f"Shard {shard_number} cutoff LSN: {shard_gc_cutoff_lsn}")
         assert shard_gc_cutoff_lsn == shard_0_gc_cutoff_lsn
-
-    for ps in env.pageservers:
-        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
-        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
-        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.extend(
-            [
-                ".*could not find data for key.*",
-                ".*could not ingest record.*",
-            ]
-        )
diff --git a/test_runner/regress/test_signed_char.py b/test_runner/regress/test_signed_char.py
new file mode 100644
index 0000000000..8752a1ff3f
--- /dev/null
+++ b/test_runner/regress/test_signed_char.py
@@ -0,0 +1,64 @@
+from pathlib import Path
+
+from fixtures.neon_fixtures import NeonEnv
+
+SIGNED_CHAR_EXTRACT = """
+    WITH
+    -- Generates an intermediate table with block numbers of the index
+  pagenumbers AS (
+    SELECT num FROM generate_series(0, (pg_relation_size('test_payload_idx') / 8192) - 1) it(num)
+  )
+    SELECT num,
+    -- Gets the data of the page, skipping the first 8 bytes which is the LSN
+    substr(page, 9, 8192-8),
+    -- Returns information about the GIN index opaque area
+    (gin_page_opaque_info(page)).*
+    FROM pagenumbers,
+    -- Gets a page from the respective blocks of the table
+    LATERAL (SELECT get_raw_page('test_payload_idx', num)) AS p(page)
+    -- Filters to only return leaf pages from the GIN Index
+    WHERE ARRAY['leaf'] = ((gin_page_opaque_info(page)).flags);
+    """
+
+
+def test_signed_char(neon_simple_env: NeonEnv):
+    """
+    Test that postgres was compiled with -fsigned-char.
+    ---
+    In multi-character keys, the GIN index creates a CRC Hash of the first 3 bytes of the key.
+    The hash can have the first bit to be set or unset, needing to have a consistent representation
+    of char across architectures for consistent results. GIN stores these keys by their hashes
+    which determines the order in which the keys are obtained from the GIN index.
+    Using -fsigned-char enforces this order across platforms making this consistent.
+    The following query gets all the data present in the leaf page of a GIN index,
+    which is ordered by the CRC hash and is consistent across platforms.
+    """
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start("main")
+
+    with endpoint.connect().cursor() as ses1:
+        # Add the required extensions
+        ses1.execute("CREATE EXTENSION pg_trgm;")
+        ses1.execute("CREATE EXTENSION pageinspect;")
+        # Create a test table
+        ses1.execute("CREATE TABLE test (payload text);")
+        # Create a GIN based index
+        ses1.execute(
+            "CREATE INDEX test_payload_idx ON test USING gin (payload gin_trgm_ops) WITH (gin_pending_list_limit = 64);"
+        )
+        # insert a multibyte character to trigger order-dependent hashing
+        ses1.execute(
+            "INSERT INTO test SELECT '123456789BV' || CHR(127153) /* ace of spades, a multibyte character */ || i::text from generate_series(1, 40) as i(i);"
+        )
+        ses1.execute("INSERT INTO test SELECT 'Bóbr';")
+        # Clean pending list to flush data to pages
+        ses1.execute("select gin_clean_pending_list('test_payload_idx'::regclass);")
+        ses1.execute(SIGNED_CHAR_EXTRACT)
+        pages = ses1.fetchall()
+    # Compare expected output
+    page1 = pages[0]
+    data = bytes(page1[1]).hex()
+    with open(Path(__file__).parent / "data" / "test_signed_char.out", encoding="utf-8") as f:
+        expected = f.read().rstrip()
+
+    assert data == expected
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index d44c176b35..0f4e5688a9 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -312,17 +312,6 @@ def test_scrubber_physical_gc_ancestors(neon_env_builder: NeonEnvBuilder, shard_
     drop_local_state(env, tenant_id)
     workload.validate()
 
-    for ps in env.pageservers:
-        # This is not okay, but it's not a scrubber bug: it's a pageserver issue that is exposed by
-        # the specific pattern of aggressive checkpointing+image layer generation + GC that this test does.
-        # TODO: remove when https://github.com/neondatabase/neon/issues/10720 is fixed
-        ps.allowed_errors.extend(
-            [
-                ".*could not find data for key.*",
-                ".*could not ingest record.*",
-            ]
-        )
-
 
 def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder):
     """
diff --git a/test_runner/regress/test_vm_bits.py b/test_runner/regress/test_vm_bits.py
index 4865178ca8..b30c02e0e4 100644
--- a/test_runner/regress/test_vm_bits.py
+++ b/test_runner/regress/test_vm_bits.py
@@ -327,9 +327,9 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
         log.info(f"pgbench run {i+1}/{PGBENCH_RUNS}")
         endpoint.safe_psql(f"create database {dbname}")
         connstr = endpoint.connstr(dbname=dbname)
-        # pgbench -i will automatically vacuum the tables. This creates the visibility map.
-        pg_bin.run(["pgbench", "-i", "-s", "10", connstr])
-        # Freeze the tuples to set the initial frozen bit.
+        # Initialize the data set, but don't vacuum yet.
+        pg_bin.run(["pgbench", "-i", "-s", "8", "-n", connstr])
+        # Vacuum to create the visibility map, and freeze the tuples to set the frozen bit.
         endpoint.safe_psql("vacuum freeze", dbname=dbname)
         # Run pgbench.
         pg_bin.run(["pgbench", "-c", "32", "-j", "8", "-T", "10", connstr])
@@ -354,19 +354,3 @@ def test_check_visibility_map(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             row = cur.fetchone()
             assert row is not None
             assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"
-
-    # Vacuum and freeze the tables, and check that the visibility map is still accurate.
-    for dbname in dbnames:
-        log.info(f"Vacuuming and checking visibility map for {dbname}")
-        with endpoint.cursor(dbname=dbname) as cur:
-            cur.execute("vacuum freeze")
-
-            cur.execute("select count(*) from pg_check_visible('pgbench_accounts')")
-            row = cur.fetchone()
-            assert row is not None
-            assert row[0] == 0, f"{row[0]} inconsistent VM pages (visible)"
-
-            cur.execute("select count(*) from pg_check_frozen('pgbench_accounts')")
-            row = cur.fetchone()
-            assert row is not None
-            assert row[0] == 0, f"{row[0]} inconsistent VM pages (frozen)"
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 0a05189bfb..0366e88389 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -27,7 +27,6 @@ from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnvBuilder,
-    NeonPageserver,
     PgBin,
     PgProtocol,
     Safekeeper,
@@ -38,8 +37,6 @@ from fixtures.pageserver.utils import (
     assert_prefix_empty,
     assert_prefix_not_empty,
     timeline_delete_wait_completed,
-    wait_for_last_record_lsn,
-    wait_for_upload,
 )
 from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
@@ -55,9 +52,16 @@ from fixtures.safekeeper.http import (
     TimelineCreateRequest,
 )
 from fixtures.safekeeper.utils import wait_walreceivers_absent
+from fixtures.safekeeper_utils import (
+    is_flush_lsn_caught_up,
+    is_segment_offloaded,
+    is_wal_trimmed,
+    wait_lsn_force_checkpoint,
+    wait_lsn_force_checkpoint_at,
+    wait_lsn_force_checkpoint_at_sk,
+)
 from fixtures.utils import (
     PropagatingThread,
-    get_dir_size,
     query_scalar,
     run_only_on_default_postgres,
     skip_in_debug_build,
@@ -69,68 +73,6 @@ if TYPE_CHECKING:
     from typing import Any, Self
 
 
-def wait_lsn_force_checkpoint(
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    endpoint: Endpoint,
-    ps: NeonPageserver,
-    pageserver_conn_options=None,
-):
-    pageserver_conn_options = pageserver_conn_options or {}
-    lsn = Lsn(endpoint.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
-    log.info(f"pg_current_wal_flush_lsn is {lsn}, waiting for it on pageserver")
-
-    wait_lsn_force_checkpoint_at(lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
-
-
-def wait_lsn_force_checkpoint_at_sk(
-    safekeeper: Safekeeper,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ps: NeonPageserver,
-    pageserver_conn_options=None,
-):
-    sk_flush_lsn = safekeeper.get_flush_lsn(tenant_id, timeline_id)
-    wait_lsn_force_checkpoint_at(sk_flush_lsn, tenant_id, timeline_id, ps, pageserver_conn_options)
-
-
-def wait_lsn_force_checkpoint_at(
-    lsn: Lsn,
-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    ps: NeonPageserver,
-    pageserver_conn_options=None,
-):
-    """
-    Wait until pageserver receives given lsn, force checkpoint and wait for
-    upload, i.e. remote_consistent_lsn advancement.
-    """
-    pageserver_conn_options = pageserver_conn_options or {}
-
-    auth_token = None
-    if "password" in pageserver_conn_options:
-        auth_token = pageserver_conn_options["password"]
-
-    # wait for the pageserver to catch up
-    wait_for_last_record_lsn(
-        ps.http_client(auth_token=auth_token),
-        tenant_id,
-        timeline_id,
-        lsn,
-    )
-
-    # force checkpoint to advance remote_consistent_lsn
-    ps.http_client(auth_token).timeline_checkpoint(tenant_id, timeline_id)
-
-    # ensure that remote_consistent_lsn is advanced
-    wait_for_upload(
-        ps.http_client(auth_token=auth_token),
-        tenant_id,
-        timeline_id,
-        lsn,
-    )
-
-
 @dataclass
 class TimelineMetrics:
     timeline_id: TimelineId
@@ -475,31 +417,6 @@ def wait(f, desc, timeout=30, wait_f=None):
             wait_f()
 
 
-def is_segment_offloaded(
-    sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, seg_end: Lsn
-):
-    http_cli = sk.http_client()
-    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"sk status is {tli_status}")
-    return tli_status.backup_lsn >= seg_end
-
-
-def is_flush_lsn_caught_up(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, lsn: Lsn):
-    http_cli = sk.http_client()
-    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    log.info(f"sk status is {tli_status}")
-    return tli_status.flush_lsn >= lsn
-
-
-def is_wal_trimmed(sk: Safekeeper, tenant_id: TenantId, timeline_id: TimelineId, target_size_mb):
-    http_cli = sk.http_client()
-    tli_status = http_cli.timeline_status(tenant_id, timeline_id)
-    sk_wal_size = get_dir_size(sk.timeline_dir(tenant_id, timeline_id))
-    sk_wal_size_mb = sk_wal_size / 1024 / 1024
-    log.info(f"Safekeeper id={sk.id} wal_size={sk_wal_size_mb:.2f}MB status={tli_status}")
-    return sk_wal_size_mb <= target_size_mb
-
-
 def test_wal_backup(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
     remote_storage_kind = s3_storage()
@@ -1685,214 +1602,6 @@ def test_replace_safekeeper(neon_env_builder: NeonEnvBuilder):
     show_statuses(env.safekeepers, tenant_id, timeline_id)
 
 
-@pytest.mark.parametrize("auth_enabled", [False, True])
-def test_delete(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
-    neon_env_builder.auth_enabled = auth_enabled
-    env = neon_env_builder.init_start()
-
-    # FIXME: are these expected?
-    env.pageserver.allowed_errors.extend(
-        [
-            ".*Timeline .* was not found in global map.*",
-            ".*Timeline .* was cancelled and cannot be used anymore.*",
-        ]
-    )
-
-    # Create two tenants: one will be deleted, other should be preserved.
-    tenant_id = env.initial_tenant
-    timeline_id_1 = env.create_branch("br1")  # Active, delete explicitly
-    timeline_id_2 = env.create_branch("br2")  # Inactive, delete explicitly
-    timeline_id_3 = env.create_branch("br3")  # Active, delete with the tenant
-    timeline_id_4 = env.create_branch("br4")  # Inactive, delete with the tenant
-
-    tenant_id_other, timeline_id_other = env.create_tenant()
-
-    # Populate branches
-    endpoint_1 = env.endpoints.create_start("br1")
-    endpoint_2 = env.endpoints.create_start("br2")
-    endpoint_3 = env.endpoints.create_start("br3")
-    endpoint_4 = env.endpoints.create_start("br4")
-    endpoint_other = env.endpoints.create_start("main", tenant_id=tenant_id_other)
-    for endpoint in [endpoint_1, endpoint_2, endpoint_3, endpoint_4, endpoint_other]:
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("CREATE TABLE t(key int primary key)")
-    sk = env.safekeepers[0]
-    sk_data_dir = sk.data_dir
-    if not auth_enabled:
-        sk_http = sk.http_client()
-        sk_http_other = sk_http
-    else:
-        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
-        sk_http_other = sk.http_client(
-            auth_token=env.auth_keys.generate_tenant_token(tenant_id_other)
-        )
-        sk_http_noauth = sk.http_client(gen_sk_wide_token=False)
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_1)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Stop branches which should be inactive and restart Safekeeper to drop its in-memory state.
-    endpoint_2.stop_and_destroy()
-    endpoint_4.stop_and_destroy()
-    sk.stop()
-    sk.start()
-
-    # Ensure connections to Safekeeper are established
-    for endpoint in [endpoint_1, endpoint_3, endpoint_other]:
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("INSERT INTO t (key) VALUES (1)")
-
-    # Stop all computes gracefully before safekeepers stop responding to them
-    endpoint_1.stop_and_destroy()
-    endpoint_3.stop_and_destroy()
-
-    # Remove initial tenant's br1 (active)
-    assert sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Ensure repeated deletion succeeds
-    assert not sk_http.timeline_delete(tenant_id, timeline_id_1)["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_2)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    if auth_enabled:
-        # Ensure we cannot delete the other tenant
-        for sk_h in [sk_http, sk_http_noauth]:
-            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
-                assert sk_h.timeline_delete(tenant_id_other, timeline_id_other)
-            with pytest.raises(sk_h.HTTPError, match="Forbidden|Unauthorized"):
-                assert sk_h.tenant_delete_force(tenant_id_other)
-        assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove initial tenant's br2 (inactive)
-    assert sk_http.timeline_delete(tenant_id, timeline_id_2)["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).is_dir()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove non-existing branch, should succeed
-    assert not sk_http.timeline_delete(tenant_id, TimelineId("00" * 16))["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_1)).exists()
-    assert not (sk_data_dir / str(tenant_id) / str(timeline_id_2)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_3)).exists()
-    assert (sk_data_dir / str(tenant_id) / str(timeline_id_4)).is_dir()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove initial tenant fully (two branches are active)
-    response = sk_http.tenant_delete_force(tenant_id)
-    assert response[str(timeline_id_3)]["dir_existed"]
-    assert not (sk_data_dir / str(tenant_id)).exists()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Remove initial tenant again.
-    response = sk_http.tenant_delete_force(tenant_id)
-    # assert response == {}
-    assert not (sk_data_dir / str(tenant_id)).exists()
-    assert (sk_data_dir / str(tenant_id_other) / str(timeline_id_other)).is_dir()
-
-    # Ensure the other tenant still works
-    sk_http_other.timeline_status(tenant_id_other, timeline_id_other)
-    with closing(endpoint_other.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("INSERT INTO t (key) VALUES (123)")
-
-
-def test_delete_timeline_under_load(neon_env_builder: NeonEnvBuilder):
-    """
-    Test deleting timelines on a safekeeper while they're under load.
-
-    This should not happen under normal operation, but it can happen if
-    there is some rogue compute/pageserver that is writing/reading to a
-    safekeeper that we're migrating a timeline away from, or if the timeline
-    is being deleted while such a rogue client is running.
-    """
-    neon_env_builder.auth_enabled = True
-    env = neon_env_builder.init_start()
-
-    # Create two endpoints that will generate load
-    timeline_id_a = env.create_branch("deleteme_a")
-    timeline_id_b = env.create_branch("deleteme_b")
-
-    endpoint_a = env.endpoints.create("deleteme_a")
-    endpoint_a.start()
-    endpoint_b = env.endpoints.create("deleteme_b")
-    endpoint_b.start()
-
-    # Get tenant and timeline IDs
-    tenant_id = env.initial_tenant
-
-    # Start generating load on both timelines
-    def generate_load(endpoint: Endpoint):
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("CREATE TABLE IF NOT EXISTS t(key int, value text)")
-                while True:
-                    try:
-                        cur.execute("INSERT INTO t SELECT generate_series(1,1000), 'data'")
-                    except:  # noqa
-                        # Ignore errors since timeline may be deleted
-                        break
-
-    t_a = threading.Thread(target=generate_load, args=(endpoint_a,))
-    t_b = threading.Thread(target=generate_load, args=(endpoint_b,))
-    try:
-        t_a.start()
-        t_b.start()
-
-        # Let the load run for a bit
-        log.info("Warming up...")
-        time.sleep(2)
-
-        # Safekeeper errors will propagate to the pageserver: it is correct that these are
-        # logged at error severity because they indicate the pageserver is trying to read
-        # a timeline that it shouldn't.
-        env.pageserver.allowed_errors.extend(
-            [
-                ".*Timeline.*was cancelled.*",
-                ".*Timeline.*was not found.*",
-            ]
-        )
-
-        # Try deleting timelines while under load
-        sk = env.safekeepers[0]
-        sk_http = sk.http_client(auth_token=env.auth_keys.generate_tenant_token(tenant_id))
-
-        # Delete first timeline
-        log.info(f"Deleting {timeline_id_a}...")
-        assert sk_http.timeline_delete(tenant_id, timeline_id_a, only_local=True)["dir_existed"]
-
-        # Delete second timeline
-        log.info(f"Deleting {timeline_id_b}...")
-        assert sk_http.timeline_delete(tenant_id, timeline_id_b, only_local=True)["dir_existed"]
-
-        # Verify timelines are gone from disk
-        sk_data_dir = sk.data_dir
-        assert not (sk_data_dir / str(tenant_id) / str(timeline_id_a)).exists()
-        # assert not (sk_data_dir / str(tenant_id) / str(timeline_id_b)).exists()
-
-    finally:
-        log.info("Stopping endpoints...")
-        # Stop endpoints with immediate mode because we deleted the timeline out from under the compute, which may cause it to hang
-        endpoint_a.stop(mode="immediate")
-        endpoint_b.stop(mode="immediate")
-        log.info("Joining threads...")
-        t_a.join()
-        t_b.join()
-
-
 # Basic pull_timeline test.
 # When live_sk_change is False, compute is restarted to change set of
 # safekeepers; otherwise it is live reload.
@@ -2281,6 +1990,54 @@ def test_membership_api(neon_env_builder: NeonEnvBuilder):
         http_cli.timeline_status(tenant_id, timeline_id)
 
 
+def test_explicit_timeline_creation(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that having neon.safekeepers starting with g#n: with non zero n enables
+    generations, which as a side effect disables automatic timeline creation.
+
+    This is kind of bootstrapping test: here membership conf & timeline is
+    created manually, later storcon will do that.
+    """
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    ps = env.pageservers[0]
+    ps_http_cli = ps.http_client()
+
+    http_clis = [sk.http_client() for sk in env.safekeepers]
+
+    config_lines = [
+        "neon.safekeeper_proto_version = 3",
+    ]
+    ep = env.endpoints.create("main", config_lines=config_lines)
+
+    # expected to fail because timeline is not created on safekeepers
+    with pytest.raises(Exception, match=r".*timed out.*"):
+        ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3], timeout="2s")
+    # figure out initial LSN.
+    ps_timeline_detail = ps_http_cli.timeline_detail(tenant_id, timeline_id)
+    init_lsn = ps_timeline_detail["last_record_lsn"]
+    log.info(f"initial LSN: {init_lsn}")
+    # sk timeline creation request expects minor version
+    pg_version = ps_timeline_detail["pg_version"] * 10000
+    # create inital mconf
+    sk_ids = [SafekeeperId(sk.id, "localhost", sk.port.pg_tenant_only) for sk in env.safekeepers]
+    mconf = Configuration(generation=1, members=sk_ids, new_members=None)
+    create_r = TimelineCreateRequest(
+        tenant_id, timeline_id, mconf, pg_version, Lsn(init_lsn), commit_lsn=None
+    )
+    log.info(f"sending timeline create: {create_r.to_json()}")
+
+    for sk_http_cli in http_clis:
+        sk_http_cli.timeline_create(create_r)
+    # Once timeline created endpoint should start.
+    ep.start(safekeeper_generation=1, safekeepers=[1, 2, 3])
+    ep.safe_psql("CREATE TABLE IF NOT EXISTS t(key int, value text)")
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 6254ab9b44..b1425505c6 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 6254ab9b4496c3e481bc037ae69d859bbc2bdd7d
+Subproject commit b1425505c6f9a622a5aadf3ee362740519993310
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 9b118b1cff..533be42f7d 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 9b118b1cffa6e4ca0d63389b57b54d11e207e9a8
+Subproject commit 533be42f7da97e614ce1c494fafe3e49f53991b1
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 799e7a08dd..78050f965f 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 799e7a08dd171aa06a7395dd326f4243aaeb9f93
+Subproject commit 78050f965f2e550fd6e58f837394cb3d080d7d42
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 517b8dc244..780efda2ef 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 517b8dc244abf3e56f0089849e464af76f70b94e
+Subproject commit 780efda2ef8d629495cc289624534ba8cde40779
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 8dde46a01e..1a811cfa3d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.4",
-    "517b8dc244abf3e56f0089849e464af76f70b94e"
+    "780efda2ef8d629495cc289624534ba8cde40779"
   ],
   "v16": [
     "16.8",
-    "799e7a08dd171aa06a7395dd326f4243aaeb9f93"
+    "78050f965f2e550fd6e58f837394cb3d080d7d42"
   ],
   "v15": [
     "15.12",
-    "9b118b1cffa6e4ca0d63389b57b54d11e207e9a8"
+    "533be42f7da97e614ce1c494fafe3e49f53991b1"
   ],
   "v14": [
     "14.17",
-    "6254ab9b4496c3e481bc037ae69d859bbc2bdd7d"
+    "b1425505c6f9a622a5aadf3ee362740519993310"
   ]
 }
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 1b7c376560..183cc66ab9 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -18,7 +18,7 @@ license.workspace = true
 ahash = { version = "0.8" }
 anyhow = { version = "1", features = ["backtrace"] }
 base64-594e8ee84c453af0 = { package = "base64", version = "0.13", features = ["alloc"] }
-base64-647d43efb71741da = { package = "base64", version = "0.21", features = ["alloc"] }
+base64-647d43efb71741da = { package = "base64", version = "0.21" }
 base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
 camino = { version = "1", default-features = false, features = ["serde1"] }