add target arch

add amd64 suffix
feat: update vm-builder for TLS support
2026-02-12 15:10:38 +00:00 · 2025-02-21 15:52:30 +00:00 · 2025-02-21 15:18:38 +00:00 · 2025-02-21 14:37:00 +00:00 · 2025-02-21 11:02:02 +00:00 · 2025-02-21 11:01:57 +00:00
182 changed files with 5964 additions and 1690 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -28,3 +28,7 @@ config-variables:
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
  - SLACK_CICD_CHANNEL_ID
+  - SLACK_STORAGE_CHANNEL_ID
+  - NEON_DEV_AWS_ACCOUNT_ID
+  - NEON_PROD_AWS_ACCOUNT_ID
+  - AWS_ECR_REGION
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -19,7 +19,11 @@ inputs:
    default: '[1, 1]'
  # settings below only needed if you want the project to be sharded from the beginning
  shard_split_project:
-    description: 'by default new projects are not shard-split, specify true to shard-split'
+    description: 'by default new projects are not shard-split initiailly, but only when shard-split threshold is reached, specify true to explicitly shard-split initially'
+    required: false
+    default: 'false'
+  disable_sharding:
+    description: 'by default new projects use storage controller default policy to shard-split when shard-split threshold is reached, specify true to explicitly disable sharding'
    required: false
    default: 'false'
  admin_api_key:
@@ -107,6 +111,21 @@ runs:
            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
            -d "{\"new_shard_count\": $SHARD_COUNT, \"new_stripe_size\": $STRIPE_SIZE}"
        fi
+        if [ "${DISABLE_SHARDING}" = "true" ]; then
+          # determine tenant ID
+          TENANT_ID=`${PSQL} ${dsn} -t -A -c "SHOW neon.tenant_id"`
+
+          echo "Explicitly disabling shard-splitting for project ${project_id} with tenant_id ${TENANT_ID}"
+
+          echo "Sending PUT request to https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy"
+          echo "with body {\"scheduling\": \"Essential\"}"
+
+          # we need an ADMIN API KEY to invoke storage controller API for shard splitting (bash -u above checks that the variable is set)
+          curl -X PUT \
+            "https://${API_HOST}/regions/${REGION_ID}/api/v1/admin/storage/proxy/control/v1/tenant/${TENANT_ID}/policy" \
+            -H "Accept: application/json" -H "Content-Type: application/json" -H "Authorization: Bearer ${ADMIN_API_KEY}" \
+            -d "{\"scheduling\": \"Essential\"}"
+        fi

      env:
        API_HOST: ${{ inputs.api_host }}
@@ -116,6 +135,7 @@ runs:
        MIN_CU: ${{ fromJSON(inputs.compute_units)[0] }}
        MAX_CU: ${{ fromJSON(inputs.compute_units)[1] }}
        SHARD_SPLIT_PROJECT: ${{ inputs.shard_split_project }}
+        DISABLE_SHARDING: ${{ inputs.disable_sharding }}
        ADMIN_API_KEY: ${{ inputs.admin_api_key }}
        SHARD_COUNT: ${{ inputs.shard_count }}
        STRIPE_SIZE: ${{ inputs.stripe_size }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -348,6 +348,10 @@ jobs:
          rerun_failed: true
          pg_version: ${{ matrix.pg_version }}
          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+          # `--session-timeout` is equal to (timeout-minutes - 10 minutes) * 60 seconds.
+          # Attempt to stop tests gracefully to generate test reports
+          # until they are forcibly stopped by the stricter `timeout-minutes` limit.
+          extra_params: --session-timeout=${{ inputs.sanitizers != 'enabled' && 3000 || 10200 }}
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -2,7 +2,7 @@ name: Push images to Container Registry
 on:
  workflow_call:
    inputs:
-      # Example: {"docker.io/neondatabase/neon:13196061314":["369495373322.dkr.ecr.eu-central-1.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
+      # Example: {"docker.io/neondatabase/neon:13196061314":["${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/neon:13196061314","neoneastus2.azurecr.io/neondatabase/neon:13196061314"]}
      image-map:
        description: JSON map of images, mapping from a source image to an array of target images that should be pushed.
        required: true
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -68,7 +68,7 @@ jobs:
  tag:
    needs: [ check-permissions ]
    runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}

@@ -263,8 +263,9 @@ jobs:
          echo "json=$(jq --compact-output '.' /tmp/benchmark_durations.json)" >> $GITHUB_OUTPUT

  benchmarks:
-    if: github.ref_name == 'main' || contains(github.event.pull_request.labels.*.name, 'run-benchmarks')
-    needs: [ check-permissions, build-and-test-locally, build-build-tools-image, get-benchmarks-durations ]
+    # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `deploy` in PRs
+    if: github.ref_name == 'main' || (contains(github.event.pull_request.labels.*.name, 'run-benchmarks') && !failure() && !cancelled())
+    needs: [ check-permissions, build-build-tools-image, get-benchmarks-durations, deploy ]
    permissions:
      id-token: write # aws-actions/configure-aws-credentials
      statuses: write
@@ -721,14 +722,14 @@ jobs:
          - pg: v17
            debian: bookworm
    env:
-      VM_BUILDER_VERSION: v0.37.1
+      VM_BUILDER_VERSION: v0.42.1

    steps:
      - uses: actions/checkout@v4

      - name: Downloading vm-builder
        run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
+          curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder-amd64 -o vm-builder
          chmod +x vm-builder

      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
@@ -749,7 +750,8 @@ jobs:
            -size=2G \
            -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
            -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
-            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
+            -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
+            -target-arch linux/amd64

      - name: Pushing vm-compute-node image
        run: |
@@ -858,14 +860,17 @@ jobs:
          BRANCH: "${{ github.ref_name }}"
          DEV_ACR: "${{ vars.AZURE_DEV_REGISTRY_NAME }}"
          PROD_ACR: "${{ vars.AZURE_PROD_REGISTRY_NAME }}"
+          DEV_AWS: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+          PROD_AWS: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
+          AWS_REGION: "${{ vars.AWS_ECR_REGION }}"

  push-neon-image-dev:
    needs: [ generate-image-maps, neon-image ]
    uses: ./.github/workflows/_push-to-container-registry.yml
    with:
      image-map: '${{ needs.generate-image-maps.outputs.neon-dev }}'
-      aws-region: eu-central-1
-      aws-account-ids: "369495373322"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
      azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
      azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -880,8 +885,8 @@ jobs:
    uses: ./.github/workflows/_push-to-container-registry.yml
    with:
      image-map: '${{ needs.generate-image-maps.outputs.compute-dev }}'
-      aws-region: eu-central-1
-      aws-account-ids: "369495373322"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
      azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
      azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -892,13 +897,13 @@ jobs:
      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

  push-neon-image-prod:
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
    needs: [ generate-image-maps, neon-image, test-images ]
    uses: ./.github/workflows/_push-to-container-registry.yml
    with:
      image-map: '${{ needs.generate-image-maps.outputs.neon-prod }}'
-      aws-region: eu-central-1
-      aws-account-ids: "093970136003"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
      azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
      azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -909,13 +914,13 @@ jobs:
      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

  push-compute-image-prod:
-    if: github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
    needs: [ generate-image-maps, vm-compute-node-image, test-images ]
    uses: ./.github/workflows/_push-to-container-registry.yml
    with:
      image-map: '${{ needs.generate-image-maps.outputs.compute-prod }}'
-      aws-region: eu-central-1
-      aws-account-ids: "093970136003"
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_PROD_AWS_ACCOUNT_ID }}"
      azure-client-id: ${{ vars.AZURE_PROD_CLIENT_ID }}
      azure-subscription-id: ${{ vars.AZURE_PROD_SUBSCRIPTION_ID }}
      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
@@ -1028,7 +1033,7 @@ jobs:
      statuses: write
      contents: write
    runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/ansible:latest
    steps:
      - uses: actions/checkout@v4

@@ -1177,6 +1182,22 @@ jobs:
            exit 1
          fi

+  notify-storage-release-deploy-failure:
+    needs: [ deploy ]
+    # We want this to run even if (transitive) dependencies are skipped, because deploy should really be successful on release branch workflow runs.
+    if: github.ref_name == 'release' && needs.deploy.result != 'success' && always()
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Post release-deploy failure to team-storage slack channel
+        uses: slackapi/slack-github-action@v2
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_STORAGE_CHANNEL_ID }}
+            text: |
+              🔴 @oncall-storage: deploy job on release branch had unexpected status "${{ needs.deploy.result }}" <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>.
+
  # The job runs on `release` branch and copies compatibility data and Neon artifact from the last *release PR* to the latest directory
  promote-compatibility-data:
    needs: [ deploy ]
@@ -1273,7 +1294,7 @@ jobs:
          done

  pin-build-tools-image:
-    needs: [ build-build-tools-image, push-compute-image-prod, push-neon-image-prod, build-and-test-locally ]
+    needs: [ build-build-tools-image, test-images, build-and-test-locally ]
    if: github.ref_name == 'main'
    uses: ./.github/workflows/pin-build-tools-image.yml
    with:
--- a/.github/workflows/build_and_test_with_sanitizers.yml
+++ b/.github/workflows/build_and_test_with_sanitizers.yml
@@ -27,7 +27,7 @@ env:
 jobs:
  tag:
    runs-on: [ self-hosted, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+    container: ${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/base:pinned
    outputs:
      build-tag: ${{steps.build-tag.outputs.tag}}

--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -0,0 +1,76 @@
+name: Force Test Upgrading of Extension
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '45 2 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow
+  group: ${{ github.workflow }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write # aws-actions/configure-aws-credentials
+  statuses: write
+  contents: read
+
+jobs:
+  regress:
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    runs-on: small
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: false
+
+      - name: Get the last compute release tag
+        id: get-last-compute-release-tag
+        env:
+          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        run: |
+          tag=$(gh api -q '[.[].tag_name | select(startswith("release-compute"))][0]'\
+            -H "Accept: application/vnd.github+json" \
+            -H "X-GitHub-Api-Version: 2022-11-28" \
+            "/repos/${GITHUB_REPOSITORY}/releases")
+          echo tag=${tag} >> ${GITHUB_OUTPUT}
+
+      - name: Test extension upgrade
+        timeout-minutes: 20
+        env:
+          NEWTAG: latest
+          OLDTAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
+          PG_VERSION: ${{ matrix.pg-version }}
+          FORCE_ALL_UPGRADE_TESTS: true
+        run: ./docker-compose/test_extensions_upgrade.sh
+
+      - name: Print logs and clean up
+        if: always()
+        run: |
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml logs || true
+          docker compose --profile test-extensions -f ./docker-compose/docker-compose.yml down
+
+      - name: Post to the Slack channel
+        if: ${{ github.event.schedule && failure() }}
+        uses: slackapi/slack-github-action@v1
+        with:
+          channel-id: ${{ vars.SLACK_ON_CALL_QA_STAGING_STREAM }}
+          slack-message: |
+            Test upgrading of extensions: ${{ job.status }}
+            <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+        env:
+          SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -32,18 +32,27 @@ jobs:
          - target_project: new_empty_project_stripe_size_2048 
            stripe_size: 2048 # 16 MiB
            postgres_version: 16
+            disable_sharding: false
          - target_project: new_empty_project_stripe_size_32768
            stripe_size: 32768 # 256 MiB # note that this is different from null because using null will shard_split the project only if it reaches the threshold
                               # while here it is sharded from the beginning with a shard size of 256 MiB
+            disable_sharding: false
            postgres_version: 16
          - target_project: new_empty_project
            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: false
            postgres_version: 16
          - target_project: new_empty_project
            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: false
            postgres_version: 17
          - target_project: large_existing_project
            stripe_size: null # cannot re-shared or choose different stripe size for existing, already sharded project
+            disable_sharding: false
+            postgres_version: 16
+          - target_project: new_empty_project_unsharded
+            stripe_size: null # run with neon defaults which will shard split only when reaching the threshold
+            disable_sharding: true
            postgres_version: 16
      max-parallel: 1 # we want to run each stripe size sequentially to be able to compare the results
    permissions:
@@ -96,6 +105,7 @@ jobs:
        admin_api_key: ${{ secrets.NEON_STAGING_ADMIN_API_KEY }} 
        shard_count: 8
        stripe_size: ${{ matrix.stripe_size }}
+        disable_sharding: ${{ matrix.disable_sharding }} 

    - name: Initialize Neon project
      if: ${{ startsWith(matrix.target_project, 'new_empty_project') }}
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -33,10 +33,6 @@ concurrency:
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}

-env:
-  FROM_TAG: ${{ inputs.from-tag }}
-  TO_TAG: pinned
-
 jobs:
  check-manifests:
    runs-on: ubuntu-22.04
@@ -46,11 +42,14 @@ jobs:
    steps:
      - name: Check if we really need to pin the image
        id: check-manifests
+        env:
+          FROM_TAG: ${{ inputs.from-tag }}
+          TO_TAG: pinned
        run: |
-          docker manifest inspect neondatabase/build-tools:${FROM_TAG} > ${FROM_TAG}.json
-          docker manifest inspect neondatabase/build-tools:${TO_TAG}   > ${TO_TAG}.json
+          docker manifest inspect "docker.io/neondatabase/build-tools:${FROM_TAG}" > "${FROM_TAG}.json"
+          docker manifest inspect "docker.io/neondatabase/build-tools:${TO_TAG}"   > "${TO_TAG}.json"

-          if diff ${FROM_TAG}.json ${TO_TAG}.json; then
+          if diff "${FROM_TAG}.json" "${TO_TAG}.json"; then
            skip=true
          else
            skip=false
@@ -64,55 +63,34 @@ jobs:
    # use format(..) to catch both inputs.force = true AND inputs.force = 'true'
    if: needs.check-manifests.outputs.skip == 'false' || format('{0}', inputs.force) == 'true'

-    runs-on: ubuntu-22.04
-
    permissions:
-      id-token: write # for `azure/login` and aws auth
+      id-token: write  # Required for aws/azure login

-    steps:
-      - uses: docker/login-action@v3
-        with:
-          username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-
-      - name: Configure AWS credentials
-        uses: aws-actions/configure-aws-credentials@v4
-        with:
-          aws-region: eu-central-1
-          role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-          role-duration-seconds: 3600
-
-      - name: Login to Amazon Dev ECR
-        uses: aws-actions/amazon-ecr-login@v2
-
-      - name: Azure login
-        uses: azure/login@6c251865b4e6290e7b78be643ea2d005bc51f69a  # @v2.1.1
-        with:
-          client-id: ${{ secrets.AZURE_DEV_CLIENT_ID }}
-          tenant-id: ${{ secrets.AZURE_TENANT_ID }}
-          subscription-id: ${{ secrets.AZURE_DEV_SUBSCRIPTION_ID }}
-
-      - name: Login to ACR
-        run: |
-          az acr login --name=neoneastus2
-
-      - name: Tag build-tools with `${{ env.TO_TAG }}` in Docker Hub, ECR, and ACR
-        env:
-          DEFAULT_DEBIAN_VERSION: bookworm
-        run: |
-          for debian_version in bullseye bookworm; do
-            tags=()
-
-            tags+=("-t" "neondatabase/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}-${debian_version}")
-            tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}-${debian_version}")
-
-            if [ "${debian_version}" == "${DEFAULT_DEBIAN_VERSION}" ]; then
-              tags+=("-t" "neondatabase/build-tools:${TO_TAG}")
-              tags+=("-t" "369495373322.dkr.ecr.eu-central-1.amazonaws.com/build-tools:${TO_TAG}")
-              tags+=("-t" "neoneastus2.azurecr.io/neondatabase/build-tools:${TO_TAG}")
-            fi
-
-            docker buildx imagetools create "${tags[@]}" \
-                                              neondatabase/build-tools:${FROM_TAG}-${debian_version}
-          done
+    uses: ./.github/workflows/_push-to-container-registry.yml
+    with:
+      image-map: |
+        {
+          "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bullseye": [
+            "docker.io/neondatabase/build-tools:pinned-bullseye",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bullseye",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bullseye"
+          ],
+          "docker.io/neondatabase/build-tools:${{ inputs.from-tag }}-bookworm": [
+            "docker.io/neondatabase/build-tools:pinned-bookworm",
+            "docker.io/neondatabase/build-tools:pinned",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned-bookworm",
+            "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}.dkr.ecr.${{ vars.AWS_ECR_REGION }}.amazonaws.com/build-tools:pinned",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned-bookworm",
+            "${{ vars.AZURE_DEV_REGISTRY_NAME }}.azurecr.io/neondatabase/build-tools:pinned"
+          ]
+        }
+      aws-region: ${{ vars.AWS_ECR_REGION }}
+      aws-account-ids: "${{ vars.NEON_DEV_AWS_ACCOUNT_ID }}"
+      azure-client-id: ${{ vars.AZURE_DEV_CLIENT_ID }}
+      azure-subscription-id: ${{ vars.AZURE_DEV_SUBSCRIPTION_ID }}
+      azure-tenant-id: ${{ vars.AZURE_TENANT_ID }}
+      acr-registry-name: ${{ vars.AZURE_DEV_REGISTRY_NAME }}
+    secrets:
+      aws-role-to-assume: "${{ vars.DEV_AWS_OIDC_ROLE_ARN }}"
+      docker-hub-username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+      docker-hub-password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
--- a/.github/workflows/regenerate-pg-setting.yml
+++ b/.github/workflows/regenerate-pg-setting.yml
@@ -0,0 +1,41 @@
+name: Regenerate Postgres Settings
+
+on:
+  pull_request:
+    types:
+      - opened
+      - synchronize
+      - reopened
+    paths:
+      - pgxn/neon/**.c
+      - vendor/postgres-v*
+      - vendor/revisions.json
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref }}
+  cancel-in-progress: true
+
+permissions:
+  pull-requests: write
+
+jobs:
+  regenerate-pg-settings:
+    runs-on: ubuntu-22.04
+
+    steps:
+      - name: Add comment
+        uses: thollander/actions-comment-pull-request@v3
+        with:
+          comment-tag: ${{ github.job }}
+          pr-number: ${{ github.event.number }}
+          message: |
+            If this PR added a GUC in the Postgres fork or `neon` extension,
+            please regenerate the Postgres settings in the `cloud` repo:
+
+            ```
+            make NEON_WORKDIR=path/to/neon/checkout \
+              -C goapp/internal/shareddomain/postgres generate
+            ```
+
+            If you're an external contributor, a Neon employee will assist in
+            making sure this step is done.
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -88,7 +88,7 @@ jobs:
          BUILD_AND_TEST_RUN_ID=${TAG}
          while true; do
            gh run --repo ${GITHUB_REPOSITORY} view ${BUILD_AND_TEST_RUN_ID} --json jobs --jq '[.jobs[] | select((.name | startswith("push-neon-image-dev")) or (.name | startswith("push-compute-image-dev"))) | {"name": .name, "conclusion": .conclusion, "url": .url}]' > jobs.json
-            if [ $(jq '[.[] | select(.conclusion == "success")]' jobs.json) -eq 2 ]; then
+            if [ $(jq '[.[] | select(.conclusion == "success")] | length' jobs.json) -eq 2 ]; then
              break
            fi
            jq -c '.[]' jobs.json | while read -r job; do
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -786,7 +786,7 @@ dependencies = [
 [[package]]
 name = "azure_core"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "async-trait",
 "base64 0.22.1",
@@ -815,7 +815,7 @@ dependencies = [
 [[package]]
 name = "azure_identity"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "async-lock",
 "async-trait",
@@ -834,7 +834,7 @@ dependencies = [
 [[package]]
 name = "azure_storage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "RustyXML",
 "async-lock",
@@ -852,7 +852,7 @@ dependencies = [
 [[package]]
 name = "azure_storage_blobs"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "RustyXML",
 "azure_core",
@@ -872,7 +872,7 @@ dependencies = [
 [[package]]
 name = "azure_svc_blobstorage"
 version = "0.21.0"
-source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#c36ed4c039bb3d59b5a1705f2cc337636c73b541"
+source = "git+https://github.com/neondatabase/azure-sdk-for-rust.git?branch=neon#f64bd57262ced51afce5d8909c06dcb11a6dd85a"
 dependencies = [
 "azure_core",
 "bytes",
@@ -1029,12 +1029,6 @@ dependencies = [
 "generic-array",
 ]

-[[package]]
-name = "boxcar"
-version = "0.2.8"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2721c3c5a6f0e7f7e607125d963fedeb765f545f67adc9d71ed934693881eb42"
-
 [[package]]
 name = "bstr"
 version = "1.5.0"
@@ -1293,6 +1287,7 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "chrono",
+ "jsonwebtoken",
 "regex",
 "remote_storage",
 "serde",
@@ -1308,6 +1303,7 @@ dependencies = [
 "aws-config",
 "aws-sdk-kms",
 "aws-sdk-s3",
+ "aws-smithy-types",
 "axum",
 "base64 0.13.1",
 "bytes",
@@ -1329,7 +1325,6 @@ dependencies = [
 "opentelemetry_sdk",
 "postgres",
 "postgres_initdb",
- "prometheus",
 "regex",
 "remote_storage",
 "reqwest",
@@ -1348,13 +1343,13 @@ dependencies = [
 "tower 0.5.2",
 "tower-http",
 "tracing",
- "tracing-opentelemetry",
 "tracing-subscriber",
 "tracing-utils",
 "url",
 "utils",
 "uuid",
 "vm_monitor",
+ "walkdir",
 "workspace_hack",
 "zstd",
 ]
@@ -4927,7 +4922,6 @@ dependencies = [
 "aws-sdk-iam",
 "aws-sigv4",
 "base64 0.13.1",
- "boxcar",
 "bstr",
 "bytes",
 "camino",
@@ -4979,7 +4973,6 @@ dependencies = [
 "postgres-protocol2",
 "postgres_backend",
 "pq_proto",
- "prometheus",
 "rand 0.8.5",
 "rand_distr",
 "rcgen",
@@ -5004,7 +4997,6 @@ dependencies = [
 "smallvec",
 "smol_str",
 "socket2",
- "strum",
 "strum_macros",
 "subtle",
 "thiserror 1.0.69",
@@ -5019,7 +5011,6 @@ dependencies = [
 "tracing",
 "tracing-log",
 "tracing-opentelemetry",
- "tracing-serde",
 "tracing-subscriber",
 "tracing-utils",
 "try-lock",
@@ -6460,10 +6451,13 @@ dependencies = [
 "pageserver_client",
 "postgres_connection",
 "rand 0.8.5",
+ "regex",
 "reqwest",
 "routerify",
 "rustls 0.23.18",
 "rustls-native-certs 0.8.0",
+ "safekeeper_api",
+ "safekeeper_client",
 "scoped-futures",
 "scopeguard",
 "serde",
@@ -6471,6 +6465,7 @@ dependencies = [
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
+ "tikv-jemallocator",
 "tokio",
 "tokio-postgres",
 "tokio-postgres-rustls",
@@ -7024,14 +7019,11 @@ dependencies = [
 name = "tokio-postgres2"
 version = "0.1.0"
 dependencies = [
- "async-trait",
- "byteorder",
 "bytes",
 "fallible-iterator",
 "futures-util",
 "log",
 "parking_lot 0.12.1",
- "percent-encoding",
 "phf",
 "pin-project-lite",
 "postgres-protocol2",
@@ -7618,13 +7610,13 @@ dependencies = [
 "hex",
 "hex-literal",
 "humantime",
- "inferno 0.12.0",
 "jsonwebtoken",
 "metrics",
 "nix 0.27.1",
 "once_cell",
 "pin-project-lite",
 "postgres_connection",
+ "pprof",
 "pq_proto",
 "rand 0.8.5",
 "regex",
--- a/16
+++ b/16
@@ -50,6 +50,14 @@ RUN set -e \
    && rm -rf pg_install/build \
    && tar -C pg_install -czf /home/nonroot/postgres_install.tar.gz .

+# Prepare cargo-chef recipe
+FROM $REPOSITORY/$IMAGE:$TAG AS plan
+WORKDIR /home/nonroot
+
+COPY --chown=nonroot . .
+
+RUN cargo chef prepare --recipe-path recipe.json
+
 # Build neon binaries
 FROM $REPOSITORY/$IMAGE:$TAG AS build
 WORKDIR /home/nonroot
@@ -63,9 +71,15 @@ COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_i
 COPY --from=pg-build /home/nonroot/pg_install/v17/include/postgresql/server pg_install/v17/include/postgresql/server
 COPY --from=pg-build /home/nonroot/pg_install/v16/lib                       pg_install/v16/lib
 COPY --from=pg-build /home/nonroot/pg_install/v17/lib                       pg_install/v17/lib
+COPY --from=plan     /home/nonroot/recipe.json                              recipe.json
+
+ARG ADDITIONAL_RUSTFLAGS=""
+
+RUN set -e \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo chef cook --locked --release --recipe-path recipe.json
+
 COPY --chown=nonroot . .

-ARG ADDITIONAL_RUSTFLAGS
 RUN set -e \
    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment -Cforce-frame-pointers=yes ${ADDITIONAL_RUSTFLAGS}" cargo build \
      --bin pg_sni_router  \
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.84.1
+ENV RUSTC_VERSION=1.85.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
@@ -300,6 +300,7 @@ ARG CARGO_HAKARI_VERSION=0.9.33
 ARG CARGO_DENY_VERSION=0.16.2
 ARG CARGO_HACK_VERSION=0.6.33
 ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_CHEF_VERSION=0.1.71
 ARG CARGO_DIESEL_CLI_VERSION=2.2.6
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
@@ -314,6 +315,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install cargo-deny --locked --version ${CARGO_DENY_VERSION} && \
    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} && \
    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install cargo-chef --locked --version ${CARGO_CHEF_VERSION} && \
    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} \
                                      --features postgres-bundled --no-default-features && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -148,7 +148,7 @@ RUN case $DEBIAN_VERSION in \
    apt install --no-install-recommends --no-install-suggests -y \
    ninja-build git autoconf automake libtool build-essential bison flex libreadline-dev \
    zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip \
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd curl unzip g++ \
    $VERSION_INSTALLS \
    && apt clean && rm -rf /var/lib/apt/lists/*

@@ -395,15 +395,22 @@ RUN case "${PG_VERSION:?}" in \
    cd plv8-src && \
    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi

-FROM pg-build AS plv8-build
+# Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
+# 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
+# (The V8 engine takes a very long time to build)
+FROM build-deps AS plv8-build
 ARG PG_VERSION
+WORKDIR /ext-src/plv8-src
 RUN apt update && \
    apt install --no-install-recommends --no-install-suggests -y \
    ninja-build python3-dev libncurses5 binutils clang \
    && apt clean && rm -rf /var/lib/apt/lists/*
-
 COPY --from=plv8-src /ext-src/ /ext-src/
-WORKDIR /ext-src/plv8-src
+RUN make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) v8
+
+# Step 2: Build the PostgreSQL-dependent parts
+COPY --from=pg-build /usr/local/pgsql /usr/local/pgsql
+ENV PATH="/usr/local/pgsql/bin:$PATH"
 RUN \
    # generate and copy upgrade scripts
    make generate_upgrades && \
@@ -1464,6 +1471,31 @@ RUN make release -j $(getconf _NPROCESSORS_ONLN) && \
    make install -j $(getconf _NPROCESSORS_ONLN) && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control

+#########################################################################################
+#
+# Layer "pg-duckdb-pg-build"
+# compile pg_duckdb extension
+#
+#########################################################################################
+FROM build-deps AS pg_duckdb-src
+WORKDIR /ext-src
+COPY compute/patches/pg_duckdb_v031.patch .
+# pg_duckdb build requires source dir to be a git repo to get submodules
+# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only: 
+# - extension management function duckdb.install_extension()
+# - access to duckdb.extensions table and its sequence
+RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
+    cd pg_duckdb-src && \
+    git submodule update --init --recursive && \
+    patch -p1 < /ext-src/pg_duckdb_v031.patch
+
+FROM pg-build AS pg_duckdb-build
+ARG PG_VERSION
+COPY --from=pg_duckdb-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pg_duckdb-src
+RUN make install -j $(getconf _NPROCESSORS_ONLN) && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_duckdb.control 
+        
 #########################################################################################
 #
 # Layer "pg_repack"
@@ -1484,6 +1516,73 @@ WORKDIR /ext-src/pg_repack-src
 RUN make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

+
+#########################################################################################
+#
+# Layer "pgaudit"
+# compile pgaudit extension
+#
+#########################################################################################
+
+FROM build-deps AS pgaudit-src
+ARG PG_VERSION
+WORKDIR /ext-src
+RUN case "${PG_VERSION}" in \
+    "v14") \
+    export PGAUDIT_VERSION=1.6.2 \
+    export PGAUDIT_CHECKSUM=1f350d70a0cbf488c0f2b485e3a5c9b11f78ad9e3cbb95ef6904afa1eb3187eb \
+    ;; \
+    "v15") \
+    export PGAUDIT_VERSION=1.7.0 \
+    export PGAUDIT_CHECKSUM=8f4a73e451c88c567e516e6cba7dc1e23bc91686bb6f1f77f8f3126d428a8bd8 \
+    ;; \
+    "v16") \
+    export PGAUDIT_VERSION=16.0 \
+    export PGAUDIT_CHECKSUM=d53ef985f2d0b15ba25c512c4ce967dce07b94fd4422c95bd04c4c1a055fe738 \
+    ;; \
+    "v17") \
+    export PGAUDIT_VERSION=17.0 \
+    export PGAUDIT_CHECKSUM=7d0d08d030275d525f36cd48b38c6455f1023da863385badff0cec44965bfd8c \
+    ;; \
+    *) \
+    echo "pgaudit is not supported on this PostgreSQL version" && exit 1;; \
+    esac && \
+    wget https://github.com/pgaudit/pgaudit/archive/refs/tags/${PGAUDIT_VERSION}.tar.gz -O pgaudit.tar.gz && \
+    echo "${PGAUDIT_CHECKSUM} pgaudit.tar.gz" | sha256sum --check && \
+    mkdir pgaudit-src && cd pgaudit-src && tar xzf ../pgaudit.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgaudit-build
+COPY --from=pgaudit-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgaudit-src
+RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
+
+#########################################################################################
+#
+# Layer "pgauditlogtofile"
+# compile pgauditlogtofile extension
+#
+#########################################################################################
+
+FROM build-deps AS pgauditlogtofile-src
+ARG PG_VERSION
+WORKDIR /ext-src
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+    export PGAUDITLOGTOFILE_VERSION=v1.6.4 \
+    export PGAUDITLOGTOFILE_CHECKSUM=ef801eb09c26aaa935c0dabd92c81eb9ebe338930daa9674d420a280c6bc2d70 \
+    ;; \
+    *) \
+    echo "pgauditlogtofile is not supported on this PostgreSQL version" && exit 1;; \
+    esac && \
+    wget https://github.com/fmbiete/pgauditlogtofile/archive/refs/tags/${PGAUDITLOGTOFILE_VERSION}.tar.gz -O pgauditlogtofile.tar.gz && \
+    echo "${PGAUDITLOGTOFILE_CHECKSUM} pgauditlogtofile.tar.gz" | sha256sum --check && \
+    mkdir pgauditlogtofile-src && cd pgauditlogtofile-src && tar xzf ../pgauditlogtofile.tar.gz --strip-components=1 -C .
+
+FROM pg-build AS pgauditlogtofile-build
+COPY --from=pgauditlogtofile-src /ext-src/ /ext-src/
+WORKDIR /ext-src/pgauditlogtofile-src
+RUN make install USE_PGXS=1 -j $(getconf _NPROCESSORS_ONLN)
+
 #########################################################################################
 #
 # Layer "neon-ext-build"
@@ -1577,7 +1676,14 @@ COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
+
+# Disabled temporarily, because it clashed with pg_mooncake. pg_mooncake
+# also depends on libduckdb, but a different version.
+#COPY --from=pg_duckdb-build /usr/local/pgsql/ /usr/local/pgsql/
+
 COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgaudit-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgauditlogtofile-build /usr/local/pgsql/ /usr/local/pgsql/

 #########################################################################################
 #
@@ -1669,29 +1775,6 @@ RUN if [ "$TARGETARCH" = "amd64" ]; then\
    && echo "${pgbouncer_exporter_sha256} pgbouncer_exporter" | sha256sum -c -\
    && echo "${sql_exporter_sha256} sql_exporter" | sha256sum -c -

-#########################################################################################
-#
-# Layer "awscli"
-#
-#########################################################################################
-FROM build-deps AS awscli
-ARG TARGETARCH
-RUN set -ex; \
-    if [ "${TARGETARCH}" = "amd64" ]; then \
-        TARGETARCH_ALT="x86_64"; \
-        CHECKSUM="c9a9df3770a3ff9259cb469b6179e02829687a464e0824d5c32d378820b53a00"; \
-    elif [ "${TARGETARCH}" = "arm64" ]; then \
-        TARGETARCH_ALT="aarch64"; \
-        CHECKSUM="8181730be7891582b38b028112e81b4899ca817e8c616aad807c9e9d1289223a"; \
-    else \
-        echo "Unsupported architecture: ${TARGETARCH}"; exit 1; \
-    fi; \
-    curl --retry 5 -L "https://awscli.amazonaws.com/awscli-exe-linux-${TARGETARCH_ALT}-2.17.5.zip" -o /tmp/awscliv2.zip; \
-    echo "${CHECKSUM}  /tmp/awscliv2.zip" | sha256sum -c -; \
-    unzip /tmp/awscliv2.zip -d /tmp/awscliv2; \
-    /tmp/awscliv2/aws/install; \
-    rm -rf /tmp/awscliv2.zip /tmp/awscliv2
-
 #########################################################################################
 #
 # Clean up postgres folder before inclusion
@@ -1750,7 +1833,7 @@ COPY --from=pg_graphql-src /ext-src/ /ext-src/
 COPY --from=hypopg-src /ext-src/ /ext-src/
 COPY --from=pg_hashids-src /ext-src/ /ext-src/
 COPY --from=rum-src /ext-src/ /ext-src/
-#COPY --from=pgtap-src /ext-src/ /ext-src/
+COPY --from=pgtap-src /ext-src/ /ext-src/
 COPY --from=ip4r-src /ext-src/ /ext-src/
 COPY --from=prefix-src /ext-src/ /ext-src/
 COPY --from=hll-src /ext-src/ /ext-src/
@@ -1772,14 +1855,20 @@ COPY --from=pg_semver-src /ext-src/ /ext-src/
 COPY --from=pg_ivm-src /ext-src/ /ext-src/
 COPY --from=pg_partman-src /ext-src/ /ext-src/
 #COPY --from=pg_mooncake-src /ext-src/ /ext-src/
-#COPY --from=pg_repack-src /ext-src/ /ext-src/
+COPY --from=pg_repack-src /ext-src/ /ext-src/
+COPY --from=pg_repack-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY compute/patches/pg_repack.patch /ext-src
+RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /ext-src/pg_repack.patch

 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
+RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl\
+   && apt clean && rm -rf /ext-src/*.tar.gz /var/lib/apt/lists/*
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
+ENV PG_VERSION=${PG_VERSION:?}

 #########################################################################################
 #
@@ -1861,9 +1950,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    mkdir /usr/local/download_extensions && \
    chown -R postgres:postgres /usr/local/download_extensions

-# aws cli is used by fast_import
-COPY --from=awscli /usr/local/aws-cli /usr/local/aws-cli
-
 # pgbouncer and its config
 COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer
 COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini
--- a/compute/patches/pg_duckdb_v031.patch
+++ b/compute/patches/pg_duckdb_v031.patch
@@ -0,0 +1,11 @@
+diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
+index d777d76..af60106 100644
+--- a/sql/pg_duckdb--0.2.0--0.3.0.sql
+++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
+@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
+ GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
+ GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;
--- a/compute/patches/pg_repack.patch
+++ b/compute/patches/pg_repack.patch
@@ -0,0 +1,72 @@
+diff --git a/regress/Makefile b/regress/Makefile
+index bf6edcb..89b4c7f 100644
+--- a/regress/Makefile
+++ b/regress/Makefile
+@@ -17,7 +17,7 @@ INTVERSION := $(shell echo $$(($$(echo $(VERSION).0 | sed 's/\([[:digit:]]\{1,\}
+ # Test suite
+ #
+ 
+-REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper tablespace get_order_by trigger
+REGRESS := init-extension repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
+ 
+ USE_PGXS = 1	# use pgxs if not in contrib directory
+ PGXS := $(shell $(PG_CONFIG) --pgxs)
+diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
+index 8d0a94e..63b68bf 100644
+--- a/regress/expected/nosuper.out
+++ b/regress/expected/nosuper.out
+@@ -4,22 +4,22 @@
+ SET client_min_messages = error;
+ DROP ROLE IF EXISTS nosuper;
+ SET client_min_messages = warning;
+-CREATE ROLE nosuper WITH LOGIN;
+CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD';
+ -- => OK
+ \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check
+ INFO: repacking table "public.tbl_cluster"
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+ ERROR: pg_repack failed with error: You must be a superuser to use pg_repack
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ ERROR: pg_repack failed with error: ERROR:  permission denied for schema repack
+ LINE 1: select repack.version(), repack.version_sql()
+                ^
+ GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper;
+ GRANT USAGE ON SCHEMA repack TO nosuper;
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ INFO: repacking table "public.tbl_cluster"
+ ERROR: query failed: ERROR:  current transaction is aborted, commands ignored until end of transaction block
+ DETAIL: query was: RESET lock_timeout
+diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
+index 072f0fa..dbe60f8 100644
+--- a/regress/sql/nosuper.sql
+++ b/regress/sql/nosuper.sql
+@@ -4,19 +4,19 @@
+ SET client_min_messages = error;
+ DROP ROLE IF EXISTS nosuper;
+ SET client_min_messages = warning;
+-CREATE ROLE nosuper WITH LOGIN;
+CREATE ROLE nosuper WITH LOGIN PASSWORD 'NoSuPeRpAsSwOrD';
+ -- => OK
+ \! pg_repack --dbname=contrib_regression --table=tbl_cluster --no-superuser-check
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ 
+ GRANT ALL ON ALL TABLES IN SCHEMA repack TO nosuper;
+ GRANT USAGE ON SCHEMA repack TO nosuper;
+ 
+ -- => ERROR
+-\! pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+\! PGPASSWORD=NoSuPeRpAsSwOrD pg_repack --dbname=contrib_regression --table=tbl_cluster --username=nosuper --no-superuser-check
+ 
+ REVOKE ALL ON ALL TABLES IN SCHEMA repack FROM nosuper;
+ REVOKE USAGE ON SCHEMA repack FROM nosuper;
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -47,7 +47,9 @@ files:
      # Allow postgres user (which is what compute_ctl runs as) to run /neonvm/bin/resize-swap
      # and /neonvm/bin/set-disk-quota as root without requiring entering a password (NOPASSWD),
      # regardless of hostname (ALL)
-      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota
+      #
+      # Also allow it to shut down the VM. The fast_import job does that when it's finished.
+      postgres ALL=(root) NOPASSWD: /neonvm/bin/resize-swap, /neonvm/bin/set-disk-quota, /neonvm/bin/poweroff
  - filename: cgconfig.conf
    content: |
      # Configuration for cgroups in VM compute nodes
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -14,6 +14,7 @@ base64.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-sdk-kms.workspace = true
+aws-smithy-types.workspace = true
 anyhow.workspace = true
 axum = { workspace = true, features = [] }
 camino.workspace = true
@@ -46,13 +47,12 @@ tokio-postgres.workspace = true
 tokio-util.workspace = true
 tokio-stream.workspace = true
 tracing.workspace = true
-tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
 uuid.workspace = true
-prometheus.workspace = true
+walkdir.workspace = true

 postgres_initdb.workspace = true
 compute_api.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -41,7 +41,6 @@ use std::process::exit;
 use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{mpsc, Arc, Condvar, Mutex, RwLock};
-use std::time::SystemTime;
 use std::{thread, time::Duration};

 use anyhow::{Context, Result};
@@ -55,7 +54,7 @@ use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info, warn};
 use url::Url;

-use compute_api::responses::ComputeStatus;
+use compute_api::responses::{ComputeCtlConfig, ComputeStatus};
 use compute_api::spec::ComputeSpec;

 use compute_tools::compute::{
@@ -86,19 +85,6 @@ fn parse_remote_ext_config(arg: &str) -> Result<String> {
    }
 }

-/// Generate a compute ID if one is not supplied. This exists to keep forward
-/// compatibility tests working, but will be removed in a future iteration.
-fn generate_compute_id() -> String {
-    let now = SystemTime::now();
-
-    format!(
-        "compute-{}",
-        now.duration_since(SystemTime::UNIX_EPOCH)
-            .unwrap()
-            .as_secs()
-    )
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
@@ -112,16 +98,13 @@ struct Cli {
    /// outside the compute will talk to the compute through this port. Keep
    /// the previous name for this argument around for a smoother release
    /// with the control plane.
-    ///
-    /// TODO: Remove the alias after the control plane release which teaches the
-    /// control plane about the renamed argument.
-    #[arg(long, alias = "http-port", default_value_t = 3080)]
+    #[arg(long, default_value_t = 3080)]
    pub external_http_port: u16,

-    /// The port to bind the internal listening HTTP server to. Clients like
+    /// The port to bind the internal listening HTTP server to. Clients include
    /// the neon extension (for installing remote extensions) and local_proxy.
-    #[arg(long)]
-    pub internal_http_port: Option<u16>,
+    #[arg(long, default_value_t = 3081)]
+    pub internal_http_port: u16,

    #[arg(short = 'D', long, value_name = "DATADIR")]
    pub pgdata: String,
@@ -156,7 +139,7 @@ struct Cli {
    #[arg(short = 'S', long, group = "spec-path")]
    pub spec_path: Option<OsString>,

-    #[arg(short = 'i', long, group = "compute-id", default_value = generate_compute_id())]
+    #[arg(short = 'i', long, group = "compute-id")]
    pub compute_id: String,

    #[arg(short = 'p', long, conflicts_with_all = ["spec", "spec-path"], value_name = "CONTROL_PLANE_API_BASE_URL")]
@@ -281,6 +264,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
        info!("got spec from cli argument {}", spec_json);
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_str(spec_json)?),
+            compute_ctl_config: ComputeCtlConfig::default(),
            live_config_allowed: false,
        });
    }
@@ -290,6 +274,7 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
        let file = File::open(Path::new(spec_path))?;
        return Ok(CliSpecParams {
            spec: Some(serde_json::from_reader(file)?),
+            compute_ctl_config: ComputeCtlConfig::default(),
            live_config_allowed: true,
        });
    }
@@ -299,8 +284,9 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
    };

    match get_spec_from_control_plane(cli.control_plane_uri.as_ref().unwrap(), &cli.compute_id) {
-        Ok(spec) => Ok(CliSpecParams {
-            spec,
+        Ok(resp) => Ok(CliSpecParams {
+            spec: resp.0,
+            compute_ctl_config: resp.1,
            live_config_allowed: true,
        }),
        Err(e) => {
@@ -317,6 +303,8 @@ fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
 struct CliSpecParams {
    /// If a spec was provided via CLI or file, the [`ComputeSpec`]
    spec: Option<ComputeSpec>,
+    #[allow(dead_code)]
+    compute_ctl_config: ComputeCtlConfig,
    live_config_allowed: bool,
 }

@@ -326,6 +314,7 @@ fn wait_spec(
    CliSpecParams {
        spec,
        live_config_allowed,
+        compute_ctl_config: _,
    }: CliSpecParams,
 ) -> Result<Arc<ComputeNode>> {
    let mut new_state = ComputeState::new();
@@ -353,7 +342,7 @@ fn wait_spec(
        pgbin: cli.pgbin.clone(),
        pgversion: get_pg_version_string(&cli.pgbin),
        external_http_port: cli.external_http_port,
-        internal_http_port: cli.internal_http_port.unwrap_or(cli.external_http_port + 1),
+        internal_http_port: cli.internal_http_port,
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
@@ -377,7 +366,7 @@ fn wait_spec(

    // The internal HTTP server could be launched later, but there isn't much
    // sense in waiting.
-    Server::Internal(cli.internal_http_port.unwrap_or(cli.external_http_port + 1)).launch(&compute);
+    Server::Internal(cli.internal_http_port).launch(&compute);

    if !spec_set {
        // No spec provided, hang waiting for it.
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -25,10 +25,10 @@
 //! docker push localhost:3030/localregistry/compute-node-v14:latest
 //! ```

-use anyhow::Context;
+use anyhow::{bail, Context};
 use aws_config::BehaviorVersion;
 use camino::{Utf8Path, Utf8PathBuf};
-use clap::Parser;
+use clap::{Parser, Subcommand};
 use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
 use tracing::{error, info, info_span, warn, Instrument};
@@ -44,32 +44,59 @@ mod s3_uri;
 const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
 const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);

+#[derive(Subcommand, Debug)]
+enum Command {
+    /// Runs local postgres (neon binary), restores into it,
+    /// uploads pgdata to s3 to be consumed by pageservers
+    Pgdata {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// If specified, will not shut down the local postgres after the import. Used in local testing
+        #[clap(short, long)]
+        interactive: bool,
+        /// Port to run postgres on. Default is 5432.
+        #[clap(long, default_value_t = 5432)]
+        pg_port: u16, // port to run postgres on, 5432 is default
+
+        /// Number of CPUs in the system. This is used to configure # of
+        /// parallel worker processes, for index creation.
+        #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
+        num_cpus: Option<usize>,
+
+        /// Amount of RAM in the system. This is used to configure shared_buffers
+        /// and maintenance_work_mem.
+        #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
+        memory_mb: Option<usize>,
+    },
+
+    /// Runs pg_dump-pg_restore from source to destination without running local postgres.
+    DumpRestore {
+        /// Raw connection string to the source database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        source_connection_string: Option<String>,
+        /// Raw connection string to the destination database. Used only in tests,
+        /// real scenario uses encrypted connection string in spec.json from s3.
+        #[clap(long)]
+        destination_connection_string: Option<String>,
+    },
+}
+
 #[derive(clap::Parser)]
 struct Args {
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_WORKDIR")]
    working_directory: Utf8PathBuf,
    #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
    s3_prefix: Option<s3_uri::S3Uri>,
-    #[clap(long)]
-    source_connection_string: Option<String>,
-    #[clap(short, long)]
-    interactive: bool,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_BIN_DIR")]
    pg_bin_dir: Utf8PathBuf,
-    #[clap(long)]
+    #[clap(long, env = "NEON_IMPORTER_PG_LIB_DIR")]
    pg_lib_dir: Utf8PathBuf,
-    #[clap(long)]
-    pg_port: Option<u16>, // port to run postgres on, 5432 is default

-    /// Number of CPUs in the system. This is used to configure # of
-    /// parallel worker processes, for index creation.
-    #[clap(long, env = "NEON_IMPORTER_NUM_CPUS")]
-    num_cpus: Option<usize>,
-
-    /// Amount of RAM in the system. This is used to configure shared_buffers
-    /// and maintenance_work_mem.
-    #[clap(long, env = "NEON_IMPORTER_MEMORY_MB")]
-    memory_mb: Option<usize>,
+    #[clap(subcommand)]
+    command: Command,
 }

 #[serde_with::serde_as]
@@ -78,6 +105,8 @@ struct Spec {
    encryption_secret: EncryptionSecret,
    #[serde_as(as = "serde_with::base64::Base64")]
    source_connstring_ciphertext_base64: Vec<u8>,
+    #[serde_as(as = "Option<serde_with::base64::Base64>")]
+    destination_connstring_ciphertext_base64: Option<Vec<u8>>,
 }

 #[derive(serde::Deserialize)]
@@ -93,192 +122,150 @@ const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
    "C.UTF-8"
 };

-#[tokio::main]
-pub(crate) async fn main() -> anyhow::Result<()> {
-    utils::logging::init(
-        utils::logging::LogFormat::Plain,
-        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
-        utils::logging::Output::Stdout,
-    )?;
-
-    info!("starting");
-
-    let args = Args::parse();
-
-    // Validate arguments
-    if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
-        anyhow::bail!("either s3_prefix or source_connection_string must be specified");
-    }
-    if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
-        anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
-    }
-
-    let working_directory = args.working_directory;
-    let pg_bin_dir = args.pg_bin_dir;
-    let pg_lib_dir = args.pg_lib_dir;
-    let pg_port = args.pg_port.unwrap_or_else(|| {
-        info!("pg_port not specified, using default 5432");
-        5432
-    });
-
-    // Initialize AWS clients only if s3_prefix is specified
-    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
-        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
-        let kms = aws_sdk_kms::Client::new(&config);
-        (Some(config), Some(kms))
-    } else {
-        (None, None)
-    };
-
-    // Get source connection string either from S3 spec or direct argument
-    let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
-        let spec: Spec = {
-            let spec_key = s3_prefix.append("/spec.json");
-            let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
-            let object = s3_client
-                .get_object()
-                .bucket(&spec_key.bucket)
-                .key(spec_key.key)
-                .send()
-                .await
-                .context("get spec from s3")?
-                .body
-                .collect()
-                .await
-                .context("download spec body")?;
-            serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
-        };
-
-        match spec.encryption_secret {
-            EncryptionSecret::KMS { key_id } => {
-                let mut output = kms_client
-                    .unwrap()
-                    .decrypt()
-                    .key_id(key_id)
-                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
-                        spec.source_connstring_ciphertext_base64,
-                    ))
-                    .send()
-                    .await
-                    .context("decrypt source connection string")?;
-                let plaintext = output
-                    .plaintext
-                    .take()
-                    .context("get plaintext source connection string")?;
-                String::from_utf8(plaintext.into_inner())
-                    .context("parse source connection string as utf8")?
-            }
-        }
-    } else {
-        args.source_connection_string.unwrap()
-    };
-
-    match tokio::fs::create_dir(&working_directory).await {
-        Ok(()) => {}
-        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
-            if !is_directory_empty(&working_directory)
-                .await
-                .context("check if working directory is empty")?
-            {
-                anyhow::bail!("working directory is not empty");
-            } else {
-                // ok
-            }
-        }
-        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
-    }
-
-    let pgdata_dir = working_directory.join("pgdata");
-    tokio::fs::create_dir(&pgdata_dir)
+async fn decode_connstring(
+    kms_client: &aws_sdk_kms::Client,
+    key_id: &String,
+    connstring_ciphertext_base64: Vec<u8>,
+) -> Result<String, anyhow::Error> {
+    let mut output = kms_client
+        .decrypt()
+        .key_id(key_id)
+        .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+            connstring_ciphertext_base64,
+        ))
+        .send()
        .await
-        .context("create pgdata directory")?;
+        .context("decrypt connection string")?;

-    let pgbin = pg_bin_dir.join("postgres");
-    let pg_version = match get_pg_version(pgbin.as_ref()) {
-        PostgresMajorVersion::V14 => 14,
-        PostgresMajorVersion::V15 => 15,
-        PostgresMajorVersion::V16 => 16,
-        PostgresMajorVersion::V17 => 17,
-    };
-    let superuser = "cloud_admin"; // XXX: this shouldn't be hard-coded
-    postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
-        superuser,
-        locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
-        pg_version,
-        initdb_bin: pg_bin_dir.join("initdb").as_ref(),
-        library_search_path: &pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
-        pgdata: &pgdata_dir,
-    })
-    .await
-    .context("initdb")?;
+    let plaintext = output
+        .plaintext
+        .take()
+        .context("get plaintext connection string")?;

-    // If the caller didn't specify CPU / RAM to use for sizing, default to
-    // number of CPUs in the system, and pretty arbitrarily, 256 MB of RAM.
-    let nproc = args.num_cpus.unwrap_or_else(num_cpus::get);
-    let memory_mb = args.memory_mb.unwrap_or(256);
+    String::from_utf8(plaintext.into_inner()).context("parse connection string as utf8")
+}

-    // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
-    // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
-    // available for misc other stuff that PostgreSQL uses memory for.
-    let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
-    let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+struct PostgresProcess {
+    pgdata_dir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pgbin: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    postgres_proc: Option<tokio::process::Child>,
+}

-    //
-    // Launch postgres process
-    //
-    let mut postgres_proc = tokio::process::Command::new(pgbin)
-        .arg("-D")
-        .arg(&pgdata_dir)
-        .args(["-p", &format!("{pg_port}")])
-        .args(["-c", "wal_level=minimal"])
-        .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
-        .args(["-c", "max_wal_senders=0"])
-        .args(["-c", "fsync=off"])
-        .args(["-c", "full_page_writes=off"])
-        .args(["-c", "synchronous_commit=off"])
-        .args([
-            "-c",
-            &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
-        ])
-        .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers={nproc}")])
-        .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
-        .args(["-c", &format!("max_worker_processes={nproc}")])
-        .args([
-            "-c",
-            &format!(
-                "effective_io_concurrency={}",
-                if cfg!(target_os = "macos") { 0 } else { 100 }
-            ),
-        ])
-        .env_clear()
-        .env("LD_LIBRARY_PATH", &pg_lib_dir)
-        .env(
-            "ASAN_OPTIONS",
-            std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+impl PostgresProcess {
+    fn new(pgdata_dir: Utf8PathBuf, pg_bin_dir: Utf8PathBuf, pg_lib_dir: Utf8PathBuf) -> Self {
+        Self {
+            pgdata_dir,
+            pgbin: pg_bin_dir.join("postgres"),
+            pg_bin_dir,
+            pg_lib_dir,
+            postgres_proc: None,
+        }
+    }
+
+    async fn prepare(&self, initdb_user: &str) -> Result<(), anyhow::Error> {
+        tokio::fs::create_dir(&self.pgdata_dir)
+            .await
+            .context("create pgdata directory")?;
+
+        let pg_version = match get_pg_version(self.pgbin.as_ref()) {
+            PostgresMajorVersion::V14 => 14,
+            PostgresMajorVersion::V15 => 15,
+            PostgresMajorVersion::V16 => 16,
+            PostgresMajorVersion::V17 => 17,
+        };
+        postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
+            superuser: initdb_user,
+            locale: DEFAULT_LOCALE, // XXX: this shouldn't be hard-coded,
+            pg_version,
+            initdb_bin: self.pg_bin_dir.join("initdb").as_ref(),
+            library_search_path: &self.pg_lib_dir, // TODO: is this right? Prob works in compute image, not sure about neon_local.
+            pgdata: &self.pgdata_dir,
+        })
+        .await
+        .context("initdb")
+    }
+
+    async fn start(
+        &mut self,
+        initdb_user: &str,
+        port: u16,
+        nproc: usize,
+        memory_mb: usize,
+    ) -> Result<&tokio::process::Child, anyhow::Error> {
+        self.prepare(initdb_user).await?;
+
+        // Somewhat arbitrarily, use 10 % of memory for shared buffer cache, 70% for
+        // maintenance_work_mem (i.e. for sorting during index creation), and leave the rest
+        // available for misc other stuff that PostgreSQL uses memory for.
+        let shared_buffers_mb = ((memory_mb as f32) * 0.10) as usize;
+        let maintenance_work_mem_mb = ((memory_mb as f32) * 0.70) as usize;
+
+        //
+        // Launch postgres process
+        //
+        let mut proc = tokio::process::Command::new(&self.pgbin)
+            .arg("-D")
+            .arg(&self.pgdata_dir)
+            .args(["-p", &format!("{port}")])
+            .args(["-c", "wal_level=minimal"])
+            .args(["-c", &format!("shared_buffers={shared_buffers_mb}MB")])
+            .args(["-c", "max_wal_senders=0"])
+            .args(["-c", "fsync=off"])
+            .args(["-c", "full_page_writes=off"])
+            .args(["-c", "synchronous_commit=off"])
+            .args([
+                "-c",
+                &format!("maintenance_work_mem={maintenance_work_mem_mb}MB"),
+            ])
+            .args(["-c", &format!("max_parallel_maintenance_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers={nproc}")])
+            .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
+            .args(["-c", &format!("max_worker_processes={nproc}")])
+            .args(["-c", "effective_io_concurrency=100"])
+            .env_clear()
+            .env("LD_LIBRARY_PATH", &self.pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
+            .stdout(std::process::Stdio::piped())
+            .stderr(std::process::Stdio::piped())
+            .spawn()
+            .context("spawn postgres")?;
+
+        info!("spawned postgres, waiting for it to become ready");
+        tokio::spawn(
+            child_stdio_to_log::relay_process_output(proc.stdout.take(), proc.stderr.take())
+                .instrument(info_span!("postgres")),
+        );
+
+        self.postgres_proc = Some(proc);
+        Ok(self.postgres_proc.as_ref().unwrap())
+    }
+
+    async fn shutdown(&mut self) -> Result<(), anyhow::Error> {
+        let proc: &mut tokio::process::Child = self.postgres_proc.as_mut().unwrap();
+        info!("shutdown postgres");
+        nix::sys::signal::kill(
+            Pid::from_raw(i32::try_from(proc.id().unwrap()).expect("convert child pid to i32")),
+            nix::sys::signal::SIGTERM,
        )
-        .env(
-            "UBSAN_OPTIONS",
-            std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
-        )
-        .stdout(std::process::Stdio::piped())
-        .stderr(std::process::Stdio::piped())
-        .spawn()
-        .context("spawn postgres")?;
-
-    info!("spawned postgres, waiting for it to become ready");
-    tokio::spawn(
-        child_stdio_to_log::relay_process_output(
-            postgres_proc.stdout.take(),
-            postgres_proc.stderr.take(),
-        )
-        .instrument(info_span!("postgres")),
-    );
+        .context("signal postgres to shut down")?;
+        proc.wait()
+            .await
+            .context("wait for postgres to shut down")
+            .map(|_| ())
+    }
+}

+async fn wait_until_ready(connstring: String, create_dbname: String) {
    // Create neondb database in the running postgres
-    let restore_pg_connstring =
-        format!("host=localhost port={pg_port} user={superuser} dbname=postgres");
-
    let start_time = std::time::Instant::now();

    loop {
@@ -289,7 +276,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            std::process::exit(1);
        }

-        match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
+        match tokio_postgres::connect(
+            &connstring.replace("dbname=neondb", "dbname=postgres"),
+            tokio_postgres::NoTls,
+        )
+        .await
+        {
            Ok((client, connection)) => {
                // Spawn the connection handling task to maintain the connection
                tokio::spawn(async move {
@@ -298,9 +290,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
                    }
                });

-                match client.simple_query("CREATE DATABASE neondb;").await {
+                match client
+                    .simple_query(format!("CREATE DATABASE {create_dbname};").as_str())
+                    .await
+                {
                    Ok(_) => {
-                        info!("created neondb database");
+                        info!("created {} database", create_dbname);
                        break;
                    }
                    Err(e) => {
@@ -324,10 +319,16 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            }
        }
    }
+}

-    let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
-
-    let dumpdir = working_directory.join("dumpdir");
+async fn run_dump_restore(
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    source_connstring: String,
+    destination_connstring: String,
+) -> Result<(), anyhow::Error> {
+    let dumpdir = workdir.join("dumpdir");

    let common_args = [
        // schema mapping (prob suffices to specify them on one side)
@@ -356,10 +357,18 @@ pub(crate) async fn main() -> anyhow::Result<()> {
            .arg("--no-sync")
            // POSITIONAL args
            // source db (db name included in connection string)
-            .arg(&source_connection_string)
+            .arg(&source_connstring)
            // how we run it
            .env_clear()
            .env("LD_LIBRARY_PATH", &pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
            .kill_on_drop(true)
            .stdout(std::process::Stdio::piped())
            .stderr(std::process::Stdio::piped())
@@ -376,24 +385,31 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        let st = pg_dump.wait().await.context("wait for pg_dump")?;
        info!(status=?st, "pg_dump exited");
        if !st.success() {
-            warn!(status=%st, "pg_dump failed, restore will likely fail as well");
+            error!(status=%st, "pg_dump failed, restore will likely fail as well");
+            bail!("pg_dump failed");
        }
    }

-    // TODO: do it in a streaming way, plenty of internal research done on this already
+    // TODO: maybe do it in a streaming way, plenty of internal research done on this already
    // TODO: do the unlogged table trick
-
-    info!("restore from working directory into vanilla postgres");
    {
        let mut pg_restore = tokio::process::Command::new(pg_bin_dir.join("pg_restore"))
            .args(&common_args)
            .arg("-d")
-            .arg(&restore_pg_connstring)
+            .arg(&destination_connstring)
            // POSITIONAL args
            .arg(&dumpdir)
            // how we run it
            .env_clear()
            .env("LD_LIBRARY_PATH", &pg_lib_dir)
+            .env(
+                "ASAN_OPTIONS",
+                std::env::var("ASAN_OPTIONS").unwrap_or_default(),
+            )
+            .env(
+                "UBSAN_OPTIONS",
+                std::env::var("UBSAN_OPTIONS").unwrap_or_default(),
+            )
            .kill_on_drop(true)
            .stdout(std::process::Stdio::piped())
            .stderr(std::process::Stdio::piped())
@@ -411,48 +427,259 @@ pub(crate) async fn main() -> anyhow::Result<()> {
        let st = pg_restore.wait().await.context("wait for pg_restore")?;
        info!(status=?st, "pg_restore exited");
        if !st.success() {
-            warn!(status=%st, "pg_restore failed, restore will likely fail as well");
-        }
-    }
-
-    // If interactive mode, wait for Ctrl+C
-    if args.interactive {
-        info!("Running in interactive mode. Press Ctrl+C to shut down.");
-        tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
-    }
-
-    info!("shutdown postgres");
-    {
-        nix::sys::signal::kill(
-            Pid::from_raw(
-                i32::try_from(postgres_proc.id().unwrap()).expect("convert child pid to i32"),
-            ),
-            nix::sys::signal::SIGTERM,
-        )
-        .context("signal postgres to shut down")?;
-        postgres_proc
-            .wait()
-            .await
-            .context("wait for postgres to shut down")?;
-    }
-
-    // Only sync if s3_prefix was specified
-    if let Some(s3_prefix) = args.s3_prefix {
-        info!("upload pgdata");
-        aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
-            .await
-            .context("sync dump directory to destination")?;
-
-        info!("write status");
-        {
-            let status_dir = working_directory.join("status");
-            std::fs::create_dir(&status_dir).context("create status directory")?;
-            let status_file = status_dir.join("pgdata");
-            std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
-                .context("write status file")?;
-            aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
-                .await
-                .context("sync status directory to destination")?;
+            error!(status=%st, "pg_restore failed, restore will likely fail as well");
+            bail!("pg_restore failed");
+        }
+    }
+
+    Ok(())
+}
+
+#[allow(clippy::too_many_arguments)]
+async fn cmd_pgdata(
+    s3_client: Option<aws_sdk_s3::Client>,
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_s3_prefix: Option<s3_uri::S3Uri>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    interactive: bool,
+    pg_port: u16,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+    num_cpus: Option<usize>,
+    memory_mb: Option<usize>,
+) -> Result<(), anyhow::Error> {
+    if maybe_spec.is_none() && source_connection_string.is_none() {
+        bail!("spec must be provided for pgdata command");
+    }
+    if maybe_spec.is_some() && source_connection_string.is_some() {
+        bail!("only one of spec or source_connection_string can be provided");
+    }
+
+    let source_connection_string = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?
+            }
+        }
+    } else {
+        source_connection_string.unwrap()
+    };
+
+    let superuser = "cloud_admin";
+    let destination_connstring = format!(
+        "host=localhost port={} user={} dbname=neondb",
+        pg_port, superuser
+    );
+
+    let pgdata_dir = workdir.join("pgdata");
+    let mut proc = PostgresProcess::new(pgdata_dir.clone(), pg_bin_dir.clone(), pg_lib_dir.clone());
+    let nproc = num_cpus.unwrap_or_else(num_cpus::get);
+    let memory_mb = memory_mb.unwrap_or(256);
+    proc.start(superuser, pg_port, nproc, memory_mb).await?;
+    wait_until_ready(destination_connstring.clone(), "neondb".to_string()).await;
+
+    run_dump_restore(
+        workdir.clone(),
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connection_string,
+        destination_connstring,
+    )
+    .await?;
+
+    // If interactive mode, wait for Ctrl+C
+    if interactive {
+        info!("Running in interactive mode. Press Ctrl+C to shut down.");
+        tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
+    }
+
+    proc.shutdown().await?;
+
+    // Only sync if s3_prefix was specified
+    if let Some(s3_prefix) = maybe_s3_prefix {
+        info!("upload pgdata");
+        aws_s3_sync::upload_dir_recursive(
+            s3_client.as_ref().unwrap(),
+            Utf8Path::new(&pgdata_dir),
+            &s3_prefix.append("/pgdata/"),
+        )
+        .await
+        .context("sync dump directory to destination")?;
+
+        info!("write status");
+        {
+            let status_dir = workdir.join("status");
+            std::fs::create_dir(&status_dir).context("create status directory")?;
+            let status_file = status_dir.join("pgdata");
+            std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
+                .context("write status file")?;
+            aws_s3_sync::upload_dir_recursive(
+                s3_client.as_ref().unwrap(),
+                &status_dir,
+                &s3_prefix.append("/status/"),
+            )
+            .await
+            .context("sync status directory to destination")?;
+        }
+    }
+
+    Ok(())
+}
+
+async fn cmd_dumprestore(
+    kms_client: Option<aws_sdk_kms::Client>,
+    maybe_spec: Option<Spec>,
+    source_connection_string: Option<String>,
+    destination_connection_string: Option<String>,
+    workdir: Utf8PathBuf,
+    pg_bin_dir: Utf8PathBuf,
+    pg_lib_dir: Utf8PathBuf,
+) -> Result<(), anyhow::Error> {
+    let (source_connstring, destination_connstring) = if let Some(spec) = maybe_spec {
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let source = decode_connstring(
+                    kms_client.as_ref().unwrap(),
+                    &key_id,
+                    spec.source_connstring_ciphertext_base64,
+                )
+                .await?;
+
+                let dest = if let Some(dest_ciphertext) =
+                    spec.destination_connstring_ciphertext_base64
+                {
+                    decode_connstring(kms_client.as_ref().unwrap(), &key_id, dest_ciphertext)
+                        .await?
+                } else {
+                    bail!("destination connection string must be provided in spec for dump_restore command");
+                };
+
+                (source, dest)
+            }
+        }
+    } else {
+        (
+            source_connection_string.unwrap(),
+            if let Some(val) = destination_connection_string {
+                val
+            } else {
+                bail!("destination connection string must be provided for dump_restore command");
+            },
+        )
+    };
+
+    run_dump_restore(
+        workdir,
+        pg_bin_dir,
+        pg_lib_dir,
+        source_connstring,
+        destination_connstring,
+    )
+    .await
+}
+
+#[tokio::main]
+pub(crate) async fn main() -> anyhow::Result<()> {
+    utils::logging::init(
+        utils::logging::LogFormat::Json,
+        utils::logging::TracingErrorLayerEnablement::EnableWithRustLogFilter,
+        utils::logging::Output::Stdout,
+    )?;
+
+    info!("starting");
+
+    let args = Args::parse();
+
+    // Initialize AWS clients only if s3_prefix is specified
+    let (s3_client, kms_client) = if args.s3_prefix.is_some() {
+        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let s3_client = aws_sdk_s3::Client::new(&config);
+        let kms = aws_sdk_kms::Client::new(&config);
+        (Some(s3_client), Some(kms))
+    } else {
+        (None, None)
+    };
+
+    let spec: Option<Spec> = if let Some(s3_prefix) = &args.s3_prefix {
+        let spec_key = s3_prefix.append("/spec.json");
+        let object = s3_client
+            .as_ref()
+            .unwrap()
+            .get_object()
+            .bucket(&spec_key.bucket)
+            .key(spec_key.key)
+            .send()
+            .await
+            .context("get spec from s3")?
+            .body
+            .collect()
+            .await
+            .context("download spec body")?;
+        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    } else {
+        None
+    };
+
+    match tokio::fs::create_dir(&args.working_directory).await {
+        Ok(()) => {}
+        Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+            if !is_directory_empty(&args.working_directory)
+                .await
+                .context("check if working directory is empty")?
+            {
+                bail!("working directory is not empty");
+            } else {
+                // ok
+            }
+        }
+        Err(e) => return Err(anyhow::Error::new(e).context("create working directory")),
+    }
+
+    match args.command {
+        Command::Pgdata {
+            source_connection_string,
+            interactive,
+            pg_port,
+            num_cpus,
+            memory_mb,
+        } => {
+            cmd_pgdata(
+                s3_client,
+                kms_client,
+                args.s3_prefix,
+                spec,
+                source_connection_string,
+                interactive,
+                pg_port,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+                num_cpus,
+                memory_mb,
+            )
+            .await?;
+        }
+        Command::DumpRestore {
+            source_connection_string,
+            destination_connection_string,
+        } => {
+            cmd_dumprestore(
+                kms_client,
+                spec,
+                source_connection_string,
+                destination_connection_string,
+                args.working_directory,
+                args.pg_bin_dir,
+                args.pg_lib_dir,
+            )
+            .await?;
        }
    }

--- a/compute_tools/src/bin/fast_import/aws_s3_sync.rs
+++ b/compute_tools/src/bin/fast_import/aws_s3_sync.rs
@@ -1,24 +1,102 @@
-use anyhow::Context;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
+use tokio::task::JoinSet;
+use walkdir::WalkDir;

 use super::s3_uri::S3Uri;

-pub(crate) async fn sync(local: &Utf8Path, remote: &S3Uri) -> anyhow::Result<()> {
-    let mut builder = tokio::process::Command::new("aws");
-    builder
-        .arg("s3")
-        .arg("sync")
-        .arg(local.as_str())
-        .arg(remote.to_string());
-    let st = builder
-        .spawn()
-        .context("spawn aws s3 sync")?
-        .wait()
-        .await
-        .context("wait for aws s3 sync")?;
-    if st.success() {
-        Ok(())
-    } else {
-        Err(anyhow::anyhow!("aws s3 sync failed"))
+use tracing::{info, warn};
+
+const MAX_PARALLEL_UPLOADS: usize = 10;
+
+/// Upload all files from 'local' to 'remote'
+pub(crate) async fn upload_dir_recursive(
+    s3_client: &aws_sdk_s3::Client,
+    local: &Utf8Path,
+    remote: &S3Uri,
+) -> anyhow::Result<()> {
+    // Recursively scan directory
+    let mut dirwalker = WalkDir::new(local)
+        .into_iter()
+        .map(|entry| {
+            let entry = entry?;
+            let file_type = entry.file_type();
+            let path = <&Utf8Path>::try_from(entry.path())?.to_path_buf();
+            Ok((file_type, path))
+        })
+        .filter_map(|e: anyhow::Result<(std::fs::FileType, Utf8PathBuf)>| {
+            match e {
+                Ok((file_type, path)) if file_type.is_file() => Some(Ok(path)),
+                Ok((file_type, _path)) if file_type.is_dir() => {
+                    // The WalkDir iterator will recurse into directories, but we don't want
+                    // to do anything with directories as such. There's no concept of uploading
+                    // an empty directory to S3.
+                    None
+                }
+                Ok((file_type, path)) if file_type.is_symlink() => {
+                    // huh, didn't expect a symlink. Can't upload that to S3. Warn and skip.
+                    warn!("cannot upload symlink ({})", path);
+                    None
+                }
+                Ok((_file_type, path)) => {
+                    // should not happen
+                    warn!("directory entry has unexpected type ({})", path);
+                    None
+                }
+                Err(e) => Some(Err(e)),
+            }
+        });
+
+    // Spawn upload tasks for each file, keeping MAX_PARALLEL_UPLOADS active in
+    // parallel.
+    let mut joinset = JoinSet::new();
+    loop {
+        // Could we upload more?
+        while joinset.len() < MAX_PARALLEL_UPLOADS {
+            if let Some(full_local_path) = dirwalker.next() {
+                let full_local_path = full_local_path?;
+                let relative_local_path = full_local_path
+                    .strip_prefix(local)
+                    .expect("all paths start from the walkdir root");
+                let remote_path = remote.append(relative_local_path.as_str());
+                info!(
+                    "starting upload of {} to {}",
+                    &full_local_path, &remote_path
+                );
+                let upload_task = upload_file(s3_client.clone(), full_local_path, remote_path);
+                joinset.spawn(upload_task);
+            } else {
+                info!("draining upload tasks");
+                break;
+            }
+        }
+
+        // Wait for an upload to complete
+        if let Some(res) = joinset.join_next().await {
+            let _ = res?;
+        } else {
+            // all done!
+            break;
+        }
    }
+    Ok(())
+}
+
+pub(crate) async fn upload_file(
+    s3_client: aws_sdk_s3::Client,
+    local_path: Utf8PathBuf,
+    remote: S3Uri,
+) -> anyhow::Result<()> {
+    use aws_smithy_types::byte_stream::ByteStream;
+    let stream = ByteStream::from_path(&local_path).await?;
+
+    let _result = s3_client
+        .put_object()
+        .bucket(remote.bucket)
+        .key(&remote.key)
+        .body(stream)
+        .send()
+        .await?;
+    info!("upload of {} to {} finished", &local_path, &remote.key);
+
+    Ok(())
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -11,7 +11,9 @@ use crate::migration::MigrationRunner;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

-use compute_api::responses::{ControlPlaneComputeStatus, ControlPlaneSpecResponse};
+use compute_api::responses::{
+    ComputeCtlConfig, ControlPlaneComputeStatus, ControlPlaneSpecResponse,
+};
 use compute_api::spec::ComputeSpec;

 // Do control plane request and return response if any. In case of error it
@@ -73,14 +75,13 @@ fn do_control_plane_request(
 pub fn get_spec_from_control_plane(
    base_uri: &str,
    compute_id: &str,
-) -> Result<Option<ComputeSpec>> {
+) -> Result<(Option<ComputeSpec>, ComputeCtlConfig)> {
    let cp_uri = format!("{base_uri}/compute/api/v2/computes/{compute_id}/spec");
    let jwt: String = match std::env::var("NEON_CONTROL_PLANE_TOKEN") {
        Ok(v) => v,
        Err(_) => "".to_string(),
    };
    let mut attempt = 1;
-    let mut spec: Result<Option<ComputeSpec>> = Ok(None);

    info!("getting spec from control plane: {}", cp_uri);

@@ -90,7 +91,7 @@ pub fn get_spec_from_control_plane(
    // - no spec for compute yet (Empty state) -> return Ok(None)
    // - got spec -> return Ok(Some(spec))
    while attempt < 4 {
-        spec = match do_control_plane_request(&cp_uri, &jwt) {
+        let result = match do_control_plane_request(&cp_uri, &jwt) {
            Ok(spec_resp) => {
                CPLANE_REQUESTS_TOTAL
                    .with_label_values(&[
@@ -99,10 +100,10 @@ pub fn get_spec_from_control_plane(
                    ])
                    .inc();
                match spec_resp.status {
-                    ControlPlaneComputeStatus::Empty => Ok(None),
+                    ControlPlaneComputeStatus::Empty => Ok((None, spec_resp.compute_ctl_config)),
                    ControlPlaneComputeStatus::Attached => {
                        if let Some(spec) = spec_resp.spec {
-                            Ok(Some(spec))
+                            Ok((Some(spec), spec_resp.compute_ctl_config))
                        } else {
                            bail!("compute is attached, but spec is empty")
                        }
@@ -121,10 +122,10 @@ pub fn get_spec_from_control_plane(
            }
        };

-        if let Err(e) = &spec {
+        if let Err(e) = &result {
            error!("attempt {} to get spec failed with: {}", attempt, e);
        } else {
-            return spec;
+            return result;
        }

        attempt += 1;
@@ -132,7 +133,9 @@ pub fn get_spec_from_control_plane(
    }

    // All attempts failed, return error.
-    spec
+    Err(anyhow::anyhow!(
+        "Exhausted all attempts to retrieve the spec from the control plane"
+    ))
 }

 /// Check `pg_hba.conf` and update if needed to allow external connections.
--- a/compute_tools/src/sql/drop_subscriptions.sql
+++ b/compute_tools/src/sql/drop_subscriptions.sql
@@ -2,6 +2,7 @@ DO $$
 DECLARE
    subname TEXT;
 BEGIN
+    LOCK TABLE pg_subscription IN ACCESS EXCLUSIVE MODE;
    FOR subname IN SELECT pg_subscription.subname FROM pg_subscription WHERE subdbid = (SELECT oid FROM pg_database WHERE datname = {datname_str}) LOOP
        EXECUTE format('ALTER SUBSCRIPTION %I DISABLE;', subname);
        EXECUTE format('ALTER SUBSCRIPTION %I SET (slot_name = NONE);', subname);
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -46,8 +46,12 @@ use std::process::Command;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
+use std::time::SystemTime;
+use std::time::UNIX_EPOCH;

 use anyhow::{anyhow, bail, Context, Result};
+use compute_api::requests::ConfigurationRequest;
+use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::Database;
 use compute_api::spec::PgIdent;
 use compute_api::spec::RemoteExtSpec;
@@ -57,6 +61,7 @@ use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
 use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
+use tracing::debug;
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

@@ -79,8 +84,10 @@ pub struct EndpointConf {
    internal_http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
+    reconfigure_concurrency: usize,
    drop_subscriptions_before_start: bool,
    features: Vec<ComputeFeature>,
+    cluster: Option<Cluster>,
 }

 //
@@ -177,7 +184,9 @@ impl ComputeControlPlane {
            // we also skip catalog updates in the cloud.
            skip_pg_catalog_updates,
            drop_subscriptions_before_start,
+            reconfigure_concurrency: 1,
            features: vec![],
+            cluster: None,
        });

        ep.create_endpoint_dir()?;
@@ -194,7 +203,9 @@ impl ComputeControlPlane {
                pg_version,
                skip_pg_catalog_updates,
                drop_subscriptions_before_start,
+                reconfigure_concurrency: 1,
                features: vec![],
+                cluster: None,
            })?,
        )?;
        std::fs::write(
@@ -259,8 +270,11 @@ pub struct Endpoint {
    skip_pg_catalog_updates: bool,

    drop_subscriptions_before_start: bool,
+    reconfigure_concurrency: usize,
    // Feature flags
    features: Vec<ComputeFeature>,
+    // Cluster settings
+    cluster: Option<Cluster>,
 }

 #[derive(PartialEq, Eq)]
@@ -300,6 +314,8 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

+        debug!("serialized endpoint conf: {:?}", conf);
+
        Ok(Endpoint {
            pg_address: SocketAddr::new(IpAddr::from(Ipv4Addr::LOCALHOST), conf.pg_port),
            external_http_address: SocketAddr::new(
@@ -317,8 +333,10 @@ impl Endpoint {
            tenant_id: conf.tenant_id,
            pg_version: conf.pg_version,
            skip_pg_catalog_updates: conf.skip_pg_catalog_updates,
+            reconfigure_concurrency: conf.reconfigure_concurrency,
            drop_subscriptions_before_start: conf.drop_subscriptions_before_start,
            features: conf.features,
+            cluster: conf.cluster,
        })
    }

@@ -605,7 +623,7 @@ impl Endpoint {
        };

        // Create spec file
-        let spec = ComputeSpec {
+        let mut spec = ComputeSpec {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
@@ -638,7 +656,7 @@ impl Endpoint {
                    Vec::new()
                },
                settings: None,
-                postgresql_conf: Some(postgresql_conf),
+                postgresql_conf: Some(postgresql_conf.clone()),
            },
            delta_operations: None,
            tenant_id: Some(self.tenant_id),
@@ -651,9 +669,35 @@ impl Endpoint {
            pgbouncer_settings: None,
            shard_stripe_size: Some(shard_stripe_size),
            local_proxy_config: None,
-            reconfigure_concurrency: 1,
+            reconfigure_concurrency: self.reconfigure_concurrency,
            drop_subscriptions_before_start: self.drop_subscriptions_before_start,
        };
+
+        // this strange code is needed to support respec() in tests
+        if self.cluster.is_some() {
+            debug!("Cluster is already set in the endpoint spec, using it");
+            spec.cluster = self.cluster.clone().unwrap();
+
+            debug!("spec.cluster {:?}", spec.cluster);
+
+            // fill missing fields again
+            if create_test_user {
+                spec.cluster.roles.push(Role {
+                    name: PgIdent::from_str("test").unwrap(),
+                    encrypted_password: None,
+                    options: None,
+                });
+                spec.cluster.databases.push(Database {
+                    name: PgIdent::from_str("neondb").unwrap(),
+                    owner: PgIdent::from_str("test").unwrap(),
+                    options: None,
+                    restrict_conn: false,
+                    invalid: false,
+                });
+            }
+            spec.cluster.postgresql_conf = Some(postgresql_conf);
+        }
+
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;

@@ -671,18 +715,14 @@ impl Endpoint {
            println!("Also at '{}'", conn_str);
        }
        let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-        //cmd.args([
-        //    "--external-http-port",
-        //    &self.external_http_address.port().to_string(),
-        //])
-        //.args([
-        //    "--internal-http-port",
-        //    &self.internal_http_address.port().to_string(),
-        //])
        cmd.args([
-            "--http-port",
+            "--external-http-port",
            &self.external_http_address.port().to_string(),
        ])
+        .args([
+            "--internal-http-port",
+            &self.internal_http_address.port().to_string(),
+        ])
        .args(["--pgdata", self.pgdata().to_str().unwrap()])
        .args(["--connstr", &conn_str])
        .args([
@@ -699,20 +739,16 @@ impl Endpoint {
        ])
        // TODO: It would be nice if we generated compute IDs with the same
        // algorithm as the real control plane.
-        //
-        // TODO: Add this back when
-        // https://github.com/neondatabase/neon/pull/10747 is merged.
-        //
-        //.args([
-        //    "--compute-id",
-        //    &format!(
-        //        "compute-{}",
-        //        SystemTime::now()
-        //            .duration_since(UNIX_EPOCH)
-        //            .unwrap()
-        //            .as_secs()
-        //    ),
-        //])
+        .args([
+            "--compute-id",
+            &format!(
+                "compute-{}",
+                SystemTime::now()
+                    .duration_since(UNIX_EPOCH)
+                    .unwrap()
+                    .as_secs()
+            ),
+        ])
        .stdin(std::process::Stdio::null())
        .stderr(logfile.try_clone()?)
        .stdout(logfile);
@@ -880,10 +916,13 @@ impl Endpoint {
                self.external_http_address.port()
            ))
            .header(CONTENT_TYPE.as_str(), "application/json")
-            .body(format!(
-                "{{\"spec\":{}}}",
-                serde_json::to_string_pretty(&spec)?
-            ))
+            .body(
+                serde_json::to_string(&ConfigurationRequest {
+                    spec,
+                    compute_ctl_config: ComputeCtlConfig::default(),
+                })
+                .unwrap(),
+            )
            .send()
            .await?;

--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -838,7 +838,10 @@ impl StorageController {
        self.dispatch(
            Method::PUT,
            format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-            Some(TenantShardMigrateRequest { node_id }),
+            Some(TenantShardMigrateRequest {
+                node_id,
+                migration_config: None,
+            }),
        )
        .await
    }
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -22,7 +22,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Method, StatusCode, Url};
-use utils::id::{NodeId, TenantId};
+use utils::id::{NodeId, TenantId, TimelineId};

 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
@@ -47,6 +47,9 @@ enum Command {
        listen_http_addr: String,
        #[arg(long)]
        listen_http_port: u16,
+        #[arg(long)]
+        listen_https_port: Option<u16>,
+
        #[arg(long)]
        availability_zone_id: String,
    },
@@ -239,6 +242,19 @@ enum Command {
        #[arg(long)]
        scheduling_policy: SkSchedulingPolicyArg,
    },
+    /// Downloads any missing heatmap layers for all shard for a given timeline
+    DownloadHeatmapLayers {
+        /// Tenant ID or tenant shard ID. When an unsharded tenant ID is specified,
+        /// the operation is performed on all shards. When a sharded tenant ID is
+        /// specified, the operation is only performed on the specified shard.
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Optional: Maximum download concurrency (default is 16)
+        #[arg(long)]
+        concurrency: Option<usize>,
+    },
 }

 #[derive(Parser)]
@@ -381,6 +397,7 @@ async fn main() -> anyhow::Result<()> {
            listen_pg_port,
            listen_http_addr,
            listen_http_port,
+            listen_https_port,
            availability_zone_id,
        } => {
            storcon_client
@@ -393,6 +410,7 @@ async fn main() -> anyhow::Result<()> {
                        listen_pg_port,
                        listen_http_addr,
                        listen_http_port,
+                        listen_https_port,
                        availability_zone_id: AvailabilityZone(availability_zone_id),
                    }),
                )
@@ -609,7 +627,10 @@ async fn main() -> anyhow::Result<()> {
            tenant_shard_id,
            node,
        } => {
-            let req = TenantShardMigrateRequest { node_id: node };
+            let req = TenantShardMigrateRequest {
+                node_id: node,
+                migration_config: None,
+            };

            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -623,7 +644,10 @@ async fn main() -> anyhow::Result<()> {
            tenant_shard_id,
            node,
        } => {
-            let req = TenantShardMigrateRequest { node_id: node };
+            let req = TenantShardMigrateRequest {
+                node_id: node,
+                migration_config: None,
+            };

            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
@@ -1082,7 +1106,10 @@ async fn main() -> anyhow::Result<()> {
                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                                Method::PUT,
                                format!("control/v1/tenant/{}/migrate", mv.tenant_shard_id),
-                                Some(TenantShardMigrateRequest { node_id: mv.to }),
+                                Some(TenantShardMigrateRequest {
+                                    node_id: mv.to,
+                                    migration_config: None,
+                                }),
                            )
                            .await
                            .map_err(|e| (mv.tenant_shard_id, mv.from, mv.to, e))
@@ -1238,6 +1265,24 @@ async fn main() -> anyhow::Result<()> {
                String::from(scheduling_policy)
            );
        }
+        Command::DownloadHeatmapLayers {
+            tenant_shard_id,
+            timeline_id,
+            concurrency,
+        } => {
+            let mut path = format!(
+                "/v1/tenant/{}/timeline/{}/download_heatmap_layers",
+                tenant_shard_id, timeline_id,
+            );
+
+            if let Some(c) = concurrency {
+                path = format!("{path}?concurrency={c}");
+            }
+
+            storcon_client
+                .dispatch::<(), ()>(Method::POST, path, None)
+                .await?;
+        }
    }

    Ok(())
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -77,4 +77,5 @@ echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
+     --compute-id "compute-$RANDOM"                          \
     -S ${SPEC_FILE}
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -71,7 +71,7 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
        # We are running tests now
        rm -f testout.txt testout_contrib.txt
-        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pgtap-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
+        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pgx_ulid-src,pg_tiktoken-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src \
        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
@@ -81,15 +81,8 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
            for d in $FAILED $CONTRIB_FAILED; do
-                dn="$(basename $d)"
-                rm -rf $dn
-                mkdir $dn
-                docker cp $TEST_CONTAINER_NAME:$d/regression.diffs $dn || [ $? -eq 1 ]
-                docker cp $TEST_CONTAINER_NAME:$d/regression.out $dn || [ $? -eq 1 ]
-                cat $dn/regression.out $dn/regression.diffs || true
-                rm -rf $dn
+                docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ]
            done
-        rm -rf $FAILED
        exit 1
        fi
    fi
--- a/docker-compose/ext-src/pg_repack-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pg_repack-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
+${PG_REGRESS} --use-existing --inputdir=./regress --bindir='/usr/local/pgsql/bin' --dbname=contrib_regression repack-setup repack-run error-on-invalid-idx no-error-on-invalid-idx after-schema repack-check nosuper get_order_by trigger
--- a/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade-16.patch
--- a/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade-17.patch
@@ -0,0 +1,24 @@
+diff --git a/test/sql/base.sql b/test/sql/base.sql
+index 53adb30..2eed91b 100644
+--- a/test/sql/base.sql
+++ b/test/sql/base.sql
+@@ -2,7 +2,6 @@
+ BEGIN;
+ 
+ \i test/pgtap-core.sql
+-CREATE EXTENSION semver;
+ 
+ SELECT plan(334);
+ --SELECT * FROM no_plan();
+diff --git a/test/sql/corpus.sql b/test/sql/corpus.sql
+index c0fe98e..39cdd2e 100644
+--- a/test/sql/corpus.sql
+++ b/test/sql/corpus.sql
+@@ -4,7 +4,6 @@ BEGIN;
+ -- Test the SemVer corpus from https://regex101.com/r/Ly7O1x/3/.
+ 
+ \i test/pgtap-core.sql
+-CREATE EXTENSION semver;
+ 
+ SELECT plan(76);
+ --SELECT * FROM no_plan();
--- a/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pg_semver-src/test-upgrade.sh
@@ -1,6 +1,7 @@
 #!/bin/sh
 set -ex
 cd "$(dirname ${0})"
-patch -p1 <test-upgrade.patch
+patch -p1 <test-upgrade-${PG_VERSION}.patch
+psql -d contrib_regression -c "DROP EXTENSION IF EXISTS pgtap"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
 ${PG_REGRESS} --use-existing --inputdir=./ --bindir='/usr/local/pgsql/bin'    --inputdir=test --dbname=contrib_regression base corpus
--- a/docker-compose/ext-src/pgtap-src/test-upgrade.patch
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.patch
@@ -0,0 +1,28 @@
+diff --git a/Makefile b/Makefile
+index f255fe6..0a0fa65 100644
+--- a/Makefile
+++ b/Makefile
+@@ -346,7 +346,7 @@ test: test-serial test-parallel
+ TB_DIR = test/build
+ GENERATED_SCHEDULE_DEPS = $(TB_DIR)/all_tests $(TB_DIR)/exclude_tests
+ REGRESS = --schedule $(TB_DIR)/run.sch # Set this again just to be safe
+-REGRESS_OPTS = --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
+REGRESS_OPTS = --use-existing --dbname=pgtap_regression --inputdir=test --max-connections=$(PARALLEL_CONN) --schedule $(SETUP_SCH) $(REGRESS_CONF)
+ SETUP_SCH = test/schedule/main.sch # schedule to use for test setup; this can be forcibly changed by some targets!
+ IGNORE_TESTS = $(notdir $(EXCLUDE_TEST_FILES:.sql=))
+ PARALLEL_TESTS = $(filter-out $(IGNORE_TESTS),$(filter-out $(SERIAL_TESTS),$(ALL_TESTS)))
+diff --git a/test/schedule/create.sql b/test/schedule/create.sql
+index ba355ed..7e250f5 100644
+--- a/test/schedule/create.sql
+++ b/test/schedule/create.sql
+@@ -1,3 +1,2 @@
+ \unset ECHO
+ \i test/psql.sql
+-CREATE EXTENSION pgtap;
+diff --git a/test/schedule/main.sch b/test/schedule/main.sch
+index a8a5fbc..0463fc4 100644
+--- a/test/schedule/main.sch
+++ b/test/schedule/main.sch
+@@ -1,2 +1 @@
+-test: build
+ test: create
--- a/docker-compose/ext-src/pgtap-src/test-upgrade.sh
+++ b/docker-compose/ext-src/pgtap-src/test-upgrade.sh
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -ex
+cd "$(dirname ${0})"
+patch -p1 <test-upgrade.patch
+make installcheck
--- a/docker-compose/ext-src/plv8-src/test-upgrade.sh
+++ b/docker-compose/ext-src/plv8-src/test-upgrade.sh
@@ -2,4 +2,5 @@
 set -ex
 cd "$(dirname ${0})"
 PG_REGRESS=$(dirname "$(pg_config --pgxs)")/../test/regress/pg_regress
-${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression plv8 plv8-errors scalar_args inline json startup_pre startup varparam json_conv jsonb_conv window guc es6 arraybuffer composites currentresource startup_perms bytea find_function_perms memory_limits reset show array_spread regression dialect bigint procedure
+REGRESS="$(make -n installcheck | awk '{print substr($0,index($0,"init-extension")+15);}')"
+${PG_REGRESS} --inputdir=./ --bindir='/usr/local/pgsql/bin'  --use-existing --dbname=contrib_regression ${REGRESS}
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -11,6 +11,7 @@ if [ -z ${OLDTAG+x} ] || [ -z ${NEWTAG+x} ] || [ -z "${OLDTAG}" ] || [ -z "${NEW
  exit 1
 fi
 export PG_VERSION=${PG_VERSION:-16}
+export PG_TEST_VERSION=${PG_VERSION}
 function wait_for_ready {
  TIME=0
  while ! docker compose logs compute_is_ready | grep -q "accepting connections" && [ ${TIME} -le 300 ] ; do
@@ -41,7 +42,9 @@ EXTENSIONS='[
 {"extname": "roaringbitmap", "extdir": "pg_roaringbitmap-src"},
 {"extname": "semver", "extdir": "pg_semver-src"},
 {"extname": "pg_ivm", "extdir": "pg_ivm-src"},
-{"extname": "pgjwt", "extdir": "pgjwt-src"}
+{"extname": "pgjwt", "extdir": "pgjwt-src"},
+{"extname": "pgtap", "extdir": "pgtap-src"},
+{"extname": "pg_repack", "extdir": "pg_repack-src"}
 ]'
 EXTNAMES=$(echo ${EXTENSIONS} | jq -r '.[].extname' | paste -sd ' ' -)
 TAG=${NEWTAG} docker compose --profile test-extensions up --quiet-pull --build -d
@@ -57,9 +60,15 @@ wait_for_ready
 docker compose cp  ext-src neon-test-extensions:/
 docker compose exec neon-test-extensions psql -c "DROP DATABASE IF EXISTS contrib_regression"
 docker compose exec neon-test-extensions psql -c "CREATE DATABASE contrib_regression"
+docker compose exec neon-test-extensions psql -c "CREATE DATABASE pgtap_regression"
+docker compose exec neon-test-extensions psql -d pgtap_regression -c "CREATE EXTENSION pgtap"
 create_extensions "${EXTNAMES}"
-query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
-exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+if [ "${FORCE_ALL_UPGRADE_TESTS:-false}" = true ]; then
+  exts="${EXTNAMES}"
+else
+  query="select pge.extname from pg_extension pge join (select key as extname, value as extversion from json_each_text('${new_vers}')) x on pge.extname=x.extname and pge.extversion <> x.extversion"
+  exts=$(docker compose exec neon-test-extensions psql -Aqt -d contrib_regression -c "$query")
+fi
 if [ -z "${exts}" ]; then
  echo "No extensions were upgraded"
 else
@@ -87,7 +96,10 @@ else
      exit 1
    fi
    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
-    docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh
+    if ! docker compose exec neon-test-extensions sh -c /ext-src/${EXTDIR}/test-upgrade.sh; then
+      docker  compose exec neon-test-extensions  cat /ext-src/${EXTDIR}/regression.diffs
+      exit 1
+    fi
    docker compose exec neon-test-extensions psql -d contrib_regression -c "alter extension ${ext} update"
    docker compose exec neon-test-extensions psql -d contrib_regression -c "\dx ${ext}"
  done
--- a/libs/compute_api/Cargo.toml
+++ b/libs/compute_api/Cargo.toml
@@ -7,6 +7,7 @@ license.workspace = true
 [dependencies]
 anyhow.workspace = true
 chrono.workspace = true
+jsonwebtoken.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 regex.workspace = true
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,18 +1,20 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 use crate::{
    privilege::Privilege,
+    responses::ComputeCtlConfig,
    spec::{ComputeSpec, ExtVersion, PgIdent},
 };
-use serde::Deserialize;
+use serde::{Deserialize, Serialize};

 /// Request of the /configure API
 ///
 /// We now pass only `spec` in the configuration request, but later we can
 /// extend it and something like `restart: bool` or something else. So put
 /// `spec` into a struct initially to be more flexible in the future.
-#[derive(Deserialize, Debug)]
+#[derive(Debug, Deserialize, Serialize)]
 pub struct ConfigurationRequest {
    pub spec: ComputeSpec,
+    pub compute_ctl_config: ComputeCtlConfig,
 }

 #[derive(Deserialize, Debug)]
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -3,6 +3,7 @@
 use std::fmt::Display;

 use chrono::{DateTime, Utc};
+use jsonwebtoken::jwk::JwkSet;
 use serde::{Deserialize, Serialize, Serializer};

 use crate::{
@@ -135,13 +136,27 @@ pub struct CatalogObjects {
    pub databases: Vec<Database>,
 }

+#[derive(Debug, Deserialize, Serialize)]
+pub struct ComputeCtlConfig {
+    pub jwks: JwkSet,
+}
+
+impl Default for ComputeCtlConfig {
+    fn default() -> Self {
+        Self {
+            jwks: JwkSet {
+                keys: Vec::default(),
+            },
+        }
+    }
+}
+
 /// Response of the `/computes/{compute_id}/spec` control-plane API.
-/// This is not actually a compute API response, so consider moving
-/// to a different place.
 #[derive(Deserialize, Debug)]
 pub struct ControlPlaneSpecResponse {
    pub spec: Option<ComputeSpec>,
    pub status: ControlPlaneComputeStatus,
+    pub compute_ctl_config: ComputeCtlConfig,
 }

 #[derive(Deserialize, Clone, Copy, Debug, PartialEq, Eq)]
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -252,7 +252,7 @@ pub enum ComputeMode {
    Replica,
 }

-#[derive(Clone, Debug, Default, Deserialize, Serialize)]
+#[derive(Clone, Debug, Default, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Cluster {
    pub cluster_id: Option<String>,
    pub name: Option<String>,
@@ -283,7 +283,7 @@ pub struct DeltaOp {

 /// Rust representation of Postgres role info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
@@ -292,7 +292,7 @@ pub struct Role {

 /// Rust representation of Postgres database info with only those fields
 /// that matter for us.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct Database {
    pub name: PgIdent,
    pub owner: PgIdent,
@@ -308,7 +308,7 @@ pub struct Database {
 /// Common type representing both SQL statement params with or without value,
 /// like `LOGIN` or `OWNER username` in the `CREATE/ALTER ROLE`, and config
 /// options like `wal_level = logical`.
-#[derive(Clone, Debug, Deserialize, Serialize)]
+#[derive(Clone, Debug, Deserialize, Serialize, PartialEq, Eq)]
 pub struct GenericOption {
    pub name: String,
    pub value: Option<String>,
--- a/libs/http-utils/src/pprof.rs
+++ b/libs/http-utils/src/pprof.rs
@@ -2,7 +2,6 @@ use anyhow::bail;
 use flate2::write::{GzDecoder, GzEncoder};
 use flate2::Compression;
 use itertools::Itertools as _;
-use once_cell::sync::Lazy;
 use pprof::protos::{Function, Line, Location, Message as _, Profile};
 use regex::Regex;

@@ -58,38 +57,30 @@ pub fn symbolize(mut profile: Profile) -> anyhow::Result<Profile> {

        // Resolve the line and function for each location.
        backtrace::resolve(loc.address as *mut c_void, |symbol| {
-            let Some(symname) = symbol.name() else {
+            let Some(symbol_name) = symbol.name() else {
                return;
            };
-            let mut name = symname.to_string();

-            // Strip the Rust monomorphization suffix from the symbol name.
-            static SUFFIX_REGEX: Lazy<Regex> =
-                Lazy::new(|| Regex::new("::h[0-9a-f]{16}$").expect("invalid regex"));
-            if let Some(m) = SUFFIX_REGEX.find(&name) {
-                name.truncate(m.start());
-            }
-
-            let function_id = match functions.get(&name) {
-                Some(function) => function.id,
-                None => {
-                    let id = functions.len() as u64 + 1;
-                    let system_name = String::from_utf8_lossy(symname.as_bytes());
+            let function_name = format!("{symbol_name:#}");
+            let functions_len = functions.len();
+            let function_id = functions
+                .entry(function_name)
+                .or_insert_with_key(|function_name| {
+                    let function_id = functions_len as u64 + 1;
+                    let system_name = String::from_utf8_lossy(symbol_name.as_bytes());
                    let filename = symbol
                        .filename()
                        .map(|path| path.to_string_lossy())
                        .unwrap_or(Cow::Borrowed(""));
-                    let function = Function {
-                        id,
-                        name: string_id(&name),
+                    Function {
+                        id: function_id,
+                        name: string_id(function_name),
                        system_name: string_id(&system_name),
                        filename: string_id(&filename),
                        ..Default::default()
-                    };
-                    functions.insert(name, function);
-                    id
-                }
-            };
+                    }
+                })
+                .id;
            loc.line.push(Line {
                function_id,
                line: symbol.lineno().unwrap_or(0) as i64,
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -122,6 +122,8 @@ pub struct ConfigToml {
    pub page_service_pipelining: PageServicePipeliningConfig,
    pub get_vectored_concurrent_io: GetVectoredConcurrentIo,
    pub enable_read_path_debugging: Option<bool>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub validate_wal_contiguity: Option<bool>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -351,7 +353,7 @@ pub struct TenantConfigToml {

    /// Enable rel_size_v2 for this tenant. Once enabled, the tenant will persist this information into
    /// `index_part.json`, and it cannot be reversed.
-    pub rel_size_v2_enabled: Option<bool>,
+    pub rel_size_v2_enabled: bool,

    // gc-compaction related configs
    /// Enable automatic gc-compaction trigger on this tenant.
@@ -521,6 +523,7 @@ impl Default for ConfigToml {
            } else {
                None
            },
+            validate_wal_contiguity: None,
        }
    }
 }
@@ -544,10 +547,11 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_PERIOD: &str = "20 s";
    pub const DEFAULT_COMPACTION_THRESHOLD: usize = 10;

-    // This value needs to be tuned to avoid OOM. We have 3/4 of the total CPU threads to do background works, that's 16*3/4=9 on
-    // most of our pageservers. Compaction ~50 layers requires about 2GB memory (could be reduced later by optimizing L0 hole
-    // calculation to avoid loading all keys into the memory). So with this config, we can get a maximum peak compaction usage of 18GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 50;
+    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
+    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
+    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
+    // with this config, we can get a maximum peak compaction usage of 9 GB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
    pub const DEFAULT_COMPACTION_L0_FIRST: bool = false;
    pub const DEFAULT_COMPACTION_L0_SEMAPHORE: bool = true;

@@ -633,7 +637,7 @@ impl Default for TenantConfigToml {
            lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
            timeline_offloading: true,
            wal_receiver_protocol_override: None,
-            rel_size_v2_enabled: None,
+            rel_size_v2_enabled: false,
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -57,6 +57,7 @@ pub struct NodeRegisterRequest {

    pub listen_http_addr: String,
    pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,

    pub availability_zone_id: AvailabilityZone,
 }
@@ -105,6 +106,7 @@ pub struct TenantLocateResponseShard {

    pub listen_http_addr: String,
    pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,
 }

 #[derive(Serialize, Deserialize)]
@@ -148,6 +150,7 @@ pub struct NodeDescribeResponse {

    pub listen_http_addr: String,
    pub listen_http_port: u16,
+    pub listen_https_port: Option<u16>,

    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
@@ -182,6 +185,18 @@ pub struct TenantDescribeResponseShard {
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
+    #[serde(default)]
+    pub migration_config: Option<MigrationConfig>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct MigrationConfig {
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub secondary_warmup_timeout: Option<Duration>,
+    #[serde(default)]
+    #[serde(with = "humantime_serde")]
+    pub secondary_download_request_timeout: Option<Duration>,
 }

 #[derive(Serialize, Clone, Debug)]
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,10 +1,12 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
+use bytes::Bytes;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::Oid;
 use postgres_ffi::RepOriginId;
 use serde::{Deserialize, Serialize};
 use std::{fmt, ops::Range};
+use utils::const_assert;

 use crate::reltag::{BlockNumber, RelTag, SlruKind};

@@ -49,6 +51,64 @@ pub const AUX_KEY_PREFIX: u8 = 0x62;
 /// The key prefix of ReplOrigin keys.
 pub const REPL_ORIGIN_KEY_PREFIX: u8 = 0x63;

+/// The key prefix of db directory keys.
+pub const DB_DIR_KEY_PREFIX: u8 = 0x64;
+
+/// The key prefix of rel directory keys.
+pub const REL_DIR_KEY_PREFIX: u8 = 0x65;
+
+#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+pub enum RelDirExists {
+    Exists,
+    Removed,
+}
+
+#[derive(Debug)]
+pub struct DecodeError;
+
+impl fmt::Display for DecodeError {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "invalid marker")
+    }
+}
+
+impl std::error::Error for DecodeError {}
+
+impl RelDirExists {
+    /// The value of the rel directory keys that indicates the existence of a relation.
+    const REL_EXISTS_MARKER: Bytes = Bytes::from_static(b"r");
+
+    pub fn encode(&self) -> Bytes {
+        match self {
+            Self::Exists => Self::REL_EXISTS_MARKER.clone(),
+            Self::Removed => SPARSE_TOMBSTONE_MARKER.clone(),
+        }
+    }
+
+    pub fn decode_option(data: Option<impl AsRef<[u8]>>) -> Result<Self, DecodeError> {
+        match data {
+            Some(marker) if marker.as_ref() == Self::REL_EXISTS_MARKER => Ok(Self::Exists),
+            // Any other marker is invalid
+            Some(_) => Err(DecodeError),
+            None => Ok(Self::Removed),
+        }
+    }
+
+    pub fn decode(data: impl AsRef<[u8]>) -> Result<Self, DecodeError> {
+        let data = data.as_ref();
+        if data == Self::REL_EXISTS_MARKER {
+            Ok(Self::Exists)
+        } else if data == SPARSE_TOMBSTONE_MARKER {
+            Ok(Self::Removed)
+        } else {
+            Err(DecodeError)
+        }
+    }
+}
+
+/// A tombstone in the sparse keyspace, which is an empty buffer.
+pub const SPARSE_TOMBSTONE_MARKER: Bytes = Bytes::from_static(b"");
+
 /// Check if the key falls in the range of metadata keys.
 pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
@@ -110,6 +170,24 @@ impl Key {
        }
    }

+    pub fn rel_dir_sparse_key_range() -> Range<Self> {
+        Key {
+            field1: REL_DIR_KEY_PREFIX,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }..Key {
+            field1: REL_DIR_KEY_PREFIX + 1,
+            field2: 0,
+            field3: 0,
+            field4: 0,
+            field5: 0,
+            field6: 0,
+        }
+    }
+
    /// This function checks more extensively what keys we can take on the write path.
    /// If a key beginning with 00 does not have a global/default tablespace OID, it
    /// will be rejected on the write path.
@@ -440,6 +518,36 @@ pub fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

+#[inline(always)]
+pub fn rel_tag_sparse_key(spcnode: Oid, dbnode: Oid, relnode: Oid, forknum: u8) -> Key {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: relnode,
+        field5: forknum,
+        field6: 1,
+    }
+}
+
+pub fn rel_tag_sparse_key_range(spcnode: Oid, dbnode: Oid) -> Range<Key> {
+    Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: 0,
+        field5: 0,
+        field6: 0,
+    }..Key {
+        field1: REL_DIR_KEY_PREFIX,
+        field2: spcnode,
+        field3: dbnode,
+        field4: u32::MAX,
+        field5: u8::MAX,
+        field6: u32::MAX,
+    } // it's fine to exclude the last key b/c we only use field6 == 1
+}
+
 #[inline(always)]
 pub fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
@@ -734,9 +842,9 @@ impl Key {
        self.field1 == RELATION_SIZE_PREFIX
    }

-    pub fn sparse_non_inherited_keyspace() -> Range<Key> {
+    pub const fn sparse_non_inherited_keyspace() -> Range<Key> {
        // The two keys are adjacent; if we will have non-adjancent keys in the future, we should return a keyspace
-        debug_assert_eq!(AUX_KEY_PREFIX + 1, REPL_ORIGIN_KEY_PREFIX);
+        const_assert!(AUX_KEY_PREFIX + 1 == REPL_ORIGIN_KEY_PREFIX);
        Key {
            field1: AUX_KEY_PREFIX,
            field2: 0,
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1080,8 +1080,7 @@ pub struct TenantInfo {

    /// Opaque explanation if gc is being blocked.
    ///
-    /// Only looked up for the individual tenant detail, not the listing. This is purely for
-    /// debugging, not included in openapi.
+    /// Only looked up for the individual tenant detail, not the listing.
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_blocking: Option<String>,
 }
@@ -1136,7 +1135,26 @@ pub struct TimelineInfo {
    pub ancestor_lsn: Option<Lsn>,
    pub last_record_lsn: Lsn,
    pub prev_record_lsn: Option<Lsn>,
+
+    /// Legacy field for compat with control plane.  Synonym of `min_readable_lsn`.
+    /// TODO: remove once control plane no longer reads it.
    pub latest_gc_cutoff_lsn: Lsn,
+
+    /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
+    /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
+    /// as it is easier to reason about.
+    #[serde(default)]
+    pub applied_gc_cutoff_lsn: Lsn,
+
+    /// The upper bound of data which is either already GC'ed, or elegible to be GC'ed at any time based on PITR interval.
+    /// This LSN represents the "end of history" for this timeline, and callers should use it to figure out the oldest
+    /// LSN at which it is legal to create a branch or ephemeral endpoint.
+    ///
+    /// Note that holders of valid LSN leases may be able to create branches and read pages earlier
+    /// than this LSN, but new leases may not be taken out earlier than this LSN.
+    #[serde(default)]
+    pub min_readable_lsn: Lsn,
+
    pub disk_consistent_lsn: Lsn,

    /// The LSN that we have succesfully uploaded to remote storage
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -9,6 +9,8 @@ use bytes::Bytes;
 use serde::{Deserialize, Serialize};
 use std::io::ErrorKind;
 use std::net::SocketAddr;
+use std::os::fd::AsRawFd;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::task::{ready, Poll};
@@ -268,6 +270,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> MaybeWriteOnly<IO> {
 }

 pub struct PostgresBackend<IO> {
+    pub socket_fd: RawFd,
    framed: MaybeWriteOnly<IO>,

    pub state: ProtoState,
@@ -293,9 +296,11 @@ impl PostgresBackend<tokio::net::TcpStream> {
        tls_config: Option<Arc<rustls::ServerConfig>>,
    ) -> io::Result<Self> {
        let peer_addr = socket.peer_addr()?;
+        let socket_fd = socket.as_raw_fd();
        let stream = MaybeTlsStream::Unencrypted(socket);

        Ok(Self {
+            socket_fd,
            framed: MaybeWriteOnly::Full(Framed::new(stream)),
            state: ProtoState::Initialization,
            auth_type,
@@ -307,6 +312,7 @@ impl PostgresBackend<tokio::net::TcpStream> {

 impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
    pub fn new_from_io(
+        socket_fd: RawFd,
        socket: IO,
        peer_addr: SocketAddr,
        auth_type: AuthType,
@@ -315,6 +321,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
        let stream = MaybeTlsStream::Unencrypted(socket);

        Ok(Self {
+            socket_fd,
            framed: MaybeWriteOnly::Full(Framed::new(stream)),
            state: ProtoState::Initialization,
            auth_type,
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -278,7 +278,7 @@ pub fn generate_pg_control(
    checkpoint_bytes: &[u8],
    lsn: Lsn,
    pg_version: u32,
-) -> anyhow::Result<(Bytes, u64)> {
+) -> anyhow::Result<(Bytes, u64, bool)> {
    dispatch_pgversion!(
        pg_version,
        pgv::xlog_utils::generate_pg_control(pg_control_bytes, checkpoint_bytes, lsn),
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -124,23 +124,59 @@ pub fn normalize_lsn(lsn: Lsn, seg_sz: usize) -> Lsn {
    }
 }

+/// Generate a pg_control file, for a basebackup for starting up Postgres at the given LSN
+///
+/// 'pg_control_bytes' and 'checkpoint_bytes' are the contents of those keys persisted in
+/// the pageserver. They use the same format as the PostgreSQL control file and the
+/// checkpoint record, but see walingest.rs for how exactly they are kept up to date.
+/// 'lsn' is the LSN at which we're starting up.
+///
+/// Returns:
+/// - pg_control file contents
+/// - system_identifier, extracted from the persisted information
+/// - true, if we're starting up from a "clean shutdown", i.e. if there was a shutdown
+///   checkpoint at the given LSN
 pub fn generate_pg_control(
    pg_control_bytes: &[u8],
    checkpoint_bytes: &[u8],
    lsn: Lsn,
-) -> anyhow::Result<(Bytes, u64)> {
+) -> anyhow::Result<(Bytes, u64, bool)> {
    let mut pg_control = ControlFileData::decode(pg_control_bytes)?;
    let mut checkpoint = CheckPoint::decode(checkpoint_bytes)?;

    // Generate new pg_control needed for bootstrap
+    //
+    // NB: In the checkpoint struct that we persist in the pageserver, we have a different
+    // convention for the 'redo' field than in PostgreSQL: On a shutdown checkpoint,
+    // 'redo' points the *end* of the checkpoint WAL record. On PostgreSQL, it points to
+    // the beginning. Furthermore, on an online checkpoint, 'redo' is set to 0.
+    //
+    // We didn't always have this convention however, and old persisted records will have
+    // old REDO values that point to some old LSN.
+    //
+    // The upshot is that if 'redo' is equal to the "current" LSN, there was a shutdown
+    // checkpoint record at that point in WAL, with no new WAL records after it. That case
+    // can be treated as starting from a clean shutdown. All other cases are treated as
+    // non-clean shutdown. In Neon, we don't do WAL replay at startup in either case, so
+    // that distinction doesn't matter very much. As of this writing, it only affects
+    // whether the persisted pg_stats information can be used or not.
+    //
+    // In the Checkpoint struct in the returned pg_control file, the redo pointer is
+    // always set to the LSN we're starting at, to hint that no WAL replay is required.
+    // (There's some neon-specific code in Postgres startup to make that work, though.
+    // Just setting the redo pointer is not sufficient.)
+    let was_shutdown = Lsn(checkpoint.redo) == lsn;
    checkpoint.redo = normalize_lsn(lsn, WAL_SEGMENT_SIZE).0;

-    //save new values in pg_control
+    // We use DBState_DB_SHUTDOWNED even if it was not a clean shutdown.  The
+    // neon-specific code at postgres startup ignores the state stored in the control
+    // file, similar to archive recovery in standalone PostgreSQL. Similarly, the
+    // checkPoint pointer is ignored, so just set it to 0.
    pg_control.checkPoint = 0;
    pg_control.checkPointCopy = checkpoint;
    pg_control.state = DBState_DB_SHUTDOWNED;

-    Ok((pg_control.encode(), pg_control.system_identifier))
+    Ok((pg_control.encode(), pg_control.system_identifier, was_shutdown))
 }

 pub fn get_current_timestamp() -> TimestampTz {
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -5,18 +5,15 @@ edition = "2021"
 license = "MIT/Apache-2.0"

 [dependencies]
-async-trait.workspace = true
 bytes.workspace = true
-byteorder.workspace = true
 fallible-iterator.workspace = true
 futures-util = { workspace = true, features = ["sink"] }
 log = "0.4"
 parking_lot.workspace = true
-percent-encoding = "2.0"
 pin-project-lite.workspace = true
 phf = "0.11"
 postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
 tokio-util = { workspace = true, features = ["codec"] }
-serde = { workspace = true, features = ["derive"] }
+serde = { workspace = true, features = ["derive"] }
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -10,8 +10,8 @@ use crate::simple_query::SimpleQueryStream;
 use crate::types::{Oid, ToSql, Type};

 use crate::{
-    prepare, query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
-    SimpleQueryMessage, Statement, ToStatement, Transaction, TransactionBuilder,
+    query, simple_query, slice_iter, CancelToken, Error, ReadyForQueryStatus, Row,
+    SimpleQueryMessage, Statement, Transaction, TransactionBuilder,
 };
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
@@ -54,18 +54,18 @@ impl Responses {
 }

 /// A cache of type info and prepared statements for fetching type info
-/// (corresponding to the queries in the [prepare] module).
+/// (corresponding to the queries in the [crate::prepare] module).
 #[derive(Default)]
 struct CachedTypeInfo {
    /// A statement for basic information for a type from its
-    /// OID. Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_QUERY) (or its
+    /// OID. Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_QUERY) (or its
    /// fallback).
    typeinfo: Option<Statement>,
    /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY).
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY).
    typeinfo_composite: Option<Statement>,
    /// A statement for getting information for a composite type from its OID.
-    /// Corresponds to [TYPEINFO_QUERY](prepare::TYPEINFO_COMPOSITE_QUERY) (or
+    /// Corresponds to [TYPEINFO_QUERY](crate::prepare::TYPEINFO_COMPOSITE_QUERY) (or
    /// its fallback).
    typeinfo_enum: Option<Statement>,

@@ -190,26 +190,6 @@ impl Client {
        &self.inner
    }

-    /// Creates a new prepared statement.
-    ///
-    /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
-    /// which are set when executed. Prepared statements can only be used with the connection that created them.
-    pub async fn prepare(&self, query: &str) -> Result<Statement, Error> {
-        self.prepare_typed(query, &[]).await
-    }
-
-    /// Like `prepare`, but allows the types of query parameters to be explicitly specified.
-    ///
-    /// The list of types may be smaller than the number of parameters - the types of the remaining parameters will be
-    /// inferred. For example, `client.prepare_typed(query, &[])` is equivalent to `client.prepare(query)`.
-    pub async fn prepare_typed(
-        &self,
-        query: &str,
-        parameter_types: &[Type],
-    ) -> Result<Statement, Error> {
-        prepare::prepare(&self.inner, query, parameter_types).await
-    }
-
    /// Executes a statement, returning a vector of the resulting rows.
    ///
    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
@@ -222,14 +202,11 @@ impl Client {
    /// # Panics
    ///
    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn query<T>(
+    pub async fn query(
        &self,
-        statement: &T,
+        statement: Statement,
        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<Vec<Row>, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
+    ) -> Result<Vec<Row>, Error> {
        self.query_raw(statement, slice_iter(params))
            .await?
            .try_collect()
@@ -250,13 +227,15 @@ impl Client {
    /// Panics if the number of parameters provided does not match the number expected.
    ///
    /// [`query`]: #method.query
-    pub async fn query_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<RowStream, Error>
+    pub async fn query_raw<'a, I>(
+        &self,
+        statement: Statement,
+        params: I,
+    ) -> Result<RowStream, Error>
    where
-        T: ?Sized + ToStatement,
        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
        I::IntoIter: ExactSizeIterator,
    {
-        let statement = statement.__convert().into_statement(self).await?;
        query::query(&self.inner, statement, params).await
    }

@@ -271,55 +250,6 @@ impl Client {
        query::query_txt(&self.inner, statement, params).await
    }

-    /// Executes a statement, returning the number of rows modified.
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// If the statement does not modify any rows (e.g. `SELECT`), 0 is returned.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    pub async fn execute<T>(
-        &self,
-        statement: &T,
-        params: &[&(dyn ToSql + Sync)],
-    ) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-    {
-        self.execute_raw(statement, slice_iter(params)).await
-    }
-
-    /// The maximally flexible version of [`execute`].
-    ///
-    /// A statement may contain parameters, specified by `$n`, where `n` is the index of the parameter of the list
-    /// provided, 1-indexed.
-    ///
-    /// The `statement` argument can either be a `Statement`, or a raw query string. If the same statement will be
-    /// repeatedly executed (perhaps with different query parameters), consider preparing the statement up front
-    /// with the `prepare` method.
-    ///
-    /// # Panics
-    ///
-    /// Panics if the number of parameters provided does not match the number expected.
-    ///
-    /// [`execute`]: #method.execute
-    pub async fn execute_raw<'a, T, I>(&self, statement: &T, params: I) -> Result<u64, Error>
-    where
-        T: ?Sized + ToStatement,
-        I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-        I::IntoIter: ExactSizeIterator,
-    {
-        let statement = statement.__convert().into_statement(self).await?;
-        query::execute(self.inner(), statement, params).await
-    }
-
    /// Executes a sequence of SQL statements using the simple query protocol, returning the resulting rows.
    ///
    /// Statements should be separated by semicolons. If an error occurs, execution of the sequence will stop at that
--- a/libs/proxy/tokio-postgres2/src/generic_client.rs
+++ b/libs/proxy/tokio-postgres2/src/generic_client.rs
@@ -1,7 +1,8 @@
+#![allow(async_fn_in_trait)]
+
 use crate::query::RowStream;
 use crate::types::Type;
 use crate::{Client, Error, Transaction};
-use async_trait::async_trait;
 use postgres_protocol2::Oid;

 mod private {
@@ -11,7 +12,6 @@ mod private {
 /// A trait allowing abstraction over connections and transactions.
 ///
 /// This trait is "sealed", and cannot be implemented outside of this crate.
-#[async_trait]
 pub trait GenericClient: private::Sealed {
    /// Like `Client::query_raw_txt`.
    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
@@ -26,7 +26,6 @@ pub trait GenericClient: private::Sealed {

 impl private::Sealed for Client {}

-#[async_trait]
 impl GenericClient for Client {
    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
@@ -39,14 +38,12 @@ impl GenericClient for Client {

    /// Query for type information
    async fn get_type(&self, oid: Oid) -> Result<Type, Error> {
-        self.get_type(oid).await
+        crate::prepare::get_type(self.inner(), oid).await
    }
 }

 impl private::Sealed for Transaction<'_> {}

-#[async_trait]
-#[allow(clippy::needless_lifetimes)]
 impl GenericClient for Transaction<'_> {
    async fn query_raw_txt<S, I>(&self, statement: &str, params: I) -> Result<RowStream, Error>
    where
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -14,7 +14,6 @@ pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
 pub use crate::tls::NoTls;
-pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
@@ -65,7 +64,6 @@ pub mod row;
 mod simple_query;
 mod statement;
 pub mod tls;
-mod to_statement;
 mod transaction;
 mod transaction_builder;
 pub mod types;
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -1,7 +1,6 @@
 use crate::client::InnerClient;
 use crate::codec::FrontendMessage;
 use crate::connection::RequestMessages;
-use crate::error::SqlState;
 use crate::types::{Field, Kind, Oid, Type};
 use crate::{query, slice_iter};
 use crate::{Column, Error, Statement};
@@ -13,7 +12,6 @@ use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use std::future::Future;
 use std::pin::Pin;
-use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::Arc;

 pub(crate) const TYPEINFO_QUERY: &str = "\
@@ -24,14 +22,6 @@ INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
 WHERE t.oid = $1
 ";

-// Range types weren't added until Postgres 9.2, so pg_range may not exist
-const TYPEINFO_FALLBACK_QUERY: &str = "\
-SELECT t.typname, t.typtype, t.typelem, NULL::OID, t.typbasetype, n.nspname, t.typrelid
-FROM pg_catalog.pg_type t
-INNER JOIN pg_catalog.pg_namespace n ON t.typnamespace = n.oid
-WHERE t.oid = $1
-";
-
 const TYPEINFO_ENUM_QUERY: &str = "\
 SELECT enumlabel
 FROM pg_catalog.pg_enum
@@ -39,14 +29,6 @@ WHERE enumtypid = $1
 ORDER BY enumsortorder
 ";

-// Postgres 9.0 didn't have enumsortorder
-const TYPEINFO_ENUM_FALLBACK_QUERY: &str = "\
-SELECT enumlabel
-FROM pg_catalog.pg_enum
-WHERE enumtypid = $1
-ORDER BY oid
-";
-
 pub(crate) const TYPEINFO_COMPOSITE_QUERY: &str = "\
 SELECT attname, atttypid
 FROM pg_catalog.pg_attribute
@@ -56,15 +38,13 @@ AND attnum > 0
 ORDER BY attnum
 ";

-static NEXT_ID: AtomicUsize = AtomicUsize::new(0);
-
 pub async fn prepare(
    client: &Arc<InnerClient>,
+    name: &'static str,
    query: &str,
    types: &[Type],
 ) -> Result<Statement, Error> {
-    let name = format!("s{}", NEXT_ID.fetch_add(1, Ordering::SeqCst));
-    let buf = encode(client, &name, query, types)?;
+    let buf = encode(client, name, query, types)?;
    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

    match responses.next().await? {
@@ -105,10 +85,11 @@ pub async fn prepare(

 fn prepare_rec<'a>(
    client: &'a Arc<InnerClient>,
+    name: &'static str,
    query: &'a str,
    types: &'a [Type],
 ) -> Pin<Box<dyn Future<Output = Result<Statement, Error>> + 'a + Send>> {
-    Box::pin(prepare(client, query, types))
+    Box::pin(prepare(client, name, query, types))
 }

 fn encode(client: &InnerClient, name: &str, query: &str, types: &[Type]) -> Result<Bytes, Error> {
@@ -192,13 +173,8 @@ async fn typeinfo_statement(client: &Arc<InnerClient>) -> Result<Statement, Erro
        return Ok(stmt);
    }

-    let stmt = match prepare_rec(client, TYPEINFO_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_TABLE) => {
-            prepare_rec(client, TYPEINFO_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_QUERY, &[]).await?;

    client.set_typeinfo(&stmt);
    Ok(stmt)
@@ -219,13 +195,8 @@ async fn typeinfo_enum_statement(client: &Arc<InnerClient>) -> Result<Statement,
        return Ok(stmt);
    }

-    let stmt = match prepare_rec(client, TYPEINFO_ENUM_QUERY, &[]).await {
-        Ok(stmt) => stmt,
-        Err(ref e) if e.code() == Some(&SqlState::UNDEFINED_COLUMN) => {
-            prepare_rec(client, TYPEINFO_ENUM_FALLBACK_QUERY, &[]).await?
-        }
-        Err(e) => return Err(e),
-    };
+    let typeinfo = "neon_proxy_typeinfo_enum";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_ENUM_QUERY, &[]).await?;

    client.set_typeinfo_enum(&stmt);
    Ok(stmt)
@@ -255,7 +226,8 @@ async fn typeinfo_composite_statement(client: &Arc<InnerClient>) -> Result<State
        return Ok(stmt);
    }

-    let stmt = prepare_rec(client, TYPEINFO_COMPOSITE_QUERY, &[]).await?;
+    let typeinfo = "neon_proxy_typeinfo_composite";
+    let stmt = prepare_rec(client, typeinfo, TYPEINFO_COMPOSITE_QUERY, &[]).await?;

    client.set_typeinfo_composite(&stmt);
    Ok(stmt)
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -157,49 +157,6 @@ where
    })
 }

-pub async fn execute<'a, I>(
-    client: &InnerClient,
-    statement: Statement,
-    params: I,
-) -> Result<u64, Error>
-where
-    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
-    I::IntoIter: ExactSizeIterator,
-{
-    let buf = if log_enabled!(Level::Debug) {
-        let params = params.into_iter().collect::<Vec<_>>();
-        debug!(
-            "executing statement {} with parameters: {:?}",
-            statement.name(),
-            BorrowToSqlParamsDebug(params.as_slice()),
-        );
-        encode(client, &statement, params)?
-    } else {
-        encode(client, &statement, params)?
-    };
-    let mut responses = start(client, buf).await?;
-
-    let mut rows = 0;
-    loop {
-        match responses.next().await? {
-            Message::DataRow(_) => {}
-            Message::CommandComplete(body) => {
-                rows = body
-                    .tag()
-                    .map_err(Error::parse)?
-                    .rsplit(' ')
-                    .next()
-                    .unwrap()
-                    .parse()
-                    .unwrap_or(0);
-            }
-            Message::EmptyQueryResponse => rows = 0,
-            Message::ReadyForQuery(_) => return Ok(rows),
-            _ => return Err(Error::unexpected_message()),
-        }
-    }
-}
-
 async fn start(client: &InnerClient, buf: Bytes) -> Result<Responses, Error> {
    let mut responses = client.send(RequestMessages::Single(FrontendMessage::Raw(buf)))?;

--- a/libs/proxy/tokio-postgres2/src/statement.rs
+++ b/libs/proxy/tokio-postgres2/src/statement.rs
@@ -13,7 +13,7 @@ use std::{

 struct StatementInner {
    client: Weak<InnerClient>,
-    name: String,
+    name: &'static str,
    params: Vec<Type>,
    columns: Vec<Column>,
 }
@@ -22,7 +22,7 @@ impl Drop for StatementInner {
    fn drop(&mut self) {
        if let Some(client) = self.client.upgrade() {
            let buf = client.with_buf(|buf| {
-                frontend::close(b'S', &self.name, buf).unwrap();
+                frontend::close(b'S', self.name, buf).unwrap();
                frontend::sync(buf);
                buf.split().freeze()
            });
@@ -40,7 +40,7 @@ pub struct Statement(Arc<StatementInner>);
 impl Statement {
    pub(crate) fn new(
        inner: &Arc<InnerClient>,
-        name: String,
+        name: &'static str,
        params: Vec<Type>,
        columns: Vec<Column>,
    ) -> Statement {
@@ -55,14 +55,14 @@ impl Statement {
    pub(crate) fn new_anonymous(params: Vec<Type>, columns: Vec<Column>) -> Statement {
        Statement(Arc::new(StatementInner {
            client: Weak::new(),
-            name: String::new(),
+            name: "<anonymous>",
            params,
            columns,
        }))
    }

    pub(crate) fn name(&self) -> &str {
-        &self.0.name
+        self.0.name
    }

    /// Returns the expected types of the statement's parameters.
--- a/libs/proxy/tokio-postgres2/src/to_statement.rs
+++ b/libs/proxy/tokio-postgres2/src/to_statement.rs
@@ -1,57 +0,0 @@
-use crate::to_statement::private::{Sealed, ToStatementType};
-use crate::Statement;
-
-mod private {
-    use crate::{Client, Error, Statement};
-
-    pub trait Sealed {}
-
-    pub enum ToStatementType<'a> {
-        Statement(&'a Statement),
-        Query(&'a str),
-    }
-
-    impl ToStatementType<'_> {
-        pub async fn into_statement(self, client: &Client) -> Result<Statement, Error> {
-            match self {
-                ToStatementType::Statement(s) => Ok(s.clone()),
-                ToStatementType::Query(s) => client.prepare(s).await,
-            }
-        }
-    }
-}
-
-/// A trait abstracting over prepared and unprepared statements.
-///
-/// Many methods are generic over this bound, so that they support both a raw query string as well as a statement which
-/// was prepared previously.
-///
-/// This trait is "sealed" and cannot be implemented by anything outside this crate.
-pub trait ToStatement: Sealed {
-    #[doc(hidden)]
-    fn __convert(&self) -> ToStatementType<'_>;
-}
-
-impl ToStatement for Statement {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Statement(self)
-    }
-}
-
-impl Sealed for Statement {}
-
-impl ToStatement for str {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for str {}
-
-impl ToStatement for String {
-    fn __convert(&self) -> ToStatementType<'_> {
-        ToStatementType::Query(self)
-    }
-}
-
-impl Sealed for String {}
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -9,13 +9,43 @@ use anyhow::bail;
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-/// Number uniquely identifying safekeeper configuration.
-/// Note: it is a part of sk control file.
-pub type Generation = u32;
 /// 1 is the first valid generation, 0 is used as
 /// a placeholder before we fully migrate to generations.
-pub const INVALID_GENERATION: Generation = 0;
-pub const INITIAL_GENERATION: Generation = 1;
+pub const INVALID_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(0);
+pub const INITIAL_GENERATION: SafekeeperGeneration = SafekeeperGeneration::new(1);
+
+/// Number uniquely identifying safekeeper configuration.
+/// Note: it is a part of sk control file.
+///
+/// Like tenant generations, but for safekeepers.
+#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize)]
+pub struct SafekeeperGeneration(u32);
+
+impl SafekeeperGeneration {
+    pub const fn new(v: u32) -> Self {
+        Self(v)
+    }
+
+    #[track_caller]
+    pub fn previous(&self) -> Option<Self> {
+        Some(Self(self.0.checked_sub(1)?))
+    }
+
+    #[track_caller]
+    pub fn next(&self) -> Self {
+        Self(self.0 + 1)
+    }
+
+    pub fn into_inner(self) -> u32 {
+        self.0
+    }
+}
+
+impl Display for SafekeeperGeneration {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}

 /// Membership is defined by ids so e.g. walproposer uses them to figure out
 /// quorums, but we also carry host and port to give wp idea where to connect.
@@ -89,7 +119,7 @@ impl Display for MemberSet {
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
 pub struct Configuration {
    /// Unique id.
-    pub generation: Generation,
+    pub generation: SafekeeperGeneration,
    /// Current members of the configuration.
    pub members: MemberSet,
    /// Some means it is a joint conf.
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -282,3 +282,18 @@ pub struct TimelineTermBumpResponse {
 pub struct SafekeeperUtilization {
    pub timeline_count: u64,
 }
+
+/// pull_timeline request body.
+#[derive(Debug, Deserialize, Serialize)]
+pub struct PullTimelineRequest {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub http_hosts: Vec<String>,
+}
+
+#[derive(Debug, Serialize, Deserialize)]
+pub struct PullTimelineResponse {
+    // Donor safekeeper host
+    pub safekeeper_host: String,
+    // TODO: add more fields?
+}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -24,11 +24,10 @@ diatomic-waker.workspace = true
 git-version.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
-inferno.workspace = true
 fail.workspace = true
 futures = { workspace = true }
 jsonwebtoken.workspace = true
-nix.workspace = true
+nix = { workspace = true, features = ["ioctl"] }
 once_cell.workspace = true
 pin-project-lite.workspace = true
 regex.workspace = true
@@ -62,6 +61,7 @@ bytes.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 camino-tempfile.workspace = true
+pprof.workspace = true
 serde_assert.workspace = true
 tokio = { workspace = true, features = ["test-util"] }

--- a/libs/utils/benches/README.md
+++ b/libs/utils/benches/README.md
@@ -0,0 +1,26 @@
+## Utils Benchmarks
+
+To run benchmarks:
+
+```sh
+# All benchmarks.
+cargo bench --package utils
+
+# Specific file.
+cargo bench --package utils --bench benchmarks
+
+# Specific benchmark.
+cargo bench --package utils --bench benchmarks warn_slow/enabled=true
+
+# List available benchmarks.
+cargo bench --package utils --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+cargo bench --package utils --bench benchmarks warn_slow/enabled=true --profile-time 10
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,5 +1,18 @@
-use criterion::{criterion_group, criterion_main, Criterion};
+use std::time::Duration;
+
+use criterion::{criterion_group, criterion_main, Bencher, Criterion};
+use pprof::criterion::{Output, PProfProfiler};
 use utils::id;
+use utils::logging::warn_slow;
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    name = benches;
+    config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets = bench_id_stringify,
+    bench_warn_slow,
+);
+criterion_main!(benches);

 pub fn bench_id_stringify(c: &mut Criterion) {
    // Can only use public methods.
@@ -16,5 +29,31 @@ pub fn bench_id_stringify(c: &mut Criterion) {
    });
 }

-criterion_group!(benches, bench_id_stringify);
-criterion_main!(benches);
+pub fn bench_warn_slow(c: &mut Criterion) {
+    for enabled in [false, true] {
+        c.bench_function(&format!("warn_slow/enabled={enabled}"), |b| {
+            run_bench(b, enabled).unwrap()
+        });
+    }
+
+    // The actual benchmark.
+    fn run_bench(b: &mut Bencher, enabled: bool) -> anyhow::Result<()> {
+        const THRESHOLD: Duration = Duration::from_secs(1);
+
+        // Use a multi-threaded runtime to avoid thread parking overhead when yielding.
+        let runtime = tokio::runtime::Builder::new_multi_thread()
+            .enable_all()
+            .build()?;
+
+        // Test both with and without warn_slow, since we're essentially measuring Tokio scheduling
+        // performance too. Use a simple noop future that yields once, to avoid any scheduler fast
+        // paths for a ready future.
+        if enabled {
+            b.iter(|| runtime.block_on(warn_slow("ready", THRESHOLD, tokio::task::yield_now())));
+        } else {
+            b.iter(|| runtime.block_on(tokio::task::yield_now()));
+        }
+
+        Ok(())
+    }
+}
--- a/libs/utils/src/bin_ser.rs
+++ b/libs/utils/src/bin_ser.rs
@@ -286,6 +286,11 @@ mod tests {
    const SHORT2_ENC_LE: &[u8] = &[8, 0, 0, 3, 7];
    const SHORT2_ENC_LE_TRAILING: &[u8] = &[8, 0, 0, 3, 7, 0xff, 0xff, 0xff];

+    #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
+    struct NewTypeStruct(u32);
+    const NT1: NewTypeStruct = NewTypeStruct(414243);
+    const NT1_INNER: u32 = 414243;
+
    #[derive(Debug, PartialEq, Eq, Serialize, Deserialize)]
    pub struct LongMsg {
        pub tag: u8,
@@ -408,4 +413,42 @@ mod tests {
        let msg2 = LongMsg::des(&encoded).unwrap();
        assert_eq!(msg, msg2);
    }
+
+    #[test]
+    /// Ensure that newtype wrappers around u32 don't change the serialization format
+    fn be_nt() {
+        use super::BeSer;
+
+        assert_eq!(NT1.serialized_size().unwrap(), 4);
+
+        let msg = NT1;
+
+        let encoded = msg.ser().unwrap();
+        let expected = hex_literal::hex!("0006 5223");
+        assert_eq!(encoded, expected);
+
+        assert_eq!(encoded, NT1_INNER.ser().unwrap());
+
+        let msg2 = NewTypeStruct::des(&encoded).unwrap();
+        assert_eq!(msg, msg2);
+    }
+
+    #[test]
+    /// Ensure that newtype wrappers around u32 don't change the serialization format
+    fn le_nt() {
+        use super::LeSer;
+
+        assert_eq!(NT1.serialized_size().unwrap(), 4);
+
+        let msg = NT1;
+
+        let encoded = msg.ser().unwrap();
+        let expected = hex_literal::hex!("2352 0600");
+        assert_eq!(encoded, expected);
+
+        assert_eq!(encoded, NT1_INNER.ser().unwrap());
+
+        let msg2 = NewTypeStruct::des(&encoded).unwrap();
+        assert_eq!(msg, msg2);
+    }
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -93,6 +93,9 @@ pub mod try_rcu;

 pub mod guard_arc_swap;

+#[cfg(target_os = "linux")]
+pub mod linux_socket_ioctl;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
--- a/libs/utils/src/linux_socket_ioctl.rs
+++ b/libs/utils/src/linux_socket_ioctl.rs
@@ -0,0 +1,35 @@
+//! Linux-specific socket ioctls.
+//!
+//! <https://elixir.bootlin.com/linux/v6.1.128/source/include/uapi/linux/sockios.h#L25-L27>
+
+use std::{
+    io,
+    mem::MaybeUninit,
+    os::{fd::RawFd, raw::c_int},
+};
+
+use nix::libc::{FIONREAD, TIOCOUTQ};
+
+unsafe fn do_ioctl(socket_fd: RawFd, cmd: nix::libc::Ioctl) -> io::Result<c_int> {
+    let mut inq: MaybeUninit<c_int> = MaybeUninit::uninit();
+    let err = nix::libc::ioctl(socket_fd, cmd, inq.as_mut_ptr());
+    if err == 0 {
+        Ok(inq.assume_init())
+    } else {
+        Err(io::Error::last_os_error())
+    }
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn inq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, FIONREAD)
+}
+
+/// # Safety
+///
+/// Caller must ensure that `socket_fd` is a valid TCP socket file descriptor.
+pub unsafe fn outq(socket_fd: RawFd) -> io::Result<c_int> {
+    do_ioctl(socket_fd, TIOCOUTQ)
+}
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,9 +1,13 @@
+use std::future::Future;
 use std::str::FromStr;
+use std::time::Duration;

 use anyhow::Context;
 use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
+use tokio::time::Instant;
+use tracing::warn;

 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -318,6 +322,41 @@ impl std::fmt::Debug for SecretString {
    }
 }

+/// Logs a periodic warning if a future is slow to complete.
+///
+/// This is performance-sensitive as it's used on the GetPage read path.
+#[inline]
+pub async fn warn_slow<O>(name: &str, threshold: Duration, f: impl Future<Output = O>) -> O {
+    // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
+    // won't fit on the stack.
+    let mut f = Box::pin(f);
+
+    let started = Instant::now();
+    let mut attempt = 1;
+
+    loop {
+        // NB: use timeout_at() instead of timeout() to avoid an extra clock reading in the common
+        // case where the timeout doesn't fire.
+        let deadline = started + attempt * threshold;
+        if let Ok(output) = tokio::time::timeout_at(deadline, &mut f).await {
+            // NB: we check if we exceeded the threshold even if the timeout never fired, because
+            // scheduling or execution delays may cause the future to succeed even if it exceeds the
+            // timeout. This costs an extra unconditional clock reading, but seems worth it to avoid
+            // false negatives.
+            let elapsed = started.elapsed();
+            if elapsed >= threshold {
+                warn!("slow {name} completed after {:.3}s", elapsed.as_secs_f64());
+            }
+            return output;
+        }
+
+        let elapsed = started.elapsed().as_secs_f64();
+        warn!("slow {name} still running after {elapsed:.3}s",);
+
+        attempt += 1;
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use metrics::{core::Opts, IntCounterVec};
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -117,6 +117,10 @@ impl TenantShardId {
        )
    }

+    pub fn range(&self) -> RangeInclusive<Self> {
+        RangeInclusive::new(*self, *self)
+    }
+
    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
        ShardSlug(self)
    }
--- a/libs/wal_decoder/proto/interpreted_wal.proto
+++ b/libs/wal_decoder/proto/interpreted_wal.proto
@@ -5,6 +5,7 @@ package interpreted_wal;
 message InterpretedWalRecords {
  repeated InterpretedWalRecord records = 1;
  optional uint64 next_record_lsn = 2;
+  optional uint64 raw_wal_start_lsn = 3;
 }

 message InterpretedWalRecord {
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -60,7 +60,11 @@ pub struct InterpretedWalRecords {
    pub records: Vec<InterpretedWalRecord>,
    // Start LSN of the next record after the batch.
    // Note that said record may not belong to the current shard.
-    pub next_record_lsn: Option<Lsn>,
+    pub next_record_lsn: Lsn,
+    // Inclusive start LSN of the PG WAL from which the interpreted
+    // WAL records were extracted. Note that this is not necessarily the
+    // start LSN of the first interpreted record in the batch.
+    pub raw_wal_start_lsn: Option<Lsn>,
 }

 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
--- a/libs/wal_decoder/src/wire_format.rs
+++ b/libs/wal_decoder/src/wire_format.rs
@@ -167,7 +167,8 @@ impl TryFrom<InterpretedWalRecords> for proto::InterpretedWalRecords {
            .collect::<Result<Vec<_>, _>>()?;
        Ok(proto::InterpretedWalRecords {
            records,
-            next_record_lsn: value.next_record_lsn.map(|l| l.0),
+            next_record_lsn: Some(value.next_record_lsn.0),
+            raw_wal_start_lsn: value.raw_wal_start_lsn.map(|l| l.0),
        })
    }
 }
@@ -254,7 +255,11 @@ impl TryFrom<proto::InterpretedWalRecords> for InterpretedWalRecords {

        Ok(InterpretedWalRecords {
            records,
-            next_record_lsn: value.next_record_lsn.map(Lsn::from),
+            next_record_lsn: value
+                .next_record_lsn
+                .map(Lsn::from)
+                .expect("Always provided"),
+            raw_wal_start_lsn: value.raw_wal_start_lsn.map(Lsn::from),
        })
    }
 }
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -477,6 +477,26 @@ impl Client {
        self.request(Method::POST, &uri, ()).await.map(|_| ())
    }

+    pub async fn timeline_download_heatmap_layers(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        concurrency: Option<usize>,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{}/timeline/{}/download_heatmap_layers",
+            self.mgmt_api_endpoint, tenant_shard_id, timeline_id
+        ))
+        .expect("Cannot build URL");
+
+        if let Some(concurrency) = concurrency {
+            path.query_pairs_mut()
+                .append_pair("concurrency", &format!("{}", concurrency));
+        }
+
+        self.request(Method::POST, path, ()).await.map(|_| ())
+    }
+
    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
        let uri = format!(
            "{}/v1/tenant/{}/reset",
--- a/pageserver/ctl/src/key.rs
+++ b/pageserver/ctl/src/key.rs
@@ -345,6 +345,7 @@ impl AuxFileV2 {
                AuxFileV2::Recognized("pg_logical/replorigin_checkpoint", hash)
            }
            (2, 1) => AuxFileV2::Recognized("pg_replslot/", hash),
+            (3, 1) => AuxFileV2::Recognized("pg_stat/pgstat.stat", hash),
            (1, 0xff) => AuxFileV2::OtherWithPrefix("pg_logical/", hash),
            (0xff, 0xff) => AuxFileV2::Other(hash),
            _ => return None,
--- a/pageserver/src/aux_file.rs
+++ b/pageserver/src/aux_file.rs
@@ -39,6 +39,7 @@ fn aux_hash_to_metadata_key(dir_level1: u8, dir_level2: u8, data: &[u8]) -> Key

 const AUX_DIR_PG_LOGICAL: u8 = 0x01;
 const AUX_DIR_PG_REPLSLOT: u8 = 0x02;
+const AUX_DIR_PG_STAT: u8 = 0x03;
 const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;

 /// Encode the aux file into a fixed-size key.
@@ -53,6 +54,7 @@ const AUX_DIR_PG_UNKNOWN: u8 = 0xFF;
 /// * pg_logical/replorigin_checkpoint -> 0x0103
 /// * pg_logical/others -> 0x01FF
 /// * pg_replslot/ -> 0x0201
+/// * pg_stat/pgstat.stat -> 0x0301
 /// * others -> 0xFFFF
 ///
 /// If you add new AUX files to this function, please also add a test case to `test_encoding_portable`.
@@ -75,6 +77,8 @@ pub fn encode_aux_file_key(path: &str) -> Key {
        aux_hash_to_metadata_key(AUX_DIR_PG_LOGICAL, 0xFF, fname.as_bytes())
    } else if let Some(fname) = path.strip_prefix("pg_replslot/") {
        aux_hash_to_metadata_key(AUX_DIR_PG_REPLSLOT, 0x01, fname.as_bytes())
+    } else if let Some(fname) = path.strip_prefix("pg_stat/") {
+        aux_hash_to_metadata_key(AUX_DIR_PG_STAT, 0x01, fname.as_bytes())
    } else {
        if cfg!(debug_assertions) {
            warn!(
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -13,7 +13,7 @@
 use anyhow::{anyhow, Context};
 use bytes::{BufMut, Bytes, BytesMut};
 use fail::fail_point;
-use pageserver_api::key::Key;
+use pageserver_api::key::{rel_block_to_key, Key};
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
 use std::time::{Instant, SystemTime};
@@ -42,8 +42,8 @@ use utils::lsn::Lsn;
 pub enum BasebackupError {
    #[error("basebackup pageserver error {0:#}")]
    Server(#[from] anyhow::Error),
-    #[error("basebackup client error {0:#}")]
-    Client(#[source] io::Error),
+    #[error("basebackup client error {0:#} when {1}")]
+    Client(#[source] io::Error, &'static str),
 }

 /// Create basebackup with non-rel data in it.
@@ -234,7 +234,7 @@ where
        self.ar
            .append(&header, self.buf.as_slice())
            .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "flush"))?;

        self.total_blocks += nblocks;
        debug!("Added to basebackup slru {} relsize {}", segname, nblocks);
@@ -264,6 +264,31 @@ where
    async fn send_tarball(mut self) -> Result<(), BasebackupError> {
        // TODO include checksum

+        // Construct the pg_control file from the persisted checkpoint and pg_control
+        // information. But we only add this to the tarball at the end, so that if the
+        // writing is interrupted half-way through, the resulting incomplete tarball will
+        // be missing the pg_control file, which prevents PostgreSQL from starting up on
+        // it. With proper error handling, you should never try to start up from an
+        // incomplete basebackup in the first place, of course, but this is a nice little
+        // extra safety measure.
+        let checkpoint_bytes = self
+            .timeline
+            .get_checkpoint(self.lsn, self.ctx)
+            .await
+            .context("failed to get checkpoint bytes")?;
+        let pg_control_bytes = self
+            .timeline
+            .get_control_file(self.lsn, self.ctx)
+            .await
+            .context("failed to get control bytes")?;
+        let (pg_control_bytes, system_identifier, was_shutdown) =
+            postgres_ffi::generate_pg_control(
+                &pg_control_bytes,
+                &checkpoint_bytes,
+                self.lsn,
+                self.timeline.pg_version,
+            )?;
+
        let lazy_slru_download = self.timeline.get_lazy_slru_download() && !self.full_backup;

        let pgversion = self.timeline.pg_version;
@@ -273,9 +298,9 @@ where
        for dir in subdirs.iter() {
            let header = new_tar_header_dir(dir)?;
            self.ar
-                .append(&header, &mut io::empty())
+                .append(&header, io::empty())
                .await
-                .context("could not add directory to basebackup tarball")?;
+                .map_err(|e| BasebackupError::Client(e, "send_tarball"))?;
        }

        // Send config files.
@@ -286,13 +311,13 @@ where
                self.ar
                    .append(&header, data)
                    .await
-                    .context("could not add config file to basebackup tarball")?;
+                    .map_err(|e| BasebackupError::Client(e, "send_tarball,pg_hba.conf"))?;
            } else {
                let header = new_tar_header(filepath, 0)?;
                self.ar
-                    .append(&header, &mut io::empty())
+                    .append(&header, io::empty())
                    .await
-                    .context("could not add config file to basebackup tarball")?;
+                    .map_err(|e| BasebackupError::Client(e, "send_tarball,add_config_file"))?;
            }
        }
        if !lazy_slru_download {
@@ -401,12 +426,16 @@ where
                // In future we will not generate AUX record for "pg_logical/replorigin_checkpoint" at all,
                // but now we should handle (skip) it for backward compatibility.
                continue;
+            } else if path == "pg_stat/pgstat.stat" && !was_shutdown {
+                // Drop statistic in case of abnormal termination, i.e. if we're not starting from the exact LSN
+                // of a shutdown checkpoint.
+                continue;
            }
            let header = new_tar_header(&path, content.len() as u64)?;
            self.ar
                .append(&header, &*content)
                .await
-                .context("could not add aux file to basebackup tarball")?;
+                .map_err(|e| BasebackupError::Client(e, "send_tarball,add_aux_file"))?;
        }

        if min_restart_lsn != Lsn::MAX {
@@ -419,7 +448,7 @@ where
            self.ar
                .append(&header, &data[..])
                .await
-                .context("could not add restart.lsn file to basebackup tarball")?;
+                .map_err(|e| BasebackupError::Client(e, "send_tarball,restart.lsn"))?;
        }
        for xid in self
            .timeline
@@ -451,9 +480,9 @@ where
            let crc32 = crc32c::crc32c(&content);
            content.extend_from_slice(&crc32.to_le_bytes());
            let header = new_tar_header("pg_logical/replorigin_checkpoint", content.len() as u64)?;
-            self.ar.append(&header, &*content).await.context(
-                "could not add pg_logical/replorigin_checkpoint file to basebackup tarball",
-            )?;
+            self.ar.append(&header, &*content).await.map_err(|e| {
+                BasebackupError::Client(e, "send_tarball,pg_logical/replorigin_checkpoint")
+            })?;
        }

        fail_point!("basebackup-before-control-file", |_| {
@@ -462,9 +491,13 @@ where
            )))
        });

-        // Generate pg_control and bootstrap WAL segment.
-        self.add_pgcontrol_file().await?;
-        self.ar.finish().await.map_err(BasebackupError::Client)?;
+        // Last, add the pg_control file and bootstrap WAL segment.
+        self.add_pgcontrol_file(pg_control_bytes, system_identifier)
+            .await?;
+        self.ar
+            .finish()
+            .await
+            .map_err(|e| BasebackupError::Client(e, "send_tarball,finish"))?;
        debug!("all tarred up!");
        Ok(())
    }
@@ -482,9 +515,9 @@ where
            let file_name = dst.to_segfile_name(0);
            let header = new_tar_header(&file_name, 0)?;
            self.ar
-                .append(&header, &mut io::empty())
+                .append(&header, io::empty())
                .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_rel,empty"))?;
            return Ok(());
        }

@@ -498,13 +531,9 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(
-                        src,
-                        blknum,
-                        Version::Lsn(self.lsn),
-                        self.ctx,
-                        self.io_concurrency.clone(),
-                    )
+                    // TODO: investigate using get_vectored for the entire startblk..endblk range.
+                    // But this code path is not on the critical path for most basebackups (?).
+                    .get(rel_block_to_key(src, blknum), self.lsn, self.ctx)
                    .await
                    .map_err(|e| BasebackupError::Server(e.into()))?;
                segment_data.extend_from_slice(&img[..]);
@@ -515,7 +544,7 @@ where
            self.ar
                .append(&header, segment_data.as_slice())
                .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_rel,segment"))?;

            seg += 1;
            startblk = endblk;
@@ -566,7 +595,7 @@ where
            self.ar
                .append(&header, pg_version_str.as_bytes())
                .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_dbdir,PG_VERSION"))?;

            info!("timeline.pg_version {}", self.timeline.pg_version);

@@ -576,7 +605,7 @@ where
                self.ar
                    .append(&header, &img[..])
                    .await
-                    .map_err(BasebackupError::Client)?;
+                    .map_err(|e| BasebackupError::Client(e, "add_dbdir,global/pg_filenode.map"))?;
            } else {
                warn!("global/pg_filenode.map is missing");
            }
@@ -612,9 +641,9 @@ where
            let path = format!("base/{}", dbnode);
            let header = new_tar_header_dir(&path)?;
            self.ar
-                .append(&header, &mut io::empty())
+                .append(&header, io::empty())
                .await
-                .map_err(BasebackupError::Client)?;
+                .map_err(|e| BasebackupError::Client(e, "add_dbdir,base"))?;

            if let Some(img) = relmap_img {
                let dst_path = format!("base/{}/PG_VERSION", dbnode);
@@ -627,14 +656,14 @@ where
                self.ar
                    .append(&header, pg_version_str.as_bytes())
                    .await
-                    .map_err(BasebackupError::Client)?;
+                    .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/PG_VERSION"))?;

                let relmap_path = format!("base/{}/pg_filenode.map", dbnode);
                let header = new_tar_header(&relmap_path, img.len() as u64)?;
                self.ar
                    .append(&header, &img[..])
                    .await
-                    .map_err(BasebackupError::Client)?;
+                    .map_err(|e| BasebackupError::Client(e, "add_dbdir,base/pg_filenode.map"))?;
            }
        };
        Ok(())
@@ -663,7 +692,7 @@ where
        self.ar
            .append(&header, &buf[..])
            .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "add_twophase_file"))?;

        Ok(())
    }
@@ -672,7 +701,11 @@ where
    // Add generated pg_control file and bootstrap WAL segment.
    // Also send zenith.signal file with extra bootstrap data.
    //
-    async fn add_pgcontrol_file(&mut self) -> Result<(), BasebackupError> {
+    async fn add_pgcontrol_file(
+        &mut self,
+        pg_control_bytes: Bytes,
+        system_identifier: u64,
+    ) -> Result<(), BasebackupError> {
        // add zenith.signal file
        let mut zenith_signal = String::new();
        if self.prev_record_lsn == Lsn(0) {
@@ -693,32 +726,14 @@ where
                zenith_signal.as_bytes(),
            )
            .await
-            .map_err(BasebackupError::Client)?;
-
-        let checkpoint_bytes = self
-            .timeline
-            .get_checkpoint(self.lsn, self.ctx)
-            .await
-            .context("failed to get checkpoint bytes")?;
-        let pg_control_bytes = self
-            .timeline
-            .get_control_file(self.lsn, self.ctx)
-            .await
-            .context("failed get control bytes")?;
-
-        let (pg_control_bytes, system_identifier) = postgres_ffi::generate_pg_control(
-            &pg_control_bytes,
-            &checkpoint_bytes,
-            self.lsn,
-            self.timeline.pg_version,
-        )?;
+            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;

        //send pg_control
        let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
        self.ar
            .append(&header, &pg_control_bytes[..])
            .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,pg_control"))?;

        //send wal segment
        let segno = self.lsn.segment_number(WAL_SEGMENT_SIZE);
@@ -742,7 +757,7 @@ where
        self.ar
            .append(&header, &wal_seg[..])
            .await
-            .map_err(BasebackupError::Client)?;
+            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,wal_segment"))?;
        Ok(())
    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -134,6 +134,7 @@ fn main() -> anyhow::Result<()> {
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
+    info!(?conf.validate_wal_contiguity, "starting with WAL contiguity validation");
    info!(?conf.page_service_pipelining, "starting with page service pipelining config");
    info!(?conf.get_vectored_concurrent_io, "starting with get_vectored IO concurrency config");

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -197,6 +197,10 @@ pub struct PageServerConf {
    /// Enable read path debugging. If enabled, read key errors will print a backtrace of the layer
    /// files read.
    pub enable_read_path_debugging: bool,
+
+    /// Interpreted protocol feature: if enabled, validate that the logical WAL received from
+    /// safekeepers does not have gaps.
+    pub validate_wal_contiguity: bool,
 }

 /// Token for authentication to safekeepers
@@ -360,6 +364,7 @@ impl PageServerConf {
            page_service_pipelining,
            get_vectored_concurrent_io,
            enable_read_path_debugging,
+            validate_wal_contiguity,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -446,6 +451,7 @@ impl PageServerConf {
            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
            no_sync: no_sync.unwrap_or(false),
            enable_read_path_debugging: enable_read_path_debugging.unwrap_or(false),
+            validate_wal_contiguity: validate_wal_contiguity.unwrap_or(false),
        };

        // ------------------------------------------------------------
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -98,6 +98,7 @@ pub struct RequestContext {
    download_behavior: DownloadBehavior,
    access_stats_behavior: AccessStatsBehavior,
    page_content_kind: PageContentKind,
+    read_path_debug: bool,
 }

 /// The kind of access to the page cache.
@@ -155,6 +156,7 @@ impl RequestContextBuilder {
                download_behavior: DownloadBehavior::Download,
                access_stats_behavior: AccessStatsBehavior::Update,
                page_content_kind: PageContentKind::Unknown,
+                read_path_debug: false,
            },
        }
    }
@@ -168,6 +170,7 @@ impl RequestContextBuilder {
                download_behavior: original.download_behavior,
                access_stats_behavior: original.access_stats_behavior,
                page_content_kind: original.page_content_kind,
+                read_path_debug: original.read_path_debug,
            },
        }
    }
@@ -191,6 +194,11 @@ impl RequestContextBuilder {
        self
    }

+    pub(crate) fn read_path_debug(mut self, b: bool) -> Self {
+        self.inner.read_path_debug = b;
+        self
+    }
+
    pub fn build(self) -> RequestContext {
        self.inner
    }
@@ -291,4 +299,8 @@ impl RequestContext {
    pub(crate) fn page_content_kind(&self) -> PageContentKind {
        self.page_content_kind
    }
+
+    pub(crate) fn read_path_debug(&self) -> bool {
+        self.read_path_debug
+    }
 }
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -173,6 +173,7 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
                        listen_pg_port: m.postgres_port,
                        listen_http_addr: m.http_host,
                        listen_http_port: m.http_port,
+                        listen_https_port: None, // TODO: Support https.
                        availability_zone_id: az_id.expect("Checked above"),
                    })
                }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -824,6 +824,38 @@ paths:
              schema:
                $ref: "#/components/schemas/TenantConfigResponse"

+  /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers:
+    parameters:
+      - name: tenant_shard_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: concurrency
+        description: Maximum number of concurrent downloads (capped at remote storage concurrency)
+        in: query
+        required: false
+        schema:
+          type: integer
+    post:
+      description: |
+        Download all layers in the specified timeline's heatmap. The `tenant_shard_id` parameter
+        may be used to target all shards of a tenant when the unsharded form is used, or a specific
+        tenant shard with the sharded form.
+      responses:
+        "200":
+          description: Success
+    delete:
+      description: Stop any on-going background downloads of heatmap layers for the specified timeline.
+      responses:
+        "200":
+          description: Success
+
  /v1/utilization:
    get:
      description: |
@@ -882,6 +914,8 @@ components:
              properties:
                reason:
                  type: string
+        gc_blocking:
+          type: string

    TenantCreateRequest:
      allOf:
@@ -1080,9 +1114,15 @@ components:
          type: integer
        state:
          type: string
+        min_readable_lsn:
+          type: string
+          format: hex
        latest_gc_cutoff_lsn:
          type: string
          format: hex
+        applied_gc_cutoff_lsn:
+          type: string
+          format: hex

    SyntheticSizeResponse:
      type: object
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -68,6 +68,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;

 use crate::config::PageServerConf;
+use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -482,6 +483,11 @@ async fn build_timeline_info_common(

    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();

+    let min_readable_lsn = std::cmp::max(
+        timeline.get_gc_cutoff_lsn(),
+        *timeline.get_applied_gc_cutoff_lsn(),
+    );
+
    let info = TimelineInfo {
        tenant_id: timeline.tenant_shard_id,
        timeline_id: timeline.timeline_id,
@@ -493,7 +499,12 @@ async fn build_timeline_info_common(
        initdb_lsn,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
-        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
+        // Externally, expose the lowest LSN that can be used to create a branch as the "GC cutoff", although internally
+        // we distinguish between the "planned" GC cutoff (PITR point) and the "latest" GC cutoff (where we
+        // actually trimmed data to), which can pass each other when PITR is changed.
+        latest_gc_cutoff_lsn: min_readable_lsn,
+        min_readable_lsn,
+        applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
        current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
        current_logical_size_is_accurate: match current_logical_size.accuracy() {
            tenant::timeline::logical_size::Accuracy::Approximate => false,
@@ -1453,6 +1464,59 @@ async fn timeline_layer_scan_disposable_keys(
    )
 }

+async fn timeline_download_heatmap_layers_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    // Only used in the case where remote storage is not configured.
+    const DEFAULT_MAX_CONCURRENCY: usize = 100;
+    // A conservative default.
+    const DEFAULT_CONCURRENCY: usize = 16;
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    let desired_concurrency =
+        parse_query_param(&request, "concurrency")?.unwrap_or(DEFAULT_CONCURRENCY);
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let max_concurrency = get_config(&request)
+        .remote_storage_config
+        .as_ref()
+        .map(|c| c.concurrency_limit())
+        .unwrap_or(DEFAULT_MAX_CONCURRENCY);
+    let concurrency = std::cmp::min(max_concurrency, desired_concurrency);
+
+    timeline.start_heatmap_layers_download(concurrency).await?;
+
+    json_response(StatusCode::ACCEPTED, ())
+}
+
+async fn timeline_shutdown_download_heatmap_layers_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    timeline.stop_and_drain_heatmap_layers_download().await;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn layer_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
@@ -2331,6 +2395,7 @@ async fn timeline_checkpoint_handler(
                    match e {
                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
                        CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
+                        CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                        CompactionError::Other(e) => ApiError::InternalServerError(e)
                    }
                )?;
@@ -2508,14 +2573,30 @@ async fn deletion_queue_flush(
    }
 }

-/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
 async fn getpage_at_lsn_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    getpage_at_lsn_handler_inner(false, request, cancel).await
+}
+
+async fn touchpage_at_lsn_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    getpage_at_lsn_handler_inner(true, request, cancel).await
+}
+
+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+async fn getpage_at_lsn_handler_inner(
+    touch: bool,
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    // Require pageserver admin permission for this API instead of only tenant-level token.
+    check_permission(&request, None)?;
    let state = get_state(&request);

    struct Key(pageserver_api::key::Key);
@@ -2530,22 +2611,29 @@ async fn getpage_at_lsn_handler(

    let key: Key = parse_query_param(&request, "key")?
        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'key' query parameter")))?;
-    let lsn: Lsn = parse_query_param(&request, "lsn")?
-        .ok_or_else(|| ApiError::BadRequest(anyhow!("missing 'lsn' query parameter")))?;
+    let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;

    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+        // Enable read path debugging
+        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true).build();
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;

+        // Use last_record_lsn if no lsn is provided
+        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let page = timeline.get(key.0, lsn, &ctx).await?;

-        Result::<_, ApiError>::Ok(
-            Response::builder()
-                .status(StatusCode::OK)
-                .header(header::CONTENT_TYPE, "application/octet-stream")
-                .body(hyper::Body::from(page))
-                .unwrap(),
-        )
+        if touch {
+            json_response(StatusCode::OK, ())
+        } else {
+            Result::<_, ApiError>::Ok(
+                Response::builder()
+                    .status(StatusCode::OK)
+                    .header(header::CONTENT_TYPE, "application/octet-stream")
+                    .body(hyper::Body::from(page))
+                    .unwrap(),
+            )
+        }
    }
    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
@@ -3616,6 +3704,14 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
            |r| api_handler(r, layer_map_info_handler),
        )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| api_handler(r, timeline_download_heatmap_layers_handler),
+        )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers",
+            |r| api_handler(r, timeline_shutdown_download_heatmap_layers_handler),
+        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, layer_download_handler),
@@ -3672,6 +3768,10 @@ pub fn make_router(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
        )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
+            |r| api_handler(r, touchpage_at_lsn_handler),
+        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
            |r| api_handler(r, timeline_collect_keyspace),
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
+use std::os::fd::RawFd;
 use std::pin::Pin;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
@@ -129,7 +130,7 @@ pub(crate) static LAYERS_PER_READ: Lazy<HistogramVec> = Lazy::new(|| {
        "Layers visited to serve a single read (read amplification). In a batch, all visited layers count towards every read.",
        &["tenant_id", "shard_id", "timeline_id"],
        // Low resolution to reduce cardinality.
-        vec![1.0, 5.0, 10.0, 25.0, 50.0, 100.0],
+        vec![4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0],
    )
    .expect("failed to define a metric")
 });
@@ -1439,27 +1440,66 @@ impl Drop for SmgrOpTimer {
 }

 impl SmgrOpFlushInProgress {
-    pub(crate) async fn measure<Fut, O>(self, mut started_at: Instant, mut fut: Fut) -> O
+    /// The caller must guarantee that `socket_fd`` outlives this function.
+    pub(crate) async fn measure<Fut, O>(
+        self,
+        started_at: Instant,
+        mut fut: Fut,
+        socket_fd: RawFd,
+    ) -> O
    where
        Fut: std::future::Future<Output = O>,
    {
        let mut fut = std::pin::pin!(fut);

-        // Whenever observe_guard gets called, or dropped,
-        // it adds the time elapsed since its last call to metrics.
-        // Last call is tracked in `now`.
+        let mut logged = false;
+        let mut last_counter_increment_at = started_at;
        let mut observe_guard = scopeguard::guard(
-            || {
+            |is_timeout| {
                let now = Instant::now();
-                let elapsed = now - started_at;
-                self.global_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                self.per_timeline_micros
-                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
-                started_at = now;
+
+                // Increment counter
+                {
+                    let elapsed_since_last_observe = now - last_counter_increment_at;
+                    self.global_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    self.per_timeline_micros
+                        .inc_by(u64::try_from(elapsed_since_last_observe.as_micros()).unwrap());
+                    last_counter_increment_at = now;
+                }
+
+                // Log something on every timeout, and on completion but only if we hit a timeout.
+                if is_timeout || logged {
+                    logged = true;
+                    let elapsed_total = now - started_at;
+                    let msg = if is_timeout {
+                        "slow flush ongoing"
+                    } else {
+                        "slow flush completed or cancelled"
+                    };
+
+                    let (inq, outq) = {
+                        // SAFETY: caller guarantees that `socket_fd` outlives this function.
+                        #[cfg(target_os = "linux")]
+                        unsafe {
+                            (
+                                utils::linux_socket_ioctl::inq(socket_fd).unwrap_or(-2),
+                                utils::linux_socket_ioctl::outq(socket_fd).unwrap_or(-2),
+                            )
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            _ = socket_fd; // appease unused lint on macOS
+                            (-1, -1)
+                        }
+                    };
+
+                    let elapsed_total_secs = format!("{:.6}", elapsed_total.as_secs_f64());
+                    tracing::info!(elapsed_total_secs, inq, outq, msg);
+                }
            },
            |mut observe| {
-                observe();
+                observe(false);
            },
        );

@@ -1467,7 +1507,7 @@ impl SmgrOpFlushInProgress {
            match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
                Ok(v) => return v,
                Err(_timeout) => {
-                    (*observe_guard)();
+                    (*observe_guard)(true);
                }
            }
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -34,11 +34,13 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::SystemTime;
 use std::time::{Duration, Instant};
+use strum_macros::IntoStaticStr;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use utils::logging::warn_slow;
 use utils::sync::gate::{Gate, GateGuard};
 use utils::sync::spsc_fold;
 use utils::{
@@ -73,6 +75,7 @@ use pageserver_api::models::PageTraceEvent;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
+use std::os::fd::AsRawFd;

 /// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
 /// is not yet in state [`TenantState::Active`].
@@ -80,6 +83,9 @@ use postgres_ffi::BLCKSZ;
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
 const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

+/// Threshold at which to log a warning about slow GetPage requests.
+const WARN_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
+
 ///////////////////////////////////////////////////////////////////////////////

 pub struct Listener {
@@ -236,7 +242,7 @@ pub async fn libpq_listener_main(

 type ConnectionHandlerResult = anyhow::Result<()>;

-#[instrument(skip_all, fields(peer_addr))]
+#[instrument(skip_all, fields(peer_addr, application_name))]
 #[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
    conf: &'static PageServerConf,
@@ -257,6 +263,8 @@ async fn page_service_conn_main(
        .set_nodelay(true)
        .context("could not set TCP_NODELAY")?;

+    let socket_fd = socket.as_raw_fd();
+
    let peer_addr = socket.peer_addr().context("get peer address")?;
    tracing::Span::current().record("peer_addr", field::display(peer_addr));

@@ -305,7 +313,7 @@ async fn page_service_conn_main(
        cancel.clone(),
        gate_guard,
    );
-    let pgbackend = PostgresBackend::new_from_io(socket, peer_addr, auth_type, None)?;
+    let pgbackend = PostgresBackend::new_from_io(socket_fd, socket, peer_addr, auth_type, None)?;

    match pgbackend.run(&mut conn_handler, &cancel).await {
        Ok(()) => {
@@ -591,6 +599,7 @@ struct BatchedTestRequest {
 /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
 /// so that we don't keep the [`Timeline::gate`] open while the batch
 /// is being built up inside the [`spsc_fold`] (pagestream pipelining).
+#[derive(IntoStaticStr)]
 enum BatchedFeMessage {
    Exists {
        span: Span,
@@ -635,6 +644,10 @@ enum BatchedFeMessage {
 }

 impl BatchedFeMessage {
+    fn as_static_str(&self) -> &'static str {
+        self.into()
+    }
+
    fn observe_execution_start(&mut self, at: Instant) {
        match self {
            BatchedFeMessage::Exists { timer, .. }
@@ -914,7 +927,7 @@ impl PageServerHandler {
                    &shard,
                    req.hdr.request_lsn,
                    req.hdr.not_modified_since,
-                    &shard.get_latest_gc_cutoff_lsn(),
+                    &shard.get_applied_gc_cutoff_lsn(),
                    ctx,
                )
                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
@@ -1286,12 +1299,15 @@ impl PageServerHandler {
            ))?;

            // what we want to do
+            let socket_fd = pgb_writer.socket_fd;
            let flush_fut = pgb_writer.flush();
            // metric for how long flushing takes
            let flush_fut = match flushing_timer {
-                Some(flushing_timer) => {
-                    futures::future::Either::Left(flushing_timer.measure(Instant::now(), flush_fut))
-                }
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
                None => futures::future::Either::Right(flush_fut),
            };
            // do it while respecting cancellation
@@ -1457,17 +1473,20 @@ impl PageServerHandler {
                }
            };

-            let err = self
-                .pagesteam_handle_batched_message(
+            let result = warn_slow(
+                msg.as_static_str(),
+                WARN_SLOW_GETPAGE_THRESHOLD,
+                self.pagesteam_handle_batched_message(
                    pgb_writer,
                    msg,
                    io_concurrency.clone(),
                    &cancel,
                    protocol_version,
                    ctx,
-                )
-                .await;
-            match err {
+                ),
+            )
+            .await;
+            match result {
                Ok(()) => {}
                Err(e) => break e,
            }
@@ -1630,13 +1649,17 @@ impl PageServerHandler {
                            return Err(e);
                        }
                    };
-                    self.pagesteam_handle_batched_message(
-                        pgb_writer,
-                        batch,
-                        io_concurrency.clone(),
-                        &cancel,
-                        protocol_version,
-                        &ctx,
+                    warn_slow(
+                        batch.as_static_str(),
+                        WARN_SLOW_GETPAGE_THRESHOLD,
+                        self.pagesteam_handle_batched_message(
+                            pgb_writer,
+                            batch,
+                            io_concurrency.clone(),
+                            &cancel,
+                            protocol_version,
+                            &ctx,
+                        ),
                    )
                    .await?;
                }
@@ -1793,6 +1816,13 @@ impl PageServerHandler {
                .as_millis()
                .to_string()
        });
+
+        info!(
+            "acquired lease for {} until {}",
+            lsn,
+            valid_until_str.as_deref().unwrap_or("<unknown>")
+        );
+
        let bytes = valid_until_str.as_ref().map(|x| x.as_bytes());

        pgb.write_message_noflush(&BeMessage::RowDescription(&[RowDescriptor::text_col(
@@ -1810,7 +1840,7 @@ impl PageServerHandler {
        req: &PagestreamExistsRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
            req.hdr.request_lsn,
@@ -1837,7 +1867,7 @@ impl PageServerHandler {
        req: &PagestreamNblocksRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
            req.hdr.request_lsn,
@@ -1864,7 +1894,7 @@ impl PageServerHandler {
        req: &PagestreamDbSizeRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
            req.hdr.request_lsn,
@@ -1954,7 +1984,7 @@ impl PageServerHandler {
        req: &PagestreamGetSlruSegmentRequest,
        ctx: &RequestContext,
    ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
        let lsn = Self::wait_or_get_last_lsn(
            timeline,
            req.hdr.request_lsn,
@@ -2050,7 +2080,8 @@ impl PageServerHandler {
    {
        fn map_basebackup_error(err: BasebackupError) -> QueryError {
            match err {
-                BasebackupError::Client(e) => QueryError::Disconnected(ConnectionError::Io(e)),
+                // TODO: passthrough the error site to the final error message?
+                BasebackupError::Client(e, _) => QueryError::Disconnected(ConnectionError::Io(e)),
                BasebackupError::Server(e) => QueryError::Other(e),
            }
        }
@@ -2071,7 +2102,7 @@ impl PageServerHandler {
            //return Err(QueryError::NotFound("timeline is archived".into()))
        }

-        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff_lsn = timeline.get_applied_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
            // Backup was requested at a particular LSN. Wait for it to arrive.
            info!("waiting for {}", lsn);
@@ -2151,10 +2182,12 @@ impl PageServerHandler {
                .await
                .map_err(map_basebackup_error)?;
            }
-            writer
-                .flush()
-                .await
-                .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
+            writer.flush().await.map_err(|e| {
+                map_basebackup_error(BasebackupError::Client(
+                    e,
+                    "handle_basebackup_request,flush",
+                ))
+            })?;
        }

        pgb.write_message_noflush(&BeMessage::CopyDone)
@@ -2454,9 +2487,16 @@ where
    fn startup(
        &mut self,
        _pgb: &mut PostgresBackend<IO>,
-        _sm: &FeStartupPacket,
+        sm: &FeStartupPacket,
    ) -> Result<(), QueryError> {
        fail::fail_point!("ps::connection-start::startup-packet");
+
+        if let FeStartupPacket::StartupMessage { params, .. } = sm {
+            if let Some(app_name) = params.get("application_name") {
+                Span::current().record("application_name", field::display(app_name));
+            }
+        };
+
        Ok(())
    }

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -23,13 +23,14 @@ use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
 use itertools::Itertools;
-use pageserver_api::key::Key;
 use pageserver_api::key::{
    dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
-    relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
-    slru_segment_key_range, slru_segment_size_to_key, twophase_file_key, twophase_key_range,
-    CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
+    rel_tag_sparse_key_range, relmap_file_key, repl_origin_key, repl_origin_key_range,
+    slru_block_to_key, slru_dir_to_key, slru_segment_key_range, slru_segment_size_to_key,
+    twophase_file_key, twophase_key_range, CompactKey, RelDirExists, AUX_FILES_KEY, CHECKPOINT_KEY,
+    CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
+use pageserver_api::key::{rel_tag_sparse_key, Key};
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
@@ -44,7 +45,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, trace, warn};
+use tracing::{debug, info, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -490,12 +491,33 @@ impl Timeline {
        if !dbdirs.contains_key(&(tag.spcnode, tag.dbnode)) {
            return Ok(false);
        }
-        // fetch directory listing
+
+        // Read path: first read the new reldir keyspace. Early return if the relation exists.
+        // Otherwise, read the old reldir keyspace.
+        // TODO: if IndexPart::rel_size_migration is `Migrated`, we only need to read from v2.
+
+        if self.get_rel_size_v2_enabled() {
+            // fetch directory listing (new)
+            let key = rel_tag_sparse_key(tag.spcnode, tag.dbnode, tag.relnode, tag.forknum);
+            let buf = RelDirExists::decode_option(version.sparse_get(self, key, ctx).await?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            let exists_v2 = buf == RelDirExists::Exists;
+            // Fast path: if the relation exists in the new format, return true.
+            // TODO: we should have a verification mode that checks both keyspaces
+            // to ensure the relation only exists in one of them.
+            if exists_v2 {
+                return Ok(true);
+            }
+        }
+
+        // fetch directory listing (old)
+
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
        let buf = version.get(self, key, ctx).await?;

        let dir = RelDirectory::des(&buf)?;
-        Ok(dir.rels.contains(&(tag.relnode, tag.forknum)))
+        let exists_v1 = dir.rels.contains(&(tag.relnode, tag.forknum));
+        Ok(exists_v1)
    }

    /// Get a list of all existing relations in given tablespace and database.
@@ -513,12 +535,12 @@ impl Timeline {
        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<HashSet<RelTag>, PageReconstructError> {
-        // fetch directory listing
+        // fetch directory listing (old)
        let key = rel_dir_to_key(spcnode, dbnode);
        let buf = version.get(self, key, ctx).await?;

        let dir = RelDirectory::des(&buf)?;
-        let rels: HashSet<RelTag> =
+        let rels_v1: HashSet<RelTag> =
            HashSet::from_iter(dir.rels.iter().map(|(relnode, forknum)| RelTag {
                spcnode,
                dbnode,
@@ -526,6 +548,46 @@ impl Timeline {
                forknum: *forknum,
            }));

+        if !self.get_rel_size_v2_enabled() {
+            return Ok(rels_v1);
+        }
+
+        // scan directory listing (new), merge with the old results
+        let key_range = rel_tag_sparse_key_range(spcnode, dbnode);
+        let io_concurrency = IoConcurrency::spawn_from_conf(
+            self.conf,
+            self.gate
+                .enter()
+                .map_err(|_| PageReconstructError::Cancelled)?,
+        );
+        let results = self
+            .scan(
+                KeySpace::single(key_range),
+                version.get_lsn(),
+                ctx,
+                io_concurrency,
+            )
+            .await?;
+        let mut rels = rels_v1;
+        for (key, val) in results {
+            let val = RelDirExists::decode(&val?)
+                .map_err(|_| PageReconstructError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            assert_eq!(key.field6, 1);
+            assert_eq!(key.field2, spcnode);
+            assert_eq!(key.field3, dbnode);
+            let tag = RelTag {
+                spcnode,
+                dbnode,
+                relnode: key.field4,
+                forknum: key.field5,
+            };
+            if val == RelDirExists::Removed {
+                debug_assert!(!rels.contains(&tag), "removed reltag in v2");
+                continue;
+            }
+            let did_not_contain = rels.insert(tag);
+            debug_assert!(did_not_contain, "duplicate reltag in v2");
+        }
        Ok(rels)
    }

@@ -611,7 +673,7 @@ impl Timeline {
    ) -> Result<LsnForTimestamp, PageReconstructError> {
        pausable_failpoint!("find-lsn-for-timestamp-pausable");

-        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
+        let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
        let gc_cutoff_planned = {
            let gc_info = self.gc_info.read().unwrap();
            gc_info.min_cutoff()
@@ -1144,7 +1206,11 @@ impl Timeline {

        let dense_keyspace = result.to_keyspace();
        let sparse_keyspace = SparseKeySpace(KeySpace {
-            ranges: vec![Key::metadata_aux_key_range(), repl_origin_key_range()],
+            ranges: vec![
+                Key::metadata_aux_key_range(),
+                repl_origin_key_range(),
+                Key::rel_dir_sparse_key_range(),
+            ],
        });

        if cfg!(debug_assertions) {
@@ -1274,12 +1340,22 @@ pub struct DatadirModification<'a> {

    /// For special "directory" keys that store key-value maps, track the size of the map
    /// if it was updated in this modification.
-    pending_directory_entries: Vec<(DirectoryKind, usize)>,
+    pending_directory_entries: Vec<(DirectoryKind, MetricsUpdate)>,

    /// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
    pending_metadata_bytes: usize,
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum MetricsUpdate {
+    /// Set the metrics to this value
+    Set(u64),
+    /// Increment the metrics by this value
+    Add(u64),
+    /// Decrement the metrics by this value
+    Sub(u64),
+}
+
 impl DatadirModification<'_> {
    // When a DatadirModification is committed, we do a monolithic serialization of all its contents.  WAL records can
    // contain multiple pages, so the pageserver's record-based batch size isn't sufficient to bound this allocation: we
@@ -1359,7 +1435,8 @@ impl DatadirModification<'_> {
        let buf = DbDirectory::ser(&DbDirectory {
            dbdirs: HashMap::new(),
        })?;
-        self.pending_directory_entries.push((DirectoryKind::Db, 0));
+        self.pending_directory_entries
+            .push((DirectoryKind::Db, MetricsUpdate::Set(0)));
        self.put(DBDIR_KEY, Value::Image(buf.into()));

        let buf = if self.tline.pg_version >= 17 {
@@ -1372,7 +1449,7 @@ impl DatadirModification<'_> {
            })
        }?;
        self.pending_directory_entries
-            .push((DirectoryKind::TwoPhase, 0));
+            .push((DirectoryKind::TwoPhase, MetricsUpdate::Set(0)));
        self.put(TWOPHASEDIR_KEY, Value::Image(buf.into()));

        let buf: Bytes = SlruSegmentDirectory::ser(&SlruSegmentDirectory::default())?.into();
@@ -1382,17 +1459,23 @@ impl DatadirModification<'_> {
        // harmless but they'd just be dropped on later compaction.
        if self.tline.tenant_shard_id.is_shard_zero() {
            self.put(slru_dir_to_key(SlruKind::Clog), empty_dir.clone());
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
            self.put(
                slru_dir_to_key(SlruKind::MultiXactMembers),
                empty_dir.clone(),
            );
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::Clog), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::Clog),
+                MetricsUpdate::Set(0),
+            ));
            self.put(slru_dir_to_key(SlruKind::MultiXactOffsets), empty_dir);
-            self.pending_directory_entries
-                .push((DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets), 0));
+            self.pending_directory_entries.push((
+                DirectoryKind::SlruSegment(SlruKind::MultiXactOffsets),
+                MetricsUpdate::Set(0),
+            ));
        }

        Ok(())
@@ -1658,10 +1741,16 @@ impl DatadirModification<'_> {
        }
        if r.is_none() {
            // Create RelDirectory
+            // TODO: if we have fully migrated to v2, no need to create this directory
            let buf = RelDirectory::ser(&RelDirectory {
                rels: HashSet::new(),
            })?;
-            self.pending_directory_entries.push((DirectoryKind::Rel, 0));
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+            if self.tline.get_rel_size_v2_enabled() {
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+            }
            self.put(
                rel_dir_to_key(spcnode, dbnode),
                Value::Image(Bytes::from(buf)),
@@ -1685,8 +1774,10 @@ impl DatadirModification<'_> {
            if !dir.xids.insert(xid) {
                anyhow::bail!("twophase file for xid {} already exists", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
        } else {
            let xid = xid as u32;
@@ -1694,8 +1785,10 @@ impl DatadirModification<'_> {
            if !dir.xids.insert(xid) {
                anyhow::bail!("twophase file for xid {} already exists", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
        };
        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -1744,8 +1837,10 @@ impl DatadirModification<'_> {
        let mut dir = DbDirectory::des(&buf)?;
        if dir.dbdirs.remove(&(spcnode, dbnode)).is_some() {
            let buf = DbDirectory::ser(&dir)?;
-            self.pending_directory_entries
-                .push((DirectoryKind::Db, dir.dbdirs.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::Db,
+                MetricsUpdate::Set(dir.dbdirs.len() as u64),
+            ));
            self.put(DBDIR_KEY, Value::Image(buf.into()));
        } else {
            warn!(
@@ -1778,39 +1873,85 @@ impl DatadirModification<'_> {
        // tablespace.  Create the reldir entry for it if so.
        let mut dbdir = DbDirectory::des(&self.get(DBDIR_KEY, ctx).await.context("read db")?)
            .context("deserialize db")?;
-        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let mut rel_dir =
+
+        let dbdir_exists =
            if let hash_map::Entry::Vacant(e) = dbdir.dbdirs.entry((rel.spcnode, rel.dbnode)) {
                // Didn't exist. Update dbdir
                e.insert(false);
                let buf = DbDirectory::ser(&dbdir).context("serialize db")?;
-                self.pending_directory_entries
-                    .push((DirectoryKind::Db, dbdir.dbdirs.len()));
+                self.pending_directory_entries.push((
+                    DirectoryKind::Db,
+                    MetricsUpdate::Set(dbdir.dbdirs.len() as u64),
+                ));
                self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-                // and create the RelDirectory
-                RelDirectory::default()
+                false
            } else {
-                // reldir already exists, fetch it
-                RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
-                    .context("deserialize db")?
+                true
            };

+        let rel_dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
+        let mut rel_dir = if !dbdir_exists {
+            // Create the RelDirectory
+            RelDirectory::default()
+        } else {
+            // reldir already exists, fetch it
+            RelDirectory::des(&self.get(rel_dir_key, ctx).await.context("read db")?)
+                .context("deserialize db")?
+        };
+
        // Add the new relation to the rel directory entry, and write it back
        if !rel_dir.rels.insert((rel.relnode, rel.forknum)) {
            return Err(RelationError::AlreadyExists);
        }

-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, rel_dir.rels.len()));
-
-        self.put(
-            rel_dir_key,
-            Value::Image(Bytes::from(
-                RelDirectory::ser(&rel_dir).context("serialize")?,
-            )),
-        );
-
+        if self.tline.get_rel_size_v2_enabled() {
+            let sparse_rel_dir_key =
+                rel_tag_sparse_key(rel.spcnode, rel.dbnode, rel.relnode, rel.forknum);
+            // check if the rel_dir_key exists in v2
+            let val = self
+                .sparse_get(sparse_rel_dir_key, ctx)
+                .await
+                .map_err(|e| RelationError::Other(e.into()))?;
+            let val = RelDirExists::decode_option(val)
+                .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+            if val == RelDirExists::Exists {
+                return Err(RelationError::AlreadyExists);
+            }
+            self.put(
+                sparse_rel_dir_key,
+                Value::Image(RelDirExists::Exists.encode()),
+            );
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)));
+                self.pending_directory_entries
+                    .push((DirectoryKind::RelV2, MetricsUpdate::Set(0)));
+                // We don't write `rel_dir_key -> rel_dir.rels` back to the storage in the v2 path unless it's the initial creation.
+                // TODO: if we have fully migrated to v2, no need to create this directory. Otherwise, there
+                // will be key not found errors if we don't create an empty one for rel_size_v2.
+                self.put(
+                    rel_dir_key,
+                    Value::Image(Bytes::from(
+                        RelDirectory::ser(&RelDirectory::default()).context("serialize")?,
+                    )),
+                );
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::RelV2, MetricsUpdate::Add(1)));
+        } else {
+            if !dbdir_exists {
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, MetricsUpdate::Set(0)))
+            }
+            self.pending_directory_entries
+                .push((DirectoryKind::Rel, MetricsUpdate::Add(1)));
+            self.put(
+                rel_dir_key,
+                Value::Image(Bytes::from(
+                    RelDirectory::ser(&rel_dir).context("serialize")?,
+                )),
+            );
+        }
        // Put size
        let size_key = rel_size_to_key(rel);
        let buf = nblocks.to_le_bytes();
@@ -1896,9 +2037,34 @@ impl DatadirModification<'_> {

            let mut dirty = false;
            for rel_tag in rel_tags {
-                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                let found = if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                    self.pending_directory_entries
+                        .push((DirectoryKind::Rel, MetricsUpdate::Sub(1)));
                    dirty = true;
+                    true
+                } else if self.tline.get_rel_size_v2_enabled() {
+                    // The rel is not found in the old reldir key, so we need to check the new sparse keyspace.
+                    // Note that a relation can only exist in one of the two keyspaces (guaranteed by the ingestion
+                    // logic).
+                    let key =
+                        rel_tag_sparse_key(spc_node, db_node, rel_tag.relnode, rel_tag.forknum);
+                    let val = RelDirExists::decode_option(self.sparse_get(key, ctx).await?)
+                        .map_err(|_| RelationError::Other(anyhow::anyhow!("invalid reldir key")))?;
+                    if val == RelDirExists::Exists {
+                        self.pending_directory_entries
+                            .push((DirectoryKind::RelV2, MetricsUpdate::Sub(1)));
+                        // put tombstone
+                        self.put(key, Value::Image(RelDirExists::Removed.encode()));
+                        // no need to set dirty to true
+                        true
+                    } else {
+                        false
+                    }
+                } else {
+                    false
+                };

+                if found {
                    // update logical size
                    let size_key = rel_size_to_key(rel_tag);
                    let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1914,8 +2080,6 @@ impl DatadirModification<'_> {

            if dirty {
                self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-                self.pending_directory_entries
-                    .push((DirectoryKind::Rel, dir.rels.len()));
            }
        }

@@ -1939,8 +2103,10 @@ impl DatadirModification<'_> {
        if !dir.segments.insert(segno) {
            anyhow::bail!("slru segment {kind:?}/{segno} already exists");
        }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
        self.put(
            dir_key,
            Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -1987,8 +2153,10 @@ impl DatadirModification<'_> {
        if !dir.segments.remove(&segno) {
            warn!("slru segment {:?}/{} does not exist", kind, segno);
        }
-        self.pending_directory_entries
-            .push((DirectoryKind::SlruSegment(kind), dir.segments.len()));
+        self.pending_directory_entries.push((
+            DirectoryKind::SlruSegment(kind),
+            MetricsUpdate::Set(dir.segments.len() as u64),
+        ));
        self.put(
            dir_key,
            Value::Image(Bytes::from(SlruSegmentDirectory::ser(&dir)?)),
@@ -2020,8 +2188,10 @@ impl DatadirModification<'_> {
            if !dir.xids.remove(&xid) {
                warn!("twophase file for xid {} does not exist", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectoryV17::ser(&dir)?)
        } else {
            let xid: u32 = u32::try_from(xid)?;
@@ -2030,8 +2200,10 @@ impl DatadirModification<'_> {
            if !dir.xids.remove(&xid) {
                warn!("twophase file for xid {} does not exist", xid);
            }
-            self.pending_directory_entries
-                .push((DirectoryKind::TwoPhase, dir.xids.len()));
+            self.pending_directory_entries.push((
+                DirectoryKind::TwoPhase,
+                MetricsUpdate::Set(dir.xids.len() as u64),
+            ));
            Bytes::from(TwoPhaseDirectory::ser(&dir)?)
        };
        self.put(TWOPHASEDIR_KEY, Value::Image(newdirbuf));
@@ -2092,6 +2264,13 @@ impl DatadirModification<'_> {
                self.tline.aux_file_size_estimator.on_add(content.len());
                new_files.push((path, content));
            }
+            // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
+            // Compute doesn't know if previous version of this file exists or not, so
+            // attempt to delete non-existing file can cause this message.
+            // To avoid false alarms, log it as info rather than warning.
+            (None, true) if path.starts_with("pg_stat/") => {
+                info!("removing non-existing pg_stat file: {}", path)
+            }
            (None, true) => warn!("removing non-existing aux file: {}", path),
        }
        let new_val = aux_file::encode_file_value(&new_files)?;
@@ -2147,7 +2326,7 @@ impl DatadirModification<'_> {
        }

        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
        }

        Ok(())
@@ -2233,7 +2412,7 @@ impl DatadirModification<'_> {
        }

        for (kind, count) in std::mem::take(&mut self.pending_directory_entries) {
-            writer.update_directory_entries_count(kind, count as u64);
+            writer.update_directory_entries_count(kind, count);
        }

        self.pending_metadata_bytes = 0;
@@ -2297,6 +2476,22 @@ impl DatadirModification<'_> {
        self.tline.get(key, lsn, ctx).await
    }

+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
    fn put(&mut self, key: Key, val: Value) {
        if Self::is_data_key(&key) {
            self.put_data(key.to_compact(), val)
@@ -2379,6 +2574,23 @@ impl Version<'_> {
        }
    }

+    /// Get a key from the sparse keyspace. Automatically converts the missing key error
+    /// and the empty value into None.
+    async fn sparse_get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Option<Bytes>, PageReconstructError> {
+        let val = self.get(timeline, key, ctx).await;
+        match val {
+            Ok(val) if val.is_empty() => Ok(None),
+            Ok(val) => Ok(Some(val)),
+            Err(PageReconstructError::MissingKey(_)) => Ok(None),
+            Err(e) => Err(e),
+        }
+    }
+
    fn get_lsn(&self) -> Lsn {
        match self {
            Version::Lsn(lsn) => *lsn,
@@ -2438,6 +2650,7 @@ pub(crate) enum DirectoryKind {
    Rel,
    AuxFiles,
    SlruSegment(SlruKind),
+    RelV2,
 }

 impl DirectoryKind {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -40,6 +40,8 @@ use remote_timeline_client::manifest::{
 use remote_timeline_client::UploadQueueNotReadyError;
 use remote_timeline_client::FAILED_REMOTE_OP_RETRIES;
 use remote_timeline_client::FAILED_UPLOAD_WARN_THRESHOLD;
+use secondary::heatmap::HeatMapTenant;
+use secondary::heatmap::HeatMapTimeline;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
@@ -55,6 +57,7 @@ use timeline::offload::OffloadError;
 use timeline::CompactFlags;
 use timeline::CompactOptions;
 use timeline::CompactionError;
+use timeline::PreviousHeatmap;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -262,6 +265,7 @@ struct TimelinePreload {
    timeline_id: TimelineId,
    client: RemoteTimelineClient,
    index_part: Result<MaybeDeletedIndexPart, DownloadError>,
+    previous_heatmap: Option<PreviousHeatmap>,
 }

 pub(crate) struct TenantPreload {
@@ -1128,6 +1132,7 @@ impl Tenant {
        resources: TimelineResources,
        mut index_part: IndexPart,
        metadata: TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
        ancestor: Option<Arc<Timeline>>,
        cause: LoadTimelineCause,
        ctx: &RequestContext,
@@ -1158,6 +1163,7 @@ impl Tenant {
        let timeline = self.create_timeline_struct(
            timeline_id,
            &metadata,
+            previous_heatmap,
            ancestor.clone(),
            resources,
            CreateTimelineCause::Load,
@@ -1557,8 +1563,18 @@ impl Tenant {
            }
        }

+        // TODO(vlad): Could go to S3 if the secondary is freezing cold and hasn't even
+        // pulled the first heatmap. Not entirely necessary since the storage controller
+        // will kick the secondary in any case and cause a download.
+        let maybe_heatmap_at = self.read_on_disk_heatmap().await;
+
        let timelines = self
-            .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
+            .load_timelines_metadata(
+                remote_timeline_ids,
+                remote_storage,
+                maybe_heatmap_at,
+                cancel,
+            )
            .await?;

        Ok(TenantPreload {
@@ -1571,6 +1587,26 @@ impl Tenant {
        })
    }

+    async fn read_on_disk_heatmap(&self) -> Option<(HeatMapTenant, std::time::Instant)> {
+        let on_disk_heatmap_path = self.conf.tenant_heatmap_path(&self.tenant_shard_id);
+        match tokio::fs::read_to_string(on_disk_heatmap_path).await {
+            Ok(heatmap) => match serde_json::from_str::<HeatMapTenant>(&heatmap) {
+                Ok(heatmap) => Some((heatmap, std::time::Instant::now())),
+                Err(err) => {
+                    error!("Failed to deserialize old heatmap: {err}");
+                    None
+                }
+            },
+            Err(err) => match err.kind() {
+                std::io::ErrorKind::NotFound => None,
+                _ => {
+                    error!("Unexpected IO error reading old heatmap: {err}");
+                    None
+                }
+            },
+        }
+    }
+
    ///
    /// Background task that downloads all data for a tenant and brings it to Active state.
    ///
@@ -1658,7 +1694,10 @@ impl Tenant {
            match index_part {
                MaybeDeletedIndexPart::IndexPart(index_part) => {
                    timeline_ancestors.insert(timeline_id, index_part.metadata.clone());
-                    remote_index_and_client.insert(timeline_id, (index_part, preload.client));
+                    remote_index_and_client.insert(
+                        timeline_id,
+                        (index_part, preload.client, preload.previous_heatmap),
+                    );
                }
                MaybeDeletedIndexPart::Deleted(index_part) => {
                    info!(
@@ -1677,7 +1716,7 @@ impl Tenant {
        // layer file.
        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
        for (timeline_id, remote_metadata) in sorted_timelines {
-            let (index_part, remote_client) = remote_index_and_client
+            let (index_part, remote_client, previous_heatmap) = remote_index_and_client
                .remove(&timeline_id)
                .expect("just put it in above");

@@ -1697,6 +1736,7 @@ impl Tenant {
                    timeline_id,
                    index_part,
                    remote_metadata,
+                    previous_heatmap,
                    self.get_timeline_resources_for(remote_client),
                    LoadTimelineCause::Attach,
                    ctx,
@@ -1846,11 +1886,13 @@ impl Tenant {
    }

    #[instrument(skip_all, fields(timeline_id=%timeline_id))]
+    #[allow(clippy::too_many_arguments)]
    async fn load_remote_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        index_part: IndexPart,
        remote_metadata: TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
        resources: TimelineResources,
        cause: LoadTimelineCause,
        ctx: &RequestContext,
@@ -1880,6 +1922,7 @@ impl Tenant {
            resources,
            index_part,
            remote_metadata,
+            previous_heatmap,
            ancestor,
            cause,
            ctx,
@@ -1891,14 +1934,29 @@ impl Tenant {
        self: &Arc<Tenant>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
+        heatmap: Option<(HeatMapTenant, std::time::Instant)>,
        cancel: CancellationToken,
    ) -> anyhow::Result<HashMap<TimelineId, TimelinePreload>> {
+        let mut timeline_heatmaps = heatmap.map(|h| (h.0.into_timelines_index(), h.1));
+
        let mut part_downloads = JoinSet::new();
        for timeline_id in timeline_ids {
            let cancel_clone = cancel.clone();
+
+            let previous_timeline_heatmap = timeline_heatmaps.as_mut().and_then(|hs| {
+                hs.0.remove(&timeline_id).map(|h| PreviousHeatmap::Active {
+                    heatmap: h,
+                    read_at: hs.1,
+                })
+            });
            part_downloads.spawn(
-                self.load_timeline_metadata(timeline_id, remote_storage.clone(), cancel_clone)
-                    .instrument(info_span!("download_index_part", %timeline_id)),
+                self.load_timeline_metadata(
+                    timeline_id,
+                    remote_storage.clone(),
+                    previous_timeline_heatmap,
+                    cancel_clone,
+                )
+                .instrument(info_span!("download_index_part", %timeline_id)),
            );
        }

@@ -1946,6 +2004,7 @@ impl Tenant {
        self: &Arc<Tenant>,
        timeline_id: TimelineId,
        remote_storage: GenericRemoteStorage,
+        previous_heatmap: Option<PreviousHeatmap>,
        cancel: CancellationToken,
    ) -> impl Future<Output = TimelinePreload> {
        let client = self.build_timeline_client(timeline_id, remote_storage);
@@ -1961,6 +2020,7 @@ impl Tenant {
                client,
                timeline_id,
                index_part,
+                previous_heatmap,
            }
        }
    }
@@ -2072,7 +2132,12 @@ impl Tenant {
            })?;

        let timeline_preload = self
-            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
+            .load_timeline_metadata(
+                timeline_id,
+                self.remote_storage.clone(),
+                None,
+                cancel.clone(),
+            )
            .await;

        let index_part = match timeline_preload.index_part {
@@ -2106,6 +2171,7 @@ impl Tenant {
            timeline_id,
            index_part,
            remote_metadata,
+            None,
            timeline_resources,
            LoadTimelineCause::Unoffload,
            &ctx,
@@ -2821,7 +2887,7 @@ impl Tenant {
        };
        let metadata = index_part.metadata.clone();
        self
-            .load_remote_timeline(timeline_id, index_part, metadata, resources, LoadTimelineCause::ImportPgdata{
+            .load_remote_timeline(timeline_id, index_part, metadata, None, resources, LoadTimelineCause::ImportPgdata{
                create_guard: timeline_create_guard, activate, }, &ctx)
            .await?
            .ready_to_activate()
@@ -3035,6 +3101,9 @@ impl Tenant {
                if let Some(queue) = queue {
                    outcome = queue
                        .iteration(cancel, ctx, &self.gc_block, &timeline)
+                        .instrument(
+                            info_span!("gc_compact_timeline", timeline_id = %timeline.timeline_id),
+                        )
                        .await?;
                }
            }
@@ -3081,6 +3150,12 @@ impl Tenant {
            // Offload failures don't trip the circuit breaker, since they're cheap to retry and
            // shouldn't block compaction.
            CompactionError::Offload(_) => {}
+            CompactionError::CollectKeySpaceError(err) => {
+                self.compaction_circuit_breaker
+                    .lock()
+                    .unwrap()
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
+            }
            CompactionError::Other(err) => {
                self.compaction_circuit_breaker
                    .lock()
@@ -3858,6 +3933,13 @@ impl Tenant {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    pub fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
    pub fn get_compaction_upper_limit(&self) -> usize {
        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
        tenant_conf
@@ -4030,6 +4112,7 @@ impl Tenant {
        &self,
        new_timeline_id: TimelineId,
        new_metadata: &TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
        ancestor: Option<Arc<Timeline>>,
        resources: TimelineResources,
        cause: CreateTimelineCause,
@@ -4053,6 +4136,7 @@ impl Tenant {
            self.conf,
            Arc::clone(&self.tenant_conf),
            new_metadata,
+            previous_heatmap,
            ancestor,
            new_timeline_id,
            self.tenant_shard_id,
@@ -4695,24 +4779,24 @@ impl Tenant {
        // We check it against both the planned GC cutoff stored in 'gc_info',
        // and the 'latest_gc_cutoff' of the last GC that was performed.  The
        // planned GC cutoff in 'gc_info' is normally larger than
-        // 'latest_gc_cutoff_lsn', but beware of corner cases like if you just
+        // 'applied_gc_cutoff_lsn', but beware of corner cases like if you just
        // changed the GC settings for the tenant to make the PITR window
        // larger, but some of the data was already removed by an earlier GC
        // iteration.

        // check against last actual 'latest_gc_cutoff' first
-        let latest_gc_cutoff_lsn = src_timeline.get_latest_gc_cutoff_lsn();
+        let applied_gc_cutoff_lsn = src_timeline.get_applied_gc_cutoff_lsn();
        {
            let gc_info = src_timeline.gc_info.read().unwrap();
            let planned_cutoff = gc_info.min_cutoff();
            if gc_info.lsn_covered_by_lease(start_lsn) {
-                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *latest_gc_cutoff_lsn);
+                tracing::info!("skipping comparison of {start_lsn} with gc cutoff {} and planned gc cutoff {planned_cutoff} due to lsn lease", *applied_gc_cutoff_lsn);
            } else {
                src_timeline
-                    .check_lsn_is_in_scope(start_lsn, &latest_gc_cutoff_lsn)
+                    .check_lsn_is_in_scope(start_lsn, &applied_gc_cutoff_lsn)
                    .context(format!(
                        "invalid branch start lsn: less than latest GC cutoff {}",
-                        *latest_gc_cutoff_lsn,
+                        *applied_gc_cutoff_lsn,
                    ))
                    .map_err(CreateTimelineError::AncestorLsn)?;

@@ -4751,7 +4835,7 @@ impl Tenant {
            dst_prev,
            Some(src_id),
            start_lsn,
-            *src_timeline.latest_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
+            *src_timeline.applied_gc_cutoff_lsn.read(), // FIXME: should we hold onto this guard longer?
            src_timeline.initdb_lsn,
            src_timeline.pg_version,
        );
@@ -5124,6 +5208,7 @@ impl Tenant {
            .create_timeline_struct(
                new_timeline_id,
                new_metadata,
+                None,
                ancestor,
                resources,
                CreateTimelineCause::Load,
@@ -5571,7 +5656,7 @@ pub(crate) mod harness {
                lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
                timeline_offloading: Some(tenant_conf.timeline_offloading),
                wal_receiver_protocol_override: tenant_conf.wal_receiver_protocol_override,
-                rel_size_v2_enabled: tenant_conf.rel_size_v2_enabled,
+                rel_size_v2_enabled: Some(tenant_conf.rel_size_v2_enabled),
                gc_compaction_enabled: Some(tenant_conf.gc_compaction_enabled),
                gc_compaction_initial_threshold_kb: Some(
                    tenant_conf.gc_compaction_initial_threshold_kb,
@@ -6130,8 +6215,8 @@ mod tests {
        make_some_layers(tline.as_ref(), Lsn(0x20), &ctx).await?;

        repo.gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO)?;
-        let latest_gc_cutoff_lsn = tline.get_latest_gc_cutoff_lsn();
-        assert!(*latest_gc_cutoff_lsn > Lsn(0x25));
+        let applied_gc_cutoff_lsn = tline.get_applied_gc_cutoff_lsn();
+        assert!(*applied_gc_cutoff_lsn > Lsn(0x25));
        match tline.get(*TEST_KEY, Lsn(0x25)) {
            Ok(_) => panic!("request for page should have failed"),
            Err(err) => assert!(err.to_string().contains("not found at")),
@@ -7770,18 +7855,6 @@ mod tests {
            }

            tline.freeze_and_flush().await?;
-            // Force layers to L1
-            tline
-                .compact(
-                    &cancel,
-                    {
-                        let mut flags = EnumSet::new();
-                        flags.insert(CompactFlags::ForceL0Compaction);
-                        flags
-                    },
-                    &ctx,
-                )
-                .await?;

            if iter % 5 == 0 {
                let (_, before_delta_file_accessed) =
@@ -7794,7 +7867,6 @@ mod tests {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::ForceL0Compaction);
                            flags
                        },
                        &ctx,
@@ -8241,8 +8313,6 @@ mod tests {

        let cancel = CancellationToken::new();

-        // Image layer creation happens on the disk_consistent_lsn so we need to force set it now.
-        tline.force_set_disk_consistent_lsn(Lsn(0x40));
        tline
            .compact(
                &cancel,
@@ -8256,7 +8326,8 @@ mod tests {
            )
            .await
            .unwrap();
-        // Image layers are created at repartition LSN
+
+        // Image layers are created at last_record_lsn
        let images = tline
            .inspect_image_layers(Lsn(0x40), &ctx, io_concurrency.clone())
            .await
@@ -8427,7 +8498,7 @@ mod tests {
            .await?;
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x30))
                .wait()
@@ -8535,7 +8606,7 @@ mod tests {
        // increase GC horizon and compact again
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x40))
                .wait()
@@ -8703,8 +8774,8 @@ mod tests {

        // Force set disk consistent lsn so we can get the cutoff at `end_lsn`.
        info!(
-            "latest_gc_cutoff_lsn: {}",
-            *timeline.get_latest_gc_cutoff_lsn()
+            "applied_gc_cutoff_lsn: {}",
+            *timeline.get_applied_gc_cutoff_lsn()
        );
        timeline.force_set_disk_consistent_lsn(end_lsn);

@@ -8730,7 +8801,7 @@ mod tests {

        // Make lease on a already GC-ed LSN.
        // 0/80 does not have a valid lease + is below latest_gc_cutoff
-        assert!(Lsn(0x80) < *timeline.get_latest_gc_cutoff_lsn());
+        assert!(Lsn(0x80) < *timeline.get_applied_gc_cutoff_lsn());
        timeline
            .init_lsn_lease(Lsn(0x80), timeline.get_lsn_lease_length(), &ctx)
            .expect_err("lease request on GC-ed LSN should fail");
@@ -8921,7 +8992,7 @@ mod tests {
        };
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x30))
                .wait()
@@ -9008,7 +9079,7 @@ mod tests {
        // increase GC horizon and compact again
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x40))
                .wait()
@@ -9461,7 +9532,7 @@ mod tests {
            .await?;
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x30))
                .wait()
@@ -9608,7 +9679,7 @@ mod tests {
        // increase GC horizon and compact again
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x38))
                .wait()
@@ -9709,7 +9780,7 @@ mod tests {
            .await?;
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x30))
                .wait()
@@ -9960,7 +10031,7 @@ mod tests {

        {
            parent_tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x10))
                .wait()
@@ -9980,7 +10051,7 @@ mod tests {

        {
            branch_tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x50))
                .wait()
@@ -10336,7 +10407,7 @@ mod tests {

        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x30))
                .wait()
@@ -10721,7 +10792,7 @@ mod tests {
            .await?;
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x30))
                .wait()
@@ -10972,7 +11043,7 @@ mod tests {
            .await?;
        {
            tline
-                .latest_gc_cutoff_lsn
+                .applied_gc_cutoff_lsn
                .lock_for_write()
                .store_and_unlock(Lsn(0x30))
                .wait()
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -485,7 +485,9 @@ impl TenantConfOpt {
            wal_receiver_protocol_override: self
                .wal_receiver_protocol_override
                .or(global_conf.wal_receiver_protocol_override),
-            rel_size_v2_enabled: self.rel_size_v2_enabled.or(global_conf.rel_size_v2_enabled),
+            rel_size_v2_enabled: self
+                .rel_size_v2_enabled
+                .unwrap_or(global_conf.rel_size_v2_enabled),
            gc_compaction_enabled: self
                .gc_compaction_enabled
                .unwrap_or(global_conf.gc_compaction_enabled),
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -130,7 +130,10 @@ struct TimelineMetadataBodyV2 {
    prev_record_lsn: Option<Lsn>,
    ancestor_timeline: Option<TimelineId>,
    ancestor_lsn: Lsn,
+
+    // The LSN at which GC was last executed.  Synonym of [`Timeline::applied_gc_cutoff_lsn`].
    latest_gc_cutoff_lsn: Lsn,
+
    initdb_lsn: Lsn,
    pg_version: u32,
 }
--- a/pageserver/src/tenant/secondary/heatmap.rs
+++ b/pageserver/src/tenant/secondary/heatmap.rs
@@ -1,4 +1,4 @@
-use std::time::SystemTime;
+use std::{collections::HashMap, time::SystemTime};

 use crate::tenant::{remote_timeline_client::index::LayerFileMetadata, storage_layer::LayerName};

@@ -8,7 +8,7 @@ use serde_with::{serde_as, DisplayFromStr, TimestampSeconds};
 use utils::{generation::Generation, id::TimelineId};

 #[derive(Serialize, Deserialize)]
-pub(super) struct HeatMapTenant {
+pub(crate) struct HeatMapTenant {
    /// Generation of the attached location that uploaded the heatmap: this is not required
    /// for correctness, but acts as a hint to secondary locations in order to detect thrashing
    /// in the unlikely event that two attached locations are both uploading conflicting heatmaps.
@@ -25,8 +25,17 @@ pub(super) struct HeatMapTenant {
    pub(super) upload_period_ms: Option<u128>,
 }

+impl HeatMapTenant {
+    pub(crate) fn into_timelines_index(self) -> HashMap<TimelineId, HeatMapTimeline> {
+        self.timelines
+            .into_iter()
+            .map(|htl| (htl.timeline_id, htl))
+            .collect()
+    }
+}
+
 #[serde_as]
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub(crate) struct HeatMapTimeline {
    #[serde_as(as = "DisplayFromStr")]
    pub(crate) timeline_id: TimelineId,
@@ -35,13 +44,13 @@ pub(crate) struct HeatMapTimeline {
 }

 #[serde_as]
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub(crate) struct HeatMapLayer {
    pub(crate) name: LayerName,
    pub(crate) metadata: LayerFileMetadata,

    #[serde_as(as = "TimestampSeconds<i64>")]
-    pub(super) access_time: SystemTime,
+    pub(crate) access_time: SystemTime,
    // TODO: an actual 'heat' score that would let secondary locations prioritize downloading
    // the hottest layers, rather than trying to simply mirror whatever layers are on-disk on the primary.
 }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -394,7 +394,7 @@ pub(super) async fn gather_inputs(
            ancestor_lsn,
            last_record: last_record_lsn,
            // this is not used above, because it might not have updated recently enough
-            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
+            latest_gc_cutoff: *timeline.get_applied_gc_cutoff_lsn(),
            next_pitr_cutoff,
            retention_param_cutoff,
            lease_points,
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -51,8 +51,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
-use pageserver_api::key::{Key, KEY_SIZE};
+use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
@@ -967,7 +966,10 @@ impl DeltaLayerInner {
                .as_slice()
                .iter()
                .filter_map(|(_, blob_meta)| {
-                    if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
+                    if blob_meta.key.is_rel_dir_key()
+                        || blob_meta.key == DBDIR_KEY
+                        || blob_meta.key.is_aux_file_key()
+                    {
                        // The size of values for these keys is unbounded and can
                        // grow very large in pathological cases.
                        None
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -48,8 +48,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
-use pageserver_api::key::DBDIR_KEY;
-use pageserver_api::key::{Key, KEY_SIZE};
+use pageserver_api::key::{Key, DBDIR_KEY, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
@@ -603,7 +602,10 @@ impl ImageLayerInner {
                    .as_slice()
                    .iter()
                    .filter_map(|(_, blob_meta)| {
-                        if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY {
+                        if blob_meta.key.is_rel_dir_key()
+                            || blob_meta.key == DBDIR_KEY
+                            || blob_meta.key.is_aux_file_key()
+                        {
                            // The size of values for these keys is unbounded and can
                            // grow very large in pathological cases.
                            None
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -136,6 +136,22 @@ pub(crate) fn local_layer_path(
    }
 }

+pub(crate) enum LastEviction {
+    Never,
+    At(std::time::Instant),
+    Evicting,
+}
+
+impl LastEviction {
+    pub(crate) fn happened_after(&self, timepoint: std::time::Instant) -> bool {
+        match self {
+            LastEviction::Never => false,
+            LastEviction::At(evicted_at) => evicted_at > &timepoint,
+            LastEviction::Evicting => true,
+        }
+    }
+}
+
 impl Layer {
    /// Creates a layer value for a file we know to not be resident.
    pub(crate) fn for_evicted(
@@ -405,6 +421,17 @@ impl Layer {
        self.0.metadata()
    }

+    pub(crate) fn last_evicted_at(&self) -> LastEviction {
+        match self.0.last_evicted_at.try_lock() {
+            Ok(lock) => match *lock {
+                None => LastEviction::Never,
+                Some(at) => LastEviction::At(at),
+            },
+            Err(std::sync::TryLockError::WouldBlock) => LastEviction::Evicting,
+            Err(std::sync::TryLockError::Poisoned(p)) => panic!("Lock poisoned: {p}"),
+        }
+    }
+
    pub(crate) fn get_timeline_id(&self) -> Option<TimelineId> {
        self.0
            .timeline
@@ -656,7 +683,9 @@ struct LayerInner {

    /// When the Layer was last evicted but has not been downloaded since.
    ///
-    /// This is used solely for updating metrics. See [`LayerImplMetrics::redownload_after`].
+    /// This is used for skipping evicted layers from the previous heatmap (see
+    /// `[Timeline::generate_heatmap]`) and for updating metrics
+    /// (see [`LayerImplMetrics::redownload_after`]).
    last_evicted_at: std::sync::Mutex<Option<std::time::Instant>>,

    #[cfg(test)]
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -287,6 +287,7 @@ fn log_compaction_error(
    sleep_duration: Duration,
    task_cancelled: bool,
 ) {
+    use crate::pgdatadir_mapping::CollectKeySpaceError;
    use crate::tenant::upload_queue::NotInitialized;
    use crate::tenant::PageReconstructError;
    use CompactionError::*;
@@ -294,6 +295,8 @@ fn log_compaction_error(
    let level = match err {
        ShuttingDown => return,
        Offload(_) => Level::ERROR,
+        CollectKeySpaceError(CollectKeySpaceError::Cancelled) => Level::INFO,
+        CollectKeySpaceError(_) => Level::ERROR,
        _ if task_cancelled => Level::INFO,
        Other(err) => {
            let root_cause = err.root_cause();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4,6 +4,7 @@ pub mod delete;
 pub(crate) mod detach_ancestor;
 mod eviction_task;
 pub(crate) mod handle;
+mod heatmap_layers_downloader;
 pub(crate) mod import_pgdata;
 mod init;
 pub mod layer_manager;
@@ -21,6 +22,7 @@ use chrono::{DateTime, Utc};
 use compaction::CompactionOutcome;
 use enumset::EnumSet;
 use fail::fail_point;
+use futures::FutureExt;
 use futures::{stream::FuturesUnordered, StreamExt};
 use handle::ShardTimelineId;
 use layer_manager::Shutdown;
@@ -117,7 +119,7 @@ use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::metrics::{TimelineMetrics, DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_GLOBAL};
-use crate::pgdatadir_mapping::CalculateLogicalSizeError;
+use crate::pgdatadir_mapping::{CalculateLogicalSizeError, MetricsUpdate};
 use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;
@@ -150,16 +152,15 @@ use super::{
    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
    MaybeOffloaded,
 };
-use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
+use super::{
+    debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf, HeatMapTimeline,
+};
 use super::{remote_timeline_client::index::IndexPart, storage_layer::LayerFringe};
 use super::{
    remote_timeline_client::RemoteTimelineClient, remote_timeline_client::WaitCompletionError,
    storage_layer::ReadableLayer,
 };
-use super::{
-    secondary::heatmap::{HeatMapLayer, HeatMapTimeline},
-    GcError,
-};
+use super::{secondary::heatmap::HeatMapLayer, GcError};

 #[cfg(test)]
 use pageserver_api::value::Value;
@@ -328,6 +329,7 @@ pub struct Timeline {
    // in `crate::page_service` writes these metrics.
    pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,

+    directory_metrics_inited: [AtomicBool; DirectoryKind::KINDS_NUM],
    directory_metrics: [AtomicU64; DirectoryKind::KINDS_NUM],

    /// Ensures layers aren't frozen by checkpointer between
@@ -352,8 +354,11 @@ pub struct Timeline {
    /// to be notified when layer flushing has finished, subscribe to the layer_flush_done channel
    layer_flush_done_tx: tokio::sync::watch::Sender<(u64, Result<(), FlushLayerError>)>,

-    // Needed to ensure that we can't create a branch at a point that was already garbage collected
-    pub latest_gc_cutoff_lsn: Rcu<Lsn>,
+    // The LSN at which we have executed GC: whereas [`Self::gc_info`] records the LSN at which
+    // we _intend_ to GC (i.e. the PITR cutoff), this LSN records where we actually last did it.
+    // Because PITR interval is mutable, it's possible for this LSN to be earlier or later than
+    // the planned GC cutoff.
+    pub applied_gc_cutoff_lsn: Rcu<Lsn>,

    pub(crate) gc_compaction_layer_update_lock: tokio::sync::RwLock<()>,

@@ -462,6 +467,20 @@ pub struct Timeline {

    /// If Some, collects GetPage metadata for an ongoing PageTrace.
    pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
+
+    previous_heatmap: ArcSwapOption<PreviousHeatmap>,
+
+    /// May host a background Tokio task which downloads all the layers from the current
+    /// heatmap on demand.
+    heatmap_layers_downloader: Mutex<Option<heatmap_layers_downloader::HeatmapLayersDownloader>>,
+}
+
+pub(crate) enum PreviousHeatmap {
+    Active {
+        heatmap: HeatMapTimeline,
+        read_at: std::time::Instant,
+    },
+    Obsolete,
 }

 pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -1077,9 +1096,15 @@ impl Timeline {
        (history, gc_info.within_ancestor_pitr)
    }

-    /// Lock and get timeline's GC cutoff
-    pub(crate) fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
-        self.latest_gc_cutoff_lsn.read()
+    /// Read timeline's GC cutoff: this is the LSN at which GC has started to happen
+    pub(crate) fn get_applied_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
+        self.applied_gc_cutoff_lsn.read()
+    }
+
+    /// Read timeline's planned GC cutoff: this is the logical end of history that users
+    /// are allowed to read (based on configured PITR), even if physically we have more history.
+    pub(crate) fn get_gc_cutoff_lsn(&self) -> Lsn {
+        self.gc_info.read().unwrap().cutoffs.time
    }

    /// Look up given page version.
@@ -1274,7 +1299,7 @@ impl Timeline {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
-        let read_path = if self.conf.enable_read_path_debugging {
+        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
            Some(ReadPath::new(keyspace.clone(), lsn))
        } else {
            None
@@ -1587,7 +1612,7 @@ impl Timeline {
                    };

                    if init || validate {
-                        let latest_gc_cutoff_lsn = self.get_latest_gc_cutoff_lsn();
+                        let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
                        if lsn < *latest_gc_cutoff_lsn {
                            bail!("tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}", lsn, *latest_gc_cutoff_lsn);
                        }
@@ -1857,7 +1882,7 @@ impl Timeline {
        // Signal compaction failure to avoid L0 flush stalls when it's broken.
        match result {
            Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
-            Err(CompactionError::Other(_)) => {
+            Err(CompactionError::Other(_)) | Err(CompactionError::CollectKeySpaceError(_)) => {
                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
            }
            // Don't change the current value on offload failure or shutdown. We don't want to
@@ -2020,6 +2045,11 @@ impl Timeline {
        tracing::debug!("Cancelling CancellationToken");
        self.cancel.cancel();

+        // If we have a background task downloading heatmap layers stop it.
+        // The background downloads are sensitive to timeline cancellation (done above),
+        // so the drain will be immediate.
+        self.stop_and_drain_heatmap_layers_download().await;
+
        // Ensure Prevent new page service requests from starting.
        self.handles.shutdown();

@@ -2337,6 +2367,14 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

+    pub(crate) fn get_rel_size_v2_enabled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .rel_size_v2_enabled
+            .unwrap_or(self.conf.default_tenant_conf.rel_size_v2_enabled)
+    }
+
    fn get_compaction_upper_limit(&self) -> usize {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2559,6 +2597,7 @@ impl Timeline {
        conf: &'static PageServerConf,
        tenant_conf: Arc<ArcSwap<AttachedTenantConf>>,
        metadata: &TimelineMetadata,
+        previous_heatmap: Option<PreviousHeatmap>,
        ancestor: Option<Arc<Timeline>>,
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
@@ -2645,6 +2684,7 @@ impl Timeline {
                ),

                directory_metrics: array::from_fn(|_| AtomicU64::new(0)),
+                directory_metrics_inited: array::from_fn(|_| AtomicBool::new(false)),

                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

@@ -2659,7 +2699,7 @@ impl Timeline {
                    LastImageLayerCreationStatus::default(),
                )),

-                latest_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
+                applied_gc_cutoff_lsn: Rcu::new(metadata.latest_gc_cutoff_lsn()),
                initdb_lsn: metadata.initdb_lsn(),

                current_logical_size: if disk_consistent_lsn.is_valid() {
@@ -2721,6 +2761,10 @@ impl Timeline {
                create_idempotency,

                page_trace: Default::default(),
+
+                previous_heatmap: ArcSwapOption::from_pointee(previous_heatmap),
+
+                heatmap_layers_downloader: Mutex::new(None),
            };

            result.repartition_threshold =
@@ -2830,6 +2874,7 @@ impl Timeline {
                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                availability_zone: self.conf.availability_zone.clone(),
                ingest_batch_size: self.conf.ingest_batch_size,
+                validate_wal_contiguity: self.conf.validate_wal_contiguity,
            },
            broker_client,
            ctx,
@@ -3409,8 +3454,42 @@ impl Timeline {
        }
    }

-    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: u64) {
-        self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
+        // TODO: this directory metrics is not correct -- we could have multiple reldirs in the system
+        // for each of the database, but we only store one value, and therefore each pgdirmodification
+        // would overwrite the previous value if they modify different databases.
+
+        match count {
+            MetricsUpdate::Set(count) => {
+                self.directory_metrics[kind.offset()].store(count, AtomicOrdering::Relaxed);
+                self.directory_metrics_inited[kind.offset()].store(true, AtomicOrdering::Relaxed);
+            }
+            MetricsUpdate::Add(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before, so we can add/sub
+                    // the value reliably.
+                    self.directory_metrics[kind.offset()].fetch_add(count, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+            MetricsUpdate::Sub(count) => {
+                // TODO: these operations are not atomic; but we only have one writer to the metrics, so
+                // it's fine.
+                if self.directory_metrics_inited[kind.offset()].load(AtomicOrdering::Relaxed) {
+                    // The metrics has been initialized with `MetricsUpdate::Set` before.
+                    // The operation could overflow so we need to normalize the value.
+                    let prev_val =
+                        self.directory_metrics[kind.offset()].load(AtomicOrdering::Relaxed);
+                    let res = prev_val.saturating_sub(count);
+                    self.directory_metrics[kind.offset()].store(res, AtomicOrdering::Relaxed);
+                }
+                // Otherwise, ignore this update
+            }
+        };
+
+        // TODO: remove this, there's no place in the code that updates this aux metrics.
        let aux_metric =
            self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);

@@ -3459,12 +3538,52 @@ impl Timeline {

        let guard = self.layers.read().await;

+        // Firstly, if there's any heatmap left over from when this location
+        // was a secondary, take that into account. Keep layers that are:
+        // * present in the layer map
+        // * visible
+        // * non-resident
+        // * not evicted since we read the heatmap
+        //
+        // Without this, a new cold, attached location would clobber the previous
+        // heatamp.
+        let previous_heatmap = self.previous_heatmap.load();
+        let visible_non_resident = match previous_heatmap.as_deref() {
+            Some(PreviousHeatmap::Active { heatmap, read_at }) => {
+                Some(heatmap.layers.iter().filter_map(|hl| {
+                    let desc: PersistentLayerDesc = hl.name.clone().into();
+                    let layer = guard.try_get_from_key(&desc.key())?;
+
+                    if layer.visibility() == LayerVisibilityHint::Covered {
+                        return None;
+                    }
+
+                    if layer.is_likely_resident() {
+                        return None;
+                    }
+
+                    if layer.last_evicted_at().happened_after(*read_at) {
+                        return None;
+                    }
+
+                    Some((desc, hl.metadata.clone(), hl.access_time))
+                }))
+            }
+            Some(PreviousHeatmap::Obsolete) => None,
+            None => None,
+        };
+
+        // Secondly, all currently visible, resident layers are included.
        let resident = guard.likely_resident_layers().filter_map(|layer| {
            match layer.visibility() {
                LayerVisibilityHint::Visible => {
                    // Layer is visible to one or more read LSNs: elegible for inclusion in layer map
                    let last_activity_ts = layer.latest_activity();
-                    Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
+                    Some((
+                        layer.layer_desc().clone(),
+                        layer.metadata(),
+                        last_activity_ts,
+                    ))
                }
                LayerVisibilityHint::Covered => {
                    // Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
@@ -3473,7 +3592,18 @@ impl Timeline {
            }
        });

-        let mut layers = resident.collect::<Vec<_>>();
+        let mut layers = match visible_non_resident {
+            Some(non_resident) => {
+                let mut non_resident = non_resident.peekable();
+                if non_resident.peek().is_none() {
+                    self.previous_heatmap
+                        .store(Some(PreviousHeatmap::Obsolete.into()));
+                }
+
+                non_resident.chain(resident).collect::<Vec<_>>()
+            }
+            None => resident.collect::<Vec<_>>(),
+        };

        // Sort layers in order of which to download first.  For a large set of layers to download, we
        // want to prioritize those layers which are most likely to still be in the resident many minutes
@@ -3577,7 +3707,9 @@ impl Timeline {
            // space. If that's not the case, we had at least one key encounter a gap in the image layer
            // and stop the search as a result of that.
            let mut removed = keyspace.remove_overlapping_with(&image_covered_keyspace);
-            // Do not fire missing key error for sparse keys.
+            // Do not fire missing key error and end early for sparse keys. Note that we hava already removed
+            // non-inherited keyspaces before, so we can safely do a full `SPARSE_RANGE` remove instead of
+            // figuring out what is the inherited key range and do a fine-grained pruning.
            removed.remove_overlapping_with(&KeySpace {
                ranges: vec![SPARSE_RANGE],
            });
@@ -3662,7 +3794,7 @@ impl Timeline {
        // the timeline, then it will remove layers that are required for fulfilling
        // the current get request (read-path cannot "look back" and notice the new
        // image layer).
-        let _gc_cutoff_holder = timeline.get_latest_gc_cutoff_lsn();
+        let _gc_cutoff_holder = timeline.get_applied_gc_cutoff_lsn();

        // See `compaction::compact_with_gc` for why we need this.
        let _guard = timeline.gc_compaction_layer_update_lock.read().await;
@@ -4349,7 +4481,7 @@ impl Timeline {
        let update = crate::tenant::metadata::MetadataUpdate::new(
            disk_consistent_lsn,
            ondisk_prev_record_lsn,
-            *self.latest_gc_cutoff_lsn.read(),
+            *self.applied_gc_cutoff_lsn.read(),
        );

        fail_point!("checkpoint-before-saving-metadata", |x| bail!(
@@ -4474,7 +4606,10 @@ impl Timeline {
            ));
        }

-        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_keyspace(lsn, ctx)
+            .await
+            .map_err(CompactionError::CollectKeySpaceError)?;
        let dense_partitioning = dense_ks.partition(&self.shard_identity, partition_size);
        let sparse_partitioning = SparseKeyPartitioning {
            parts: vec![sparse_ks],
@@ -4995,20 +5130,26 @@ impl Timeline {
                    // image layer generation taking too long time and blocking L0 compaction. So in this
                    // mode, we also inspect the current number of L0 layers and skip image layer generation
                    // if there are too many of them.
-                    let num_of_l0_layers = {
-                        let layers = self.layers.read().await;
-                        layers.layer_map()?.level0_deltas().len()
-                    };
                    let image_preempt_threshold = self.get_image_creation_preempt_threshold()
                        * self.get_compaction_threshold();
-                    if image_preempt_threshold != 0 && num_of_l0_layers >= image_preempt_threshold {
-                        tracing::info!(
-                        "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers {}",
-                        partition.start().unwrap(), partition.end().unwrap(), num_of_l0_layers
-                    );
-                        last_partition_processed = Some(partition.clone());
-                        all_generated = false;
-                        break;
+                    // TODO: currently we do not respect `get_image_creation_preempt_threshold` and always yield
+                    // when there is a single timeline with more than L0 threshold L0 layers. As long as the
+                    // `get_image_creation_preempt_threshold` is set to a value greater than 0, we will yield for L0 compaction.
+                    if image_preempt_threshold != 0 {
+                        let should_yield = self
+                            .l0_compaction_trigger
+                            .notified()
+                            .now_or_never()
+                            .is_some();
+                        if should_yield {
+                            tracing::info!(
+                                "preempt image layer generation at {lsn} when processing partition {}..{}: too many L0 layers",
+                                partition.start().unwrap(), partition.end().unwrap()
+                            );
+                            last_partition_processed = Some(partition.clone());
+                            all_generated = false;
+                            break;
+                        }
                    }
                }
            }
@@ -5037,14 +5178,16 @@ impl Timeline {
            .map(|l| l.metadata().file_size)
            .sum::<u64>();

-        info!(
-            "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
-            image_layers.len(),
-            total_layer_size,
-            duration.as_secs_f64(),
-            partition_processed,
-            total_partitions
-        );
+        if !image_layers.is_empty() {
+            info!(
+                "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+                image_layers.len(),
+                total_layer_size,
+                duration.as_secs_f64(),
+                partition_processed,
+                total_partitions
+            );
+        }

        Ok((
            image_layers,
@@ -5187,6 +5330,8 @@ pub(crate) enum CompactionError {
    #[error("Failed to offload timeline: {0}")]
    Offload(OffloadError),
    /// Compaction cannot be done right now; page reconstruction and so on.
+    #[error("Failed to collect keyspace: {0}")]
+    CollectKeySpaceError(CollectKeySpaceError),
    #[error(transparent)]
    Other(anyhow::Error),
 }
@@ -5200,12 +5345,6 @@ impl From<OffloadError> for CompactionError {
    }
 }

-impl CompactionError {
-    pub fn is_cancelled(&self) -> bool {
-        matches!(self, CompactionError::ShuttingDown)
-    }
-}
-
 impl From<CollectKeySpaceError> for CompactionError {
    fn from(err: CollectKeySpaceError) -> Self {
        match err {
@@ -5577,7 +5716,7 @@ impl Timeline {
                // PITR interval is set & we didn't look up a timestamp successfully.  Conservatively assume PITR
                // cannot advance beyond what was already GC'd, and respect space-based retention
                GcCutoffs {
-                    time: *self.get_latest_gc_cutoff_lsn(),
+                    time: *self.get_applied_gc_cutoff_lsn(),
                    space: space_cutoff,
                }
            }
@@ -5698,7 +5837,7 @@ impl Timeline {
        let mut result: GcResult = GcResult::default();

        // Nothing to GC. Return early.
-        let latest_gc_cutoff = *self.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff = *self.get_applied_gc_cutoff_lsn();
        if latest_gc_cutoff >= new_gc_cutoff {
            info!(
                "Nothing to GC: new_gc_cutoff_lsn {new_gc_cutoff}, latest_gc_cutoff_lsn {latest_gc_cutoff}",
@@ -5712,7 +5851,7 @@ impl Timeline {
        //
        // The GC cutoff should only ever move forwards.
        let waitlist = {
-            let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
+            let write_guard = self.applied_gc_cutoff_lsn.lock_for_write();
            if *write_guard > new_gc_cutoff {
                return Err(GcError::BadLsn {
                    why: format!(
@@ -6470,7 +6609,7 @@ impl TimelineWriter<'_> {

        if let Some(wait_threshold) = wait_threshold {
            if l0_count >= wait_threshold {
-                info!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
+                debug!("layer roll waiting for flush due to compaction backpressure at {l0_count} L0 layers");
                self.tl.wait_flush_completion(flush_id).await?;
            }
        }
@@ -6652,18 +6791,32 @@ fn is_send() {

 #[cfg(test)]
 mod tests {
+    use std::sync::Arc;
+
    use pageserver_api::key::Key;
    use pageserver_api::value::Value;
+    use tracing::Instrument;
    use utils::{id::TimelineId, lsn::Lsn};

    use crate::tenant::{
        harness::{test_img, TenantHarness},
        layer_map::LayerMap,
-        storage_layer::{Layer, LayerName},
+        storage_layer::{Layer, LayerName, LayerVisibilityHint},
        timeline::{DeltaLayerTestDesc, EvictionError},
-        Timeline,
+        PreviousHeatmap, Timeline,
    };

+    use super::HeatMapTimeline;
+
+    fn assert_heatmaps_have_same_layers(lhs: &HeatMapTimeline, rhs: &HeatMapTimeline) {
+        assert_eq!(lhs.layers.len(), rhs.layers.len());
+        let lhs_rhs = lhs.layers.iter().zip(rhs.layers.iter());
+        for (l, r) in lhs_rhs {
+            assert_eq!(l.name, r.name);
+            assert_eq!(l.metadata, r.metadata);
+        }
+    }
+
    #[tokio::test]
    async fn test_heatmap_generation() {
        let harness = TenantHarness::create("heatmap_generation").await.unwrap();
@@ -6737,7 +6890,7 @@ mod tests {
        assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());

        let mut last_lsn = Lsn::MAX;
-        for layer in heatmap.layers {
+        for layer in &heatmap.layers {
            // Covered layer should be omitted
            assert!(layer.name != covered_delta.layer_name());

@@ -6752,6 +6905,144 @@ mod tests {
                last_lsn = layer_lsn;
            }
        }
+
+        // Evict all the layers and stash the old heatmap in the timeline.
+        // This simulates a migration to a cold secondary location.
+
+        let guard = timeline.layers.read().await;
+        let mut all_layers = Vec::new();
+        let forever = std::time::Duration::from_secs(120);
+        for layer in guard.likely_resident_layers() {
+            all_layers.push(layer.clone());
+            layer.evict_and_wait(forever).await.unwrap();
+        }
+        drop(guard);
+
+        timeline
+            .previous_heatmap
+            .store(Some(Arc::new(PreviousHeatmap::Active {
+                heatmap: heatmap.clone(),
+                read_at: std::time::Instant::now(),
+            })));
+
+        // Generate a new heatmap and assert that it contains the same layers as the old one.
+        let post_migration_heatmap = timeline.generate_heatmap().await.unwrap();
+        assert_heatmaps_have_same_layers(&heatmap, &post_migration_heatmap);
+
+        // Download each layer one by one. Generate the heatmap at each step and check
+        // that it's stable.
+        for layer in all_layers {
+            if layer.visibility() == LayerVisibilityHint::Covered {
+                continue;
+            }
+
+            eprintln!("Downloading {layer} and re-generating heatmap");
+
+            let _resident = layer
+                .download_and_keep_resident()
+                .instrument(tracing::info_span!(
+                    parent: None,
+                    "download_layer",
+                    tenant_id = %timeline.tenant_shard_id.tenant_id,
+                    shard_id = %timeline.tenant_shard_id.shard_slug(),
+                    timeline_id = %timeline.timeline_id
+                ))
+                .await
+                .unwrap();
+
+            let post_download_heatmap = timeline.generate_heatmap().await.unwrap();
+            assert_heatmaps_have_same_layers(&heatmap, &post_download_heatmap);
+        }
+
+        // Everything from the post-migration heatmap is now resident.
+        // Check that we drop it from memory.
+        assert!(matches!(
+            timeline.previous_heatmap.load().as_deref(),
+            Some(PreviousHeatmap::Obsolete)
+        ));
+    }
+
+    #[tokio::test]
+    async fn test_previous_heatmap_obsoletion() {
+        let harness = TenantHarness::create("heatmap_previous_heatmap_obsoletion")
+            .await
+            .unwrap();
+
+        let l0_delta = DeltaLayerTestDesc::new(
+            Lsn(0x20)..Lsn(0x30),
+            Key::from_hex("000000000000000000000000000000000000").unwrap()
+                ..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
+            vec![(
+                Key::from_hex("720000000033333333444444445500000000").unwrap(),
+                Lsn(0x25),
+                Value::Image(test_img("foo")),
+            )],
+        );
+
+        let image_layer = (
+            Lsn(0x40),
+            vec![(
+                Key::from_hex("620000000033333333444444445500000000").unwrap(),
+                test_img("bar"),
+            )],
+        );
+
+        let delta_layers = vec![l0_delta];
+        let image_layers = vec![image_layer];
+
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline_with_layers(
+                TimelineId::generate(),
+                Lsn(0x10),
+                14,
+                &ctx,
+                delta_layers,
+                image_layers,
+                Lsn(0x100),
+            )
+            .await
+            .unwrap();
+
+        // Layer visibility is an input to heatmap generation, so refresh it first
+        timeline.update_layer_visibility().await.unwrap();
+
+        let heatmap = timeline
+            .generate_heatmap()
+            .await
+            .expect("Infallible while timeline is not shut down");
+
+        // Both layers should be in the heatmap
+        assert!(!heatmap.layers.is_empty());
+
+        // Now simulate a migration.
+        timeline
+            .previous_heatmap
+            .store(Some(Arc::new(PreviousHeatmap::Active {
+                heatmap: heatmap.clone(),
+                read_at: std::time::Instant::now(),
+            })));
+
+        // Evict all the layers in the previous heatmap
+        let guard = timeline.layers.read().await;
+        let forever = std::time::Duration::from_secs(120);
+        for layer in guard.likely_resident_layers() {
+            layer.evict_and_wait(forever).await.unwrap();
+        }
+        drop(guard);
+
+        // Generate a new heatmap and check that the previous heatmap
+        // has been marked obsolete.
+        let post_eviction_heatmap = timeline
+            .generate_heatmap()
+            .await
+            .expect("Infallible while timeline is not shut down");
+
+        assert!(post_eviction_heatmap.layers.is_empty());
+        assert!(matches!(
+            timeline.previous_heatmap.load().as_deref(),
+            Some(PreviousHeatmap::Obsolete)
+        ));
    }

    #[tokio::test]
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -11,7 +11,8 @@ use std::sync::Arc;
 use super::layer_manager::LayerManager;
 use super::{
    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, GetVectoredError,
-    ImageLayerCreationMode, LastImageLayerCreationStatus, RecordedDuration, Timeline,
+    ImageLayerCreationMode, LastImageLayerCreationStatus, PageReconstructError, RecordedDuration,
+    Timeline,
 };

 use anyhow::{anyhow, bail, Context};
@@ -25,12 +26,13 @@ use pageserver_api::models::CompactInfoResponse;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, info_span, trace, warn, Instrument};
+use tracing::{debug, error, info, info_span, trace, warn, Instrument};
 use utils::critical;
 use utils::id::TimelineId;

 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::pgdatadir_mapping::CollectKeySpaceError;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -301,18 +303,12 @@ impl GcCompactionQueue {
                        let mut guard = self.inner.lock().unwrap();
                        guard.gc_guards.insert(id, gc_guard);
                    }
-                    let _ = timeline
-                        .compact_with_options(cancel, options, ctx)
-                        .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                        .await?;
+                    let _ = timeline.compact_with_options(cancel, options, ctx).await?;
                    self.notify_and_unblock(id);
                }
            }
            GcCompactionQueueItem::SubCompactionJob(options) => {
-                let _ = timeline
-                    .compact_with_options(cancel, options, ctx)
-                    .instrument(info_span!("scheduled_compact_timeline", %timeline.timeline_id))
-                    .await?;
+                let _ = timeline.compact_with_options(cancel, options, ctx).await?;
            }
            GcCompactionQueueItem::Notify(id) => {
                self.notify_and_unblock(id);
@@ -692,21 +688,6 @@ impl Timeline {

        // Define partitioning schema if needed

-        let l0_l1_boundary_lsn = {
-            // We do the repartition on the L0-L1 boundary. All data below the boundary
-            // are compacted by L0 with low read amplification, thus making the `repartition`
-            // function run fast.
-            let guard = self.layers.read().await;
-            let l0_min_lsn = guard
-                .layer_map()?
-                .level0_deltas()
-                .iter()
-                .map(|l| l.get_lsn_range().start)
-                .min()
-                .unwrap_or(self.get_disk_consistent_lsn());
-            l0_min_lsn.max(self.get_ancestor_lsn())
-        };
-
        // 1. L0 Compact
        let l0_outcome = {
            let timer = self.metrics.compact_time_histo.start_timer();
@@ -733,86 +714,87 @@ impl Timeline {
            return Ok(CompactionOutcome::YieldForL0);
        }

-        if l0_l1_boundary_lsn < self.partitioning.read().1 {
-            // We never go backwards when repartition and create image layers.
-            info!("skipping image layer generation because repartition LSN is greater than L0-L1 boundary LSN.");
-        } else {
-            // 2. Repartition and create image layers if necessary
-            match self
-                .repartition(
-                    l0_l1_boundary_lsn,
-                    self.get_compaction_target_size(),
-                    options.flags,
-                    ctx,
-                )
-                .await
-            {
-                Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
-                    // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                    let image_ctx = RequestContextBuilder::extend(ctx)
-                        .access_stats_behavior(AccessStatsBehavior::Skip)
-                        .build();
+        // 2. Repartition and create image layers if necessary
+        match self
+            .repartition(
+                self.get_last_record_lsn(),
+                self.get_compaction_target_size(),
+                options.flags,
+                ctx,
+            )
+            .await
+        {
+            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
+                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
+                let image_ctx = RequestContextBuilder::extend(ctx)
+                    .access_stats_behavior(AccessStatsBehavior::Skip)
+                    .build();

-                    let mut partitioning = dense_partitioning;
-                    partitioning
-                        .parts
-                        .extend(sparse_partitioning.into_dense().parts);
+                let mut partitioning = dense_partitioning;
+                partitioning
+                    .parts
+                    .extend(sparse_partitioning.into_dense().parts);

-                    // 3. Create new image layers for partitions that have been modified "enough".
-                    let (image_layers, outcome) = self
-                        .create_image_layers(
-                            &partitioning,
-                            lsn,
-                            if options
-                                .flags
-                                .contains(CompactFlags::ForceImageLayerCreation)
-                            {
-                                ImageLayerCreationMode::Force
-                            } else {
-                                ImageLayerCreationMode::Try
-                            },
-                            &image_ctx,
-                            self.last_image_layer_creation_status
-                                .load()
-                                .as_ref()
-                                .clone(),
-                            !options.flags.contains(CompactFlags::NoYield),
-                        )
-                        .await
-                        .inspect_err(|err| {
-                            if let CreateImageLayersError::GetVectoredError(
-                                GetVectoredError::MissingKey(_),
-                            ) = err
-                            {
-                                critical!("missing key during compaction: {err:?}");
-                            }
-                        })?;
+                // 3. Create new image layers for partitions that have been modified "enough".
+                let (image_layers, outcome) = self
+                    .create_image_layers(
+                        &partitioning,
+                        lsn,
+                        if options
+                            .flags
+                            .contains(CompactFlags::ForceImageLayerCreation)
+                        {
+                            ImageLayerCreationMode::Force
+                        } else {
+                            ImageLayerCreationMode::Try
+                        },
+                        &image_ctx,
+                        self.last_image_layer_creation_status
+                            .load()
+                            .as_ref()
+                            .clone(),
+                        !options.flags.contains(CompactFlags::NoYield),
+                    )
+                    .await
+                    .inspect_err(|err| {
+                        if let CreateImageLayersError::GetVectoredError(
+                            GetVectoredError::MissingKey(_),
+                        ) = err
+                        {
+                            critical!("missing key during compaction: {err:?}");
+                        }
+                    })?;

-                    self.last_image_layer_creation_status
-                        .store(Arc::new(outcome.clone()));
+                self.last_image_layer_creation_status
+                    .store(Arc::new(outcome.clone()));

-                    self.upload_new_image_layers(image_layers)?;
-                    if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
-                        // Yield and do not do any other kind of compaction.
-                        info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
-                        return Ok(CompactionOutcome::YieldForL0);
-                    }
+                self.upload_new_image_layers(image_layers)?;
+                if let LastImageLayerCreationStatus::Incomplete { .. } = outcome {
+                    // Yield and do not do any other kind of compaction.
+                    info!("skipping shard ancestor compaction due to pending image layer generation tasks (preempted by L0 compaction).");
+                    return Ok(CompactionOutcome::YieldForL0);
                }
-                Err(err) => {
-                    // no partitioning? This is normal, if the timeline was just created
-                    // as an empty timeline. Also in unit tests, when we use the timeline
-                    // as a simple key-value store, ignoring the datadir layout. Log the
-                    // error but continue.
-                    //
-                    // Suppress error when it's due to cancellation
-                    if !self.cancel.is_cancelled() && !err.is_cancelled() {
-                        tracing::error!(
-                            "could not compact, repartitioning keyspace failed: {err:?}"
-                        );
-                    }
-                }
-            };
-        }
+            }
+
+            // Suppress errors when cancelled.
+            Err(_) if self.cancel.is_cancelled() => {}
+            Err(CompactionError::ShuttingDown) => {}
+
+            // Alert on critical errors that indicate data corruption.
+            Err(
+                err @ CompactionError::CollectKeySpaceError(
+                    CollectKeySpaceError::Decode(_)
+                    | CollectKeySpaceError::PageRead(
+                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
+                    ),
+                ),
+            ) => critical!("could not compact, repartitioning keyspace failed: {err:?}"),
+
+            // Log other errors. No partitioning? This is normal, if the timeline was just created
+            // as an empty timeline. Also in unit tests, when we use the timeline as a simple
+            // key-value store, ignoring the datadir layout. Log the error but continue.
+            Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
+        };

        let partition_count = self.partitioning.read().0 .0.parts.len();

@@ -852,7 +834,7 @@ impl Timeline {
        //
        // Holding this read guard also blocks [`Self::gc_timeline`] from entering while we
        // are rewriting layers.
-        let latest_gc_cutoff = self.get_latest_gc_cutoff_lsn();
+        let latest_gc_cutoff = self.get_applied_gc_cutoff_lsn();

        tracing::info!(
            "latest_gc_cutoff: {}, pitr cutoff {}",
@@ -2202,7 +2184,7 @@ impl Timeline {

        // TODO: ensure the child branches will not use anything below the watermark, or consider
        // them when computing the watermark.
-        gc_cutoff_lsn.min(*self.get_latest_gc_cutoff_lsn())
+        gc_cutoff_lsn.min(*self.get_applied_gc_cutoff_lsn())
    }

    /// Split a gc-compaction job into multiple compaction jobs. The split is based on the key range and the estimated size of the compaction job.
@@ -2230,7 +2212,7 @@ impl Timeline {
        let sub_compaction_max_job_size_mb =
            sub_compaction_max_job_size_mb.unwrap_or(GC_COMPACT_MAX_SIZE_MB);

-        let mut compact_jobs = Vec::new();
+        let mut compact_jobs = Vec::<GcCompactJob>::new();
        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
        let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
@@ -2317,16 +2299,25 @@ impl Timeline {
                } else {
                    end
                };
-                info!(
-                    "splitting compaction job: {}..{}, estimated_size={}",
-                    start, end, total_size
-                );
-                compact_jobs.push(GcCompactJob {
-                    dry_run: job.dry_run,
-                    compact_key_range: start..end,
-                    compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
-                });
-                current_start = Some(end);
+                if total_size == 0 && !compact_jobs.is_empty() {
+                    info!(
+                        "splitting compaction job: {}..{}, estimated_size={}, extending the previous job",
+                        start, end, total_size
+                    );
+                    compact_jobs.last_mut().unwrap().compact_key_range.end = end;
+                    current_start = Some(end);
+                } else {
+                    info!(
+                        "splitting compaction job: {}..{}, estimated_size={}",
+                        start, end, total_size
+                    );
+                    compact_jobs.push(GcCompactJob {
+                        dry_run: job.dry_run,
+                        compact_key_range: start..end,
+                        compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                    });
+                    current_start = Some(end);
+                }
            }
        }
        Ok(compact_jobs)
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -294,6 +294,7 @@ impl DeleteTimelineFlow {
                timeline_id,
                local_metadata,
                None, // Ancestor is not needed for deletion.
+                None, // Previous heatmap is not needed for deletion
                tenant.get_timeline_resources_for(remote_client),
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -0,0 +1,162 @@
+//! Timeline utility module to hydrate everything from the current heatmap.
+//!
+//! Provides utilities to spawn and abort a background task where the downloads happen.
+//! See /v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_heatmap_layers.
+
+use futures::StreamExt;
+use http_utils::error::ApiError;
+use std::sync::{Arc, Mutex};
+use tokio_util::sync::CancellationToken;
+use utils::sync::gate::Gate;
+
+use super::Timeline;
+
+// This status is not strictly necessary now, but gives us a nice place
+// to store progress information if we ever wish to expose it.
+pub(super) enum HeatmapLayersDownloadStatus {
+    InProgress,
+    Complete,
+}
+
+pub(super) struct HeatmapLayersDownloader {
+    handle: tokio::task::JoinHandle<()>,
+    status: Arc<Mutex<HeatmapLayersDownloadStatus>>,
+    cancel: CancellationToken,
+    downloads_guard: Arc<Gate>,
+}
+
+impl HeatmapLayersDownloader {
+    fn new(
+        timeline: Arc<Timeline>,
+        concurrency: usize,
+    ) -> Result<HeatmapLayersDownloader, ApiError> {
+        let tl_guard = timeline.gate.enter().map_err(|_| ApiError::Cancelled)?;
+
+        let cancel = timeline.cancel.child_token();
+        let downloads_guard = Arc::new(Gate::default());
+
+        let status = Arc::new(Mutex::new(HeatmapLayersDownloadStatus::InProgress));
+
+        let handle = tokio::task::spawn({
+            let status = status.clone();
+            let downloads_guard = downloads_guard.clone();
+            let cancel = cancel.clone();
+
+            async move {
+                let _guard = tl_guard;
+
+                scopeguard::defer! {
+                    *status.lock().unwrap() = HeatmapLayersDownloadStatus::Complete;
+                }
+
+                let Some(heatmap) = timeline.generate_heatmap().await else {
+                    tracing::info!("Heatmap layers download failed to generate heatmap");
+                    return;
+                };
+
+                tracing::info!(
+                    resident_size=%timeline.resident_physical_size(),
+                    heatmap_layers=%heatmap.layers.len(),
+                    "Starting heatmap layers download"
+                );
+
+                let stream = futures::stream::iter(heatmap.layers.into_iter().filter_map(
+                    |layer| {
+                        let tl = timeline.clone();
+                        let dl_guard = match downloads_guard.enter() {
+                            Ok(g) => g,
+                            Err(_) => {
+                                // [`Self::shutdown`] was called. Don't spawn any more downloads.
+                                return None;
+                            }
+                        };
+
+                        Some(async move {
+                            let _dl_guard = dl_guard;
+
+                            let res = tl.download_layer(&layer.name).await;
+                            if let Err(err) = res {
+                                if !err.is_cancelled() {
+                                    tracing::warn!(layer=%layer.name,"Failed to download heatmap layer: {err}")
+                                }
+                            }
+                        })
+                    }
+                )).buffered(concurrency);
+
+                tokio::select! {
+                    _ = stream.collect::<()>() => {
+                        tracing::info!(
+                            resident_size=%timeline.resident_physical_size(),
+                            "Heatmap layers download completed"
+                        );
+                    },
+                    _ = cancel.cancelled() => {
+                        tracing::info!("Heatmap layers download cancelled");
+                    }
+                }
+            }
+        });
+
+        Ok(Self {
+            status,
+            handle,
+            cancel,
+            downloads_guard,
+        })
+    }
+
+    fn is_complete(&self) -> bool {
+        matches!(
+            *self.status.lock().unwrap(),
+            HeatmapLayersDownloadStatus::Complete
+        )
+    }
+
+    /// Drive any in-progress downloads to completion and stop spawning any new ones.
+    ///
+    /// This has two callers and they behave differently
+    /// 1. [`Timeline::shutdown`]: the drain will be immediate since downloads themselves
+    ///    are sensitive to timeline cancellation.
+    ///
+    /// 2. Endpoint handler in [`crate::http::routes`]: the drain will wait for any in-progress
+    ///    downloads to complete.
+    async fn stop_and_drain(self) {
+        // Counterintuitive: close the guard before cancelling.
+        // Something needs to poll the already created download futures to completion.
+        // If we cancel first, then the underlying task exits and we lost
+        // the poller.
+        self.downloads_guard.close().await;
+        self.cancel.cancel();
+        if let Err(err) = self.handle.await {
+            tracing::warn!("Failed to join heatmap layer downloader task: {err}");
+        }
+    }
+}
+
+impl Timeline {
+    pub(crate) async fn start_heatmap_layers_download(
+        self: &Arc<Self>,
+        concurrency: usize,
+    ) -> Result<(), ApiError> {
+        let mut locked = self.heatmap_layers_downloader.lock().unwrap();
+        if locked.as_ref().map(|dl| dl.is_complete()).unwrap_or(true) {
+            let dl = HeatmapLayersDownloader::new(self.clone(), concurrency)?;
+            *locked = Some(dl);
+            Ok(())
+        } else {
+            Err(ApiError::Conflict("Already running".to_string()))
+        }
+    }
+
+    pub(crate) async fn stop_and_drain_heatmap_layers_download(&self) {
+        // This can race with the start of a new downloader and lead to a situation
+        // where one donloader is shutting down and another one is in-flight.
+        // The only impact is that we'd end up using more remote storage semaphore
+        // units than expected.
+        let downloader = self.heatmap_layers_downloader.lock().unwrap().take();
+        if let Some(dl) = downloader {
+            dl.stop_and_drain().await;
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -7,7 +7,9 @@ use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
-use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
+use crate::tenant::{
+    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+};

 #[derive(thiserror::Error, Debug)]
 pub(crate) enum OffloadError {
@@ -37,12 +39,25 @@ pub(crate) async fn offload_timeline(
    debug_assert_current_span_has_tenant_and_timeline_id();
    tracing::info!("offloading archived timeline");

-    let (timeline, guard) = make_timeline_delete_guard(
+    let delete_guard_res = make_timeline_delete_guard(
        tenant,
        timeline.timeline_id,
        TimelineDeleteGuardKind::Offload,
-    )
-    .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
+    );
+    if let Err(DeleteTimelineError::HasChildren(children)) = delete_guard_res {
+        let is_archived = timeline.is_archived();
+        if is_archived == Some(true) {
+            tracing::error!("timeline is archived but has non-archived children: {children:?}");
+            return Err(OffloadError::NotArchived);
+        }
+        tracing::info!(
+            ?is_archived,
+            "timeline is not archived and has unarchived children"
+        );
+        return Err(OffloadError::NotArchived);
+    };
+    let (timeline, guard) =
+        delete_guard_res.map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;

    let TimelineOrOffloaded::Timeline(timeline) = timeline else {
        tracing::error!("timeline already offloaded, but given timeline object");
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -56,6 +56,7 @@ pub struct WalReceiverConf {
    pub auth_token: Option<Arc<String>>,
    pub availability_zone: Option<String>,
    pub ingest_batch_size: u64,
+    pub validate_wal_contiguity: bool,
 }

 pub struct WalReceiver {
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -537,6 +537,7 @@ impl ConnectionManagerState {
        let connect_timeout = self.conf.wal_connect_timeout;
        let ingest_batch_size = self.conf.ingest_batch_size;
        let protocol = self.conf.protocol;
+        let validate_wal_contiguity = self.conf.validate_wal_contiguity;
        let timeline = Arc::clone(&self.timeline);
        let ctx = ctx.detached_child(
            TaskKind::WalReceiverConnectionHandler,
@@ -558,6 +559,7 @@ impl ConnectionManagerState {
                    ctx,
                    node_id,
                    ingest_batch_size,
+                    validate_wal_contiguity,
                )
                .await;

@@ -1563,6 +1565,7 @@ mod tests {
                auth_token: None,
                availability_zone: None,
                ingest_batch_size: 1,
+                validate_wal_contiguity: false,
            },
            wal_connection: None,
            wal_stream_candidates: HashMap::new(),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -120,6 +120,7 @@ pub(super) async fn handle_walreceiver_connection(
    ctx: RequestContext,
    safekeeper_node: NodeId,
    ingest_batch_size: u64,
+    validate_wal_contiguity: bool,
 ) -> Result<(), WalReceiverError> {
    debug_assert_current_span_has_tenant_and_timeline_id();

@@ -274,6 +275,7 @@ pub(super) async fn handle_walreceiver_connection(
        } => Some((format, compression)),
    };

+    let mut expected_wal_start = startpoint;
    while let Some(replication_message) = {
        select! {
            _ = cancellation.cancelled() => {
@@ -340,13 +342,49 @@ pub(super) async fn handle_walreceiver_connection(
                    )
                    })?;

+                // Guard against WAL gaps. If the start LSN of the PG WAL section
+                // from which the interpreted records were extracted, doesn't match
+                // the end of the previous batch (or the starting point for the first batch),
+                // then kill this WAL receiver connection and start a new one.
+                if validate_wal_contiguity {
+                    if let Some(raw_wal_start_lsn) = batch.raw_wal_start_lsn {
+                        match raw_wal_start_lsn.cmp(&expected_wal_start) {
+                            std::cmp::Ordering::Greater => {
+                                let msg = format!(
+                                    "Gap in streamed WAL: [{}, {})",
+                                    expected_wal_start, raw_wal_start_lsn
+                                );
+                                critical!("{msg}");
+                                return Err(WalReceiverError::Other(anyhow!(msg)));
+                            }
+                            std::cmp::Ordering::Less => {
+                                // Other shards are reading WAL behind us.
+                                // This is valid, but check that we received records
+                                // that we haven't seen before.
+                                if let Some(first_rec) = batch.records.first() {
+                                    if first_rec.next_record_lsn < last_rec_lsn {
+                                        let msg = format!(
+                                            "Received record with next_record_lsn multiple times ({} < {})",
+                                            first_rec.next_record_lsn, expected_wal_start
+                                        );
+                                        critical!("{msg}");
+                                        return Err(WalReceiverError::Other(anyhow!(msg)));
+                                    }
+                                }
+                            }
+                            std::cmp::Ordering::Equal => {}
+                        }
+                    }
+                }
+
                let InterpretedWalRecords {
                    records,
                    next_record_lsn,
+                    raw_wal_start_lsn: _,
                } = batch;

                tracing::debug!(
-                    "Received WAL up to {} with next_record_lsn={:?}",
+                    "Received WAL up to {} with next_record_lsn={}",
                    streaming_lsn,
                    next_record_lsn
                );
@@ -423,12 +461,11 @@ pub(super) async fn handle_walreceiver_connection(
                // need to advance last record LSN on all shards. If we've not ingested the latest
                // record, then set the LSN of the modification past it. This way all shards
                // advance their last record LSN at the same time.
-                let needs_last_record_lsn_advance = match next_record_lsn {
-                    Some(lsn) if lsn > modification.get_lsn() => {
-                        modification.set_lsn(lsn).unwrap();
-                        true
-                    }
-                    _ => false,
+                let needs_last_record_lsn_advance = if next_record_lsn > modification.get_lsn() {
+                    modification.set_lsn(next_record_lsn).unwrap();
+                    true
+                } else {
+                    false
                };

                if uncommitted_records > 0 || needs_last_record_lsn_advance {
@@ -446,9 +483,8 @@ pub(super) async fn handle_walreceiver_connection(
                    timeline.get_last_record_lsn()
                );

-                if let Some(lsn) = next_record_lsn {
-                    last_rec_lsn = lsn;
-                }
+                last_rec_lsn = next_record_lsn;
+                expected_wal_start = streaming_lsn;

                Some(streaming_lsn)
            }
--- a/Show More
+++ b/Show More