Proxy release 2025-04-08

Skip hole tags in local_cache view (#11454 )
## Problem If the local file cache is shrunk, so that we punch some holes in the underlying file, the local_cache view displays the holes incorrectly. See https://github.com/neondatabase/neon/issues/10770 ## Summary of changes Skip hole tags in the local_cache view. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
2026-06-20 13:50:37 +00:00 · 2025-04-08 06:01:37 +00:00 · 2025-04-08 03:52:50 +00:00 · 2025-04-07 21:19:06 +00:00 · 2025-04-07 19:10:36 +00:00 · 2025-04-07 17:56:56 +00:00
343 changed files with 5942 additions and 4071 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,6 +8,7 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
+  - AWS_ECR_REGION
  - AZURE_DEV_CLIENT_ID
  - AZURE_DEV_REGISTRY_NAME
  - AZURE_DEV_SUBSCRIPTION_ID
@@ -15,23 +16,25 @@ config-variables:
  - AZURE_PROD_REGISTRY_NAME
  - AZURE_PROD_SUBSCRIPTION_ID
  - AZURE_TENANT_ID
+  - BENCHMARK_INGEST_TARGET_PROJECTID
+  - BENCHMARK_LARGE_OLTP_PROJECTID
  - BENCHMARK_PROJECT_ID_PUB
  - BENCHMARK_PROJECT_ID_SUB
-  - REMOTE_STORAGE_AZURE_CONTAINER
-  - REMOTE_STORAGE_AZURE_REGION
-  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
  - DEV_AWS_OIDC_ROLE_ARN
-  - BENCHMARK_INGEST_TARGET_PROJECTID
-  - PGREGRESS_PG16_PROJECT_ID
-  - PGREGRESS_PG17_PROJECT_ID
-  - SLACK_ON_CALL_QA_STAGING_STREAM
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
-  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
-  - SLACK_CICD_CHANNEL_ID
-  - SLACK_STORAGE_CHANNEL_ID
+  - HETZNER_CACHE_BUCKET
+  - HETZNER_CACHE_ENDPOINT
+  - HETZNER_CACHE_REGION
  - NEON_DEV_AWS_ACCOUNT_ID
  - NEON_PROD_AWS_ACCOUNT_ID
-  - AWS_ECR_REGION
-  - BENCHMARK_LARGE_OLTP_PROJECTID
+  - PGREGRESS_PG16_PROJECT_ID
+  - PGREGRESS_PG17_PROJECT_ID
+  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_REGION
+  - SLACK_CICD_CHANNEL_ID
  - SLACK_ON_CALL_DEVPROD_STREAM
+  - SLACK_ON_CALL_QA_STAGING_STREAM
+  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
  - SLACK_RUST_CHANNEL_ID
+  - SLACK_STORAGE_CHANNEL_ID
+  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/scripts/generate_image_maps.py
+++ b/.github/scripts/generate_image_maps.py
@@ -39,12 +39,18 @@ registries = {
    ],
 }

+release_branches = ["release", "release-proxy", "release-compute"]
+
 outputs: dict[str, dict[str, list[str]]] = {}

-target_tags = [target_tag, "latest"] if branch == "main" else [target_tag]
-target_stages = (
-    ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"]
+target_tags = (
+    [target_tag, "latest"]
+    if branch == "main"
+    else [target_tag, "released"]
+    if branch in release_branches
+    else [target_tag]
 )
+target_stages = ["dev", "prod"] if branch in release_branches else ["dev"]

 for component_name, component_images in components.items():
    for stage in target_stages:
--- a/.github/scripts/push_with_image_map.py
+++ b/.github/scripts/push_with_image_map.py
@@ -2,6 +2,9 @@ import json
 import os
 import subprocess

+RED = "\033[91m"
+RESET = "\033[0m"
+
 image_map = os.getenv("IMAGE_MAP")
 if not image_map:
    raise ValueError("IMAGE_MAP environment variable is not set")
@@ -11,12 +14,32 @@ try:
 except json.JSONDecodeError as e:
    raise ValueError("Failed to parse IMAGE_MAP as JSON") from e

-for source, targets in parsed_image_map.items():
-    for target in targets:
-        cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
-        print(f"Running: {' '.join(cmd)}")
-        result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+failures = []

-        if result.returncode != 0:
-            print(f"Error: {result.stdout}")
-            raise RuntimeError(f"Command failed: {' '.join(cmd)}")
+pending = [(source, target) for source, targets in parsed_image_map.items() for target in targets]
+
+while len(pending) > 0:
+    if len(failures) > 10:
+        print("Error: more than 10 failures!")
+        for failure in failures:
+            print(f'"{failure[0]}" failed with the following output:')
+            print(failure[1])
+        raise RuntimeError("Retry limit reached.")
+
+    source, target = pending.pop(0)
+    cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    if result.returncode != 0:
+        failures.append((" ".join(cmd), result.stdout, target))
+        pending.append((source, target))
+        print(
+            f"{RED}[RETRY]{RESET} Push failed for {target}. Retrying... (failure count: {len(failures)})"
+        )
+        print(result.stdout)
+
+if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")):
+    failed_targets = [target for _, _, target in failures]
+    with open(github_output, "a") as f:
+        f.write(f"push_failures={json.dumps(failed_targets)}\n")
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -128,29 +128,49 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg_14
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

      - name: Cache postgres v17 build
        id: cache_pg_17
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

--- a/.github/workflows/_check-codestyle-python.yml
+++ b/.github/workflows/_check-codestyle-python.yml
@@ -37,8 +37,14 @@ jobs:

      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+      - name: Cache poetry deps
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: ~/.cache/pypoetry/virtualenvs
          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -48,8 +48,13 @@ jobs:
          submodules: true

      - name: Cache cargo deps
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: |
            ~/.cargo/registry
            !~/.cargo/registry/src
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -5,6 +5,9 @@ on:
      github-event-name:
        type: string
        required: true
+      github-event-json:
+        type: string
+        required: true
    outputs:
      build-tag:
        description: "Tag for the current workflow run"
@@ -27,6 +30,9 @@ on:
      release-pr-run-id:
        description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found."
        value: ${{ jobs.tags.outputs.release-pr-run-id }}
+      sha:
+        description: "github.event.pull_request.head.sha on release PRs, github.sha otherwise"
+        value: ${{ jobs.tags.outputs.sha }}

 permissions: {}

@@ -45,6 +51,7 @@ jobs:
      storage: ${{ steps.previous-releases.outputs.storage }}
      run-kind: ${{ steps.run-kind.outputs.run-kind }}
      release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }}
+      sha: ${{ steps.sha.outputs.sha }}
    permissions:
      contents: read
    steps:
@@ -54,10 +61,6 @@ jobs:
        with:
          egress-policy: audit

-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
      - name: Get run kind
        id: run-kind
        env:
@@ -78,6 +81,23 @@ jobs:
        run: |
          echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT

+      - name: Get the right SHA
+        id: sha
+        env:
+          SHA: >
+            ${{
+              contains(fromJSON('["storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), steps.run-kind.outputs.run-kind)
+              && fromJSON(inputs.github-event-json).pull_request.head.sha
+              || github.sha
+            }}
+        run: |
+          echo "sha=$SHA" | tee -a $GITHUB_OUTPUT
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          ref: ${{ steps.sha.outputs.sha }}
+
      - name: Get build tag
        id: build-tag
        env:
@@ -143,7 +163,7 @@ jobs:
        if: ${{ contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }}
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          CURRENT_SHA: ${{ github.sha }}
        run: |
          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
          echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -104,6 +104,25 @@ jobs:
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

      - name: Copy docker images to target registries
+        id: push
        run: python3 .github/scripts/push_with_image_map.py
        env:
          IMAGE_MAP: ${{ inputs.image-map }}
+
+      - name: Notify Slack if container image pushing fails
+        if: steps.push.outputs.push_failures || failure()
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
+            text: >
+              *Container image pushing ${{
+                steps.push.outcome == 'failure' && 'failed completely' || 'succeeded with some retries'
+              }}* in
+              <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
+
+              ${{ steps.push.outputs.push_failures && format(
+                '*Failed targets:*\n• {0}', join(fromJson(steps.push.outputs.push_failures), '\n• ')
+              ) || '' }}
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -63,8 +63,13 @@ jobs:

      - name: Cache postgres ${{ matrix.postgres-version }} build
        id: cache_pg
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/${{ matrix.postgres-version }}
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

@@ -129,15 +134,25 @@ jobs:

      - name: Cache postgres v17 build
        id: cache_pg
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache walproposer-lib
        id: cache_walproposer_lib
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

@@ -203,32 +218,57 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v15 build
        id: cache_pg_v15
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v16 build
        id: cache_pg_v16
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v17 build
        id: cache_pg_v17
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache cargo deps (only for v17)
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: |
            ~/.cargo/registry
            !~/.cargo/registry/src
@@ -238,8 +278,13 @@ jobs:

      - name: Cache walproposer-lib
        id: cache_walproposer_lib
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -80,6 +80,7 @@ jobs:
    uses: ./.github/workflows/_meta.yml
    with:
      github-event-name: ${{ github.event_name }}
+      github-event-json: ${{ toJSON(github.event) }}

  build-build-tools-image:
    needs: [ check-permissions ]
@@ -88,8 +89,8 @@ jobs:

  check-codestyle-python:
    needs: [ meta, check-permissions, build-build-tools-image ]
-    # No need to run on `main` because we this in the merge queue
-    if: ${{ needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    uses: ./.github/workflows/_check-codestyle-python.yml
    with:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -97,7 +98,8 @@ jobs:

  check-codestyle-jsonnet:
    needs: [ meta, check-permissions, build-build-tools-image ]
-    if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    runs-on: [ self-hosted, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -180,8 +182,8 @@ jobs:

  check-codestyle-rust:
    needs: [ meta, check-permissions, build-build-tools-image ]
-    # No need to run on `main` because we this in the merge queue
-    if: ${{ needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    uses: ./.github/workflows/_check-codestyle-rust.yml
    with:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -190,7 +192,8 @@ jobs:

  check-dependencies-rust:
    needs: [ meta, files-changed, build-build-tools-image ]
-    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    uses: ./.github/workflows/cargo-deny.yml
    with:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -198,7 +201,8 @@ jobs:

  build-and-test-locally:
    needs: [ meta, build-build-tools-image ]
-    if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    strategy:
      fail-fast: false
      matrix:
@@ -248,8 +252,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Cache poetry deps
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: ~/.cache/pypoetry/virtualenvs
          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

@@ -540,6 +549,7 @@ jobs:
    uses: ./.github/workflows/trigger-e2e-tests.yml
    with:
      github-event-name: ${{ github.event_name }}
+      github-event-json: ${{ toJSON(github.event) }}
    secrets: inherit

  neon-image-arch:
@@ -563,6 +573,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          submodules: true
+          ref: ${{ needs.meta.outputs.sha }}

      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
@@ -672,6 +683,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          submodules: true
+          ref: ${{ needs.meta.outputs.sha }}

      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
@@ -968,7 +980,7 @@ jobs:
          TEST_EXTENSIONS_TAG: >-
            ${{
              contains(fromJSON('["storage-rc-pr", "proxy-rc-pr"]'), needs.meta.outputs.run-kind)
-              && 'latest'
+              && needs.meta.outputs.previous-compute-release
              || needs.meta.outputs.build-tag
            }}
          TEST_VERSION_ONLY: ${{ matrix.pg_version }}
@@ -1556,10 +1568,10 @@ jobs:
        if: |
          contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
-          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.build-and-test-locally.result == 'skipped' && contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.check-codestyle-python.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.check-codestyle-rust.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
          || needs.files-changed.result == 'skipped'
          || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
          || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -55,7 +55,7 @@ jobs:
          echo tag=${tag} >> ${GITHUB_OUTPUT}

      - name: Test extension upgrade
-        timeout-minutes: 20
+        timeout-minutes: 60
        env:
          NEW_COMPUTE_TAG: latest
          OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
--- a/.github/workflows/report-workflow-stats-batch.yml
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -23,7 +23,7 @@ jobs:
        egress-policy: audit

    - name: Export Workflow Run for the past 2 hours
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
      with:
        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
        db_table: "gh_workflow_stats_neon"
@@ -43,7 +43,7 @@ jobs:
        egress-policy: audit

    - name: Export Workflow Run for the past 48 hours
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
      with:
        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
        db_table: "gh_workflow_stats_neon"
@@ -63,7 +63,7 @@ jobs:
        egress-policy: audit

    - name: Export Workflow Run for the past 30 days
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
      with:
        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
        db_table: "gh_workflow_stats_neon"
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -9,6 +9,9 @@ on:
      github-event-name:
        type: string
        required: true
+      github-event-json:
+        type: string
+        required: true

 defaults:
  run:
@@ -48,6 +51,7 @@ jobs:
    uses: ./.github/workflows/_meta.yml
    with:
      github-event-name: ${{ inputs.github-event-name || github.event_name }}
+      github-event-json: ${{ inputs.github-event-json || toJSON(github.event) }}

  trigger-e2e-tests:
    needs: [ meta ]
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -148,9 +148,9 @@ dependencies = [

 [[package]]
 name = "arc-swap"
-version = "1.6.0"
+version = "1.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"

 [[package]]
 name = "archery"
@@ -3861,11 +3861,10 @@ dependencies = [

 [[package]]
 name = "num-bigint"
-version = "0.4.3"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
 dependencies = [
- "autocfg",
 "num-integer",
 "num-traits",
 ]
@@ -3914,11 +3913,10 @@ dependencies = [

 [[package]]
 name = "num-integer"
-version = "0.1.45"
+version = "0.1.46"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
 dependencies = [
- "autocfg",
 "num-traits",
 ]

@@ -3947,9 +3945,9 @@ dependencies = [

 [[package]]
 name = "num-traits"
-version = "0.2.15"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
 "autocfg",
 "libm",
@@ -4331,6 +4329,7 @@ dependencies = [
 "strum",
 "strum_macros",
 "thiserror 1.0.69",
+ "tracing-utils",
 "utils",
 ]

@@ -5362,26 +5361,25 @@ dependencies = [

 [[package]]
 name = "redis"
-version = "0.25.2"
+version = "0.29.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
+checksum = "b110459d6e323b7cda23980c46c77157601199c9da6241552b284cd565a7a133"
 dependencies = [
- "async-trait",
+ "arc-swap",
 "bytes",
 "combine",
 "futures-util",
 "itoa",
+ "num-bigint",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "rustls-pemfile 2.1.1",
- "rustls-pki-types",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
 "ryu",
 "sha1_smol",
 "socket2",
 "tokio",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
 "tokio-util",
 "url",
 ]
@@ -7118,9 +7116,9 @@ dependencies = [

 [[package]]
 name = "tokio"
-version = "1.43.0"
+version = "1.43.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "3d61fa4ffa3de412bfea335c6ecff681de2b609ba3c77ef3e00e521813a9ed9e"
+checksum = "492a604e2fd7f814268a378409e6c92b5525d747d10db9a229723f55a417958c"
 dependencies = [
 "backtrace",
 "bytes",
@@ -7217,15 +7215,14 @@ dependencies = [
 "bytes",
 "fallible-iterator",
 "futures-util",
- "log",
 "parking_lot 0.12.1",
- "phf",
 "pin-project-lite",
 "postgres-protocol2",
 "postgres-types2",
 "serde",
 "tokio",
 "tokio-util",
+ "tracing",
 ]

 [[package]]
@@ -7607,6 +7604,7 @@ dependencies = [
 "opentelemetry-otlp",
 "opentelemetry-semantic-conventions",
 "opentelemetry_sdk",
+ "pin-project-lite",
 "tokio",
 "tracing",
 "tracing-opentelemetry",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -50,7 +50,7 @@ license = "Apache-2.0"
 [workspace.dependencies]
 ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
-arc-swap = "1.6"
+arc-swap = "1.7"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 flate2 = "1.0.26"
@@ -130,7 +130,7 @@ nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal"
 # on compute startup metrics (start_postgres_ms), >= 25% degradation.
 notify = "6.0.0"
 num_cpus = "1.15"
-num-traits = "0.2.15"
+num-traits = "0.2.19"
 once_cell = "1.13"
 opentelemetry = "0.27"
 opentelemetry_sdk = "0.27"
@@ -146,7 +146,7 @@ procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13"
 rand = "0.8"
-redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
@@ -183,7 +183,7 @@ test-context = "0.3"
 thiserror = "1.0"
 tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
-tokio = { version = "1.41", features = ["macros"] }
+tokio = { version = "1.43.1", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.85.0
+ENV RUSTC_VERSION=1.86.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -369,7 +369,7 @@ FROM build-deps AS plv8-src
 ARG PG_VERSION
 WORKDIR /ext-src

-COPY compute/patches/plv8-3.1.10.patch .
+COPY compute/patches/plv8* .

 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -393,7 +393,7 @@ RUN case "${PG_VERSION:?}" in \
    git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
    tar -czf plv8.tar.gz --exclude .git plv8-src && \
    cd plv8-src && \
-    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi

 # Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
 # 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
@@ -1022,67 +1022,6 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/semver.control

-#########################################################################################
-#
-# Layer "pg_embedding-build"
-# compile pg_embedding extension
-#
-#########################################################################################
-FROM build-deps AS pg_embedding-src
-ARG PG_VERSION
-
-# This is our extension, support stopped in favor of pgvector
-# TODO: deprecate it
-WORKDIR /ext-src
-RUN case "${PG_VERSION:?}" in \
-      "v14" | "v15") \
-        export PG_EMBEDDING_VERSION=0.3.5 \
-        export PG_EMBEDDING_CHECKSUM=0e95b27b8b6196e2cf0a0c9ec143fe2219b82e54c5bb4ee064e76398cbe69ae9 \
-        ;; \
-      *) \
-        echo "pg_embedding not supported on this PostgreSQL version. Use pgvector instead." && exit 0;; \
-    esac && \
-    wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/${PG_EMBEDDING_VERSION}.tar.gz -O pg_embedding.tar.gz && \
-    echo "${PG_EMBEDDING_CHECKSUM} pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xzf ../pg_embedding.tar.gz --strip-components=1 -C .
-
-FROM pg-build AS pg_embedding-build
-COPY --from=pg_embedding-src /ext-src/ /ext-src/
-WORKDIR /ext-src/
-RUN  if [ -d pg_embedding-src ]; then \
-        cd pg_embedding-src && \
-        make -j $(getconf _NPROCESSORS_ONLN) && \
-        make -j $(getconf _NPROCESSORS_ONLN) install; \
-    fi
-
-#########################################################################################
-#
-# Layer "pg_anon-build"
-# compile anon extension
-#
-#########################################################################################
-FROM build-deps AS pg_anon-src
-ARG PG_VERSION
-
-# This is an experimental extension, never got to real production.
-# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
-WORKDIR /ext-src
-RUN case "${PG_VERSION:?}" in "v17") \
-    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
-    esac && \
-    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
-    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C .
-
-FROM pg-build AS pg_anon-build
-COPY --from=pg_anon-src /ext-src/ /ext-src/
-WORKDIR /ext-src
-RUN if [ -d pg_anon-src ]; then \
-        cd pg_anon-src && \
-        make -j $(getconf _NPROCESSORS_ONLN) install && \
-        echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \
-    fi
-
 #########################################################################################
 #
 # Layer "pg build with nonroot user and cargo installed"
@@ -1366,8 +1305,8 @@ ARG PG_VERSION
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
 WORKDIR /ext-src
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
@@ -1675,9 +1614,7 @@ COPY --from=rdkit-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_uuidv7-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1853,7 +1790,6 @@ COPY --from=pg_cron-src /ext-src/ /ext-src/
 COPY --from=pg_uuidv7-src /ext-src/ /ext-src/
 COPY --from=pg_roaringbitmap-src /ext-src/ /ext-src/
 COPY --from=pg_semver-src /ext-src/ /ext-src/
-#COPY --from=pg_embedding-src /ext-src/ /ext-src/
 #COPY --from=wal2json-src /ext-src/ /ext-src/
 COPY --from=pg_ivm-src /ext-src/ /ext-src/
 COPY --from=pg_partman-src /ext-src/ /ext-src/
@@ -1916,26 +1852,30 @@ RUN apt update && \
      ;; \
    esac && \
    apt install --no-install-recommends -y \
+        ca-certificates \
        gdb \
-        liblz4-1 \
-        libreadline8 \
+        iproute2 \
        libboost-iostreams1.74.0 \
        libboost-regex1.74.0 \
        libboost-serialization1.74.0 \
        libboost-system1.74.0 \
-        libossp-uuid16 \
+        libcurl4 \
+        libevent-2.1-7 \
        libgeos-c1v5 \
+        liblz4-1 \
+        libossp-uuid16 \
        libprotobuf-c1 \
+        libreadline8 \
        libsfcgal1 \
        libxml2 \
        libxslt1.1 \
        libzstd1 \
-        libcurl4 \
-        libevent-2.1-7 \
        locales \
+        lsof \
        procps \
-        ca-certificates \
        rsyslog \
+        screen \
+        tcpdump \
        $VERSION_INSTALLS && \
    apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -33,6 +33,7 @@
    import 'sql_exporter/lfc_hits.libsonnet',
    import 'sql_exporter/lfc_misses.libsonnet',
    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_used_pages.libsonnet',
    import 'sql_exporter/lfc_writes.libsonnet',
    import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
    import 'sql_exporter/max_cluster_size.libsonnet',
--- a/compute/etc/sql_exporter/lfc_used_pages.libsonnet
+++ b/compute/etc/sql_exporter/lfc_used_pages.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_used_pages',
+  type: 'gauge',
+  help: 'LFC pages used',
+  key_labels: null,
+  values: [
+    'lfc_used_pages',
+  ],
+  query: importstr 'sql_exporter/lfc_used_pages.sql',
+}
--- a/compute/etc/sql_exporter/lfc_used_pages.sql
+++ b/compute/etc/sql_exporter/lfc_used_pages.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages';
--- a/compute/patches/cloud_regress_pg16.patch
+++ b/compute/patches/cloud_regress_pg16.patch
@@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644
 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
 ERROR:  must be owner of relation constraint_comments_tbl
 diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
-index 442e7aff2b..525f732b03 100644
+index d785f92561..16377e5ac9 100644
 --- a/src/test/regress/expected/conversion.out
 +++ b/src/test/regress/expected/conversion.out
-@@ -8,7 +8,7 @@
+@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup();
 CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
     AS :'regresslib', 'test_enc_conversion'
     LANGUAGE C STRICT;
@@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644
 SELECT *
    INTO TABLE ramp
 diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out
-index 454db91ec0..01378d7081 100644
+index 4cbdbdf84d..573362850e 100644
 --- a/src/test/regress/expected/database.out
 +++ b/src/test/regress/expected/database.out
-@@ -1,8 +1,7 @@
+@@ -1,8 +1,6 @@
 CREATE DATABASE regression_tbd
 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
 ALTER DATABASE regression_tbd RENAME TO regression_utf8;
 -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
 -ALTER DATABASE regression_utf8 RESET TABLESPACE;
-+WARNING:  you need to manually restart any running background workers after this command
 ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
 -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
 BEGIN;
@@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644
 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
 diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
-index 6b8c2f2414..8e13b7fa46 100644
+index 84745b9f60..4883c12351 100644
 --- a/src/test/regress/expected/foreign_key.out
 +++ b/src/test/regress/expected/foreign_key.out
@@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -1112,7 +1111,7 @@ index 8475231735..0653946337 100644
 DROP ROLE regress_passwd_sha_len1;
 DROP ROLE regress_passwd_sha_len2;
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
-index 5b9dba7b32..cc408dad42 100644
+index 620fbe8c52..0570102357 100644
 --- a/src/test/regress/expected/privileges.out
 +++ b/src/test/regress/expected/privileges.out
@@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
@@ -1174,8 +1173,8 @@ index 5b9dba7b32..cc408dad42 100644
 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
 GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
- SET SESSION AUTHORIZATION regress_priv_user1;
-@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+ SET SESSION AUTHORIZATION regress_priv_user3;
+@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
 ERROR:  permission denied to grant privileges as role "regress_priv_role"
 DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
 GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
@@ -1192,7 +1191,7 @@ index 5b9dba7b32..cc408dad42 100644
 DROP ROLE regress_priv_role;
 SET SESSION AUTHORIZATION regress_priv_user1;
 SELECT session_user, current_user;
-@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
 
 -- security-restricted operations
 \c -
@@ -1201,7 +1200,7 @@ index 5b9dba7b32..cc408dad42 100644
 -- Check that index expressions and predicates are run as the table's owner
 -- A dummy index function checking current_user
 CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
-@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer)
+@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer)
 drop cascades to function testns.priv_testproc(integer)
 -- Change owner of the schema & and rename of new schema owner
 \c -
@@ -1212,7 +1211,7 @@ index 5b9dba7b32..cc408dad42 100644
 SET SESSION ROLE regress_schemauser1;
 CREATE SCHEMA testns;
 SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
-@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7;
+@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7;
 DROP USER regress_priv_user8; -- does not exist
 ERROR:  role "regress_priv_user8" does not exist
 -- permissions with LOCK TABLE
@@ -1221,7 +1220,7 @@ index 5b9dba7b32..cc408dad42 100644
 CREATE TABLE lock_table (a int);
 -- LOCK TABLE and SELECT permission
 GRANT SELECT ON lock_table TO regress_locktable_user;
-@@ -2874,7 +2878,7 @@ DROP USER regress_locktable_user;
+@@ -2881,7 +2885,7 @@ DROP USER regress_locktable_user;
 -- pg_backend_memory_contexts.
 -- switch to superuser
 \c -
@@ -1230,7 +1229,7 @@ index 5b9dba7b32..cc408dad42 100644
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  has_table_privilege 
 ---------------------
-@@ -2918,10 +2922,10 @@ RESET ROLE;
+@@ -2925,10 +2929,10 @@ RESET ROLE;
 -- clean up
 DROP ROLE regress_readallstats;
 -- test role grantor machinery
@@ -1245,7 +1244,7 @@ index 5b9dba7b32..cc408dad42 100644
 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
 GRANT regress_group_direct_manager TO regress_group_indirect_manager;
 SET SESSION AUTHORIZATION regress_group_direct_manager;
-@@ -2950,9 +2954,9 @@ DROP ROLE regress_group_direct_manager;
+@@ -2957,9 +2961,9 @@ DROP ROLE regress_group_direct_manager;
 DROP ROLE regress_group_indirect_manager;
 DROP ROLE regress_group_member;
 -- test SET and INHERIT options with object ownership changes
@@ -1841,7 +1840,7 @@ index 09a255649b..15895f0c53 100644
 CREATE TABLE ruletest_t2 (x int);
 CREATE VIEW ruletest_v1 WITH (security_invoker=true) AS
 diff --git a/src/test/regress/expected/security_label.out b/src/test/regress/expected/security_label.out
-index a8e01a6220..5a9cef4ede 100644
+index a8e01a6220..83543b250a 100644
 --- a/src/test/regress/expected/security_label.out
 +++ b/src/test/regress/expected/security_label.out
@@ -6,8 +6,8 @@ SET client_min_messages TO 'warning';
@@ -1855,34 +1854,6 @@ index a8e01a6220..5a9cef4ede 100644
 CREATE TABLE seclabel_tbl1 (a int, b text);
 CREATE TABLE seclabel_tbl2 (x int, y text);
 CREATE VIEW seclabel_view1 AS SELECT * FROM seclabel_tbl2;
-@@ -19,21 +19,21 @@ ALTER TABLE seclabel_tbl2 OWNER TO regress_seclabel_user2;
- -- Test of SECURITY LABEL statement without a plugin
- --
- SECURITY LABEL ON TABLE seclabel_tbl1 IS 'classified';			-- fail
-ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL FOR 'dummy' ON TABLE seclabel_tbl1 IS 'classified';		-- fail
- ERROR:  security label provider "dummy" is not loaded
- SECURITY LABEL ON TABLE seclabel_tbl1 IS '...invalid label...';		-- fail
-ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL ON TABLE seclabel_tbl3 IS 'unclassified';			-- fail
-ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL ON ROLE regress_seclabel_user1 IS 'classified';			-- fail
-ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL FOR 'dummy' ON ROLE regress_seclabel_user1 IS 'classified';		-- fail
- ERROR:  security label provider "dummy" is not loaded
- SECURITY LABEL ON ROLE regress_seclabel_user1 IS '...invalid label...';		-- fail
-ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- SECURITY LABEL ON ROLE regress_seclabel_user3 IS 'unclassified';			-- fail
-ERROR:  no security label providers have been loaded
-+ERROR:  must specify provider when multiple security label providers have been loaded
- -- clean up objects
- DROP FUNCTION seclabel_four();
- DROP DOMAIN seclabel_domain;
 diff --git a/src/test/regress/expected/select_into.out b/src/test/regress/expected/select_into.out
 index b79fe9a1c0..e29fab88ab 100644
 --- a/src/test/regress/expected/select_into.out
@@ -2413,10 +2384,10 @@ index e3e3bea709..fa86ddc326 100644
 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
 COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment';
 diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
-index 9a65fca91f..58431a3056 100644
+index b567a1a572..4d1ac2e631 100644
 --- a/src/test/regress/sql/conversion.sql
 +++ b/src/test/regress/sql/conversion.sql
-@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
+@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
     AS :'regresslib', 'test_enc_conversion'
     LANGUAGE C STRICT;
 
@@ -2780,7 +2751,7 @@ index ae6841308b..47bc792e30 100644
 
 SELECT *
 diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql
-index 0367c0e37a..a23b98c4bd 100644
+index 46ad263478..eb05584ed5 100644
 --- a/src/test/regress/sql/database.sql
 +++ b/src/test/regress/sql/database.sql
@@ -1,8 +1,6 @@
@@ -2893,7 +2864,7 @@ index aa147b14a9..370e0dd570 100644
 CREATE FOREIGN DATA WRAPPER dummy;
 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
-index 45c7a534cb..32dd26b8cd 100644
+index 9f4210b26e..620d3fc87e 100644
 --- a/src/test/regress/sql/foreign_key.sql
 +++ b/src/test/regress/sql/foreign_key.sql
@@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -3246,7 +3217,7 @@ index 53e86b0b6c..0303fdfe96 100644
 -- Check that the invalid secrets were re-hashed. A re-hashed secret
 -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
-index 249df17a58..b258e7f26a 100644
+index 259f1aedd1..6e1a3d17b7 100644
 --- a/src/test/regress/sql/privileges.sql
 +++ b/src/test/regress/sql/privileges.sql
@@ -24,18 +24,18 @@ RESET client_min_messages;
@@ -3308,7 +3279,7 @@ index 249df17a58..b258e7f26a 100644
 
 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
 
-@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
 
 -- security-restricted operations
 \c -
@@ -3317,7 +3288,7 @@ index 249df17a58..b258e7f26a 100644
 
 -- Check that index expressions and predicates are run as the table's owner
 
-@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE;
+@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE;
 -- Change owner of the schema & and rename of new schema owner
 \c -
 
@@ -3328,7 +3299,7 @@ index 249df17a58..b258e7f26a 100644
 
 SET SESSION ROLE regress_schemauser1;
 CREATE SCHEMA testns;
-@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist
+@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist
 
 
 -- permissions with LOCK TABLE
@@ -3337,7 +3308,7 @@ index 249df17a58..b258e7f26a 100644
 CREATE TABLE lock_table (a int);
 
 -- LOCK TABLE and SELECT permission
-@@ -1836,7 +1836,7 @@ DROP USER regress_locktable_user;
+@@ -1839,7 +1839,7 @@ DROP USER regress_locktable_user;
 -- switch to superuser
 \c -
 
@@ -3346,7 +3317,7 @@ index 249df17a58..b258e7f26a 100644
 
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
-@@ -1856,10 +1856,10 @@ RESET ROLE;
+@@ -1859,10 +1859,10 @@ RESET ROLE;
 DROP ROLE regress_readallstats;
 
 -- test role grantor machinery
@@ -3361,7 +3332,7 @@ index 249df17a58..b258e7f26a 100644
 
 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
 GRANT regress_group_direct_manager TO regress_group_indirect_manager;
-@@ -1881,9 +1881,9 @@ DROP ROLE regress_group_indirect_manager;
+@@ -1884,9 +1884,9 @@ DROP ROLE regress_group_indirect_manager;
 DROP ROLE regress_group_member;
 
 -- test SET and INHERIT options with object ownership changes
--- a/compute/patches/cloud_regress_pg17.patch
+++ b/compute/patches/cloud_regress_pg17.patch
@@ -202,10 +202,10 @@ index cf0b80d616..e8e2a14a4a 100644
 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
 ERROR:  must be owner of relation constraint_comments_tbl
 diff --git a/src/test/regress/expected/conversion.out b/src/test/regress/expected/conversion.out
-index 442e7aff2b..525f732b03 100644
+index d785f92561..16377e5ac9 100644
 --- a/src/test/regress/expected/conversion.out
 +++ b/src/test/regress/expected/conversion.out
-@@ -8,7 +8,7 @@
+@@ -15,7 +15,7 @@ SELECT FROM test_enc_setup();
 CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, result OUT bytea)
     AS :'regresslib', 'test_enc_conversion'
     LANGUAGE C STRICT;
@@ -587,16 +587,15 @@ index f551624afb..57f1e432d4 100644
 SELECT *
    INTO TABLE ramp
 diff --git a/src/test/regress/expected/database.out b/src/test/regress/expected/database.out
-index 454db91ec0..01378d7081 100644
+index 4cbdbdf84d..573362850e 100644
 --- a/src/test/regress/expected/database.out
 +++ b/src/test/regress/expected/database.out
-@@ -1,8 +1,7 @@
+@@ -1,8 +1,6 @@
 CREATE DATABASE regression_tbd
 	ENCODING utf8 LC_COLLATE "C" LC_CTYPE "C" TEMPLATE template0;
 ALTER DATABASE regression_tbd RENAME TO regression_utf8;
 -ALTER DATABASE regression_utf8 SET TABLESPACE regress_tblspace;
 -ALTER DATABASE regression_utf8 RESET TABLESPACE;
-+WARNING:  you need to manually restart any running background workers after this command
 ALTER DATABASE regression_utf8 CONNECTION_LIMIT 123;
 -- Test PgDatabaseToastTable.  Doing this with GRANT would be slow.
 BEGIN;
@@ -700,7 +699,7 @@ index 6ed50fdcfa..caa00a345d 100644
 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 CREATE FOREIGN DATA WRAPPER postgresql VALIDATOR postgresql_fdw_validator;
 diff --git a/src/test/regress/expected/foreign_key.out b/src/test/regress/expected/foreign_key.out
-index 69994c98e3..129abcfbe8 100644
+index fe6a1015f2..614b387b7d 100644
 --- a/src/test/regress/expected/foreign_key.out
 +++ b/src/test/regress/expected/foreign_key.out
@@ -1985,7 +1985,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -1147,7 +1146,7 @@ index 924d6e001d..7fdda73439 100644
 DROP ROLE regress_passwd_sha_len1;
 DROP ROLE regress_passwd_sha_len2;
 diff --git a/src/test/regress/expected/privileges.out b/src/test/regress/expected/privileges.out
-index 1296da0d57..f43fffa44c 100644
+index e8c668e0a1..03be5c2120 100644
 --- a/src/test/regress/expected/privileges.out
 +++ b/src/test/regress/expected/privileges.out
@@ -20,19 +20,19 @@ SELECT lo_unlink(oid) FROM pg_largeobject_metadata WHERE oid >= 1000 AND oid < 3
@@ -1209,8 +1208,8 @@ index 1296da0d57..f43fffa44c 100644
 +CREATE GROUP regress_priv_group2 WITH ADMIN regress_priv_user1 PASSWORD NEON_PASSWORD_PLACEHOLDER USER regress_priv_user2;
 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
 GRANT regress_priv_group2 TO regress_priv_user2 GRANTED BY regress_priv_user1;
- SET SESSION AUTHORIZATION regress_priv_user1;
-@@ -239,12 +239,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
+ SET SESSION AUTHORIZATION regress_priv_user3;
+@@ -246,12 +246,16 @@ GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY regre
 ERROR:  permission denied to grant privileges as role "regress_priv_role"
 DETAIL:  The grantor must have the ADMIN option on role "regress_priv_role".
 GRANT regress_priv_role TO regress_priv_user1 WITH ADMIN OPTION GRANTED BY CURRENT_ROLE;
@@ -1227,7 +1226,7 @@ index 1296da0d57..f43fffa44c 100644
 DROP ROLE regress_priv_role;
 SET SESSION AUTHORIZATION regress_priv_user1;
 SELECT session_user, current_user;
-@@ -1776,7 +1780,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1783,7 +1787,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
 
 -- security-restricted operations
 \c -
@@ -1236,7 +1235,7 @@ index 1296da0d57..f43fffa44c 100644
 -- Check that index expressions and predicates are run as the table's owner
 -- A dummy index function checking current_user
 CREATE FUNCTION sro_ifun(int) RETURNS int AS $$
-@@ -2668,8 +2672,8 @@ drop cascades to function testns.priv_testagg(integer)
+@@ -2675,8 +2679,8 @@ drop cascades to function testns.priv_testagg(integer)
 drop cascades to function testns.priv_testproc(integer)
 -- Change owner of the schema & and rename of new schema owner
 \c -
@@ -1247,7 +1246,7 @@ index 1296da0d57..f43fffa44c 100644
 SET SESSION ROLE regress_schemauser1;
 CREATE SCHEMA testns;
 SELECT nspname, rolname FROM pg_namespace, pg_roles WHERE pg_namespace.nspname = 'testns' AND pg_namespace.nspowner = pg_roles.oid;
-@@ -2792,7 +2796,7 @@ DROP USER regress_priv_user7;
+@@ -2799,7 +2803,7 @@ DROP USER regress_priv_user7;
 DROP USER regress_priv_user8; -- does not exist
 ERROR:  role "regress_priv_user8" does not exist
 -- permissions with LOCK TABLE
@@ -1256,7 +1255,7 @@ index 1296da0d57..f43fffa44c 100644
 CREATE TABLE lock_table (a int);
 -- LOCK TABLE and SELECT permission
 GRANT SELECT ON lock_table TO regress_locktable_user;
-@@ -2888,7 +2892,7 @@ DROP USER regress_locktable_user;
+@@ -2895,7 +2899,7 @@ DROP USER regress_locktable_user;
 -- pg_backend_memory_contexts.
 -- switch to superuser
 \c -
@@ -1265,7 +1264,7 @@ index 1296da0d57..f43fffa44c 100644
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
  has_table_privilege 
 ---------------------
-@@ -2932,10 +2936,10 @@ RESET ROLE;
+@@ -2939,10 +2943,10 @@ RESET ROLE;
 -- clean up
 DROP ROLE regress_readallstats;
 -- test role grantor machinery
@@ -1280,7 +1279,7 @@ index 1296da0d57..f43fffa44c 100644
 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
 GRANT regress_group_direct_manager TO regress_group_indirect_manager;
 SET SESSION AUTHORIZATION regress_group_direct_manager;
-@@ -2964,9 +2968,9 @@ DROP ROLE regress_group_direct_manager;
+@@ -2971,9 +2975,9 @@ DROP ROLE regress_group_direct_manager;
 DROP ROLE regress_group_indirect_manager;
 DROP ROLE regress_group_member;
 -- test SET and INHERIT options with object ownership changes
@@ -1293,7 +1292,7 @@ index 1296da0d57..f43fffa44c 100644
 CREATE SCHEMA regress_roleoption;
 GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
 GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
-@@ -2995,9 +2999,9 @@ DROP ROLE regress_roleoption_protagonist;
+@@ -3002,9 +3006,9 @@ DROP ROLE regress_roleoption_protagonist;
 DROP ROLE regress_roleoption_donor;
 DROP ROLE regress_roleoption_recipient;
 -- MAINTAIN
@@ -2433,10 +2432,10 @@ index e3e3bea709..fa86ddc326 100644
 COMMENT ON CONSTRAINT the_constraint ON constraint_comments_tbl IS 'no, the comment';
 COMMENT ON CONSTRAINT the_constraint ON DOMAIN constraint_comments_dom IS 'no, another comment';
 diff --git a/src/test/regress/sql/conversion.sql b/src/test/regress/sql/conversion.sql
-index 9a65fca91f..58431a3056 100644
+index b567a1a572..4d1ac2e631 100644
 --- a/src/test/regress/sql/conversion.sql
 +++ b/src/test/regress/sql/conversion.sql
-@@ -12,7 +12,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
+@@ -17,7 +17,7 @@ CREATE FUNCTION test_enc_conversion(bytea, name, name, bool, validlen OUT int, r
     AS :'regresslib', 'test_enc_conversion'
     LANGUAGE C STRICT;
 
@@ -2800,7 +2799,7 @@ index ae6841308b..47bc792e30 100644
 
 SELECT *
 diff --git a/src/test/regress/sql/database.sql b/src/test/regress/sql/database.sql
-index 0367c0e37a..a23b98c4bd 100644
+index 46ad263478..eb05584ed5 100644
 --- a/src/test/regress/sql/database.sql
 +++ b/src/test/regress/sql/database.sql
@@ -1,8 +1,6 @@
@@ -2913,7 +2912,7 @@ index aa147b14a9..370e0dd570 100644
 CREATE FOREIGN DATA WRAPPER dummy;
 COMMENT ON FOREIGN DATA WRAPPER dummy IS 'useless';
 diff --git a/src/test/regress/sql/foreign_key.sql b/src/test/regress/sql/foreign_key.sql
-index 2e710e419c..89cd481a54 100644
+index 8c4e4c7c83..e946cd2119 100644
 --- a/src/test/regress/sql/foreign_key.sql
 +++ b/src/test/regress/sql/foreign_key.sql
@@ -1435,7 +1435,7 @@ ALTER TABLE fk_partitioned_fk_6 ATTACH PARTITION fk_partitioned_pk_6 FOR VALUES
@@ -3301,7 +3300,7 @@ index bb82aa4aa2..dd8a05e24d 100644
 -- Check that the invalid secrets were re-hashed. A re-hashed secret
 -- should not contain the original salt.
 diff --git a/src/test/regress/sql/privileges.sql b/src/test/regress/sql/privileges.sql
-index 5880bc018d..27aa952b18 100644
+index b7e1cb6cdd..6e5a2217f1 100644
 --- a/src/test/regress/sql/privileges.sql
 +++ b/src/test/regress/sql/privileges.sql
@@ -24,18 +24,18 @@ RESET client_min_messages;
@@ -3363,7 +3362,7 @@ index 5880bc018d..27aa952b18 100644
 
 ALTER GROUP regress_priv_group1 ADD USER regress_priv_user4;
 
-@@ -1157,7 +1157,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
+@@ -1160,7 +1160,7 @@ SELECT has_table_privilege('regress_priv_user1', 'atest4', 'SELECT WITH GRANT OP
 
 -- security-restricted operations
 \c -
@@ -3372,7 +3371,7 @@ index 5880bc018d..27aa952b18 100644
 
 -- Check that index expressions and predicates are run as the table's owner
 
-@@ -1653,8 +1653,8 @@ DROP SCHEMA testns CASCADE;
+@@ -1656,8 +1656,8 @@ DROP SCHEMA testns CASCADE;
 -- Change owner of the schema & and rename of new schema owner
 \c -
 
@@ -3383,7 +3382,7 @@ index 5880bc018d..27aa952b18 100644
 
 SET SESSION ROLE regress_schemauser1;
 CREATE SCHEMA testns;
-@@ -1748,7 +1748,7 @@ DROP USER regress_priv_user8; -- does not exist
+@@ -1751,7 +1751,7 @@ DROP USER regress_priv_user8; -- does not exist
 
 
 -- permissions with LOCK TABLE
@@ -3392,7 +3391,7 @@ index 5880bc018d..27aa952b18 100644
 CREATE TABLE lock_table (a int);
 
 -- LOCK TABLE and SELECT permission
-@@ -1851,7 +1851,7 @@ DROP USER regress_locktable_user;
+@@ -1854,7 +1854,7 @@ DROP USER regress_locktable_user;
 -- switch to superuser
 \c -
 
@@ -3401,7 +3400,7 @@ index 5880bc018d..27aa952b18 100644
 
 SELECT has_table_privilege('regress_readallstats','pg_backend_memory_contexts','SELECT'); -- no
 SELECT has_table_privilege('regress_readallstats','pg_shmem_allocations','SELECT'); -- no
-@@ -1871,10 +1871,10 @@ RESET ROLE;
+@@ -1874,10 +1874,10 @@ RESET ROLE;
 DROP ROLE regress_readallstats;
 
 -- test role grantor machinery
@@ -3416,7 +3415,7 @@ index 5880bc018d..27aa952b18 100644
 
 GRANT regress_group TO regress_group_direct_manager WITH INHERIT FALSE, ADMIN TRUE;
 GRANT regress_group_direct_manager TO regress_group_indirect_manager;
-@@ -1896,9 +1896,9 @@ DROP ROLE regress_group_indirect_manager;
+@@ -1899,9 +1899,9 @@ DROP ROLE regress_group_indirect_manager;
 DROP ROLE regress_group_member;
 
 -- test SET and INHERIT options with object ownership changes
@@ -3429,7 +3428,7 @@ index 5880bc018d..27aa952b18 100644
 CREATE SCHEMA regress_roleoption;
 GRANT CREATE, USAGE ON SCHEMA regress_roleoption TO PUBLIC;
 GRANT regress_roleoption_donor TO regress_roleoption_protagonist WITH INHERIT TRUE, SET FALSE;
-@@ -1926,9 +1926,9 @@ DROP ROLE regress_roleoption_donor;
+@@ -1929,9 +1929,9 @@ DROP ROLE regress_roleoption_donor;
 DROP ROLE regress_roleoption_recipient;
 
 -- MAINTAIN
--- a/compute/patches/pg_hint_plan_v16.patch
+++ b/compute/patches/pg_hint_plan_v16.patch
@@ -2,23 +2,6 @@ diff --git a/expected/ut-A.out b/expected/ut-A.out
 index da723b8..5328114 100644
 --- a/expected/ut-A.out
 +++ b/expected/ut-A.out
-@@ -9,13 +9,16 @@ SET search_path TO public;
- ----
- -- No.A-1-1-3
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- -- No.A-1-2-3
- DROP EXTENSION pg_hint_plan;
- -- No.A-1-1-4
- CREATE SCHEMA other_schema;
- CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- DROP SCHEMA other_schema;
- ----
- ---- No. A-5-1 comment pattern
@@ -3175,6 +3178,7 @@ SELECT s.query, s.calls
   FROM public.pg_stat_statements s
   JOIN pg_catalog.pg_database d
@@ -27,18 +10,6 @@ index da723b8..5328114 100644
  ORDER BY 1;
                 query                 | calls 
 --------------------------------------+-------
-diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
-index d372459..6282afe 100644
--- a/expected/ut-fdw.out
-+++ b/expected/ut-fdw.out
-@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
- SET client_min_messages TO LOG;
- SET pg_hint_plan.enable_hint TO on;
- CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
- CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
- CREATE USER MAPPING FOR PUBLIC SERVER file_server;
- CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
 diff --git a/sql/ut-A.sql b/sql/ut-A.sql
 index 7c7d58a..4fd1a07 100644
 --- a/sql/ut-A.sql
--- a/compute/patches/pg_hint_plan_v17.patch
+++ b/compute/patches/pg_hint_plan_v17.patch
@@ -1,24 +1,3 @@
-diff --git a/expected/ut-A.out b/expected/ut-A.out
-index e7d68a1..65a056c 100644
--- a/expected/ut-A.out
-+++ b/expected/ut-A.out
-@@ -9,13 +9,16 @@ SET search_path TO public;
- ----
- -- No.A-1-1-3
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- -- No.A-1-2-3
- DROP EXTENSION pg_hint_plan;
- -- No.A-1-1-4
- CREATE SCHEMA other_schema;
- CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
- CREATE EXTENSION pg_hint_plan;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/pg_hint_plan
- DROP SCHEMA other_schema;
- ----
- ---- No. A-5-1 comment pattern
 diff --git a/expected/ut-J.out b/expected/ut-J.out
 index 2fa3c70..314e929 100644
 --- a/expected/ut-J.out
@@ -160,15 +139,3 @@ index a09bd34..0ad227c 100644
 error hint:
 
                     explain_filter                    
-diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
-index 017fa4b..98d989b 100644
--- a/expected/ut-fdw.out
-+++ b/expected/ut-fdw.out
-@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
- SET client_min_messages TO LOG;
- SET pg_hint_plan.enable_hint TO on;
- CREATE EXTENSION file_fdw;
-+LOG:  Sending request to compute_ctl: http://localhost:3081/extension_server/file_fdw
- CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
- CREATE USER MAPPING FOR PUBLIC SERVER file_server;
- CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
--- a/compute/patches/plv8_v3.1.10.patch
+++ b/compute/patches/plv8_v3.1.10.patch
@@ -1,12 +1,6 @@
-commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e
-Author: Alexander Bayandin <alexander@neon.tech>
-Date:   Sat Nov 30 18:29:32 2024 +0000
-
-    Fix v8 9.7.37 compilation on Debian 12
-
 diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
 new file mode 100644
-index 0000000..f0a5dc7
+index 0000000..fae1cb3
 --- /dev/null
 +++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
@@ -0,0 +1,30 @@
@@ -35,8 +29,21 @@ index 0000000..f0a5dc7
 +@@ -5,6 +5,7 @@
 + #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
 + #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
-+ 
+
 ++#include <utility>
 + #include <vector>
-+ 
+
 + #include "include/cppgc/prefinalizer.h"
+diff --git a/plv8.cc b/plv8.cc
+index c1ce883..6e47e94 100644
+--- a/plv8.cc
+++ b/plv8.cc
+@@ -379,7 +379,7 @@ _PG_init(void)
+ 							   NULL,
+ 							   &plv8_v8_flags,
+ 							   NULL,
+-							   PGC_USERSET, 0,
+							   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+ 							   NULL,
+ #endif
--- a/compute/patches/plv8_v3.2.3.patch
+++ b/compute/patches/plv8_v3.2.3.patch
@@ -0,0 +1,13 @@
+diff --git a/plv8.cc b/plv8.cc
+index edfa2aa..623e7f2 100644
+--- a/plv8.cc
+++ b/plv8.cc
+@@ -385,7 +385,7 @@ _PG_init(void)
+                                    NULL,
+                                    &plv8_v8_flags,
+                                    NULL,
+-                                   PGC_USERSET, 0,
+                                   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+                                    NULL,
+ #endif
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,9 @@ use anyhow::{Context, Result};
 use clap::Parser;
 use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::ComputeSpec;
-use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
+use compute_tools::compute::{
+    BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
+};
 use compute_tools::extension_server::get_pg_version_string;
 use compute_tools::logger::*;
 use compute_tools::params::*;
@@ -57,10 +59,6 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

-// this is an arbitrary build tag. Fine as a default / for testing purposes
-// in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "latest";
-
 // Compatibility hack: if the control plane specified any remote-ext-config
 // use the default value for extension storage proxy gateway.
 // Remove this once the control plane is updated to pass the gateway URL
@@ -147,7 +145,7 @@ fn main() -> Result<()> {
        .build()?;
    let _rt_guard = runtime.enter();

-    let build_tag = runtime.block_on(init())?;
+    runtime.block_on(init())?;

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -174,8 +172,6 @@ fn main() -> Result<()> {
            cgroup: cli.cgroup,
            #[cfg(target_os = "linux")]
            vm_monitor_addr: cli.vm_monitor_addr,
-            build_tag,
-
            live_config_allowed: cli_spec.live_config_allowed,
        },
        cli_spec.spec,
@@ -189,7 +185,7 @@ fn main() -> Result<()> {
    deinit_and_exit(exit_code);
 }

-async fn init() -> Result<String> {
+async fn init() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -199,12 +195,9 @@ async fn init() -> Result<String> {
        }
    });

-    let build_tag = option_env!("BUILD_TAG")
-        .unwrap_or(BUILD_TAG_DEFAULT)
-        .to_string();
-    info!("build_tag: {build_tag}");
+    info!("compute build_tag: {}", &BUILD_TAG.to_string());

-    Ok(build_tag)
+    Ok(())
 }

 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,6 +20,7 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
+use once_cell::sync::Lazy;
 use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
@@ -35,6 +36,7 @@ use crate::disk_quota::set_disk_quota;
 use crate::installed_extensions::get_installed_extensions;
 use crate::logger::startup_context_from_env;
 use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
+use crate::metrics::COMPUTE_CTL_UP;
 use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
 use crate::rsyslog::{
@@ -49,6 +51,17 @@ use crate::{config, extension_server, local_proxy};

 pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
 pub static PG_PID: AtomicU32 = AtomicU32::new(0);
+// This is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "latest";
+/// Build tag/version of the compute node binaries/image. It's tricky and ugly
+/// to pass it everywhere as a part of `ComputeNodeParams`, so we use a
+/// global static variable.
+pub static BUILD_TAG: Lazy<String> = Lazy::new(|| {
+    option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string()
+});

 /// Static configuration params that don't change after startup. These mostly
 /// come from the CLI args, or are derived from them.
@@ -72,7 +85,6 @@ pub struct ComputeNodeParams {
    pub pgdata: String,
    pub pgbin: String,
    pub pgversion: String,
-    pub build_tag: String,

    /// The port that the compute's external HTTP server listens on
    pub external_http_port: u16,
@@ -173,6 +185,11 @@ impl ComputeState {
        info!("Changing compute status from {} to {}", prev, status);
        self.status = status;
        state_changed.notify_all();
+
+        COMPUTE_CTL_UP.reset();
+        COMPUTE_CTL_UP
+            .with_label_values(&[&BUILD_TAG, status.to_string().as_str()])
+            .set(1);
    }

    pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) {
@@ -343,6 +360,14 @@ impl ComputeNode {
            this.prewarm_postgres()?;
        }

+        // Set the up metric with Empty status before starting the HTTP server.
+        // That way on the first metric scrape, an external observer will see us
+        // as 'up' and 'empty' (unless the compute was started with a spec or
+        // already configured by control plane).
+        COMPUTE_CTL_UP
+            .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()])
+            .set(1);
+
        // Launch the external HTTP server first, so that we can serve control plane
        // requests while configuration is still in progress.
        crate::http::server::Server::External {
@@ -2032,12 +2057,8 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(
-                library,
-                true,
-                &self.params.build_tag,
-                &self.params.pgversion,
-            )?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &BUILD_TAG, &self.params.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -59,9 +59,12 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
        Box::pin(async move {
            let request_id = request.extract_parts::<RequestId>().await.unwrap();

-            // TODO: Remove this check after a successful rollout
-            if jwks.keys.is_empty() {
-                warn!(%request_id, "Authorization has not been configured");
+            // TODO: Remove this stanza after teaching neon_local and the
+            // regression tests to use a JWT + JWKS.
+            //
+            // https://github.com/neondatabase/neon/issues/11316
+            if cfg!(feature = "testing") {
+                warn!(%request_id, "Skipping compute_ctl authorization check");

                return Ok(request);
            }
@@ -110,8 +113,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
 impl Authorize {
    /// Verify the token using the JSON Web Key set and return the token data.
    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
-        debug_assert!(!jwks.keys.is_empty());
-
        for jwk in jwks.keys.iter() {
            let decoding_key = match DecodingKey::from_jwk(jwk) {
                Ok(key) => key,
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -5,7 +5,7 @@ use axum::response::{IntoResponse, Response};
 use http::StatusCode;
 use serde::Deserialize;

-use crate::compute::ComputeNode;
+use crate::compute::{BUILD_TAG, ComputeNode};
 use crate::http::JsonResponse;
 use crate::http::extract::{Path, Query};

@@ -47,7 +47,7 @@ pub(in crate::http) async fn download_extension(
        remote_extensions.get_ext(
            &filename,
            ext_server_params.is_library,
-            &compute.params.build_tag,
+            &BUILD_TAG,
            &compute.params.pgversion,
        )
    };
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -1,7 +1,8 @@
 use metrics::core::{AtomicF64, Collector, GenericGauge};
 use metrics::proto::MetricFamily;
 use metrics::{
-    IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
+    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec,
+    register_int_gauge_vec, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;

@@ -70,8 +71,19 @@ pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(
    .expect("failed to define a metric")
 });

+// Report that `compute_ctl` is up and what's the current compute status.
+pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "compute_ctl_up",
+        "Whether compute_ctl is running",
+        &["build_tag", "status"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub fn collect() -> Vec<MetricFamily> {
-    let mut metrics = INSTALLED_EXTENSIONS.collect();
+    let mut metrics = COMPUTE_CTL_UP.collect();
+    metrics.extend(INSTALLED_EXTENSIONS.collect());
    metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
    metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
    metrics.extend(DB_MIGRATION_FAILED.collect());
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -419,7 +419,7 @@ impl ComputeNode {
                .iter()
                .filter_map(|val| val.parse::<usize>().ok())
                .map(|val| if val > 1 { val - 1 } else { 1 })
-                .last()
+                .next_back()
                .unwrap_or(3)
        }
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -428,11 +428,6 @@ impl PageServerNode {
                .map(|x| x.parse::<usize>())
                .transpose()
                .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
-            l0_flush_wait_upload: settings
-                .remove("l0_flush_wait_upload")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?,
            l0_flush_stall_threshold: settings
                .remove("l0_flush_stall_threshold")
                .map(|x| x.parse::<usize>())
@@ -550,6 +545,11 @@ impl PageServerNode {
                .map(|x| x.parse::<u64>())
                .transpose()
                .context("Failed to parse 'gc_compaction_ratio_percent' as integer")?,
+            sampling_ratio: settings
+                .remove("sampling_ratio")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Falied to parse 'sampling_ratio'")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -385,8 +385,6 @@ where
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();

-    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
-
    let ssl_ca_certs = match &cli.ssl_ca_file {
        Some(ssl_ca_file) => {
            let buf = tokio::fs::read(ssl_ca_file).await?;
@@ -401,9 +399,11 @@ async fn main() -> anyhow::Result<()> {
    }
    let http_client = http_client.build()?;

+    let storcon_client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone());
+
    let mut trimmed = cli.api.to_string();
    trimmed.pop();
-    let vps_client = mgmt_api::Client::new(http_client, trimmed, cli.jwt.as_deref());
+    let vps_client = mgmt_api::Client::new(http_client.clone(), trimmed, cli.jwt.as_deref());

    match cli.command {
        Command::NodeRegister {
@@ -1056,7 +1056,7 @@ async fn main() -> anyhow::Result<()> {
            const DEFAULT_MIGRATE_CONCURRENCY: usize = 8;
            let mut stream = futures::stream::iter(moves)
                .map(|mv| {
-                    let client = Client::new(cli.api.clone(), cli.jwt.clone());
+                    let client = Client::new(http_client.clone(), cli.api.clone(), cli.jwt.clone());
                    async move {
                        client
                            .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -21,6 +21,7 @@ in this repository.
    - [WAL Redo](./pageserver-walredo.md)
    - [Page cache](./pageserver-pagecache.md)
    - [Storage](./pageserver-storage.md)
+    - [Compaction](./pageserver-compaction.md)
    - [Processing a GetPage request](./pageserver-processing-getpage.md)
    - [Processing WAL](./pageserver-processing-wal.md)

--- a/docs/pageserver-compaction.md
+++ b/docs/pageserver-compaction.md
@@ -0,0 +1,110 @@
+# Pageserver Compaction
+
+Lifted from <https://www.notion.so/neondatabase/Rough-Notes-on-Compaction-1baf189e004780859e65ef63b85cfa81?pvs=4>.
+
+Updated 2025-03-26.
+
+## Pages and WAL
+
+Postgres stores data in 8 KB pages, identified by a page number.
+
+The WAL contains a sequence of page writes: either images (complete page contents) or deltas (patches applied to images). Each write is identified by its byte position in the WAL, aka LSN. 
+
+Each page version is thus identified by page@LSN. Postgres may read pages at past LSNs.
+
+Pageservers ingest WAL by writing WAL records into a key/value store keyed by page@LSN.
+
+Pageservers materialize pages for Postgres reads by finding the most recent page image and applying all subsequent page deltas, up to the read LSN.
+
+## Compaction: Why?
+
+Pageservers store page@LSN keys in a key/value store using a custom variant of an LSM tree. Each timeline on each tenant shard has its own LSM tree.
+
+When Pageservers write new page@LSN entries, they are appended unordered to an ephemeral layer file. When the ephemeral layer file exceeds `checkpoint_distance` (default 256 MB), the key/value pairs are sorted by key and written out to a layer file (for efficient lookups).
+
+As WAL writes continue, more layer files accumulate.
+
+Reads must search through the layer files to find the page’s image and deltas. The more layer files accumulate, the more la yer files reads must search through before they find a page image, aka read amplification.
+
+Compaction’s job is to:
+
+- Reduce read amplification by reorganizing and combining layer files.
+- Remove old garbage from layer files.
+
+As part of this, it may combine several page deltas into a single page image where possible.
+
+## Compaction: How?
+
+Neon uses a non-standard variant of an LSM tree made up of two levels of layer files: L0 and L1.
+
+Compaction runs in two phases: L0→L1 compaction, and L1 image compaction.
+
+L0 contains a stack of L0 layers at decreasing LSN ranges. These have been flushed sequentially from ephemeral layers. Each L0 layer covers the entire page space (page 0 to ~infinity) and the LSN range that was ingested into it. L0 layers are therefore particularly bad for read amp, since every read must search all L0 layers below the read LSN. For example:
+
+```
+| Page 0-99 @ LSN 0400-04ff |
+| Page 0-99 @ LSN 0300-03ff |
+| Page 0-99 @ LSN 0200-02ff |
+| Page 0-99 @ LSN 0100-01ff |
+| Page 0-99 @ LSN 0000-00ff |
+```
+
+L0→L1 compaction takes the bottom-most chunk of L0 layer files of between `compaction_threshold` (default 10) and `compaction_upper_limit` (default 20) layers. It uses merge-sort to write out sorted L1 delta layers of size `compaction_target_size` (default 128 MB).
+
+L1 typically consists of a “bed” of image layers with materialized page images at a specific LSN, and then delta layers of various page/LSN ranges above them with page deltas. For example:
+
+```
+Delta layers:               |     30-84@0310-04ff      |
+Delta layers:    | 10-42@0200-02ff |           | 65-92@0174-02aa |
+Image layers: |    0-39@0100    |    40-79@0100    |    80-99@0100    |
+```
+
+L1 image compaction scans across the L1 keyspace at some LSN, materializes page images by reading the image and delta layers below the LSN (via vectored reads), and writes out new sorted image layers of roughly size `compaction_target_size` (default 128 MB) at that LSN.
+
+Layer files below the new image files’ LSN can be garbage collected when they are no longer needed for PITR.
+
+Even though the old layer files are not immediately garbage collected, the new image layers help with read amp because reads can stop traversing the layer stack as soon as they encounter a page image.
+
+## Compaction: When?
+
+Pageservers run a `compaction_loop` background task for each tenant shard. Every `compaction_period` (default 20 seconds) it will wake up and check if any of the shard’s timelines need compaction. Additionally, L0 layer flushes will eagerly wake the compaction loop if the L0 count exceeds `compaction_threshold` (default 10).
+
+L0 compaction runs if the number of L0 layers exceeds `compaction_threshold` (default 10).
+
+L1 image compaction runs across sections of the L1 keyspace that have at least `image_creation_threshold` (default 3) delta layers overlapping image layers.
+
+At most `CONCURRENT_BACKGROUND_TASKS` (default 3 / 4 * CPUs = 6) background tasks can run concurrently on a Pageserver, including compaction. Further compaction tasks must wait.
+
+Because L0 layers cause the most read amp (they overlap the entire keyspace and only contain page deltas), they are aggressively compacted down:
+
+- L0 is compacted down across all tenant timelines before L1 compaction is attempted (`compaction_l0_first`).
+- L0 compaction uses a separate concurrency limit of `CONCURRENT_L0_COMPACTION_TASKS` (default 3 / 4 * CPUs = 6) to avoid waiting for other tasks (`compaction_l0_semaphore`).
+- If L0 compaction is needed on any tenant timeline, L1 image compaction will yield to start an immediate L0 compaction run (except for compaction run via admin APIs).
+
+## Backpressure
+
+With sustained heavy write loads, new L0 layers may be flushed faster than they can be compacted down. This can cause an unbounded buildup of read amplification and compaction debt, which can take hours to resolve even after the writes stop.
+
+To avoid this and allow compaction to keep up, layer flushes will slow writes down to apply backpressure on the workload:
+
+- At `l0_flush_delay_threshold` (default 30) L0 layers, layer flushes are delayed by the flush duration, such that they take 2x as long.
+- At `l0_flush_stall_threshold` (default disabled) L0 layers, layer flushes stall entirely until the L0 count falls back below the threshold. This is currently disabled because we don’t trust L0 compaction to be responsive enough.
+
+This backpressure is propagated to the compute by waiting for layer flushes when WAL ingestion rolls the ephemeral layer. The compute will significantly slow down WAL writes at:
+
+- `max_replication_write_lag` (default 500 MB), when Pageserver WAL ingestion lags
+- `max_replication_flush_lag` (default 10 GB), when Pageserver L0 flushes lag
+
+Combined, this means that the compute will backpressure when there are 30 L0 layers (30 * 256 MB = 7.7 GB) and the Pageserver WAL ingestion lags the compute by 500 MB, for a total of ~8 GB L0+ephemeral compaction debt on a single shard.
+
+Since we only delay L0 flushes by 2x when backpressuring, and haven’t enabled stalls, it is still possible for read amp to increase unbounded if compaction is too slow (although we haven’t seen this in practice). But this is considered better than stalling flushes and causing unavailability for as long as it takes L0 compaction to react, since we don’t trust it to be fast enough — at the expense of continually increasing read latency and CPU usage for this tenant. We should either enable stalls when we have enough confidence in L0 compaction, or scale the flush delay by the number of L0 layers to apply increasing backpressure.
+
+## Circuit Breaker
+
+Compaction can fail, often repeatedly. This can happen e.g. due to data corruption, faulty hardware, S3 outages, etc.
+
+If compaction fails, the compaction loop will naïvely try and fail again almost immediately. It may only fail after doing a significant amount of wasted work, while holding onto the background task semaphore.
+
+To avoid repeatedly doing wasted work and starving out other compaction jobs, each tenant has a compaction circuit breaker. After 5 repeated compaction failures, the circuit breaker trips and disables compaction for the next 24 hours, before resetting the breaker and trying again. This disables compaction across all tenant timelines (faulty or not).
+
+Disabling compaction for a long time is dangerous, since it can lead to unbounded read amp and compaction debt, and continuous workload backpressure. However, continually failing would not help either. Tripped circuit breakers trigger an alert and must be investigated promptly.
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -91,14 +91,14 @@ impl Server {
                                        Ok(tls_stream) => tls_stream,
                                        Err(err) => {
                                            if !suppress_io_error(&err) {
-                                                info!("Failed to accept TLS connection: {err:#}");
+                                                info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                            }
                                            return;
                                        }
                                    };
                                    if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
                                        if !suppress_hyper_error(&err) {
-                                            info!("Failed to serve HTTPS connection: {err:#}");
+                                            info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                        }
                                    }
                                }
@@ -106,7 +106,7 @@ impl Server {
                                    // Handle HTTP connection.
                                    if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
                                        if !suppress_hyper_error(&err) {
-                                            info!("Failed to serve HTTP connection: {err:#}");
+                                            info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                        }
                                    }
                                }
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -34,6 +34,7 @@ postgres_backend.workspace = true
 nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
+tracing-utils.workspace = true

 [dev-dependencies]
 bincode.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -51,9 +51,54 @@ pub struct NodeMetadata {
 /// If there cannot be a static default value because we need to make runtime
 /// checks to determine the default, make it an `Option` (which defaults to None).
 /// The runtime check should be done in the consuming crate, i.e., `pageserver`.
+///
+/// Unknown fields are silently ignored during deserialization.
+/// The alternative, which we used in the past, was to set `deny_unknown_fields`,
+/// which fails deserialization, and hence pageserver startup, if there is an unknown field.
+/// The reason we don't do that anymore is that it complicates
+/// usage of config fields for feature flagging, which we commonly do for
+/// region-by-region rollouts.
+/// The complications mainly arise because the `pageserver.toml` contents on a
+/// prod server have a separate lifecycle from the pageserver binary.
+/// For instance, `pageserver.toml` contents today are defined in the internal
+/// infra repo, and thus introducing a new config field to pageserver and
+/// rolling it out to prod servers are separate commits in separate repos
+/// that can't be made or rolled back atomically.
+/// Rollbacks in particular pose a risk with deny_unknown_fields because
+/// the old pageserver binary may reject a new config field, resulting in
+/// an outage unless the person doing the pageserver rollback remembers
+/// to also revert the commit that added the config field in to the
+/// `pageserver.toml` templates in the internal infra repo.
+/// (A pre-deploy config check would eliminate this risk during rollbacks,
+///  cf [here](https://github.com/neondatabase/cloud/issues/24349).)
+/// In addition to this compatibility problem during emergency rollbacks,
+/// deny_unknown_fields adds further complications when decomissioning a feature
+/// flag: with deny_unknown_fields, we can't remove a flag from the [`ConfigToml`]
+/// until all prod servers' `pageserver.toml` files have been updated to a version
+/// that doesn't specify the flag. Otherwise new software would fail to start up.
+/// This adds the requirement for an intermediate step where the new config field
+/// is accepted but ignored, prolonging the decomissioning process by an entire
+/// release cycle.
+/// By contrast  with unknown fields silently ignored, decomissioning a feature
+/// flag is a one-step process: we can skip the intermediate step and straight
+/// remove the field from the [`ConfigToml`]. We leave the field in the
+/// `pageserver.toml` files on prod servers until we reach certainty that we
+/// will not roll back to old software whose behavior was dependent on config.
+/// Then we can remove the field from the templates in the internal infra repo.
+/// This process is [documented internally](
+/// https://docs.neon.build/storage/pageserver_configuration.html).
+///
+/// Note that above relaxed compatbility for the config format does NOT APPLY
+/// TO THE STORAGE FORMAT. As general guidance, when introducing storage format
+/// changes, ensure that the potential rollback target version will be compatible
+/// with the new format. This must hold regardless of what flags are set in in the `pageserver.toml`:
+/// any format version that exists in an environment must be compatible with the software that runs there.
+/// Use a pageserver.toml flag only to gate whether software _writes_ the new format.
+/// For more compatibility considerations, refer to [internal docs](
+/// https://docs.neon.build/storage/compat.html?highlight=compat#format-versions--compatibility)
 #[serde_as]
 #[derive(Clone, Debug, serde::Deserialize, serde::Serialize)]
-#[serde(default, deny_unknown_fields)]
+#[serde(default)]
 pub struct ConfigToml {
    // types mapped 1:1 into the runtime PageServerConfig type
    pub listen_pg_addr: String,
@@ -134,10 +179,10 @@ pub struct ConfigToml {
    pub load_previous_heatmap: Option<bool>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generate_unarchival_heatmap: Option<bool>,
+    pub tracing: Option<Tracing>,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields)]
 pub struct DiskUsageEvictionTaskConfig {
    pub max_usage_pct: utils::serde_percent::Percent,
    pub min_avail_bytes: u64,
@@ -152,13 +197,11 @@ pub struct DiskUsageEvictionTaskConfig {

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
-#[serde(deny_unknown_fields)]
 pub enum PageServicePipeliningConfig {
    Serial,
    Pipelined(PageServicePipeliningConfigPipelined),
 }
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields)]
 pub struct PageServicePipeliningConfigPipelined {
    /// Causes runtime errors if larger than max get_vectored batch size.
    pub max_batch_size: NonZeroUsize,
@@ -174,7 +217,6 @@ pub enum PageServiceProtocolPipelinedExecutionStrategy {

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 #[serde(tag = "mode", rename_all = "kebab-case")]
-#[serde(deny_unknown_fields)]
 pub enum GetVectoredConcurrentIo {
    /// The read path is fully sequential: layers are visited
    /// one after the other and IOs are issued and waited upon
@@ -191,6 +233,54 @@ pub enum GetVectoredConcurrentIo {
    SidecarTask,
 }

+#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct Ratio {
+    pub numerator: usize,
+    pub denominator: usize,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct OtelExporterConfig {
+    pub endpoint: String,
+    pub protocol: OtelExporterProtocol,
+    #[serde(with = "humantime_serde")]
+    pub timeout: Duration,
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(rename_all = "kebab-case")]
+pub enum OtelExporterProtocol {
+    Grpc,
+    HttpBinary,
+    HttpJson,
+}
+
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+pub struct Tracing {
+    pub sampling_ratio: Ratio,
+    pub export_config: OtelExporterConfig,
+}
+
+impl From<&OtelExporterConfig> for tracing_utils::ExportConfig {
+    fn from(val: &OtelExporterConfig) -> Self {
+        tracing_utils::ExportConfig {
+            endpoint: Some(val.endpoint.clone()),
+            protocol: val.protocol.into(),
+            timeout: val.timeout,
+        }
+    }
+}
+
+impl From<OtelExporterProtocol> for tracing_utils::Protocol {
+    fn from(val: OtelExporterProtocol) -> Self {
+        match val {
+            OtelExporterProtocol::Grpc => tracing_utils::Protocol::Grpc,
+            OtelExporterProtocol::HttpJson => tracing_utils::Protocol::HttpJson,
+            OtelExporterProtocol::HttpBinary => tracing_utils::Protocol::HttpBinary,
+        }
+    }
+}
+
 pub mod statvfs {
    pub mod mock {
        #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -245,7 +335,7 @@ pub struct MaxVectoredReadBytes(pub NonZeroUsize);

 /// Tenant-level configuration values, used for various purposes.
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
-#[serde(deny_unknown_fields, default)]
+#[serde(default)]
 pub struct TenantConfigToml {
    // Flush out an inmemory layer, if it's holding WAL older than this
    // This puts a backstop on how much WAL needs to be re-digested if the
@@ -285,12 +375,6 @@ pub struct TenantConfigToml {
    /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
    /// to avoid deadlock. 0 to disable. Disabled by default.
    pub l0_flush_stall_threshold: Option<usize>,
-    /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
-    /// layer. This is a temporary backpressure mechanism which should be removed once
-    /// l0_flush_{delay,stall}_threshold is fully enabled.
-    ///
-    /// TODO: this is no longer enabled, remove it when the config option is no longer set.
-    pub l0_flush_wait_upload: bool,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -373,6 +457,9 @@ pub struct TenantConfigToml {
    /// The ratio that triggers the auto gc-compaction. If (the total size of layers between L2 LSN and gc-horizon) / (size below the L2 LSN)
    /// is above this ratio, gc-compaction will be triggered.
    pub gc_compaction_ratio_percent: u64,
+    /// Tenant level performance sampling ratio override. Controls the ratio of get page requests
+    /// that will get perf sampling for the tenant.
+    pub sampling_ratio: Option<Ratio>,
 }

 pub mod defaults {
@@ -543,6 +630,7 @@ impl Default for ConfigToml {
            validate_wal_contiguity: None,
            load_previous_heatmap: None,
            generate_unarchival_heatmap: None,
+            tracing: None,
        }
    }
 }
@@ -579,8 +667,6 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
        crate::models::CompactionAlgorithm::Legacy;

-    pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false;
-
    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
@@ -627,7 +713,6 @@ impl Default for TenantConfigToml {
            compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
            l0_flush_delay_threshold: None,
            l0_flush_stall_threshold: None,
-            l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
@@ -661,6 +746,7 @@ impl Default for TenantConfigToml {
            gc_compaction_enabled: DEFAULT_GC_COMPACTION_ENABLED,
            gc_compaction_initial_threshold_kb: DEFAULT_GC_COMPACTION_INITIAL_THRESHOLD_KB,
            gc_compaction_ratio_percent: DEFAULT_GC_COMPACTION_RATIO_PERCENT,
+            sampling_ratio: None,
        }
    }
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -23,6 +23,7 @@ use utils::lsn::Lsn;
 use utils::postgres_client::PostgresClientProtocol;
 use utils::{completion, serde_system_time};

+use crate::config::Ratio;
 use crate::key::{CompactKey, Key};
 use crate::reltag::RelTag;
 use crate::shard::{ShardCount, ShardStripeSize, TenantShardId};
@@ -79,10 +80,22 @@ pub enum TenantState {
    ///
    /// Transitions out of this state are possible through `set_broken()`.
    Stopping {
+        /// The barrier can be used to wait for shutdown to complete. The first caller to set
+        /// Some(Barrier) is responsible for driving shutdown to completion. Subsequent callers
+        /// will wait for the first caller's existing barrier.
+        ///
+        /// None is set when an attach is cancelled, to signal to shutdown that the attach has in
+        /// fact cancelled:
+        ///
+        /// 1. `shutdown` sees `TenantState::Attaching`, and cancels the tenant.
+        /// 2. `attach` sets `TenantState::Stopping(None)` and exits.
+        /// 3. `set_stopping` waits for `TenantState::Stopping(None)` and sets
+        ///    `TenantState::Stopping(Some)` to claim the barrier as the shutdown owner.
+        //
        // Because of https://github.com/serde-rs/serde/issues/2105 this has to be a named field,
        // otherwise it will not be skipped during deserialization
        #[serde(skip)]
-        progress: completion::Barrier,
+        progress: Option<completion::Barrier>,
    },
    /// The tenant is recognized by the pageserver, but can no longer be used for
    /// any operations.
@@ -523,8 +536,6 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub l0_flush_stall_threshold: FieldPatch<usize>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
-    pub l0_flush_wait_upload: FieldPatch<bool>,
-    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_horizon: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_period: FieldPatch<String>,
@@ -570,6 +581,8 @@ pub struct TenantConfigPatch {
    pub gc_compaction_initial_threshold_kb: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_compaction_ratio_percent: FieldPatch<u64>,
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub sampling_ratio: FieldPatch<Option<Ratio>>,
 }

 /// Like [`crate::config::TenantConfigToml`], but preserves the information
@@ -614,9 +627,6 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub l0_flush_stall_threshold: Option<usize>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub l0_flush_wait_upload: Option<bool>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_horizon: Option<u64>,

@@ -693,6 +703,9 @@ pub struct TenantConfig {

    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_compaction_ratio_percent: Option<u64>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub sampling_ratio: Option<Option<Ratio>>,
 }

 impl TenantConfig {
@@ -712,7 +725,6 @@ impl TenantConfig {
            mut compaction_l0_semaphore,
            mut l0_flush_delay_threshold,
            mut l0_flush_stall_threshold,
-            mut l0_flush_wait_upload,
            mut gc_horizon,
            mut gc_period,
            mut image_creation_threshold,
@@ -736,6 +748,7 @@ impl TenantConfig {
            mut gc_compaction_enabled,
            mut gc_compaction_initial_threshold_kb,
            mut gc_compaction_ratio_percent,
+            mut sampling_ratio,
        } = self;

        patch.checkpoint_distance.apply(&mut checkpoint_distance);
@@ -765,7 +778,6 @@ impl TenantConfig {
        patch
            .l0_flush_stall_threshold
            .apply(&mut l0_flush_stall_threshold);
-        patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
        patch.gc_horizon.apply(&mut gc_horizon);
        patch
            .gc_period
@@ -831,6 +843,7 @@ impl TenantConfig {
        patch
            .gc_compaction_ratio_percent
            .apply(&mut gc_compaction_ratio_percent);
+        patch.sampling_ratio.apply(&mut sampling_ratio);

        Ok(Self {
            checkpoint_distance,
@@ -844,7 +857,6 @@ impl TenantConfig {
            compaction_l0_semaphore,
            l0_flush_delay_threshold,
            l0_flush_stall_threshold,
-            l0_flush_wait_upload,
            gc_horizon,
            gc_period,
            image_creation_threshold,
@@ -868,6 +880,7 @@ impl TenantConfig {
            gc_compaction_enabled,
            gc_compaction_initial_threshold_kb,
            gc_compaction_ratio_percent,
+            sampling_ratio,
        })
    }

@@ -911,9 +924,6 @@ impl TenantConfig {
            l0_flush_stall_threshold: self
                .l0_flush_stall_threshold
                .or(global_conf.l0_flush_stall_threshold),
-            l0_flush_wait_upload: self
-                .l0_flush_wait_upload
-                .unwrap_or(global_conf.l0_flush_wait_upload),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -972,6 +982,7 @@ impl TenantConfig {
            gc_compaction_ratio_percent: self
                .gc_compaction_ratio_percent
                .unwrap_or(global_conf.gc_compaction_ratio_percent),
+            sampling_ratio: self.sampling_ratio.unwrap_or(global_conf.sampling_ratio),
        }
    }
 }
@@ -1105,7 +1116,7 @@ pub struct CompactionAlgorithmSettings {
 }

 #[derive(Debug, PartialEq, Eq, Clone, Deserialize, Serialize)]
-#[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
+#[serde(tag = "mode", rename_all = "kebab-case")]
 pub enum L0FlushConfig {
    #[serde(rename_all = "snake_case")]
    Direct { max_concurrency: NonZeroUsize },
@@ -1429,11 +1440,6 @@ pub struct TimelineInfo {
    pub last_record_lsn: Lsn,
    pub prev_record_lsn: Option<Lsn>,

-    /// Legacy field, retained for one version to enable old storage controller to
-    /// decode (it was a mandatory field).
-    #[serde(default, rename = "latest_gc_cutoff_lsn")]
-    pub _unused: Lsn,
-
    /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
    /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
    /// as it is easier to reason about.
@@ -2725,10 +2731,15 @@ mod tests {
                "Activating",
            ),
            (line!(), TenantState::Active, "Active"),
+            (
+                line!(),
+                TenantState::Stopping { progress: None },
+                "Stopping",
+            ),
            (
                line!(),
                TenantState::Stopping {
-                    progress: utils::completion::Barrier::default(),
+                    progress: Some(completion::Barrier::default()),
                },
                "Stopping",
            ),
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -8,10 +8,9 @@ license = "MIT/Apache-2.0"
 bytes.workspace = true
 fallible-iterator.workspace = true
 futures-util = { workspace = true, features = ["sink"] }
-log = "0.4"
+tracing.workspace = true
 parking_lot.workspace = true
 pin-project-lite.workspace = true
-phf = "0.11"
 postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -6,13 +6,13 @@ use std::task::{Context, Poll};
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{Sink, Stream, ready};
-use log::{info, trace};
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc;
 use tokio_util::codec::Framed;
 use tokio_util::sync::PollSender;
+use tracing::{info, trace};

 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
 use crate::error::DbError;
--- a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
+++ b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -5,9 +5,9 @@ use std::sync::Arc;
 use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
 use futures_util::{TryStreamExt, pin_mut};
-use log::debug;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
+use tracing::debug;

 use crate::client::{CachedTypeInfo, InnerClient};
 use crate::codec::FrontendMessage;
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -7,11 +7,11 @@ use std::task::{Context, Poll};
 use bytes::{BufMut, Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use futures_util::{Stream, ready};
-use log::{Level, debug, log_enabled};
 use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use postgres_types2::{Format, ToSql, Type};
+use tracing::debug;

 use crate::client::{InnerClient, Responses};
 use crate::codec::FrontendMessage;
@@ -36,7 +36,7 @@ where
    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
    I::IntoIter: ExactSizeIterator,
 {
-    let buf = if log_enabled!(Level::Debug) {
+    let buf = if tracing::enabled!(tracing::Level::DEBUG) {
        let params = params.into_iter().collect::<Vec<_>>();
        debug!(
            "executing statement {} with parameters: {:?}",
--- a/libs/proxy/tokio-postgres2/src/simple_query.rs
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -6,10 +6,10 @@ use std::task::{Context, Poll};
 use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
 use futures_util::{Stream, ready};
-use log::debug;
 use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
+use tracing::debug;

 use crate::client::{InnerClient, Responses};
 use crate::codec::FrontendMessage;
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -558,7 +558,7 @@ async fn upload_large_enough_file(
 ) -> usize {
    let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
    let body = bytes::Bytes::from(vec![0u8; 1024]);
-    let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
+    let contents = std::iter::once(header).chain(std::iter::repeat_n(body, 128));

    let len = contents.clone().fold(0, |acc, next| acc + next.len());

--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -71,6 +71,7 @@ pub struct PeerInfo {
    pub ts: Instant,
    pub pg_connstr: String,
    pub http_connstr: String,
+    pub https_connstr: Option<String>,
 }

 pub type FullTransactionId = u64;
@@ -227,6 +228,8 @@ pub struct TimelineDeleteResult {
    pub dir_existed: bool,
 }

+pub type TenantDeleteResult = std::collections::HashMap<String, TimelineDeleteResult>;
+
 fn lsn_invalid() -> Lsn {
    Lsn::INVALID
 }
@@ -259,6 +262,8 @@ pub struct SkTimelineInfo {
    pub safekeeper_connstr: Option<String>,
    #[serde(default)]
    pub http_connstr: Option<String>,
+    #[serde(default)]
+    pub https_connstr: Option<String>,
    // Minimum of all active RO replicas flush LSN
    #[serde(default = "lsn_invalid")]
    pub standby_horizon: Lsn,
--- a/libs/tracing-utils/Cargo.toml
+++ b/libs/tracing-utils/Cargo.toml
@@ -14,6 +14,7 @@ tokio = { workspace = true, features = ["rt", "rt-multi-thread"] }
 tracing.workspace = true
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+pin-project-lite.workspace = true

 [dev-dependencies]
 tracing-subscriber.workspace = true    # For examples in docs
--- a/libs/tracing-utils/src/lib.rs
+++ b/libs/tracing-utils/src/lib.rs
@@ -31,10 +31,10 @@
 //!         .init();
 //! }
 //! ```
-#![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]

 pub mod http;
+pub mod perf_span;

 use opentelemetry::KeyValue;
 use opentelemetry::trace::TracerProvider;
--- a/libs/tracing-utils/src/perf_span.rs
+++ b/libs/tracing-utils/src/perf_span.rs
@@ -0,0 +1,144 @@
+//! Crutch module to work around tracing infrastructure deficiencies
+//!
+//! We wish to collect granular request spans without impacting performance
+//! by much. Ideally, we should have zero overhead for a sampling rate of 0.
+//!
+//! The approach taken by the pageserver crate is to use a completely different
+//! span hierarchy for the performance spans. Spans are explicitly stored in
+//! the request context and use a different [`tracing::Subscriber`] in order
+//! to avoid expensive filtering.
+//!
+//! [`tracing::Span`] instances record their [`tracing::Dispatch`] and, implcitly,
+//! their [`tracing::Subscriber`] at creation time. However, upon exiting the span,
+//! the global default [`tracing::Dispatch`] is used. This is problematic if one
+//! wishes to juggle different subscribers.
+//!
+//! In order to work around this, this module provides a [`PerfSpan`] type which
+//! wraps a [`Span`] and sets the default subscriber when exiting the span. This
+//! achieves the correct routing.
+//!
+//! There's also a modified version of [`tracing::Instrument`] which works with
+//! [`PerfSpan`].
+
+use core::{
+    future::Future,
+    marker::Sized,
+    mem::ManuallyDrop,
+    pin::Pin,
+    task::{Context, Poll},
+};
+use pin_project_lite::pin_project;
+use tracing::{Dispatch, span::Span};
+
+#[derive(Debug, Clone)]
+pub struct PerfSpan {
+    inner: ManuallyDrop<Span>,
+    dispatch: Dispatch,
+}
+
+#[must_use = "once a span has been entered, it should be exited"]
+pub struct PerfSpanEntered<'a> {
+    span: &'a PerfSpan,
+}
+
+impl PerfSpan {
+    pub fn new(span: Span, dispatch: Dispatch) -> Self {
+        Self {
+            inner: ManuallyDrop::new(span),
+            dispatch,
+        }
+    }
+
+    pub fn enter(&self) -> PerfSpanEntered {
+        if let Some(ref id) = self.inner.id() {
+            self.dispatch.enter(id);
+        }
+
+        PerfSpanEntered { span: self }
+    }
+
+    pub fn inner(&self) -> &Span {
+        &self.inner
+    }
+}
+
+impl Drop for PerfSpan {
+    fn drop(&mut self) {
+        // Bring the desired dispatch into scope before explicitly calling
+        // the span destructor. This routes the span exit to the correct
+        // [`tracing::Subscriber`].
+        let _dispatch_guard = tracing::dispatcher::set_default(&self.dispatch);
+        // SAFETY: ManuallyDrop in Drop implementation
+        unsafe { ManuallyDrop::drop(&mut self.inner) }
+    }
+}
+
+impl Drop for PerfSpanEntered<'_> {
+    fn drop(&mut self) {
+        assert!(self.span.inner.id().is_some());
+
+        let _dispatch_guard = tracing::dispatcher::set_default(&self.span.dispatch);
+        self.span.dispatch.exit(&self.span.inner.id().unwrap());
+    }
+}
+
+pub trait PerfInstrument: Sized {
+    fn instrument(self, span: PerfSpan) -> PerfInstrumented<Self> {
+        PerfInstrumented {
+            inner: ManuallyDrop::new(self),
+            span,
+        }
+    }
+}
+
+pin_project! {
+    #[project = PerfInstrumentedProj]
+    #[derive(Debug, Clone)]
+    #[must_use = "futures do nothing unless you `.await` or poll them"]
+    pub struct PerfInstrumented<T> {
+        // `ManuallyDrop` is used here to to enter instrument `Drop` by entering
+        // `Span` and executing `ManuallyDrop::drop`.
+        #[pin]
+        inner: ManuallyDrop<T>,
+        span: PerfSpan,
+    }
+
+    impl<T> PinnedDrop for PerfInstrumented<T> {
+        fn drop(this: Pin<&mut Self>) {
+            let this = this.project();
+            let _enter = this.span.enter();
+            // SAFETY: 1. `Pin::get_unchecked_mut()` is safe, because this isn't
+            //             different from wrapping `T` in `Option` and calling
+            //             `Pin::set(&mut this.inner, None)`, except avoiding
+            //             additional memory overhead.
+            //         2. `ManuallyDrop::drop()` is safe, because
+            //            `PinnedDrop::drop()` is guaranteed to be called only
+            //            once.
+            unsafe { ManuallyDrop::drop(this.inner.get_unchecked_mut()) }
+        }
+    }
+}
+
+impl<'a, T> PerfInstrumentedProj<'a, T> {
+    /// Get a mutable reference to the [`Span`] a pinned mutable reference to
+    /// the wrapped type.
+    fn span_and_inner_pin_mut(self) -> (&'a mut PerfSpan, Pin<&'a mut T>) {
+        // SAFETY: As long as `ManuallyDrop<T>` does not move, `T` won't move
+        //         and `inner` is valid, because `ManuallyDrop::drop` is called
+        //         only inside `Drop` of the `Instrumented`.
+        let inner = unsafe { self.inner.map_unchecked_mut(|v| &mut **v) };
+        (self.span, inner)
+    }
+}
+
+impl<T: Future> Future for PerfInstrumented<T> {
+    type Output = T::Output;
+
+    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
+        let (span, inner) = self.project().span_and_inner_pin_mut();
+        let _enter = span.enter();
+        inner.poll(cx)
+    }
+}
+
+impl<T: Sized> PerfInstrument for T {}
--- a/libs/utils/src/elapsed_accum.rs
+++ b/libs/utils/src/elapsed_accum.rs
@@ -0,0 +1,26 @@
+use std::time::{Duration, Instant};
+
+#[derive(Default)]
+pub struct ElapsedAccum {
+    accum: Duration,
+}
+
+impl ElapsedAccum {
+    pub fn get(&self) -> Duration {
+        self.accum
+    }
+    pub fn guard(&mut self) -> impl Drop + '_ {
+        let start = Instant::now();
+        scopeguard::guard(start, |last_wait_at| {
+            self.accum += Instant::now() - last_wait_at;
+        })
+    }
+
+    pub async fn measure<Fut, O>(&mut self, fut: Fut) -> O
+    where
+        Fut: Future<Output = O>,
+    {
+        let _guard = self.guard();
+        fut.await
+    }
+}
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -93,6 +93,8 @@ pub mod try_rcu;

 pub mod guard_arc_swap;

+pub mod elapsed_accum;
+
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;

--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -111,9 +111,17 @@ impl<T> OnceCell<T> {
        }
    }

+    /// Like [`Self::get_or_init_detached_measured`], but without out parameter for time spent waiting.
+    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
+        self.get_or_init_detached_measured(None).await
+    }
+
    /// Returns a guard to an existing initialized value, or returns an unique initialization
    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
-    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
+    pub async fn get_or_init_detached_measured(
+        &self,
+        mut wait_time: Option<&mut crate::elapsed_accum::ElapsedAccum>,
+    ) -> Result<Guard<'_, T>, InitPermit> {
        // It looks like OnceCell::get_or_init could be implemented using this method instead of
        // duplication. However, that makes the future be !Send due to possibly holding on to the
        // MutexGuard over an await point.
@@ -125,12 +133,16 @@ impl<T> OnceCell<T> {
                }
                guard.init_semaphore.clone()
            };
-
            {
                let permit = {
                    // increment the count for the duration of queued
                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
+                    let fut = sem.acquire();
+                    if let Some(wait_time) = wait_time.as_mut() {
+                        wait_time.measure(fut).await
+                    } else {
+                        fut.await
+                    }
                };

                let Ok(permit) = permit else {
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -86,17 +86,17 @@ impl Client {
        resp.json().await.map_err(Error::ReceiveBody)
    }

-    /// Get an arbitrary path and returning a streaming Response.  This function is suitable
-    /// for pass-through/proxy use cases where we don't care what the response content looks
-    /// like.
+    /// Send an HTTP request to an arbitrary path with a desired HTTP method and returning a streaming
+    /// Response.  This function is suitable for pass-through/proxy use cases where we don't care
+    /// what the response content looks like.
    ///
    /// Use/add one of the properly typed methods below if you know aren't proxying, and
    /// know what kind of response you expect.
-    pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
+    pub async fn op_raw(&self, method: Method, path: String) -> Result<reqwest::Response> {
        debug_assert!(path.starts_with('/'));
        let uri = format!("{}{}", self.mgmt_api_endpoint, path);

-        let mut req = self.client.request(Method::GET, uri);
+        let mut req = self.client.request(method, uri);
        if let Some(value) = &self.authorization_header {
            req = req.header(reqwest::header::AUTHORIZATION, value);
        }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -16,7 +16,7 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::launch_timestamp::{LaunchTimestamp, set_launch_timestamp_metric};
 use metrics::set_build_info_metric;
 use nix::sys::socket::{setsockopt, sockopt};
-use pageserver::config::{PageServerConf, PageserverIdentity};
+use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
@@ -35,6 +35,7 @@ use tokio::signal::unix::SignalKind;
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
+use tracing_utils::OtelGuard;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
 use utils::logging::TracingErrorLayerEnablement;
@@ -97,7 +98,7 @@ fn main() -> anyhow::Result<()> {
    env::set_current_dir(&workdir)
        .with_context(|| format!("Failed to set application's current dir to '{workdir}'"))?;

-    let conf = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;
+    let (conf, ignored) = initialize_config(&identity_file_path, &cfg_file_path, &workdir)?;

    // Initialize logging.
    //
@@ -118,6 +119,21 @@ fn main() -> anyhow::Result<()> {
        logging::Output::Stdout,
    )?;

+    let otel_enablement = match &conf.tracing {
+        Some(cfg) => tracing_utils::OtelEnablement::Enabled {
+            service_name: "pageserver".to_string(),
+            export_config: (&cfg.export_config).into(),
+            runtime: *COMPUTE_REQUEST_RUNTIME,
+        },
+        None => tracing_utils::OtelEnablement::Disabled,
+    };
+
+    let otel_guard = tracing_utils::init_performance_tracing(otel_enablement);
+
+    if otel_guard.is_some() {
+        info!(?conf.tracing, "starting with OTEL tracing enabled");
+    }
+
    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
    // disarming this hook on pageserver, because we never tear down tracing.
    logging::replace_panic_hook_with_tracing_panic_hook().forget();
@@ -128,7 +144,17 @@ fn main() -> anyhow::Result<()> {
        &[("node_id", &conf.id.to_string())],
    );

-    // after setting up logging, log the effective IO engine choice and read path implementations
+    // Warn about ignored config items; see pageserver_api::config::ConfigToml
+    // doc comment for rationale why we prefer this over serde(deny_unknown_fields).
+    {
+        let ignored_fields::Paths { paths } = &ignored;
+        for path in paths {
+            warn!(?path, "ignoring unknown configuration item");
+        }
+    }
+
+    // Log configuration items for feature-flag-like config
+    // (maybe we should automate this with a visitor?).
    info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
    info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
@@ -191,7 +217,7 @@ fn main() -> anyhow::Result<()> {
    tracing::info!("Initializing page_cache...");
    page_cache::init(conf.page_cache_size);

-    start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
+    start_pageserver(launch_ts, conf, ignored, otel_guard).context("Failed to start pageserver")?;

    scenario.teardown();
    Ok(())
@@ -201,7 +227,7 @@ fn initialize_config(
    identity_file_path: &Utf8Path,
    cfg_file_path: &Utf8Path,
    workdir: &Utf8Path,
-) -> anyhow::Result<&'static PageServerConf> {
+) -> anyhow::Result<(&'static PageServerConf, ignored_fields::Paths)> {
    // The deployment orchestrator writes out an indentity file containing the node id
    // for all pageservers. This file is the source of truth for the node id. In order
    // to allow for rolling back pageserver releases, the node id is also included in
@@ -230,16 +256,36 @@ fn initialize_config(

    let config_file_contents =
        std::fs::read_to_string(cfg_file_path).context("read config file from filesystem")?;
-    let config_toml = serde_path_to_error::deserialize(
-        toml_edit::de::Deserializer::from_str(&config_file_contents)
-            .context("build toml deserializer")?,
-    )
-    .context("deserialize config toml")?;

+    // Deserialize the config file contents into a ConfigToml.
+    let config_toml: pageserver_api::config::ConfigToml = {
+        let deserializer = toml_edit::de::Deserializer::from_str(&config_file_contents)
+            .context("build toml deserializer")?;
+        let mut path_to_error_track = serde_path_to_error::Track::new();
+        let deserializer =
+            serde_path_to_error::Deserializer::new(deserializer, &mut path_to_error_track);
+        serde::Deserialize::deserialize(deserializer).context("deserialize config toml")?
+    };
+
+    // Find unknown fields by re-serializing the parsed ConfigToml and comparing it to the on-disk file.
+    // Any fields that are only in the on-disk version are unknown.
+    // (The assumption here is that the ConfigToml doesn't to skip_serializing_if.)
+    // (Make sure to read the ConfigToml doc comment on why we only want to warn about, but not fail startup, on unknown fields).
+    let ignored = {
+        let ondisk_toml = config_file_contents
+            .parse::<toml_edit::DocumentMut>()
+            .context("parse original config as toml document")?;
+        let parsed_toml = toml_edit::ser::to_document(&config_toml)
+            .context("re-serialize config to toml document")?;
+        pageserver::config::ignored_fields::find(ondisk_toml, parsed_toml)
+    };
+
+    // Construct the runtime god object (it's called PageServerConf but actually is just global shared state).
    let conf = PageServerConf::parse_and_validate(identity.id, config_toml, workdir)
        .context("runtime-validation of config toml")?;
+    let conf = Box::leak(Box::new(conf));

-    Ok(Box::leak(Box::new(conf)))
+    Ok((conf, ignored))
 }

 struct WaitForPhaseResult<F: std::future::Future + Unpin> {
@@ -290,6 +336,8 @@ fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
 fn start_pageserver(
    launch_ts: &'static LaunchTimestamp,
    conf: &'static PageServerConf,
+    ignored: ignored_fields::Paths,
+    otel_guard: Option<OtelGuard>,
 ) -> anyhow::Result<()> {
    // Monotonic time for later calculating startup duration
    let started_startup_at = Instant::now();
@@ -312,7 +360,7 @@ fn start_pageserver(
        pageserver::metrics::tokio_epoll_uring::Collector::new(),
    ))
    .unwrap();
-    pageserver::preinitialize_metrics(conf);
+    pageserver::preinitialize_metrics(conf, ignored);

    // If any failpoints were set from FAILPOINTS environment variable,
    // print them to the log for debugging purposes
@@ -675,13 +723,21 @@ fn start_pageserver(

    // Spawn a task to listen for libpq connections. It will spawn further tasks
    // for each connection. We created the listener earlier already.
-    let page_service = page_service::spawn(conf, tenant_manager.clone(), pg_auth, {
-        let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
-        pageserver_listener
-            .set_nonblocking(true)
-            .context("set listener to nonblocking")?;
-        tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
-    });
+    let perf_trace_dispatch = otel_guard.as_ref().map(|g| g.dispatch.clone());
+    let page_service = page_service::spawn(
+        conf,
+        tenant_manager.clone(),
+        pg_auth,
+        perf_trace_dispatch,
+        {
+            let _entered = COMPUTE_REQUEST_RUNTIME.enter(); // TcpListener::from_std requires it
+            pageserver_listener
+                .set_nonblocking(true)
+                .context("set listener to nonblocking")?;
+            tokio::net::TcpListener::from_std(pageserver_listener)
+                .context("create tokio listener")?
+        },
+    );

    // All started up! Now just sit and wait for shutdown signal.
    BACKGROUND_RUNTIME.block_on(async move {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -4,6 +4,8 @@
 //! file, or on the command line.
 //! See also `settings.md` for better description on every parameter.

+pub mod ignored_fields;
+
 use std::env;
 use std::num::NonZeroUsize;
 use std::sync::Arc;
@@ -215,6 +217,8 @@ pub struct PageServerConf {

    /// When set, include visible layers in the next uploaded heatmaps of an unarchived timeline.
    pub generate_unarchival_heatmap: bool,
+
+    pub tracing: Option<pageserver_api::config::Tracing>,
 }

 /// Token for authentication to safekeepers
@@ -386,6 +390,7 @@ impl PageServerConf {
            validate_wal_contiguity,
            load_previous_heatmap,
            generate_unarchival_heatmap,
+            tracing,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -435,6 +440,7 @@ impl PageServerConf {
            wal_receiver_protocol,
            page_service_pipelining,
            get_vectored_concurrent_io,
+            tracing,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
@@ -506,6 +512,17 @@ impl PageServerConf {
            );
        }

+        if let Some(tracing_config) = conf.tracing.as_ref() {
+            let ratio = &tracing_config.sampling_ratio;
+            ensure!(
+                ratio.denominator != 0 && ratio.denominator >= ratio.numerator,
+                format!(
+                    "Invalid sampling ratio: {}/{}",
+                    ratio.numerator, ratio.denominator
+                )
+            );
+        }
+
        IndexEntry::validate_checkpoint_distance(conf.default_tenant_conf.checkpoint_distance)
            .map_err(anyhow::Error::msg)
            .with_context(|| {
@@ -545,7 +562,6 @@ impl PageServerConf {
 }

 #[derive(serde::Deserialize, serde::Serialize)]
-#[serde(deny_unknown_fields)]
 pub struct PageserverIdentity {
    pub id: NodeId,
 }
@@ -617,82 +633,4 @@ mod tests {
        PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir)
            .expect("parse_and_validate");
    }
-
-    /// If there's a typo in the pageserver config, we'd rather catch that typo
-    /// and fail pageserver startup than silently ignoring the typo, leaving whoever
-    /// made it in the believe that their config change is effective.
-    ///
-    /// The default in serde is to allow unknown fields, so, we rely
-    /// on developer+review discipline to add `deny_unknown_fields` when adding
-    /// new structs to the config, and these tests here as a regression test.
-    ///
-    /// The alternative to all of this would be to allow unknown fields in the config.
-    /// To catch them, we could have a config check tool or mgmt API endpoint that
-    /// compares the effective config with the TOML on disk and makes sure that
-    /// the on-disk TOML is a strict subset of the effective config.
-    mod unknown_fields_handling {
-        macro_rules! test {
-            ($short_name:ident, $input:expr) => {
-                #[test]
-                fn $short_name() {
-                    let input = $input;
-                    let err = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(&input)
-                        .expect_err("some_invalid_field is an invalid field");
-                    dbg!(&err);
-                    assert!(err.to_string().contains("some_invalid_field"));
-                }
-            };
-        }
-        use indoc::indoc;
-
-        test!(
-            toplevel,
-            indoc! {r#"
-                some_invalid_field = 23
-            "#}
-        );
-
-        test!(
-            toplevel_nested,
-            indoc! {r#"
-                [some_invalid_field]
-                foo = 23
-            "#}
-        );
-
-        test!(
-            disk_usage_based_eviction,
-            indoc! {r#"
-                [disk_usage_based_eviction]
-                some_invalid_field = 23
-            "#}
-        );
-
-        test!(
-            tenant_config,
-            indoc! {r#"
-                [tenant_config]
-                some_invalid_field = 23
-            "#}
-        );
-
-        test!(
-            l0_flush,
-            indoc! {r#"
-                [l0_flush]
-                mode = "direct"
-                some_invalid_field = 23
-            "#}
-        );
-
-        // TODO: fix this => https://github.com/neondatabase/neon/issues/8915
-        // test!(
-        //     remote_storage_config,
-        //     indoc! {r#"
-        //         [remote_storage_config]
-        //         local_path = "/nonexistent"
-        //         some_invalid_field = 23
-        //     "#}
-        // );
-    }
 }
--- a/pageserver/src/config/ignored_fields.rs
+++ b/pageserver/src/config/ignored_fields.rs
@@ -0,0 +1,179 @@
+//! Check for fields in the on-disk config file that were ignored when
+//! deserializing [`pageserver_api::config::ConfigToml`].
+//!
+//! This could have been part of the [`pageserver_api::config`] module,
+//! but the way we identify unused fields in this module
+//! is specific to the format (TOML) and the implementation of the
+//! deserialization for that format ([`toml_edit`]).
+
+use std::collections::HashSet;
+
+use itertools::Itertools;
+
+/// Pass in the user-specified config and the re-serialized [`pageserver_api::config::ConfigToml`].
+/// The returned [`Paths`] contains the paths to the fields that were ignored by deserialization
+/// of the [`pageserver_api::config::ConfigToml`].
+pub fn find(user_specified: toml_edit::DocumentMut, reserialized: toml_edit::DocumentMut) -> Paths {
+    let user_specified = paths(user_specified);
+    let reserialized = paths(reserialized);
+    fn paths(doc: toml_edit::DocumentMut) -> HashSet<String> {
+        let mut out = Vec::new();
+        let mut visitor = PathsVisitor::new(&mut out);
+        visitor.visit_table_like(doc.as_table());
+        HashSet::from_iter(out)
+    }
+
+    let mut ignored = HashSet::new();
+
+    // O(n) because of HashSet
+    for path in user_specified {
+        if !reserialized.contains(&path) {
+            ignored.insert(path);
+        }
+    }
+
+    Paths {
+        paths: ignored
+            .into_iter()
+            // sort lexicographically for deterministic output
+            .sorted()
+            .collect(),
+    }
+}
+
+pub struct Paths {
+    pub paths: Vec<String>,
+}
+
+struct PathsVisitor<'a> {
+    stack: Vec<String>,
+    out: &'a mut Vec<String>,
+}
+
+impl<'a> PathsVisitor<'a> {
+    fn new(out: &'a mut Vec<String>) -> Self {
+        Self {
+            stack: Vec::new(),
+            out,
+        }
+    }
+
+    fn visit_table_like(&mut self, table_like: &dyn toml_edit::TableLike) {
+        for (entry, item) in table_like.iter() {
+            self.stack.push(entry.to_string());
+            self.visit_item(item);
+            self.stack.pop();
+        }
+    }
+
+    fn visit_item(&mut self, item: &toml_edit::Item) {
+        match item {
+            toml_edit::Item::None => (),
+            toml_edit::Item::Value(value) => self.visit_value(value),
+            toml_edit::Item::Table(table) => {
+                self.visit_table_like(table);
+            }
+            toml_edit::Item::ArrayOfTables(array_of_tables) => {
+                for (i, table) in array_of_tables.iter().enumerate() {
+                    self.stack.push(format!("[{i}]"));
+                    self.visit_table_like(table);
+                    self.stack.pop();
+                }
+            }
+        }
+    }
+
+    fn visit_value(&mut self, value: &toml_edit::Value) {
+        match value {
+            toml_edit::Value::String(_)
+            | toml_edit::Value::Integer(_)
+            | toml_edit::Value::Float(_)
+            | toml_edit::Value::Boolean(_)
+            | toml_edit::Value::Datetime(_) => self.out.push(self.stack.join(".")),
+            toml_edit::Value::Array(array) => {
+                for (i, value) in array.iter().enumerate() {
+                    self.stack.push(format!("[{i}]"));
+                    self.visit_value(value);
+                    self.stack.pop();
+                }
+            }
+            toml_edit::Value::InlineTable(inline_table) => self.visit_table_like(inline_table),
+        }
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+
+    fn test_impl(original: &str, parsed: &str, expect: [&str; 1]) {
+        let original: toml_edit::DocumentMut = original.parse().expect("parse original config");
+        let parsed: toml_edit::DocumentMut = parsed.parse().expect("parse re-serialized config");
+
+        let super::Paths { paths: actual } = super::find(original, parsed);
+        assert_eq!(actual, &expect);
+    }
+
+    #[test]
+    fn top_level() {
+        test_impl(
+            r#"
+                [a]
+                b = 1
+                c = 2
+                d = 3
+            "#,
+            r#"
+                [a]
+                b = 1
+                c = 2
+            "#,
+            ["a.d"],
+        );
+    }
+
+    #[test]
+    fn nested() {
+        test_impl(
+            r#"
+                [a.b.c]
+                d = 23
+            "#,
+            r#"
+                [a]
+                e = 42
+            "#,
+            ["a.b.c.d"],
+        );
+    }
+
+    #[test]
+    fn array_of_tables() {
+        test_impl(
+            r#"
+                [[a]]
+                b = 1
+                c = 2
+                d = 3
+            "#,
+            r#"
+                [[a]]
+                b = 1
+                c = 2
+            "#,
+            ["a.[0].d"],
+        );
+    }
+
+    #[test]
+    fn array() {
+        test_impl(
+            r#"
+            foo = [ {bar = 23} ]
+            "#,
+            r#"
+            foo = [ { blup = 42 }]
+            "#,
+            ["foo.[0].bar"],
+        );
+    }
+}
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -89,7 +89,7 @@
 //! [`RequestContext`] argument. Functions in the middle of the call chain
 //! only need to pass it on.

-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};

 use once_cell::sync::Lazy;
 use tracing::warn;
@@ -100,6 +100,12 @@ use crate::{
    task_mgr::TaskKind,
    tenant::Timeline,
 };
+use futures::FutureExt;
+use futures::future::BoxFuture;
+use std::future::Future;
+use tracing_utils::perf_span::{PerfInstrument, PerfSpan};
+
+use tracing::{Dispatch, Span};

 // The main structure of this module, see module-level comment.
 pub struct RequestContext {
@@ -109,6 +115,8 @@ pub struct RequestContext {
    page_content_kind: PageContentKind,
    read_path_debug: bool,
    scope: Scope,
+    perf_span: Option<PerfSpan>,
+    perf_span_dispatch: Option<Dispatch>,
 }

 #[derive(Clone)]
@@ -263,22 +271,15 @@ impl RequestContextBuilder {
                page_content_kind: PageContentKind::Unknown,
                read_path_debug: false,
                scope: Scope::new_global(),
+                perf_span: None,
+                perf_span_dispatch: None,
            },
        }
    }

-    pub fn extend(original: &RequestContext) -> Self {
+    pub fn from(original: &RequestContext) -> Self {
        Self {
-            // This is like a Copy, but avoid implementing Copy because ordinary users of
-            // RequestContext should always move or ref it.
-            inner: RequestContext {
-                task_kind: original.task_kind,
-                download_behavior: original.download_behavior,
-                access_stats_behavior: original.access_stats_behavior,
-                page_content_kind: original.page_content_kind,
-                read_path_debug: original.read_path_debug,
-                scope: original.scope.clone(),
-            },
+            inner: original.clone(),
        }
    }

@@ -316,12 +317,74 @@ impl RequestContextBuilder {
        self
    }

-    pub fn build(self) -> RequestContext {
+    pub(crate) fn perf_span_dispatch(mut self, dispatch: Option<Dispatch>) -> Self {
+        self.inner.perf_span_dispatch = dispatch;
+        self
+    }
+
+    pub fn root_perf_span<Fn>(mut self, make_span: Fn) -> Self
+    where
+        Fn: FnOnce() -> Span,
+    {
+        assert!(self.inner.perf_span.is_none());
+        assert!(self.inner.perf_span_dispatch.is_some());
+
+        let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap();
+        let new_span = tracing::dispatcher::with_default(dispatcher, make_span);
+
+        self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone()));
+
+        self
+    }
+
+    pub fn perf_span<Fn>(mut self, make_span: Fn) -> Self
+    where
+        Fn: FnOnce(&Span) -> Span,
+    {
+        if let Some(ref perf_span) = self.inner.perf_span {
+            assert!(self.inner.perf_span_dispatch.is_some());
+            let dispatcher = self.inner.perf_span_dispatch.as_ref().unwrap();
+
+            let new_span =
+                tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner()));
+
+            self.inner.perf_span = Some(PerfSpan::new(new_span, dispatcher.clone()));
+        }
+
+        self
+    }
+
+    pub fn root(self) -> RequestContext {
+        self.inner
+    }
+
+    pub fn attached_child(self) -> RequestContext {
+        self.inner
+    }
+
+    pub fn detached_child(self) -> RequestContext {
        self.inner
    }
 }

 impl RequestContext {
+    /// Private clone implementation
+    ///
+    /// Callers should use the [`RequestContextBuilder`] or child spaning APIs of
+    /// [`RequestContext`].
+    fn clone(&self) -> Self {
+        Self {
+            task_kind: self.task_kind,
+            download_behavior: self.download_behavior,
+            access_stats_behavior: self.access_stats_behavior,
+            page_content_kind: self.page_content_kind,
+            read_path_debug: self.read_path_debug,
+            scope: self.scope.clone(),
+            perf_span: self.perf_span.clone(),
+            perf_span_dispatch: self.perf_span_dispatch.clone(),
+        }
+    }
+
    /// Create a new RequestContext that has no parent.
    ///
    /// The function is called `new` because, once we add children
@@ -337,7 +400,7 @@ impl RequestContext {
    pub fn new(task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
        RequestContextBuilder::new(task_kind)
            .download_behavior(download_behavior)
-            .build()
+            .root()
    }

    /// Create a detached child context for a task that may outlive `self`.
@@ -358,7 +421,10 @@ impl RequestContext {
    ///
    /// We could make new calls to this function fail if `self` is already canceled.
    pub fn detached_child(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        self.child_impl(task_kind, download_behavior)
+        RequestContextBuilder::from(self)
+            .task_kind(task_kind)
+            .download_behavior(download_behavior)
+            .detached_child()
    }

    /// Create a child of context `self` for a task that shall not outlive `self`.
@@ -382,7 +448,7 @@ impl RequestContext {
    /// The method to wait for child tasks would return an error, indicating
    /// that the child task was not started because the context was canceled.
    pub fn attached_child(&self) -> Self {
-        self.child_impl(self.task_kind(), self.download_behavior())
+        RequestContextBuilder::from(self).attached_child()
    }

    /// Use this function when you should be creating a child context using
@@ -397,17 +463,10 @@ impl RequestContext {
        Self::new(task_kind, download_behavior)
    }

-    fn child_impl(&self, task_kind: TaskKind, download_behavior: DownloadBehavior) -> Self {
-        RequestContextBuilder::extend(self)
-            .task_kind(task_kind)
-            .download_behavior(download_behavior)
-            .build()
-    }
-
    pub fn with_scope_timeline(&self, timeline: &Arc<Timeline>) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
            .scope(Scope::new_timeline(timeline))
-            .build()
+            .attached_child()
    }

    pub(crate) fn with_scope_page_service_pagestream(
@@ -416,9 +475,9 @@ impl RequestContext {
            crate::page_service::TenantManagerTypes,
        >,
    ) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
            .scope(Scope::new_page_service_pagestream(timeline_handle))
-            .build()
+            .attached_child()
    }

    pub fn with_scope_secondary_timeline(
@@ -426,28 +485,30 @@ impl RequestContext {
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
    ) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
            .scope(Scope::new_secondary_timeline(tenant_shard_id, timeline_id))
-            .build()
+            .attached_child()
    }

    pub fn with_scope_secondary_tenant(&self, tenant_shard_id: &TenantShardId) -> Self {
-        RequestContextBuilder::extend(self)
+        RequestContextBuilder::from(self)
            .scope(Scope::new_secondary_tenant(tenant_shard_id))
-            .build()
+            .attached_child()
    }

    #[cfg(test)]
    pub fn with_scope_unit_test(&self) -> Self {
-        RequestContextBuilder::new(TaskKind::UnitTest)
+        RequestContextBuilder::from(self)
+            .task_kind(TaskKind::UnitTest)
            .scope(Scope::new_unit_test())
-            .build()
+            .attached_child()
    }

    pub fn with_scope_debug_tools(&self) -> Self {
-        RequestContextBuilder::new(TaskKind::DebugTool)
+        RequestContextBuilder::from(self)
+            .task_kind(TaskKind::DebugTool)
            .scope(Scope::new_debug_tools())
-            .build()
+            .attached_child()
    }

    pub fn task_kind(&self) -> TaskKind {
@@ -504,4 +565,76 @@ impl RequestContext {
            Scope::DebugTools { io_size_metrics } => io_size_metrics,
        }
    }
+
+    pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
+        if duration == Duration::ZERO {
+            return;
+        }
+
+        match &self.scope {
+            Scope::Timeline { arc_arc } => arc_arc
+                .wait_ondemand_download_time
+                .observe(self.task_kind, duration),
+            _ => {
+                use once_cell::sync::Lazy;
+                use std::sync::Mutex;
+                use std::time::Duration;
+                use utils::rate_limit::RateLimit;
+                static LIMIT: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
+                let mut guard = LIMIT.lock().unwrap();
+                guard.call2(|rate_limit_stats| {
+                    warn!(
+                        %rate_limit_stats,
+                        backtrace=%std::backtrace::Backtrace::force_capture(),
+                        "ondemand downloads should always happen within timeline scope",
+                    );
+                });
+            }
+        }
+    }
+
+    pub(crate) fn perf_follows_from(&self, from: &RequestContext) {
+        if let (Some(span), Some(from_span)) = (&self.perf_span, &from.perf_span) {
+            span.inner().follows_from(from_span.inner());
+        }
+    }
+
+    pub(crate) fn has_perf_span(&self) -> bool {
+        self.perf_span.is_some()
+    }
 }
+
+/// [`Future`] extension trait that allow for creating performance
+/// spans on sampled requests
+pub(crate) trait PerfInstrumentFutureExt<'a>: Future + Send {
+    /// Instrument this future with a new performance span when the
+    /// provided request context indicates the originator request
+    /// was sampled. Otherwise, just box the future and return it as is.
+    fn maybe_perf_instrument<Fn>(
+        self,
+        ctx: &RequestContext,
+        make_span: Fn,
+    ) -> BoxFuture<'a, Self::Output>
+    where
+        Self: Sized + 'a,
+        Fn: FnOnce(&Span) -> Span,
+    {
+        match &ctx.perf_span {
+            Some(perf_span) => {
+                assert!(ctx.perf_span_dispatch.is_some());
+                let dispatcher = ctx.perf_span_dispatch.as_ref().unwrap();
+
+                let new_span =
+                    tracing::dispatcher::with_default(dispatcher, || make_span(perf_span.inner()));
+
+                let new_perf_span = PerfSpan::new(new_span, dispatcher.clone());
+                self.instrument(new_perf_span).boxed()
+            }
+            None => self.boxed(),
+        }
+    }
+}
+
+// Implement the trait for all types that satisfy the trait bounds
+impl<'a, T: Future + Send + 'a> PerfInstrumentFutureExt<'a> for T {}
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1133,6 +1133,40 @@ components:
        applied_gc_cutoff_lsn:
          type: string
          format: hex
+        safekeepers:
+          $ref: "#/components/schemas/TimelineSafekeepersInfo"
+
+    TimelineSafekeepersInfo:
+      type: object
+      required:
+        - tenant_id
+        - timeline_id
+        - generation
+        - safekeepers
+      properties:
+        tenant_id:
+          type: string
+          format: hex
+        timeline_id:
+          type: string
+          format: hex
+        generation:
+          type: integer
+        safekeepers:
+          type: array
+          items:
+            $ref: "#/components/schemas/TimelineSafekeeperInfo"
+
+    TimelineSafekeeperInfo:
+      type: object
+      required:
+        - id
+        - hostname
+      properties:
+        id:
+          type: integer
+        hostname:
+          type: string

    SyntheticSizeResponse:
      type: object
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,8 +74,8 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout,
-    WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
+    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
    GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -445,6 +445,9 @@ async fn build_timeline_info_common(

    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();

+    // Externally, expose the lowest LSN that can be used to create a branch.
+    // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
+    // actually trimmed data to), which can pass each other when PITR is changed.
    let min_readable_lsn = std::cmp::max(
        timeline.get_gc_cutoff_lsn(),
        *timeline.get_applied_gc_cutoff_lsn(),
@@ -461,7 +464,6 @@ async fn build_timeline_info_common(
        initdb_lsn,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
-        _unused: Default::default(), // Unused, for legacy decode only
        min_readable_lsn,
        applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
        current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
@@ -2256,7 +2258,6 @@ async fn timeline_compact_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
-    flags |= CompactFlags::NoYield; // run compaction to completion

    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
        flags |= CompactFlags::ForceL0Compaction;
@@ -2336,21 +2337,31 @@ async fn timeline_compact_handler(
 }

 async fn timeline_mark_invisible_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let compact_request = json_request_maybe::<Option<MarkInvisibleRequest>>(&mut request).await?;
+
    let state = get_state(&request);

+    let visibility = match compact_request {
+        Some(req) => match req.is_visible {
+            Some(true) => TimelineVisibilityState::Visible,
+            Some(false) | None => TimelineVisibilityState::Invisible,
+        },
+        None => TimelineVisibilityState::Invisible,
+    };
+
    async {
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
        let timeline = tenant.get_timeline(timeline_id, true)?;
-        timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(TimelineVisibilityState::Invisible).map_err(ApiError::InternalServerError)?;
+        timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(visibility).map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_timeline_mark_invisible", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -2417,7 +2428,6 @@ async fn timeline_checkpoint_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
-    flags |= CompactFlags::NoYield; // run compaction to completion
    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
        flags |= CompactFlags::ForceL0Compaction;
    }
@@ -2687,11 +2697,12 @@ async fn getpage_at_lsn_handler_inner(
    let lsn: Option<Lsn> = parse_query_param(&request, "lsn")?;

    async {
-        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        // Enable read path debugging
        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        let ctx = RequestContextBuilder::extend(&ctx).read_path_debug(true)
-        .scope(context::Scope::new_timeline(&timeline)).build();
+        let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest)
+            .download_behavior(DownloadBehavior::Download)
+            .scope(context::Scope::new_timeline(&timeline))
+            .read_path_debug(true)
+            .root();

        // Use last_record_lsn if no lsn is provided
        let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
@@ -3178,7 +3189,8 @@ async fn list_aux_files(
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let files = timeline
        .list_aux_files(body.lsn, &ctx, io_concurrency)
        .await?;
@@ -3422,14 +3434,15 @@ async fn put_tenant_timeline_import_wal(

    check_permission(&request, Some(tenant_id))?;

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-
    let span = info_span!("import_wal", tenant_id=%tenant_id, timeline_id=%timeline_id, start_lsn=%start_lsn, end_lsn=%end_lsn);
    async move {
        let state = get_state(&request);

        let timeline = active_timeline_of_active_tenant(&state.tenant_manager, TenantShardId::unsharded(tenant_id), timeline_id).await?;
-        let ctx = RequestContextBuilder::extend(&ctx).scope(context::Scope::new_timeline(&timeline)).build();
+        let ctx = RequestContextBuilder::new(TaskKind::MgmtRequest)
+            .download_behavior(DownloadBehavior::Warn)
+            .scope(context::Scope::new_timeline(&timeline))
+            .root();

        let mut body = StreamReader::new(request.into_body().map(|res| {
            res.map_err(|error| {
@@ -3776,7 +3789,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/mark_invisible",
-            |r| testing_api_handler("mark timeline invisible", r, timeline_mark_invisible_handler),
+            |r| api_handler( r, timeline_mark_invisible_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -55,6 +55,9 @@ pub const DEFAULT_PG_VERSION: u32 = 16;
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
 pub const DELTA_FILE_MAGIC: u16 = 0x5A61;

+// Target used for performance traces.
+pub const PERF_TRACE_TARGET: &str = "P";
+
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);

 pub use crate::metrics::preinitialize_metrics;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,16 +1,14 @@
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::os::fd::RawFd;
-use std::pin::Pin;
 use std::sync::atomic::AtomicU64;
 use std::sync::{Arc, Mutex};
-use std::task::{Context, Poll};
 use std::time::{Duration, Instant};

 use enum_map::{Enum as _, EnumMap};
 use futures::Future;
 use metrics::{
-    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -23,13 +21,13 @@ use pageserver_api::config::{
 };
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
-use pin_project_lite::pin_project;
 use postgres_backend::{QueryError, is_expected_io_error};
 use pq_proto::framed::ConnectionError;
 use strum::{EnumCount, IntoEnumIterator as _, VariantNames};
 use strum_macros::{IntoStaticStr, VariantNames};
 use utils::id::TimelineId;

+use crate::config;
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::pgdatadir_mapping::DatadirModificationStats;
@@ -499,14 +497,99 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_flush_wait_upload_seconds",
-        "Time spent waiting for preceding uploads during layer flush",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
+pub(crate) mod wait_ondemand_download_time {
+    use super::*;
+    const WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS: &[f64] = &[
+        0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, // 10 ms - 100ms
+        0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, // 100ms to 1s
+        1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, // 1s to 10s
+        10.0, 20.0, 30.0, 40.0, 50.0, 60.0, // 10s to 1m
+    ];
+
+    /// The task kinds for which we want to track wait times for on-demand downloads.
+    /// Other task kinds' wait times are accumulated in label value `unknown`.
+    pub(crate) const WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS: [TaskKind; 2] = [
+        TaskKind::PageRequestHandler,
+        TaskKind::WalReceiverConnectionHandler,
+    ];
+
+    pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL: Lazy<Vec<Histogram>> = Lazy::new(|| {
+        let histo = register_histogram_vec!(
+            "pageserver_wait_ondemand_download_seconds_global",
+            "Observations are individual tasks' wait times for on-demand downloads. \
+         If N tasks coalesce on an on-demand download, and it takes 10s, than we observe N * 10s.",
+            &["task_kind"],
+            WAIT_ONDEMAND_DOWNLOAD_TIME_BUCKETS.into(),
+        )
+        .expect("failed to define a metric");
+        WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
+            .iter()
+            .map(|task_kind| histo.with_label_values(&[task_kind.into()]))
+            .collect::<Vec<_>>()
+    });
+
+    pub(crate) static WAIT_ONDEMAND_DOWNLOAD_TIME_SUM: Lazy<CounterVec> = Lazy::new(|| {
+        register_counter_vec!(
+            // use a name that _could_ be evolved into a per-timeline histogram later
+            "pageserver_wait_ondemand_download_seconds_sum",
+            "Like `pageserver_wait_ondemand_download_seconds_global` but per timeline",
+            &["tenant_id", "shard_id", "timeline_id", "task_kind"],
+        )
+        .unwrap()
+    });
+
+    pub struct WaitOndemandDownloadTimeSum {
+        counters: [Counter; WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS.len()],
+    }
+
+    impl WaitOndemandDownloadTimeSum {
+        pub(crate) fn new(tenant_id: &str, shard_id: &str, timeline_id: &str) -> Self {
+            let counters = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
+                .iter()
+                .map(|task_kind| {
+                    WAIT_ONDEMAND_DOWNLOAD_TIME_SUM
+                        .get_metric_with_label_values(&[
+                            tenant_id,
+                            shard_id,
+                            timeline_id,
+                            task_kind.into(),
+                        ])
+                        .unwrap()
+                })
+                .collect::<Vec<_>>();
+            Self {
+                counters: counters.try_into().unwrap(),
+            }
+        }
+        pub(crate) fn observe(&self, task_kind: TaskKind, duration: Duration) {
+            let maybe = WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS
+                .iter()
+                .enumerate()
+                .find(|(_, kind)| **kind == task_kind);
+            let Some((idx, _)) = maybe else {
+                return;
+            };
+            WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL[idx].observe(duration.as_secs_f64());
+            let counter = &self.counters[idx];
+            counter.inc_by(duration.as_secs_f64());
+        }
+    }
+
+    pub(crate) fn shutdown_timeline(tenant_id: &str, shard_id: &str, timeline_id: &str) {
+        for task_kind in WAIT_ONDEMAND_DOWNLOAD_METRIC_TASK_KINDS {
+            let _ = WAIT_ONDEMAND_DOWNLOAD_TIME_SUM.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+                task_kind.into(),
+            ]);
+        }
+    }
+
+    pub(crate) fn preinitialize_global_metrics() {
+        Lazy::force(&WAIT_ONDEMAND_DOWNLOAD_TIME_GLOBAL);
+    }
+}

 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
@@ -1257,13 +1340,13 @@ pub(crate) static STORAGE_IO_TIME_METRIC: Lazy<StorageIoTime> = Lazy::new(Storag

 #[derive(Clone, Copy)]
 #[repr(usize)]
-enum StorageIoSizeOperation {
+pub(crate) enum StorageIoSizeOperation {
    Read,
    Write,
 }

 impl StorageIoSizeOperation {
-    const VARIANTS: &'static [&'static str] = &["read", "write"];
+    pub(crate) const VARIANTS: &'static [&'static str] = &["read", "write"];

    fn as_str(&self) -> &'static str {
        Self::VARIANTS[*self as usize]
@@ -1271,7 +1354,7 @@ impl StorageIoSizeOperation {
 }

 // Needed for the https://neonprod.grafana.net/d/5uK9tHL4k/picking-tenant-for-relocation?orgId=1
-static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub(crate) static STORAGE_IO_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
@@ -2323,13 +2406,18 @@ impl RemoteOpFileKind {
    }
 }

-pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
-        "pageserver_remote_operation_seconds",
-        "Time spent on remote storage operations. \
-        Grouped by tenant, timeline, operation_kind and status. \
+        "pageserver_remote_timeline_client_seconds_global",
+        "Time spent on remote timeline client operations. \
+        Grouped by task_kind, file_kind, operation_kind and status. \
+        The task_kind is \
+          - for layer downloads, populated from RequestContext (primary objective of having the label) \
+          - for index downloads, set to 'unknown' \
+          - for any upload operation, set to 'RemoteUploadTask' \
+        This keeps dimensionality at bay. \
        Does not account for time spent waiting in remote timeline client's queues.",
-        &["file_kind", "op_kind", "status"]
+        &["task_kind", "file_kind", "op_kind", "status"]
    )
    .expect("failed to define a metric")
 });
@@ -2864,7 +2952,6 @@ pub(crate) struct TimelineMetrics {
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
    pub flush_delay_histo: StorageTimeMetrics,
-    pub flush_wait_upload_time_gauge: Gauge,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
@@ -2892,6 +2979,7 @@ pub(crate) struct TimelineMetrics {
    pub storage_io_size: StorageIoSizeMetrics,
    pub wait_lsn_in_progress_micros: GlobalAndPerTenantIntCounter,
    pub wait_lsn_start_finish_counterpair: IntCounterPair,
+    pub wait_ondemand_download_time: wait_ondemand_download_time::WaitOndemandDownloadTimeSum,
    shutdown: std::sync::atomic::AtomicBool,
 }

@@ -2916,9 +3004,6 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        let compact_time_histo = StorageTimeMetrics::new(
            StorageTimeOperation::Compact,
            &tenant_id,
@@ -3040,13 +3125,19 @@ impl TimelineMetrics {
            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();

+        let wait_ondemand_download_time =
+            wait_ondemand_download_time::WaitOndemandDownloadTimeSum::new(
+                &tenant_id,
+                &shard_id,
+                &timeline_id,
+            );
+
        TimelineMetrics {
            tenant_id,
            shard_id,
            timeline_id,
            flush_time_histo,
            flush_delay_histo,
-            flush_wait_upload_time_gauge,
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
@@ -3074,6 +3165,7 @@ impl TimelineMetrics {
            wal_records_received,
            wait_lsn_in_progress_micros,
            wait_lsn_start_finish_counterpair,
+            wait_ondemand_download_time,
            shutdown: std::sync::atomic::AtomicBool::default(),
        }
    }
@@ -3096,14 +3188,6 @@ impl TimelineMetrics {
        self.resident_physical_size_gauge.get()
    }

-    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
-        self.flush_wait_upload_time_gauge.add(duration);
-        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
-            .unwrap()
-            .add(duration);
-    }
-
    /// Generates TIMELINE_LAYER labels for a persistent layer.
    fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
        let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
@@ -3207,7 +3291,6 @@ impl TimelineMetrics {
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -3275,6 +3358,8 @@ impl TimelineMetrics {
                .remove_label_values(&mut res, &[tenant_id, shard_id, timeline_id]);
        }

+        wait_ondemand_download_time::shutdown_timeline(tenant_id, shard_id, timeline_id);
+
        let _ = SMGR_QUERY_STARTED_PER_TENANT_TIMELINE.remove_label_values(&[
            SmgrQueryType::GetPageAtLsn.into(),
            tenant_id,
@@ -3396,13 +3481,18 @@ impl RemoteTimelineClientMetrics {

    pub fn remote_operation_time(
        &self,
+        task_kind: Option<TaskKind>,
        file_kind: &RemoteOpFileKind,
        op_kind: &RemoteOpKind,
        status: &'static str,
    ) -> Histogram {
-        let key = (file_kind.as_str(), op_kind.as_str(), status);
-        REMOTE_OPERATION_TIME
-            .get_metric_with_label_values(&[key.0, key.1, key.2])
+        REMOTE_TIMELINE_CLIENT_COMPLETION_LATENCY
+            .get_metric_with_label_values(&[
+                task_kind.as_ref().map(|tk| tk.into()).unwrap_or("unknown"),
+                file_kind.as_str(),
+                op_kind.as_str(),
+                status,
+            ])
            .unwrap()
    }

@@ -3647,54 +3737,26 @@ impl Drop for RemoteTimelineClientMetrics {

 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub(crate) trait MeasureRemoteOp: Sized {
-    fn measure_remote_op(
+pub(crate) trait MeasureRemoteOp<O, E>: Sized + Future<Output = Result<O, E>> {
+    async fn measure_remote_op(
        self,
+        task_kind: Option<TaskKind>, // not all caller contexts have a RequestContext / TaskKind handy
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        metrics: Arc<RemoteTimelineClientMetrics>,
-    ) -> MeasuredRemoteOp<Self> {
+    ) -> Result<O, E> {
        let start = Instant::now();
-        MeasuredRemoteOp {
-            inner: self,
-            file_kind,
-            op,
-            start,
-            metrics,
-        }
+        let res = self.await;
+        let duration = start.elapsed();
+        let status = if res.is_ok() { &"success" } else { &"failure" };
+        metrics
+            .remote_operation_time(task_kind, &file_kind, &op, status)
+            .observe(duration.as_secs_f64());
+        res
    }
 }

-impl<T: Sized> MeasureRemoteOp for T {}
-
-pin_project! {
-    pub(crate) struct MeasuredRemoteOp<F>
-    {
-        #[pin]
-        inner: F,
-        file_kind: RemoteOpFileKind,
-        op: RemoteOpKind,
-        start: Instant,
-        metrics: Arc<RemoteTimelineClientMetrics>,
-    }
-}
-
-impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
-    type Output = Result<O, E>;
-
-    fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        let this = self.project();
-        let poll_result = this.inner.poll(cx);
-        if let Poll::Ready(ref res) = poll_result {
-            let duration = this.start.elapsed();
-            let status = if res.is_ok() { &"success" } else { &"failure" };
-            this.metrics
-                .remote_operation_time(this.file_kind, this.op, status)
-                .observe(duration.as_secs_f64());
-        }
-        poll_result
-    }
-}
+impl<Fut, O, E> MeasureRemoteOp<O, E> for Fut where Fut: Sized + Future<Output = Result<O, E>> {}

 pub mod tokio_epoll_uring {
    use std::collections::HashMap;
@@ -4130,9 +4192,33 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
        .set(u64::try_from(num_threads.get()).unwrap());
 }

-pub fn preinitialize_metrics(conf: &'static PageServerConf) {
+static PAGESERVER_CONFIG_IGNORED_ITEMS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_config_ignored_items",
+        "TOML items present in the on-disk configuration file but ignored by the pageserver config parser.\
+         The `item` label is the dot-separated path of the ignored item in the on-disk configuration file.\
+         The value for an unknown config item is always 1.\
+         There is a special label value \"\", which is 0, so that there is always a metric exposed (simplifies dashboards).",
+        &["item"]
+    )
+    .unwrap()
+});
+
+pub fn preinitialize_metrics(
+    conf: &'static PageServerConf,
+    ignored: config::ignored_fields::Paths,
+) {
    set_page_service_config_max_batch_size(&conf.page_service_pipelining);

+    PAGESERVER_CONFIG_IGNORED_ITEMS
+        .with_label_values(&[""])
+        .set(0);
+    for path in &ignored.paths {
+        PAGESERVER_CONFIG_IGNORED_ITEMS
+            .with_label_values(&[path])
+            .set(1);
+    }
+
    // Python tests need these and on some we do alerting.
    //
    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
@@ -4218,4 +4304,5 @@ pub fn preinitialize_metrics(conf: &'static PageServerConf) {
    Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);

    tenant_throttling::preinitialize_global_metrics();
+    wait_ondemand_download_time::preinitialize_global_metrics();
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -9,6 +9,7 @@ use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};
 use std::{io, str};

+use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, bail};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
@@ -53,7 +54,9 @@ use utils::sync::spsc_fold;
 use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
+};
 use crate::metrics::{
    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, LIVE_CONNECTIONS, SmgrOpTimer,
    TimelineMetrics,
@@ -100,6 +103,7 @@ pub fn spawn(
    conf: &'static PageServerConf,
    tenant_manager: Arc<TenantManager>,
    pg_auth: Option<Arc<SwappableJwtAuth>>,
+    perf_trace_dispatch: Option<Dispatch>,
    tcp_listener: tokio::net::TcpListener,
 ) -> Listener {
    let cancel = CancellationToken::new();
@@ -117,6 +121,7 @@ pub fn spawn(
            conf,
            tenant_manager,
            pg_auth,
+            perf_trace_dispatch,
            tcp_listener,
            conf.pg_auth_type,
            conf.page_service_pipelining.clone(),
@@ -173,6 +178,7 @@ pub async fn libpq_listener_main(
    conf: &'static PageServerConf,
    tenant_manager: Arc<TenantManager>,
    auth: Option<Arc<SwappableJwtAuth>>,
+    perf_trace_dispatch: Option<Dispatch>,
    listener: tokio::net::TcpListener,
    auth_type: AuthType,
    pipelining_config: PageServicePipeliningConfig,
@@ -205,8 +211,12 @@ pub async fn libpq_listener_main(
                // Connection established. Spawn a new task to handle it.
                debug!("accepted connection from {}", peer_addr);
                let local_auth = auth.clone();
-                let connection_ctx = listener_ctx
-                    .detached_child(TaskKind::PageRequestHandler, DownloadBehavior::Download);
+                let connection_ctx = RequestContextBuilder::from(&listener_ctx)
+                    .task_kind(TaskKind::PageRequestHandler)
+                    .download_behavior(DownloadBehavior::Download)
+                    .perf_span_dispatch(perf_trace_dispatch.clone())
+                    .detached_child();
+
                connection_handler_tasks.spawn(page_service_conn_main(
                    conf,
                    tenant_manager.clone(),
@@ -237,6 +247,15 @@ pub async fn libpq_listener_main(

 type ConnectionHandlerResult = anyhow::Result<()>;

+/// Perf root spans start at the per-request level, after shard routing.
+/// This struct carries connection-level information to the root perf span definition.
+#[derive(Clone)]
+struct ConnectionPerfSpanFields {
+    peer_addr: String,
+    application_name: Option<String>,
+    compute_mode: Option<String>,
+}
+
 #[instrument(skip_all, fields(peer_addr, application_name, compute_mode))]
 #[allow(clippy::too_many_arguments)]
 async fn page_service_conn_main(
@@ -261,6 +280,12 @@ async fn page_service_conn_main(
    let socket_fd = socket.as_raw_fd();

    let peer_addr = socket.peer_addr().context("get peer address")?;
+
+    let perf_span_fields = ConnectionPerfSpanFields {
+        peer_addr: peer_addr.to_string(),
+        application_name: None, // filled in later
+        compute_mode: None,     // filled in later
+    };
    tracing::Span::current().record("peer_addr", field::display(peer_addr));

    // setup read timeout of 10 minutes. the timeout is rather arbitrary for requirements:
@@ -304,6 +329,7 @@ async fn page_service_conn_main(
        tenant_manager,
        auth,
        pipelining_config,
+        perf_span_fields,
        connection_ctx,
        cancel.clone(),
        gate_guard,
@@ -348,6 +374,8 @@ struct PageServerHandler {
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,

+    perf_span_fields: ConnectionPerfSpanFields,
+
    cancel: CancellationToken,

    /// None only while pagestream protocol is being processed.
@@ -607,6 +635,7 @@ impl std::fmt::Display for BatchedPageStreamError {
 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
    timer: SmgrOpTimer,
+    ctx: RequestContext,
 }

 #[cfg(feature = "testing")]
@@ -692,11 +721,13 @@ impl BatchedFeMessage {
 }

 impl PageServerHandler {
+    #[allow(clippy::too_many_arguments)]
    pub fn new(
        conf: &'static PageServerConf,
        tenant_manager: Arc<TenantManager>,
        auth: Option<Arc<SwappableJwtAuth>>,
        pipelining_config: PageServicePipeliningConfig,
+        perf_span_fields: ConnectionPerfSpanFields,
        connection_ctx: RequestContext,
        cancel: CancellationToken,
        gate_guard: GateGuard,
@@ -706,6 +737,7 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            perf_span_fields,
            timeline_handles: Some(TimelineHandles::new(tenant_manager)),
            cancel,
            pipelining_config,
@@ -743,6 +775,7 @@ impl PageServerHandler {
        tenant_id: TenantId,
        timeline_id: TimelineId,
        timeline_handles: &mut TimelineHandles,
+        conn_perf_span_fields: &ConnectionPerfSpanFields,
        cancel: &CancellationToken,
        ctx: &RequestContext,
        protocol_version: PagestreamProtocolVersion,
@@ -902,10 +935,12 @@ impl PageServerHandler {
                }

                let key = rel_block_to_key(req.rel, req.blkno);
-                let shard = match timeline_handles
+
+                let res = timeline_handles
                    .get(tenant_id, timeline_id, ShardSelector::Page(key))
-                    .await
-                {
+                    .await;
+
+                let shard = match res {
                    Ok(tl) => tl,
                    Err(e) => {
                        let span = mkspan!(before shard routing);
@@ -932,6 +967,41 @@ impl PageServerHandler {
                        }
                    }
                };
+
+                let ctx = if shard.is_get_page_request_sampled() {
+                    RequestContextBuilder::from(ctx)
+                        .root_perf_span(|| {
+                            info_span!(
+                            target: PERF_TRACE_TARGET,
+                            "GET_PAGE",
+                            peer_addr = conn_perf_span_fields.peer_addr,
+                            application_name = conn_perf_span_fields.application_name,
+                            compute_mode = conn_perf_span_fields.compute_mode,
+                            tenant_id = %tenant_id,
+                            shard_id = %shard.get_shard_identity().shard_slug(),
+                            timeline_id = %timeline_id,
+                            lsn = %req.hdr.request_lsn,
+                            request_id = %req.hdr.reqid,
+                            key = %key,
+                            )
+                        })
+                        .attached_child()
+                } else {
+                    ctx.attached_child()
+                };
+
+                // This ctx travels as part of the BatchedFeMessage through
+                // batching into the request handler.
+                // The request handler needs to do some per-request work
+                // (relsize check) before dispatching the batch as a single
+                // get_vectored call to the Timeline.
+                // This ctx will be used for the reslize check, whereas the
+                // get_vectored call will be a different ctx with separate
+                // perf span.
+                let ctx = ctx.with_scope_page_service_pagestream(&shard);
+
+                // Similar game for this `span`: we funnel it through so that
+                // request handler log messages contain the request-specific fields.
                let span = mkspan!(shard.tenant_shard_id.shard_slug());

                let timer = record_op_start_and_throttle(
@@ -939,19 +1009,34 @@ impl PageServerHandler {
                    metrics::SmgrQueryType::GetPageAtLsn,
                    received_at,
                )
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "THROTTLE",
+                    )
+                })
                .await?;

                // We're holding the Handle
-                let effective_request_lsn = match Self::wait_or_get_last_lsn(
+                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
+                let res = Self::wait_or_get_last_lsn(
                    &shard,
                    req.hdr.request_lsn,
                    req.hdr.not_modified_since,
                    &shard.get_applied_gc_cutoff_lsn(),
-                    ctx,
+                    &ctx,
                )
-                // TODO: if we actually need to wait for lsn here, it delays the entire batch which doesn't need to wait
-                .await
-                {
+                .maybe_perf_instrument(&ctx, |current_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: current_perf_span,
+                        "WAIT_LSN",
+                    )
+                })
+                .await;
+
+                let effective_request_lsn = match res {
                    Ok(lsn) => lsn,
                    Err(e) => {
                        return respond_error!(span, e);
@@ -961,7 +1046,7 @@ impl PageServerHandler {
                    span,
                    shard: shard.downgrade(),
                    effective_request_lsn,
-                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
+                    pages: smallvec::smallvec![BatchedGetPageRequest { req, timer, ctx }],
                }
            }
            #[cfg(feature = "testing")]
@@ -1514,12 +1599,14 @@ impl PageServerHandler {
        IO: AsyncRead + AsyncWrite + Send + Sync + Unpin + 'static,
    {
        let cancel = self.cancel.clone();
+
        let err = loop {
            let msg = Self::pagestream_read_message(
                &mut pgb_reader,
                tenant_id,
                timeline_id,
                &mut timeline_handles,
+                &self.perf_span_fields,
                &cancel,
                ctx,
                protocol_version,
@@ -1653,6 +1740,8 @@ impl PageServerHandler {
        // Batcher
        //

+        let perf_span_fields = self.perf_span_fields.clone();
+
        let cancel_batcher = self.cancel.child_token();
        let (mut batch_tx, mut batch_rx) = spsc_fold::channel();
        let batcher = pipeline_stage!("batcher", cancel_batcher.clone(), move |cancel_batcher| {
@@ -1666,6 +1755,7 @@ impl PageServerHandler {
                        tenant_id,
                        timeline_id,
                        &mut timeline_handles,
+                        &perf_span_fields,
                        &cancel_batcher,
                        &ctx,
                        protocol_version,
@@ -2004,7 +2094,9 @@ impl PageServerHandler {

        let results = timeline
            .get_rel_page_at_lsn_batched(
-                requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
+                requests
+                    .iter()
+                    .map(|p| (&p.req.rel, &p.req.blkno, p.ctx.attached_child())),
                effective_lsn,
                io_concurrency,
                ctx,
@@ -2606,12 +2698,14 @@ where

        if let FeStartupPacket::StartupMessage { params, .. } = sm {
            if let Some(app_name) = params.get("application_name") {
+                self.perf_span_fields.application_name = Some(app_name.to_string());
                Span::current().record("application_name", field::display(app_name));
            }
            if let Some(options) = params.get("options") {
                let (config, _) = parse_options(options);
                for (key, value) in config {
                    if key == "neon.compute_mode" {
+                        self.perf_span_fields.compute_mode = Some(value.clone());
                        Span::current().record("compute_mode", field::display(value));
                    }
                }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -9,6 +9,7 @@
 use std::collections::{BTreeMap, HashMap, HashSet, hash_map};
 use std::ops::{ControlFlow, Range};

+use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, ensure};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
@@ -31,7 +32,7 @@ use postgres_ffi::{BLCKSZ, Oid, RepOriginId, TimestampTz, TransactionId};
 use serde::{Deserialize, Serialize};
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, trace, warn};
+use tracing::{debug, info, info_span, trace, warn};
 use utils::bin_ser::{BeSer, DeserializeError};
 use utils::lsn::Lsn;
 use utils::pausable_failpoint;
@@ -39,7 +40,7 @@ use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
-use crate::context::RequestContext;
+use crate::context::{PerfInstrumentFutureExt, RequestContext, RequestContextBuilder};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::metrics::{
    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
@@ -209,7 +210,9 @@ impl Timeline {
                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                let res = self
                    .get_rel_page_at_lsn_batched(
-                        pages.iter().map(|(tag, blknum)| (tag, blknum)),
+                        pages
+                            .iter()
+                            .map(|(tag, blknum)| (tag, blknum, ctx.attached_child())),
                        effective_lsn,
                        io_concurrency.clone(),
                        ctx,
@@ -248,7 +251,7 @@ impl Timeline {
    /// The ordering of the returned vec corresponds to the ordering of `pages`.
    pub(crate) async fn get_rel_page_at_lsn_batched(
        &self,
-        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, RequestContext)>,
        effective_lsn: Lsn,
        io_concurrency: IoConcurrency,
        ctx: &RequestContext,
@@ -262,8 +265,11 @@ impl Timeline {
        let mut result = Vec::with_capacity(pages.len());
        let result_slots = result.spare_capacity_mut();

-        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
-        for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
+        let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[(usize, RequestContext); 1]>> =
+            BTreeMap::default();
+
+        let mut perf_instrument = false;
+        for (response_slot_idx, (tag, blknum, ctx)) in pages.enumerate() {
            if tag.relnode == 0 {
                result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                    RelationError::InvalidRelnode.into(),
@@ -274,7 +280,16 @@ impl Timeline {
            }

            let nblocks = match self
-                .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
+                .get_rel_size(*tag, Version::Lsn(effective_lsn), &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "GET_REL_SIZE",
+                        reltag=%tag,
+                        lsn=%effective_lsn,
+                    )
+                })
                .await
            {
                Ok(nblocks) => nblocks,
@@ -297,8 +312,12 @@ impl Timeline {

            let key = rel_block_to_key(*tag, *blknum);

+            if ctx.has_perf_span() {
+                perf_instrument = true;
+            }
+
            let key_slots = keys_slots.entry(key).or_default();
-            key_slots.push(response_slot_idx);
+            key_slots.push((response_slot_idx, ctx));
        }

        let keyspace = {
@@ -314,16 +333,34 @@ impl Timeline {
            acc.to_keyspace()
        };

-        match self
-            .get_vectored(keyspace, effective_lsn, io_concurrency, ctx)
-            .await
-        {
+        let ctx = match perf_instrument {
+            true => RequestContextBuilder::from(ctx)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "GET_VECTORED",
+                        tenant_id = %self.tenant_shard_id.tenant_id,
+                        timeline_id = %self.timeline_id,
+                        lsn = %effective_lsn,
+                        shard = %self.tenant_shard_id.shard_slug(),
+                    )
+                })
+                .attached_child(),
+            false => ctx.attached_child(),
+        };
+
+        let res = self
+            .get_vectored(keyspace, effective_lsn, io_concurrency, &ctx)
+            .maybe_perf_instrument(&ctx, |current_perf_span| current_perf_span.clone())
+            .await;
+
+        match res {
            Ok(results) => {
                for (key, res) in results {
                    let mut key_slots = keys_slots.remove(&key).unwrap().into_iter();
-                    let first_slot = key_slots.next().unwrap();
+                    let (first_slot, first_req_ctx) = key_slots.next().unwrap();

-                    for slot in key_slots {
+                    for (slot, req_ctx) in key_slots {
                        let clone = match &res {
                            Ok(buf) => Ok(buf.clone()),
                            Err(err) => Err(match err {
@@ -341,17 +378,22 @@ impl Timeline {
                        };

                        result_slots[slot].write(clone);
+                        // There is no standardized way to express that the batched span followed from N request spans.
+                        // So, abuse the system and mark the request contexts as follows_from the batch span, so we get
+                        // some linkage in our trace viewer. It allows us to answer: which GET_VECTORED did this GET_PAGE wait for.
+                        req_ctx.perf_follows_from(&ctx);
                        slots_filled += 1;
                    }

                    result_slots[first_slot].write(res);
+                    first_req_ctx.perf_follows_from(&ctx);
                    slots_filled += 1;
                }
            }
            Err(err) => {
                // this cannot really happen because get_vectored only errors globally on invalid LSN or too large batch size
                // (We enforce the max batch size outside of this function, in the code that constructs the batch request.)
-                for slot in keys_slots.values().flatten() {
+                for (slot, req_ctx) in keys_slots.values().flatten() {
                    // this whole `match` is a lot like `From<GetVectoredError> for PageReconstructError`
                    // but without taking ownership of the GetVectoredError
                    let err = match &err {
@@ -383,6 +425,7 @@ impl Timeline {
                        }
                    };

+                    req_ctx.perf_follows_from(&ctx);
                    result_slots[*slot].write(err);
                }

--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -219,8 +219,7 @@ pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
 pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
 pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
 // Bump this number when adding a new pageserver_runtime!
-// SAFETY: it's obviously correct
-const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
+const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = NonZeroUsize::new(4).unwrap();

 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -45,6 +45,7 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::{
    FAILED_REMOTE_OP_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD, UploadQueueNotReadyError,
+    download_tenant_manifest,
 };
 use secondary::heatmap::{HeatMapTenant, HeatMapTimeline};
 use storage_broker::BrokerClientChannel;
@@ -226,7 +227,8 @@ struct TimelinePreload {
 }

 pub(crate) struct TenantPreload {
-    tenant_manifest: TenantManifest,
+    /// The tenant manifest from remote storage, or None if no manifest was found.
+    tenant_manifest: Option<TenantManifest>,
    /// Map from timeline ID to a possible timeline preload. It is None iff the timeline is offloaded according to the manifest.
    timelines: HashMap<TimelineId, Option<TimelinePreload>>,
 }
@@ -282,12 +284,15 @@ pub struct Tenant {
    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
    timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,

-    /// Serialize writes of the tenant manifest to remote storage.  If there are concurrent operations
-    /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
-    /// each other (this could be optimized to coalesce writes if necessary).
+    /// The last tenant manifest known to be in remote storage. None if the manifest has not yet
+    /// been either downloaded or uploaded. Always Some after tenant attach.
    ///
-    /// The contents of the Mutex are the last manifest we successfully uploaded
-    tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
+    /// Initially populated during tenant attach, updated via `maybe_upload_tenant_manifest`.
+    ///
+    /// Do not modify this directly. It is used to check whether a new manifest needs to be
+    /// uploaded. The manifest is constructed in `build_tenant_manifest`, and uploaded via
+    /// `maybe_upload_tenant_manifest`.
+    remote_tenant_manifest: tokio::sync::Mutex<Option<TenantManifest>>,

    // This mutex prevents creation of new timelines during GC.
    // Adding yet another mutex (in addition to `timelines`) is needed because holding
@@ -1354,36 +1359,41 @@ impl Tenant {
                    }
                }

-                // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state.
-                enum BrokenVerbosity {
-                    Error,
-                    Info
-                }
-                let make_broken =
-                    |t: &Tenant, err: anyhow::Error, verbosity: BrokenVerbosity| {
-                        match verbosity {
-                            BrokenVerbosity::Info => {
-                                info!("attach cancelled, setting tenant state to Broken: {err}");
-                            },
-                            BrokenVerbosity::Error => {
-                                error!("attach failed, setting tenant state to Broken: {err:?}");
-                            }
+                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
+                    t.state.send_modify(|state| match state {
+                        // TODO: the old code alluded to DeleteTenantFlow sometimes setting
+                        // TenantState::Stopping before we get here, but this may be outdated.
+                        // Let's find out with a testing assertion. If this doesn't fire, and the
+                        // logs don't show this happening in production, remove the Stopping cases.
+                        TenantState::Stopping{..} if cfg!(any(test, feature = "testing")) => {
+                            panic!("unexpected TenantState::Stopping during attach")
                        }
-                        t.state.send_modify(|state| {
-                            // The Stopping case is for when we have passed control on to DeleteTenantFlow:
-                            // if it errors, we will call make_broken when tenant is already in Stopping.
-                            assert!(
-                                matches!(*state, TenantState::Attaching | TenantState::Stopping { .. }),
-                                "the attach task owns the tenant state until activation is complete"
-                            );
-
-                            *state = TenantState::broken_from_reason(err.to_string());
-                        });
-                    };
+                        // If the tenant is cancelled, assume the error was caused by cancellation.
+                        TenantState::Attaching if t.cancel.is_cancelled() => {
+                            info!("attach cancelled, setting tenant state to Stopping: {err}");
+                            // NB: progress None tells `set_stopping` that attach has cancelled.
+                            *state = TenantState::Stopping { progress: None };
+                        }
+                        // According to the old code, DeleteTenantFlow may already have set this to
+                        // Stopping. Retain its progress.
+                        // TODO: there is no DeleteTenantFlow. Is this still needed? See above.
+                        TenantState::Stopping { progress } if t.cancel.is_cancelled() => {
+                            assert!(progress.is_some(), "concurrent attach cancellation");
+                            info!("attach cancelled, already Stopping: {err}");
+                        }
+                        // Mark the tenant as broken.
+                        TenantState::Attaching | TenantState::Stopping { .. } => {
+                            error!("attach failed, setting tenant state to Broken (was {state}): {err:?}");
+                            *state = TenantState::broken_from_reason(err.to_string())
+                        }
+                        // The attach task owns the tenant state until activated.
+                        state => panic!("invalid tenant state {state} during attach: {err:?}"),
+                    });
+                }

                // TODO: should also be rejecting tenant conf changes that violate this check.
                if let Err(e) = crate::tenant::storage_layer::inmemory_layer::IndexEntry::validate_checkpoint_distance(tenant_clone.get_checkpoint_distance()) {
-                    make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                    make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
                    return Ok(());
                }

@@ -1435,10 +1445,8 @@ impl Tenant {
                            // stayed in Activating for such a long time that shutdown found it in
                            // that state.
                            tracing::info!(state=%tenant_clone.current_state(), "Tenant shut down before activation");
-                            // Make the tenant broken so that set_stopping will not hang waiting for it to leave
-                            // the Attaching state.  This is an over-reaction (nothing really broke, the tenant is
-                            // just shutting down), but ensures progress.
-                            make_broken(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"), BrokenVerbosity::Info);
+                            // Set the tenant to Stopping to signal `set_stopping` that we're done.
+                            make_broken_or_stopping(&tenant_clone, anyhow::anyhow!("Shut down while Attaching"));
                            return Ok(());
                        },
                    )
@@ -1457,7 +1465,7 @@ impl Tenant {
                        match res {
                            Ok(p) => Some(p),
                            Err(e) => {
-                                make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
+                                make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e));
                                return Ok(());
                            }
                        }
@@ -1483,9 +1491,7 @@ impl Tenant {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
                    }
-                    Err(e) => {
-                        make_broken(&tenant_clone, anyhow::anyhow!(e), BrokenVerbosity::Error);
-                    }
+                    Err(e) => make_broken_or_stopping(&tenant_clone, anyhow::anyhow!(e)),
                }

                // If we are doing an opportunistic warmup attachment at startup, initialize
@@ -1525,28 +1531,27 @@ impl Tenant {
            cancel.clone(),
        )
        .await?;
-        let (offloaded_add, tenant_manifest) =
-            match remote_timeline_client::download_tenant_manifest(
-                remote_storage,
-                &self.tenant_shard_id,
-                self.generation,
-                &cancel,
-            )
-            .await
-            {
-                Ok((tenant_manifest, _generation, _manifest_mtime)) => (
-                    format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
-                    tenant_manifest,
-                ),
-                Err(DownloadError::NotFound) => {
-                    ("no manifest".to_string(), TenantManifest::empty())
-                }
-                Err(e) => Err(e)?,
-            };
+
+        let tenant_manifest = match download_tenant_manifest(
+            remote_storage,
+            &self.tenant_shard_id,
+            self.generation,
+            &cancel,
+        )
+        .await
+        {
+            Ok((tenant_manifest, _, _)) => Some(tenant_manifest),
+            Err(DownloadError::NotFound) => None,
+            Err(err) => return Err(err.into()),
+        };

        info!(
-            "found {} timelines, and {offloaded_add}",
-            remote_timeline_ids.len()
+            "found {} timelines ({} offloaded timelines)",
+            remote_timeline_ids.len(),
+            tenant_manifest
+                .as_ref()
+                .map(|m| m.offloaded_timelines.len())
+                .unwrap_or(0)
        );

        for k in other_keys {
@@ -1555,11 +1560,13 @@ impl Tenant {

        // Avoid downloading IndexPart of offloaded timelines.
        let mut offloaded_with_prefix = HashSet::new();
-        for offloaded in tenant_manifest.offloaded_timelines.iter() {
-            if remote_timeline_ids.remove(&offloaded.timeline_id) {
-                offloaded_with_prefix.insert(offloaded.timeline_id);
-            } else {
-                // We'll take care later of timelines in the manifest without a prefix
+        if let Some(tenant_manifest) = &tenant_manifest {
+            for offloaded in tenant_manifest.offloaded_timelines.iter() {
+                if remote_timeline_ids.remove(&offloaded.timeline_id) {
+                    offloaded_with_prefix.insert(offloaded.timeline_id);
+                } else {
+                    // We'll take care later of timelines in the manifest without a prefix
+                }
            }
        }

@@ -1633,12 +1640,14 @@ impl Tenant {

        let mut offloaded_timeline_ids = HashSet::new();
        let mut offloaded_timelines_list = Vec::new();
-        for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() {
-            let timeline_id = timeline_manifest.timeline_id;
-            let offloaded_timeline =
-                OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
-            offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
-            offloaded_timeline_ids.insert(timeline_id);
+        if let Some(tenant_manifest) = &preload.tenant_manifest {
+            for timeline_manifest in tenant_manifest.offloaded_timelines.iter() {
+                let timeline_id = timeline_manifest.timeline_id;
+                let offloaded_timeline =
+                    OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
+                offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
+                offloaded_timeline_ids.insert(timeline_id);
+            }
        }
        // Complete deletions for offloaded timeline id's from manifest.
        // The manifest will be uploaded later in this function.
@@ -1796,15 +1805,21 @@ impl Tenant {
            .context("resume_deletion")
            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
        }
-        let needs_manifest_upload =
-            offloaded_timelines_list.len() != preload.tenant_manifest.offloaded_timelines.len();
        {
            let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
            offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
        }
-        if needs_manifest_upload {
-            self.store_tenant_manifest().await?;
+
+        // Stash the preloaded tenant manifest, and upload a new manifest if changed.
+        //
+        // NB: this must happen after the tenant is fully populated above. In particular the
+        // offloaded timelines, which are included in the manifest.
+        {
+            let mut guard = self.remote_tenant_manifest.lock().await;
+            assert!(guard.is_none(), "tenant manifest set before preload"); // first populated here
+            *guard = preload.tenant_manifest;
        }
+        self.maybe_upload_tenant_manifest().await?;

        // The local filesystem contents are a cache of what's in the remote IndexPart;
        // IndexPart is the source of truth.
@@ -2218,7 +2233,7 @@ impl Tenant {
        };

        // Upload new list of offloaded timelines to S3
-        self.store_tenant_manifest().await?;
+        self.maybe_upload_tenant_manifest().await?;

        // Activate the timeline (if it makes sense)
        if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -3080,6 +3095,7 @@ impl Tenant {
            let mut has_pending_l0 = false;
            for timeline in compact_l0 {
                let ctx = &ctx.with_scope_timeline(&timeline);
+                // NB: don't set CompactFlags::YieldForL0, since this is an L0-only compaction pass.
                let outcome = timeline
                    .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
                    .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
@@ -3097,14 +3113,9 @@ impl Tenant {
            }
        }

-        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated
-        // more L0 layers, they may also be compacted here.
-        //
-        // NB: image compaction may yield if there is pending L0 compaction.
-        //
-        // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a
-        // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`.
-        // We leave this for a later PR.
+        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated more
+        // L0 layers, they may also be compacted here. Image compaction will yield if there is
+        // pending L0 compaction on any tenant timeline.
        //
        // TODO: consider ordering timelines by some priority, e.g. time since last full compaction,
        // amount of L1 delta debt or garbage, offload-eligible timelines first, etc.
@@ -3115,8 +3126,14 @@ impl Tenant {
            }
            let ctx = &ctx.with_scope_timeline(&timeline);

+            // Yield for L0 if the separate L0 pass is enabled (otherwise there's no point).
+            let mut flags = EnumSet::default();
+            if self.get_compaction_l0_first() {
+                flags |= CompactFlags::YieldForL0;
+            }
+
            let mut outcome = timeline
-                .compact(cancel, EnumSet::default(), ctx)
+                .compact(cancel, flags, ctx)
                .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
                .await
                .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
@@ -3246,17 +3263,23 @@ impl Tenant {
    async fn housekeeping(&self) {
        // Call through to all timelines to freeze ephemeral layers as needed. This usually happens
        // during ingest, but we don't want idle timelines to hold open layers for too long.
-        let timelines = self
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tli| tli.is_active())
-            .cloned()
-            .collect_vec();
+        //
+        // We don't do this if the tenant can't upload layers (i.e. it's in stale attachment mode).
+        // We don't run compaction in this case either, and don't want to keep flushing tiny L0
+        // layers that won't be compacted down.
+        if self.tenant_conf.load().location.may_upload_layers_hint() {
+            let timelines = self
+                .timelines
+                .lock()
+                .unwrap()
+                .values()
+                .filter(|tli| tli.is_active())
+                .cloned()
+                .collect_vec();

-        for timeline in timelines {
-            timeline.maybe_freeze_ephemeral_layer().await;
+            for timeline in timelines {
+                timeline.maybe_freeze_ephemeral_layer().await;
+            }
        }

        // Shut down walredo if idle.
@@ -3421,7 +3444,7 @@ impl Tenant {
            shutdown_mode
        };

-        match self.set_stopping(shutdown_progress, false, false).await {
+        match self.set_stopping(shutdown_progress).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
@@ -3501,25 +3524,13 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    ///
-    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
-    /// `allow_transition_from_attaching` is needed for the special case of attaching deleted tenant.
-    async fn set_stopping(
-        &self,
-        progress: completion::Barrier,
-        _allow_transition_from_loading: bool,
-        allow_transition_from_attaching: bool,
-    ) -> Result<(), SetStoppingError> {
+    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
-            TenantState::Attaching if allow_transition_from_attaching => true,
            TenantState::Activating(_) | TenantState::Attaching => {
-                info!(
-                    "waiting for {} to turn Active|Broken|Stopping",
-                    <&'static str>::from(state)
-                );
+                info!("waiting for {state} to turn Active|Broken|Stopping");
                false
            }
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
@@ -3530,25 +3541,24 @@ impl Tenant {
        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) => {
-                unreachable!("1we ensured above that we're done with activation, and, there is no re-activation")
-            }
-            TenantState::Attaching => {
-                if !allow_transition_from_attaching {
-                    unreachable!("2we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
+            TenantState::Activating(_) | TenantState::Attaching => {
+                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
                // won't be accessible to anyone afterwards, because the Tenant is in Stopping state.
-                *current_state = TenantState::Stopping { progress };
+                *current_state = TenantState::Stopping { progress: Some(progress) };
                // Continue stopping outside the closure. We need to grab timelines.lock()
                // and we plan to turn it into a tokio::sync::Mutex in a future patch.
                true
            }
+            TenantState::Stopping { progress: None } => {
+                // An attach was cancelled, and the attach transitioned the tenant from Attaching to
+                // Stopping(None) to let us know it exited. Register our progress and continue.
+                *current_state = TenantState::Stopping { progress: Some(progress) };
+                true
+            }
            TenantState::Broken { reason, .. } => {
                info!(
                    "Cannot set tenant to Stopping state, it is in Broken state due to: {reason}"
@@ -3556,7 +3566,7 @@ impl Tenant {
                err = Some(SetStoppingError::Broken);
                false
            }
-            TenantState::Stopping { progress } => {
+            TenantState::Stopping { progress: Some(progress) } => {
                info!("Tenant is already in Stopping state");
                err = Some(SetStoppingError::AlreadyStopping(progress.clone()));
                false
@@ -3681,7 +3691,7 @@ impl Tenant {
                        }
                    }
                }
-                TenantState::Active { .. } => {
+                TenantState::Active => {
                    return Ok(());
                }
                TenantState::Broken { reason, .. } => {
@@ -4057,18 +4067,19 @@ impl Tenant {

    /// Generate an up-to-date TenantManifest based on the state of this Tenant.
    fn build_tenant_manifest(&self) -> TenantManifest {
-        let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
-
-        let mut timeline_manifests = timelines_offloaded
-            .iter()
-            .map(|(_timeline_id, offloaded)| offloaded.manifest())
-            .collect::<Vec<_>>();
-        // Sort the manifests so that our output is deterministic
-        timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id);
+        // Collect the offloaded timelines, and sort them for deterministic output.
+        let offloaded_timelines = self
+            .timelines_offloaded
+            .lock()
+            .unwrap()
+            .values()
+            .map(|tli| tli.manifest())
+            .sorted_by_key(|m| m.timeline_id)
+            .collect_vec();

        TenantManifest {
            version: LATEST_TENANT_MANIFEST_VERSION,
-            offloaded_timelines: timeline_manifests,
+            offloaded_timelines,
        }
    }

@@ -4197,9 +4208,9 @@ impl Tenant {
            self.cancel.child_token(),
        );

-        let timeline_ctx = RequestContextBuilder::extend(ctx)
+        let timeline_ctx = RequestContextBuilder::from(ctx)
            .scope(context::Scope::new_timeline(&timeline))
-            .build();
+            .detached_child();

        Ok((timeline, timeline_ctx))
    }
@@ -4291,7 +4302,7 @@ impl Tenant {
            timelines: Mutex::new(HashMap::new()),
            timelines_creating: Mutex::new(HashSet::new()),
            timelines_offloaded: Mutex::new(HashMap::new()),
-            tenant_manifest_upload: Default::default(),
+            remote_tenant_manifest: Default::default(),
            gc_cs: tokio::sync::Mutex::new(()),
            walredo_mgr,
            remote_storage,
@@ -5524,27 +5535,35 @@ impl Tenant {
            .unwrap_or(0)
    }

-    /// Serialize and write the latest TenantManifest to remote storage.
-    pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Only one manifest write may be done at at time, and the contents of the manifest
-        // must be loaded while holding this lock. This makes it safe to call this function
-        // from anywhere without worrying about colliding updates.
+    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
+    /// manifest in `Self::remote_tenant_manifest`.
+    ///
+    /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
+    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
+    /// the authoritative source of data with an API that automatically uploads on changes. Revisit
+    /// this when the manifest is more widely used and we have a better idea of the data model.
+    pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
+        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
+        // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
+        // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
+        // simple coalescing mechanism.
        let mut guard = tokio::select! {
-            g = self.tenant_manifest_upload.lock() => {
-                g
-            },
-            _ = self.cancel.cancelled() => {
-                return Err(TenantManifestError::Cancelled);
-            }
+            guard = self.remote_tenant_manifest.lock() => guard,
+            _ = self.cancel.cancelled() => return Err(TenantManifestError::Cancelled),
        };

+        // Build a new manifest.
        let manifest = self.build_tenant_manifest();
-        if Some(&manifest) == (*guard).as_ref() {
-            // Optimisation: skip uploads that don't change anything.
-            return Ok(());
+
+        // Check if the manifest has changed. We ignore the version number here, to avoid
+        // uploading every manifest on version number bumps.
+        if let Some(old) = guard.as_ref() {
+            if manifest.eq_ignoring_version(old) {
+                return Ok(());
+            }
        }

-        // Remote storage does no retries internally, so wrap it
+        // Upload the manifest. Remote storage does no retries internally, so retry here.
        match backoff::retry(
            || async {
                upload_tenant_manifest(
@@ -5556,7 +5575,7 @@ impl Tenant {
                )
                .await
            },
-            |_e| self.cancel.is_cancelled(),
+            |_| self.cancel.is_cancelled(),
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "uploading tenant manifest",
@@ -6516,11 +6535,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
@@ -6537,11 +6552,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
@@ -6558,11 +6569,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
@@ -6579,11 +6586,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        assert_eq!(
@@ -6666,9 +6669,7 @@ mod tests {
            timeline.freeze_and_flush().await?;
            if compact {
                // this requires timeline to be &Arc<Timeline>
-                timeline
-                    .compact(&cancel, CompactFlags::NoYield.into(), ctx)
-                    .await?;
+                timeline.compact(&cancel, EnumSet::default(), ctx).await?;
            }

            // this doesn't really need to use the timeline_id target, but it is closer to what it
@@ -6995,7 +6996,6 @@ mod tests {
        child_timeline.freeze_and_flush().await?;
        let mut flags = EnumSet::new();
        flags.insert(CompactFlags::ForceRepartition);
-        flags.insert(CompactFlags::NoYield);
        child_timeline
            .compact(&CancellationToken::new(), flags, &ctx)
            .await?;
@@ -7374,9 +7374,7 @@ mod tests {

            // Perform a cycle of flush, compact, and GC
            tline.freeze_and_flush().await?;
-            tline
-                .compact(&cancel, CompactFlags::NoYield.into(), &ctx)
-                .await?;
+            tline.compact(&cancel, EnumSet::default(), &ctx).await?;
            tenant
                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                .await?;
@@ -7705,7 +7703,6 @@ mod tests {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::NoYield);
                            flags
                        } else {
                            EnumSet::empty()
@@ -7756,9 +7753,7 @@ mod tests {
        let before_num_l0_delta_files =
            tline.layers.read().await.layer_map()?.level0_deltas().len();

-        tline
-            .compact(&cancel, CompactFlags::NoYield.into(), &ctx)
-            .await?;
+        tline.compact(&cancel, EnumSet::default(), &ctx).await?;

        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();

@@ -7923,7 +7918,6 @@ mod tests {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::NoYield);
                            flags
                        },
                        &ctx,
@@ -8386,7 +8380,6 @@ mod tests {
                    let mut flags = EnumSet::new();
                    flags.insert(CompactFlags::ForceImageLayerCreation);
                    flags.insert(CompactFlags::ForceRepartition);
-                    flags.insert(CompactFlags::NoYield);
                    flags
                },
                &ctx,
@@ -8454,7 +8447,6 @@ mod tests {
                    let mut flags = EnumSet::new();
                    flags.insert(CompactFlags::ForceImageLayerCreation);
                    flags.insert(CompactFlags::ForceRepartition);
-                    flags.insert(CompactFlags::NoYield);
                    flags
                },
                &ctx,
@@ -11551,4 +11543,255 @@ mod tests {

        Ok(())
    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
+        use pageserver_api::models::TimelineVisibilityState;
+
+        use crate::tenant::size::gather_inputs;
+
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            // Ensure that we don't compute gc_cutoffs (which needs reading the layer files)
+            pitr_interval: Some(Duration::ZERO),
+            ..Default::default()
+        };
+        let harness = TenantHarness::create_custom(
+            "test_synthetic_size_calculation_with_invisible_branches",
+            tenant_conf,
+            TenantId::generate(),
+            ShardIdentity::unsharded(),
+            Generation::new(0xdeadbeef),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let main_tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![],
+                vec![],
+                vec![],
+                Lsn(0x100),
+            )
+            .await?;
+
+        let snapshot1 = TimelineId::from_array(hex!("11223344556677881122334455667790"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot1,
+                Some(Lsn(0x20)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let snapshot2 = TimelineId::from_array(hex!("11223344556677881122334455667791"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot2,
+                Some(Lsn(0x30)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let snapshot3 = TimelineId::from_array(hex!("11223344556677881122334455667792"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot3,
+                Some(Lsn(0x40)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let limit = Arc::new(Semaphore::new(1));
+        let max_retention_period = None;
+        let mut logical_size_cache = HashMap::new();
+        let cause = LogicalSizeCalculationCause::EvictionTaskImitation;
+        let cancel = CancellationToken::new();
+
+        let inputs = gather_inputs(
+            &tenant,
+            &limit,
+            max_retention_period,
+            &mut logical_size_cache,
+            cause,
+            &cancel,
+            &ctx,
+        )
+        .instrument(info_span!(
+            "gather_inputs",
+            tenant_id = "unknown",
+            shard_id = "unknown",
+        ))
+        .await?;
+        use crate::tenant::size::{LsnKind, ModelInputs, SegmentMeta};
+        use LsnKind::*;
+        use tenant_size_model::Segment;
+        let ModelInputs { mut segments, .. } = inputs;
+        segments.retain(|s| s.timeline_id == TIMELINE_ID);
+        for segment in segments.iter_mut() {
+            segment.segment.parent = None; // We don't care about the parent for the test
+            segment.segment.size = None; // We don't care about the size for the test
+        }
+        assert_eq!(
+            segments,
+            [
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x10,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchStart,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x20,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x30,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x100,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: GcCutOff,
+                }, // we need to retain everything above the last branch point
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x100,
+                        size: None,
+                        needed: true,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchEnd,
+                },
+            ]
+        );
+
+        main_tline
+            .remote_client
+            .schedule_index_upload_for_timeline_invisible_state(
+                TimelineVisibilityState::Invisible,
+            )?;
+        main_tline.remote_client.wait_completion().await?;
+        let inputs = gather_inputs(
+            &tenant,
+            &limit,
+            max_retention_period,
+            &mut logical_size_cache,
+            cause,
+            &cancel,
+            &ctx,
+        )
+        .instrument(info_span!(
+            "gather_inputs",
+            tenant_id = "unknown",
+            shard_id = "unknown",
+        ))
+        .await?;
+        let ModelInputs { mut segments, .. } = inputs;
+        segments.retain(|s| s.timeline_id == TIMELINE_ID);
+        for segment in segments.iter_mut() {
+            segment.segment.parent = None; // We don't care about the parent for the test
+            segment.segment.size = None; // We don't care about the size for the test
+        }
+        assert_eq!(
+            segments,
+            [
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x10,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchStart,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x20,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x30,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40, // Branch end LSN == last branch point LSN
+                        size: None,
+                        needed: true,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchEnd,
+                },
+            ]
+        );
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -53,7 +53,7 @@ impl<Value: Clone> LayerCoverage<Value> {
    ///
    /// Complexity: O(log N)
    fn add_node(&mut self, key: i128) {
-        let value = match self.nodes.range(..=key).last() {
+        let value = match self.nodes.range(..=key).next_back() {
            Some((_, Some(v))) => Some(v.clone()),
            Some((_, None)) => None,
            None => None,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -58,7 +58,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
-///    reads and ingest WAL.
+///   reads and ingest WAL.
 /// - `Secondary`: is only keeping a local cache warm.
 ///
 /// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -642,6 +642,7 @@ impl RemoteTimelineClient {
            cancel,
        )
        .measure_remote_op(
+            Option::<TaskKind>::None,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
            Arc::clone(&self.metrics),
@@ -739,6 +740,7 @@ impl RemoteTimelineClient {
                ctx,
            )
            .measure_remote_op(
+                Some(ctx.task_kind()),
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
                Arc::clone(&self.metrics),
@@ -1968,9 +1970,7 @@ impl RemoteTimelineClient {
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
    ///
-    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
-    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
-    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
+    /// The number of inprogress tasks is limited by `Self::inprogress_tasks`, see `next_ready`.
    fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
            debug!("starting op: {next_op}");
@@ -2177,6 +2177,7 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
+                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
@@ -2193,6 +2194,7 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
+                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
@@ -2218,6 +2220,11 @@ impl RemoteTimelineClient {
                    }
                    res
                }
+                // TODO: this should wait for the deletion to be executed by the deletion queue.
+                // Otherwise, the deletion may race with an upload and wrongfully delete a newer
+                // file. Some of the above logic attempts to work around this, it should be replaced
+                // by the upload queue ordering guarantees (see `can_bypass`). See:
+                // <https://github.com/neondatabase/neon/issues/10283>.
                UploadOp::Delete(delete) => {
                    if self.config.read().unwrap().block_deletions {
                        let mut queue_locked = self.upload_queue.lock().unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -130,7 +130,7 @@ impl IndexPart {
    /// Version history
    /// - 2: added `deleted_at`
    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
-    ///      is always generated from the keys of `layer_metadata`)
+    ///   is always generated from the keys of `layer_metadata`)
    /// - 4: timeline_layers is fully removed.
    /// - 5: lineage was added
    /// - 6: last_aux_file_policy is added.
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -3,11 +3,15 @@ use serde::{Deserialize, Serialize};
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

-/// Tenant-shard scoped manifest
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
+/// Tenant shard manifest, stored in remote storage. Contains offloaded timelines and other tenant
+/// shard-wide information that must be persisted in remote storage.
+///
+/// The manifest is always updated on tenant attach, and as needed.
+#[derive(Clone, Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct TenantManifest {
-    /// Debugging aid describing the version of this manifest.
-    /// Can also be used for distinguishing breaking changes later on.
+    /// The manifest version. Incremented on manifest format changes, even non-breaking ones.
+    /// Manifests must generally always be backwards and forwards compatible for one release, to
+    /// allow release rollbacks.
    pub version: usize,

    /// The list of offloaded timelines together with enough information
@@ -16,6 +20,7 @@ pub struct TenantManifest {
    /// Note: the timelines mentioned in this list might be deleted, i.e.
    /// we don't hold an invariant that the references aren't dangling.
    /// Existence of index-part.json is the actual indicator of timeline existence.
+    #[serde(default)]
    pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
 }

@@ -24,7 +29,7 @@ pub struct TenantManifest {
 /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
 /// but the two datastructures serve different needs, this is for a persistent disk format
 /// that must be backwards compatible, while the other is only for informative purposes.
-#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
+#[derive(Clone, Debug, Serialize, Deserialize, Copy, PartialEq, Eq)]
 pub struct OffloadedTimelineManifest {
    pub timeline_id: TimelineId,
    /// Whether the timeline has a parent it has been branched off from or not
@@ -35,20 +40,114 @@ pub struct OffloadedTimelineManifest {
    pub archived_at: NaiveDateTime,
 }

+/// The newest manifest version. This should be incremented on changes, even non-breaking ones. We
+/// do not use deny_unknown_fields, so new fields are not breaking.
 pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;

 impl TenantManifest {
-    pub(crate) fn empty() -> Self {
-        Self {
-            version: LATEST_TENANT_MANIFEST_VERSION,
-            offloaded_timelines: vec![],
+    /// Returns true if the manifests are equal, ignoring the version number. This avoids
+    /// re-uploading all manifests just because the version number is bumped.
+    pub fn eq_ignoring_version(&self, other: &Self) -> bool {
+        // Fast path: if the version is equal, just compare directly.
+        if self.version == other.version {
+            return self == other;
        }
-    }
-    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
-        serde_json::from_slice::<Self>(bytes)
+
+        // We could alternatively just clone and modify the version here.
+        let Self {
+            version: _, // ignore version
+            offloaded_timelines,
+        } = self;
+
+        offloaded_timelines == &other.offloaded_timelines
    }

-    pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
+    /// Decodes a manifest from JSON.
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice(bytes)
+    }
+
+    /// Encodes a manifest as JSON.
+    pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
        serde_json::to_vec(self)
    }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use utils::id::TimelineId;
+
+    use super::*;
+
+    /// Empty manifests should be parsed. Version is required.
+    #[test]
+    fn parse_empty() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 0
+         }"#;
+        let expected = TenantManifest {
+            version: 0,
+            offloaded_timelines: Vec::new(),
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+
+    /// Unknown fields should be ignored, for forwards compatibility.
+    #[test]
+    fn parse_unknown_fields() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 1,
+             "foo": "bar"
+         }"#;
+        let expected = TenantManifest {
+            version: 1,
+            offloaded_timelines: Vec::new(),
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+
+    /// v1 manifests should be parsed, for backwards compatibility.
+    #[test]
+    fn parse_v1() -> anyhow::Result<()> {
+        let json = r#"{
+             "version": 1,
+             "offloaded_timelines": [
+                 {
+                     "timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "archived_at": "2025-03-07T11:07:11.373105434"
+                 },
+                 {
+                     "timeline_id": "f3def5823ad7080d2ea538d8e12163fa",
+                     "ancestor_timeline_id": "5c4df612fd159e63c1b7853fe94d97da",
+                     "ancestor_retain_lsn": "0/1F79038",
+                     "archived_at": "2025-03-05T11:10:22.257901390"
+                 }
+             ]
+         }"#;
+        let expected = TenantManifest {
+            version: 1,
+            offloaded_timelines: vec![
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("5c4df612fd159e63c1b7853fe94d97da")?,
+                    ancestor_timeline_id: None,
+                    ancestor_retain_lsn: None,
+                    archived_at: NaiveDateTime::from_str("2025-03-07T11:07:11.373105434")?,
+                },
+                OffloadedTimelineManifest {
+                    timeline_id: TimelineId::from_str("f3def5823ad7080d2ea538d8e12163fa")?,
+                    ancestor_timeline_id: Some(TimelineId::from_str(
+                        "5c4df612fd159e63c1b7853fe94d97da",
+                    )?),
+                    ancestor_retain_lsn: Some(Lsn::from_str("0/1F79038")?),
+                    archived_at: NaiveDateTime::from_str("2025-03-05T11:10:22.257901390")?,
+                },
+            ],
+        };
+        assert_eq!(expected, TenantManifest::from_json_bytes(json.as_bytes())?);
+        Ok(())
+    }
+}
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -61,6 +61,7 @@ pub(crate) async fn upload_index_part(
        .await
        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
+
 /// Serializes and uploads the given tenant manifest data to the remote storage.
 pub(crate) async fn upload_tenant_manifest(
    storage: &GenericRemoteStorage,
@@ -76,16 +77,14 @@ pub(crate) async fn upload_tenant_manifest(
    });
    pausable_failpoint!("before-upload-manifest-pausable");

-    let serialized = tenant_manifest.to_json_bytes()?;
-    let serialized = Bytes::from(serialized);
-
-    let tenant_manifest_site = serialized.len();
-
+    let serialized = Bytes::from(tenant_manifest.to_json_bytes()?);
+    let tenant_manifest_size = serialized.len();
    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+
    storage
        .upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(serialized))),
-            tenant_manifest_site,
+            tenant_manifest_size,
            &remote_path,
            cancel,
        )
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -167,10 +167,17 @@ impl SecondaryTenant {

        self.validate_metrics();

+        // Metrics are subtracted from and/or removed eagerly.
+        // Deletions are done in the background via [`BackgroundPurges::spawn`].
        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+
+        self.detail
+            .lock()
+            .unwrap()
+            .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric);
    }

    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -4,6 +4,7 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};

+use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation};
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
@@ -124,15 +125,53 @@ impl OnDiskState {
    }
 }

-#[derive(Debug, Clone, Default)]
 pub(super) struct SecondaryDetailTimeline {
    on_disk_layers: HashMap<LayerName, OnDiskState>,

    /// We remember when layers were evicted, to prevent re-downloading them.
    pub(super) evicted_at: HashMap<LayerName, SystemTime>,
+
+    ctx: RequestContext,
+}
+
+impl Clone for SecondaryDetailTimeline {
+    fn clone(&self) -> Self {
+        Self {
+            on_disk_layers: self.on_disk_layers.clone(),
+            evicted_at: self.evicted_at.clone(),
+            // This is a bit awkward. The downloader code operates on a snapshot
+            // of the secondary list to avoid locking it for extended periods of time.
+            // No particularly strong reason to chose [`RequestContext::detached_child`],
+            // but makes more sense than [`RequestContext::attached_child`].
+            ctx: self
+                .ctx
+                .detached_child(self.ctx.task_kind(), self.ctx.download_behavior()),
+        }
+    }
+}
+
+impl std::fmt::Debug for SecondaryDetailTimeline {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("SecondaryDetailTimeline")
+            .field("on_disk_layers", &self.on_disk_layers)
+            .field("evicted_at", &self.evicted_at)
+            .finish()
+    }
 }

 impl SecondaryDetailTimeline {
+    pub(super) fn empty(ctx: RequestContext) -> Self {
+        SecondaryDetailTimeline {
+            on_disk_layers: Default::default(),
+            evicted_at: Default::default(),
+            ctx,
+        }
+    }
+
+    pub(super) fn context(&self) -> &RequestContext {
+        &self.ctx
+    }
+
    pub(super) fn remove_layer(
        &mut self,
        name: &LayerName,
@@ -258,18 +297,50 @@ impl SecondaryDetail {

    pub(super) fn remove_timeline(
        &mut self,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        resident_metric: &UIntGauge,
    ) {
        let removed = self.timelines.remove(timeline_id);
        if let Some(removed) = removed {
-            resident_metric.sub(
-                removed
-                    .on_disk_layers
-                    .values()
-                    .map(|l| l.metadata.file_size)
-                    .sum(),
-            );
+            Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric);
+        }
+    }
+
+    pub(super) fn drain_timelines(
+        &mut self,
+        tenant_shard_id: &TenantShardId,
+        resident_metric: &UIntGauge,
+    ) {
+        for (timeline_id, removed) in self.timelines.drain() {
+            Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric);
+        }
+    }
+
+    fn clear_timeline_metrics(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        detail: SecondaryDetailTimeline,
+        resident_metric: &UIntGauge,
+    ) {
+        resident_metric.sub(
+            detail
+                .on_disk_layers
+                .values()
+                .map(|l| l.metadata.file_size)
+                .sum(),
+        );
+
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        for op in StorageIoSizeOperation::VARIANTS {
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[
+                op,
+                tenant_id.as_str(),
+                shard_id.as_str(),
+                timeline_id.as_str(),
+            ]);
        }
    }

@@ -727,6 +798,7 @@ impl<'a> TenantDownloader<'a> {
                        last_heatmap,
                        timeline,
                        &self.secondary_state.resident_size_metric,
+                        ctx,
                    )
                    .await;

@@ -774,7 +846,6 @@ impl<'a> TenantDownloader<'a> {

        // Download the layers in the heatmap
        for timeline in heatmap.timelines {
-            let ctx = &ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline.timeline_id);
            let timeline_state = timeline_states
                .remove(&timeline.timeline_id)
                .expect("Just populated above");
@@ -917,7 +988,11 @@ impl<'a> TenantDownloader<'a> {
            for delete_timeline in &delete_timelines {
                // We haven't removed from disk yet, but optimistically remove from in-memory state: if removal
                // from disk fails that will be a fatal error.
-                detail.remove_timeline(delete_timeline, &self.secondary_state.resident_size_metric);
+                detail.remove_timeline(
+                    self.secondary_state.get_tenant_shard_id(),
+                    delete_timeline,
+                    &self.secondary_state.resident_size_metric,
+                );
            }
        }

@@ -1013,7 +1088,6 @@ impl<'a> TenantDownloader<'a> {
        timeline: HeatMapTimeline,
        timeline_state: SecondaryDetailTimeline,
        deadline: Instant,
-        ctx: &RequestContext,
    ) -> (Result<(), UpdateError>, Vec<HeatMapLayer>) {
        // Accumulate updates to the state
        let mut touched = Vec::new();
@@ -1044,7 +1118,12 @@ impl<'a> TenantDownloader<'a> {
            }

            match self
-                .download_layer(tenant_shard_id, &timeline_id, layer, ctx)
+                .download_layer(
+                    tenant_shard_id,
+                    &timeline_id,
+                    layer,
+                    timeline_state.context(),
+                )
                .await
            {
                Ok(Some(layer)) => touched.push(layer),
@@ -1155,13 +1234,16 @@ impl<'a> TenantDownloader<'a> {
        tracing::debug!(timeline_id=%timeline_id, "Downloading layers, {} in heatmap", timeline.hot_layers().count());

        let (result, touched) = self
-            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline, ctx)
+            .download_timeline_layers(tenant_shard_id, timeline, timeline_state, deadline)
            .await;

        // Write updates to state to record layers we just downloaded or touched, irrespective of whether the overall result was successful
        {
            let mut detail = self.secondary_state.detail.lock().unwrap();
-            let timeline_detail = detail.timelines.entry(timeline_id).or_default();
+            let timeline_detail = detail.timelines.entry(timeline_id).or_insert_with(|| {
+                let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &timeline_id);
+                SecondaryDetailTimeline::empty(ctx)
+            });

            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
            touched.into_iter().for_each(|t| {
@@ -1295,10 +1377,12 @@ async fn init_timeline_state(
    last_heatmap: Option<&HeatMapTimeline>,
    heatmap: &HeatMapTimeline,
    resident_metric: &UIntGauge,
+    ctx: &RequestContext,
 ) -> SecondaryDetailTimeline {
-    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
-    let mut detail = SecondaryDetailTimeline::default();
+    let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id);
+    let mut detail = SecondaryDetailTimeline::empty(ctx);

+    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
        Ok(d) => d,
        Err(e) => {
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -33,7 +33,7 @@ pub struct ModelInputs {
 }

 /// A [`Segment`], with some extra information for display purposes
-#[derive(Debug, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
 pub struct SegmentMeta {
    pub segment: Segment,
    pub timeline_id: TimelineId,
@@ -248,6 +248,8 @@ pub(super) async fn gather_inputs(
            None
        };

+        let branch_is_invisible = timeline.is_invisible() == Some(true);
+
        let lease_points = gc_info
            .leases
            .keys()
@@ -271,7 +273,10 @@ pub(super) async fn gather_inputs(
            .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint))
            .collect::<Vec<_>>();

-        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+        if !branch_is_invisible {
+            // Do not count lease points for invisible branches.
+            lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+        }

        drop(gc_info);

@@ -287,7 +292,9 @@ pub(super) async fn gather_inputs(

        // Add a point for the PITR cutoff
        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
-        if !branch_start_needed {
+        if !branch_start_needed && !branch_is_invisible {
+            // Only add the GcCutOff point when the timeline is visible; otherwise, do not compute the size for the LSN
+            // range from the last branch point to the latest data.
            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
        }

@@ -373,11 +380,19 @@ pub(super) async fn gather_inputs(
            }
        }

+        let branch_end_lsn = if branch_is_invisible {
+            // If the branch is invisible, the branch end is the last requested LSN (likely a branch cutoff point).
+            segments.last().unwrap().segment.lsn
+        } else {
+            // Otherwise, the branch end is the last record LSN.
+            last_record_lsn.0
+        };
+
        // Current end of the timeline
        segments.push(SegmentMeta {
            segment: Segment {
                parent: Some(parent),
-                lsn: last_record_lsn.0,
+                lsn: branch_end_lsn,
                size: None, // Filled in later, if necessary
                needed: true,
            },
@@ -609,6 +624,7 @@ async fn calculate_logical_size(
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }

+#[cfg(test)]
 #[test]
 fn verify_size_for_multiple_branches() {
    // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
@@ -766,6 +782,7 @@ fn verify_size_for_multiple_branches() {
    assert_eq!(inputs.calculate(), 37_851_408);
 }

+#[cfg(test)]
 #[test]
 fn verify_size_for_one_branch() {
    let doc = r#"
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -13,13 +13,13 @@ pub mod merge_iterator;
 use std::cmp::Ordering;
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
-use std::future::Future;
 use std::ops::Range;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::sync::atomic::AtomicUsize;
 use std::time::{Duration, SystemTime, UNIX_EPOCH};

+use crate::PERF_TRACE_TARGET;
 pub use batch_split_writer::{BatchLayerWriter, SplitDeltaLayerWriter, SplitImageLayerWriter};
 use bytes::Bytes;
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
@@ -34,7 +34,7 @@ use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
-use tracing::{Instrument, trace};
+use tracing::{Instrument, info_span, trace};
 use utils::lsn::Lsn;
 use utils::sync::gate::GateGuard;

@@ -43,7 +43,9 @@ use super::PageReconstructError;
 use super::layer_map::InMemoryLayerDesc;
 use super::timeline::{GetVectoredError, ReadPath};
 use crate::config::PageServerConf;
-use crate::context::{AccessStatsBehavior, RequestContext};
+use crate::context::{
+    AccessStatsBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
+};

 pub fn range_overlaps<T>(a: &Range<T>, b: &Range<T>) -> bool
 where
@@ -874,13 +876,37 @@ impl ReadableLayer {
    ) -> Result<(), GetVectoredError> {
        match self {
            ReadableLayer::PersistentLayer(layer) => {
+                let ctx = RequestContextBuilder::from(ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "PLAN_LAYER",
+                            layer = %layer
+                        )
+                    })
+                    .attached_child();
+
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx)
+                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                    .await
            }
            ReadableLayer::InMemoryLayer(layer) => {
+                let ctx = RequestContextBuilder::from(ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "PLAN_LAYER",
+                            layer = %layer
+                        )
+                    })
+                    .attached_child();
+
                layer
-                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, ctx)
+                    .get_values_reconstruct_data(keyspace, lsn_range, reconstruct_state, &ctx)
+                    .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                    .await
            }
        }
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -896,9 +896,9 @@ impl DeltaLayerInner {
    where
        Reader: BlockReader + Clone,
    {
-        let ctx = RequestContextBuilder::extend(ctx)
+        let ctx = RequestContextBuilder::from(ctx)
            .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-            .build();
+            .attached_child();

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;
@@ -1105,9 +1105,9 @@ impl DeltaLayerInner {
                    all_keys.push(entry);
                    true
                },
-                &RequestContextBuilder::extend(ctx)
+                &RequestContextBuilder::from(ctx)
                    .page_content_kind(PageContentKind::DeltaLayerBtreeNode)
-                    .build(),
+                    .attached_child(),
            )
            .await?;
        if let Some(last) = all_keys.last_mut() {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -481,9 +481,9 @@ impl ImageLayerInner {
        let tree_reader =
            DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);

-        let ctx = RequestContextBuilder::extend(ctx)
+        let ctx = RequestContextBuilder::from(ctx)
            .page_content_kind(PageContentKind::ImageLayerBtreeNode)
-            .build();
+            .attached_child();

        for range in keyspace.ranges.iter() {
            let mut range_end_handled = false;
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -421,9 +421,9 @@ impl InMemoryLayer {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let ctx = RequestContextBuilder::extend(ctx)
+        let ctx = RequestContextBuilder::from(ctx)
            .page_content_kind(PageContentKind::InMemoryLayer)
-            .build();
+            .attached_child();

        let inner = self.inner.read().await;

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -3,12 +3,13 @@ use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
 use std::time::{Duration, SystemTime};

+use crate::PERF_TRACE_TARGET;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::HistoricLayerInfo;
 use pageserver_api::shard::{ShardIdentity, ShardIndex, TenantShardId};
-use tracing::Instrument;
+use tracing::{Instrument, info_span};
 use utils::generation::Generation;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -18,7 +19,7 @@ use super::delta_layer::{self};
 use super::image_layer::{self};
 use super::{
    AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
-    LayerVisibilityHint, PersistentLayerDesc, ValuesReconstructState,
+    LayerVisibilityHint, PerfInstrumentFutureExt, PersistentLayerDesc, ValuesReconstructState,
 };
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
@@ -324,16 +325,29 @@ impl Layer {
        reconstruct_data: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<(), GetVectoredError> {
-        let downloaded =
+        let downloaded = {
+            let ctx = RequestContextBuilder::from(ctx)
+                .perf_span(|crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "GET_LAYER",
+                    )
+                })
+                .attached_child();
+
            self.0
-                .get_or_maybe_download(true, ctx)
+                .get_or_maybe_download(true, &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_context| crnt_perf_context.clone())
                .await
                .map_err(|err| match err {
                    DownloadError::TimelineShutdown | DownloadError::DownloadCancelled => {
                        GetVectoredError::Cancelled
                    }
                    other => GetVectoredError::Other(anyhow::anyhow!(other)),
-                })?;
+                })?
+        };
+
        let this = ResidentLayer {
            downloaded: downloaded.clone(),
            owner: self.clone(),
@@ -341,9 +355,20 @@ impl Layer {

        self.record_access(ctx);

+        let ctx = RequestContextBuilder::from(ctx)
+            .perf_span(|crnt_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: crnt_perf_span,
+                    "VISIT_LAYER",
+                )
+            })
+            .attached_child();
+
        downloaded
-            .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, ctx)
+            .get_values_reconstruct_data(this, keyspace, lsn_range, reconstruct_data, &ctx)
            .instrument(tracing::debug_span!("get_values_reconstruct_data", layer=%self))
+            .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
            .await
            .map_err(|err| match err {
                GetVectoredError::Other(err) => GetVectoredError::Other(
@@ -950,6 +975,10 @@ impl LayerInner {
        allow_download: bool,
        ctx: &RequestContext,
    ) -> Result<Arc<DownloadedLayer>, DownloadError> {
+        let mut wait_for_download_recorder =
+            scopeguard::guard(utils::elapsed_accum::ElapsedAccum::default(), |accum| {
+                ctx.ondemand_download_wait_observe(accum.get());
+            });
        let (weak, permit) = {
            // get_or_init_detached can:
            // - be fast (mutex lock) OR uncontested semaphore permit acquire
@@ -958,7 +987,7 @@ impl LayerInner {

            let locked = self
                .inner
-                .get_or_init_detached()
+                .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
                .await
                .map(|mut guard| guard.get_and_upgrade().ok_or(guard));

@@ -988,6 +1017,7 @@ impl LayerInner {
                Err(permit) => (None, permit),
            }
        };
+        let _guard = wait_for_download_recorder.guard();

        if let Some(weak) = weak {
            // only drop the weak after dropping the heavier_once_cell guard
@@ -1045,15 +1075,34 @@ impl LayerInner {
            return Err(DownloadError::DownloadRequired);
        }

-        let download_ctx = ctx.detached_child(TaskKind::LayerDownload, DownloadBehavior::Download);
+        let ctx = if ctx.has_perf_span() {
+            let dl_ctx = RequestContextBuilder::from(ctx)
+                .task_kind(TaskKind::LayerDownload)
+                .download_behavior(DownloadBehavior::Download)
+                .root_perf_span(|| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        "DOWNLOAD_LAYER",
+                        layer = %self,
+                        reason = %reason
+                    )
+                })
+                .detached_child();
+            ctx.perf_follows_from(&dl_ctx);
+            dl_ctx
+        } else {
+            ctx.attached_child()
+        };

        async move {
            tracing::info!(%reason, "downloading on-demand");

            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
            let res = self
-                .download_init_and_wait(timeline, permit, download_ctx)
+                .download_init_and_wait(timeline, permit, ctx.attached_child())
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                .await?;
+
            scopeguard::ScopeGuard::into_inner(init_cancelled);
            Ok(res)
        }
@@ -1158,6 +1207,7 @@ impl LayerInner {
        permit: heavier_once_cell::InitPermit,
        ctx: &RequestContext,
    ) -> Result<Arc<DownloadedLayer>, remote_storage::DownloadError> {
+        let start = std::time::Instant::now();
        let result = timeline
            .remote_client
            .download_layer_file(
@@ -1169,7 +1219,8 @@ impl LayerInner {
                ctx,
            )
            .await;
-
+        let latency = start.elapsed();
+        let latency_millis = u64::try_from(latency.as_millis()).unwrap();
        match result {
            Ok(size) => {
                assert_eq!(size, self.desc.file_size);
@@ -1185,9 +1236,8 @@ impl LayerInner {
                    Err(e) => {
                        panic!("post-condition failed: needs_download errored: {e:?}");
                    }
-                }
-
-                tracing::info!(size=%self.desc.file_size, "on-demand download successful");
+                };
+                tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
                timeline
                    .metrics
                    .resident_physical_size_add(self.desc.file_size);
@@ -1216,7 +1266,7 @@ impl LayerInner {
                    return Err(e);
                }

-                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
+                tracing::error!(consecutive_failures, %latency_millis, "layer file download failed: {e:#}");

                let backoff = utils::backoff::exponential_backoff_duration_seconds(
                    consecutive_failures.min(u32::MAX as usize) as u32,
@@ -1720,9 +1770,9 @@ impl DownloadedLayer {
            );

            let res = if owner.desc.is_delta {
-                let ctx = RequestContextBuilder::extend(ctx)
+                let ctx = RequestContextBuilder::from(ctx)
                    .page_content_kind(crate::context::PageContentKind::DeltaLayerSummary)
-                    .build();
+                    .attached_child();
                let summary = Some(delta_layer::Summary::expected(
                    owner.desc.tenant_shard_id.tenant_id,
                    owner.desc.timeline_id,
@@ -1738,9 +1788,9 @@ impl DownloadedLayer {
                .await
                .map(LayerKind::Delta)
            } else {
-                let ctx = RequestContextBuilder::extend(ctx)
+                let ctx = RequestContextBuilder::from(ctx)
                    .page_content_kind(crate::context::PageContentKind::ImageLayerSummary)
-                    .build();
+                    .attached_child();
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
                    owner.desc.tenant_shard_id.tenant_id,
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -119,6 +119,10 @@ async fn smoke_test() {
    let e = layer.evict_and_wait(FOREVER).await.unwrap_err();
    assert!(matches!(e, EvictionError::NotFound));

+    let dl_ctx = RequestContextBuilder::from(ctx)
+        .download_behavior(DownloadBehavior::Download)
+        .attached_child();
+
    // on accesses when the layer is evicted, it will automatically be downloaded.
    let img_after = {
        let mut data = ValuesReconstructState::new(io_concurrency.clone());
@@ -127,7 +131,7 @@ async fn smoke_test() {
                controlfile_keyspace.clone(),
                Lsn(0x10)..Lsn(0x11),
                &mut data,
-                ctx,
+                &dl_ctx,
            )
            .instrument(download_span.clone())
            .await
@@ -177,7 +181,7 @@ async fn smoke_test() {

    // plain downloading is rarely needed
    layer
-        .download_and_keep_resident(ctx)
+        .download_and_keep_resident(&dl_ctx)
        .instrument(download_span)
        .await
        .unwrap();
@@ -645,9 +649,10 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
    let ctx = ctx.with_scope_timeline(&timeline);

    // This test does downloads
-    let ctx = RequestContextBuilder::extend(&ctx)
+    let ctx = RequestContextBuilder::from(&ctx)
        .download_behavior(DownloadBehavior::Download)
-        .build();
+        .attached_child();
+
    let layer = {
        let mut layers = {
            let layers = timeline.layers.read().await;
@@ -730,9 +735,9 @@ async fn evict_and_wait_does_not_wait_for_download() {
    let ctx = ctx.with_scope_timeline(&timeline);

    // This test does downloads
-    let ctx = RequestContextBuilder::extend(&ctx)
+    let ctx = RequestContextBuilder::from(&ctx)
        .download_behavior(DownloadBehavior::Download)
-        .build();
+        .attached_child();

    let layer = {
        let mut layers = {
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -268,7 +268,12 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                error_run += 1;
                let backoff =
                    exponential_backoff_duration(error_run, BASE_BACKOFF_SECS, MAX_BACKOFF_SECS);
-                log_compaction_error(&err, Some((error_run, backoff)), cancel.is_cancelled());
+                log_compaction_error(
+                    &err,
+                    Some((error_run, backoff)),
+                    cancel.is_cancelled(),
+                    false,
+                );
                continue;
            }
        }
@@ -285,6 +290,7 @@ pub(crate) fn log_compaction_error(
    err: &CompactionError,
    retry_info: Option<(u32, Duration)>,
    task_cancelled: bool,
+    degrade_to_warning: bool,
 ) {
    use CompactionError::*;

@@ -333,6 +339,7 @@ pub(crate) fn log_compaction_error(
        }
    } else {
        match level {
+            Level::ERROR if degrade_to_warning => warn!("Compaction failed and discarded: {err:#}"),
            Level::ERROR => error!("Compaction failed: {err:#}"),
            Level::INFO => info!("Compaction failed: {err:#}"),
            level => unimplemented!("unexpected level {level:?}"),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,6 +23,7 @@ use std::sync::atomic::{AtomicBool, AtomicU64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, OnceLock, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

+use crate::PERF_TRACE_TARGET;
 use anyhow::{Context, Result, anyhow, bail, ensure};
 use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
@@ -84,8 +85,8 @@ use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
+use super::remote_timeline_client::RemoteTimelineClient;
 use super::remote_timeline_client::index::{GcCompactionState, IndexPart};
-use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
 use super::secondary::heatmap::HeatMapLayer;
 use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
 use super::tasks::log_compaction_error;
@@ -96,7 +97,9 @@ use super::{
 };
 use crate::aux_file::AuxFileSizeEstimator;
 use crate::config::PageServerConf;
-use crate::context::{DownloadBehavior, RequestContext};
+use crate::context::{
+    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
+};
 use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
@@ -870,9 +873,14 @@ pub(crate) enum CompactFlags {
    OnlyL0Compaction,
    EnhancedGcBottomMostCompaction,
    DryRun,
-    /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting
-    /// compaction via HTTP API.
-    NoYield,
+    /// Makes image compaction yield if there's pending L0 compaction. This should always be used in
+    /// the background compaction task, since we want to aggressively compact down L0 to bound
+    /// read amplification.
+    ///
+    /// It only makes sense to use this when `compaction_l0_first` is enabled (such that we yield to
+    /// an L0 compaction pass), and without `OnlyL0Compaction` (L0 compaction shouldn't yield for L0
+    /// compaction).
+    YieldForL0,
 }

 #[serde_with::serde_as]
@@ -890,6 +898,12 @@ pub(crate) struct CompactRequest {
    pub sub_compaction_max_job_size_mb: Option<u64>,
 }

+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct MarkInvisibleRequest {
+    #[serde(default)]
+    pub is_visible: Option<bool>,
+}
+
 #[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
    pub flags: EnumSet<CompactFlags>,
@@ -1278,9 +1292,22 @@ impl Timeline {
        };
        reconstruct_state.read_path = read_path;

-        let traversal_res: Result<(), _> = self
-            .get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, ctx)
-            .await;
+        let traversal_res: Result<(), _> = {
+            let ctx = RequestContextBuilder::from(ctx)
+                .perf_span(|crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "PLAN_IO",
+                    )
+                })
+                .attached_child();
+
+            self.get_vectored_reconstruct_data(keyspace.clone(), lsn, reconstruct_state, &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
+                .await
+        };
+
        if let Err(err) = traversal_res {
            // Wait for all the spawned IOs to complete.
            // See comments on `spawn_io` inside `storage_layer` for more details.
@@ -1294,14 +1321,46 @@ impl Timeline {

        let layers_visited = reconstruct_state.get_layers_visited();

+        let ctx = RequestContextBuilder::from(ctx)
+            .perf_span(|crnt_perf_span| {
+                info_span!(
+                    target: PERF_TRACE_TARGET,
+                    parent: crnt_perf_span,
+                    "RECONSTRUCT",
+                )
+            })
+            .attached_child();
+
        let futs = FuturesUnordered::new();
        for (key, state) in std::mem::take(&mut reconstruct_state.keys) {
            futs.push({
                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                let ctx = RequestContextBuilder::from(&ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "RECONSTRUCT_KEY",
+                            key = %key,
+                        )
+                    })
+                    .attached_child();
+
                async move {
                    assert_eq!(state.situation, ValueReconstructSituation::Complete);

-                    let converted = match state.collect_pending_ios().await {
+                    let res = state
+                        .collect_pending_ios()
+                        .maybe_perf_instrument(&ctx, |crnt_perf_span| {
+                            info_span!(
+                                target: PERF_TRACE_TARGET,
+                                parent: crnt_perf_span,
+                                "WAIT_FOR_IO_COMPLETIONS",
+                            )
+                        })
+                        .await;
+
+                    let converted = match res {
                        Ok(ok) => ok,
                        Err(err) => {
                            return (key, Err(err));
@@ -1318,16 +1377,27 @@ impl Timeline {
                        "{converted:?}"
                    );

-                    (
-                        key,
-                        walredo_self.reconstruct_value(key, lsn, converted).await,
-                    )
+                    let walredo_deltas = converted.num_deltas();
+                    let walredo_res = walredo_self
+                        .reconstruct_value(key, lsn, converted)
+                        .maybe_perf_instrument(&ctx, |crnt_perf_span| {
+                            info_span!(
+                                target: PERF_TRACE_TARGET,
+                                parent: crnt_perf_span,
+                                "WALREDO",
+                                deltas = %walredo_deltas,
+                            )
+                        })
+                        .await;
+
+                    (key, walredo_res)
                }
            });
        }

        let results = futs
            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
            .await;

        // For aux file keys (v1 or v2) the vectored read path does not return an error
@@ -1870,7 +1940,7 @@ impl Timeline {
            )
            .await;
        if let Err(err) = &res {
-            log_compaction_error(err, None, cancel.is_cancelled());
+            log_compaction_error(err, None, cancel.is_cancelled(), false);
        }
        res
    }
@@ -1891,18 +1961,19 @@ impl Timeline {
        // out by other background tasks (including image compaction). We request this via
        // `BackgroundLoopKind::L0Compaction`.
        //
-        // If this is a regular compaction pass, and L0-only compaction is enabled in the config,
-        // then we should yield for immediate L0 compaction if necessary while we're waiting for the
-        // background task semaphore. There's no point yielding otherwise, since we'd just end up
-        // right back here.
+        // Yield for pending L0 compaction while waiting for the semaphore.
        let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction);
        let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() {
            true => BackgroundLoopKind::L0Compaction,
            false => BackgroundLoopKind::Compaction,
        };
-        let yield_for_l0 = !is_l0_only
-            && self.get_compaction_l0_first()
-            && !options.flags.contains(CompactFlags::NoYield);
+        let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0);
+        if yield_for_l0 {
+            // If this is an L0 pass, it doesn't make sense to yield for L0.
+            debug_assert!(!is_l0_only, "YieldForL0 during L0 pass");
+            // If `compaction_l0_first` is disabled, there's no point yielding.
+            debug_assert!(self.get_compaction_l0_first(), "YieldForL0 without L0 pass");
+        }

        let acquire = async move {
            let guard = self.compaction_lock.lock().await;
@@ -2209,6 +2280,10 @@ impl Timeline {
        self.remote_client.is_archived()
    }

+    pub(crate) fn is_invisible(&self) -> Option<bool> {
+        self.remote_client.is_invisible()
+    }
+
    pub(crate) fn is_stopping(&self) -> bool {
        self.current_state() == TimelineState::Stopping
    }
@@ -2231,7 +2306,7 @@ impl Timeline {
                        .await
                        .expect("holding a reference to self");
                }
-                TimelineState::Active { .. } => {
+                TimelineState::Active => {
                    return Ok(());
                }
                TimelineState::Broken { .. } | TimelineState::Stopping => {
@@ -2401,6 +2476,31 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
    }

+    /// Checks if a get page request should get perf tracing
+    ///
+    /// The configuration priority is: tenant config override, default tenant config,
+    /// pageserver config.
+    pub(crate) fn is_get_page_request_sampled(&self) -> bool {
+        let tenant_conf = self.tenant_conf.load();
+        let ratio = tenant_conf
+            .tenant_conf
+            .sampling_ratio
+            .flatten()
+            .or(self.conf.default_tenant_conf.sampling_ratio)
+            .or(self.conf.tracing.as_ref().map(|t| t.sampling_ratio));
+
+        match ratio {
+            Some(r) => {
+                if r.numerator == 0 {
+                    false
+                } else {
+                    rand::thread_rng().gen_range(0..r.denominator) < r.numerator
+                }
+            }
+            None => false,
+        }
+    }
+
    fn get_checkpoint_distance(&self) -> u64 {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2562,14 +2662,6 @@ impl Timeline {
        Some(max(l0_flush_stall_threshold, compaction_threshold))
    }

-    fn get_l0_flush_wait_upload(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .l0_flush_wait_upload
-            .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload)
-    }
-
    fn get_image_creation_threshold(&self) -> usize {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -3867,15 +3959,30 @@ impl Timeline {
            let TimelineVisitOutcome {
                completed_keyspace: completed,
                image_covered_keyspace,
-            } = Self::get_vectored_reconstruct_data_timeline(
-                timeline,
-                keyspace.clone(),
-                cont_lsn,
-                reconstruct_state,
-                &self.cancel,
-                ctx,
-            )
-            .await?;
+            } = {
+                let ctx = RequestContextBuilder::from(ctx)
+                    .perf_span(|crnt_perf_span| {
+                        info_span!(
+                            target: PERF_TRACE_TARGET,
+                            parent: crnt_perf_span,
+                            "PLAN_IO_TIMELINE",
+                            timeline = %timeline.timeline_id,
+                            lsn = %cont_lsn,
+                        )
+                    })
+                    .attached_child();
+
+                Self::get_vectored_reconstruct_data_timeline(
+                    timeline,
+                    keyspace.clone(),
+                    cont_lsn,
+                    reconstruct_state,
+                    &self.cancel,
+                    &ctx,
+                )
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
+                .await?
+            };

            keyspace.remove_overlapping_with(&completed);

@@ -3919,8 +4026,24 @@ impl Timeline {

            // Take the min to avoid reconstructing a page with data newer than request Lsn.
            cont_lsn = std::cmp::min(Lsn(request_lsn.0 + 1), Lsn(timeline.ancestor_lsn.0 + 1));
+
+            let ctx = RequestContextBuilder::from(ctx)
+                .perf_span(|crnt_perf_span| {
+                    info_span!(
+                        target: PERF_TRACE_TARGET,
+                        parent: crnt_perf_span,
+                        "GET_ANCESTOR",
+                        timeline = %timeline.timeline_id,
+                        lsn = %cont_lsn,
+                        ancestor = %ancestor_timeline.timeline_id,
+                        ancestor_lsn = %timeline.ancestor_lsn
+                    )
+                })
+                .attached_child();
+
            timeline_owned = timeline
-                .get_ready_ancestor_timeline(ancestor_timeline, ctx)
+                .get_ready_ancestor_timeline(ancestor_timeline, &ctx)
+                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                .await?;
            timeline = &*timeline_owned;
        };
@@ -4591,27 +4714,6 @@ impl Timeline {
            // release lock on 'layers'
        };

-        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote
-        // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead.
-        if self.get_l0_flush_wait_upload() {
-            let start = Instant::now();
-            self.remote_client
-                .wait_completion()
-                .await
-                .map_err(|e| match e {
-                    WaitCompletionError::UploadQueueShutDownOrStopped
-                    | WaitCompletionError::NotInitialized(
-                        NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                    ) => FlushLayerError::Cancelled,
-                    WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                        FlushLayerError::Other(anyhow!(e).into())
-                    }
-                })?;
-            let duration = start.elapsed().as_secs_f64();
-            self.metrics.flush_wait_upload_time_gauge_add(duration);
-        }
-
        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
@@ -6251,10 +6353,33 @@ impl Timeline {

    /// Reconstruct a value, using the given base image and WAL records in 'data'.
    async fn reconstruct_value(
+        &self,
+        key: Key,
+        request_lsn: Lsn,
+        data: ValueReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.reconstruct_value_inner(key, request_lsn, data, false)
+            .await
+    }
+
+    /// Reconstruct a value, using the given base image and WAL records in 'data'. It does not fire critical errors because
+    /// sometimes it is expected to fail due to unreplayable history described in <https://github.com/neondatabase/neon/issues/10395>.
+    async fn reconstruct_value_wo_critical_error(
+        &self,
+        key: Key,
+        request_lsn: Lsn,
+        data: ValueReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        self.reconstruct_value_inner(key, request_lsn, data, true)
+            .await
+    }
+
+    async fn reconstruct_value_inner(
        &self,
        key: Key,
        request_lsn: Lsn,
        mut data: ValueReconstructState,
+        no_critical_error: bool,
    ) -> Result<Bytes, PageReconstructError> {
        // Perform WAL redo if needed
        data.records.reverse();
@@ -6311,7 +6436,9 @@ impl Timeline {
                    Ok(img) => img,
                    Err(walredo::Error::Cancelled) => return Err(PageReconstructError::Cancelled),
                    Err(walredo::Error::Other(err)) => {
-                        critical!("walredo failure during page reconstruction: {err:?}");
+                        if !no_critical_error {
+                            critical!("walredo failure during page reconstruction: {err:?}");
+                        }
                        return Err(PageReconstructError::WalRedo(
                            err.context("reconstruct a page image"),
                        ));
@@ -7272,9 +7399,9 @@ mod tests {

            eprintln!("Downloading {layer} and re-generating heatmap");

-            let ctx = &RequestContextBuilder::extend(ctx)
+            let ctx = &RequestContextBuilder::from(ctx)
                .download_behavior(crate::context::DownloadBehavior::Download)
-                .build();
+                .attached_child();

            let _resident = layer
                .download_and_keep_resident(ctx)
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,7 +26,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_CHECKPOINT_DISTANCE;
 use pageserver_api::key::{KEY_SIZE, Key};
 use pageserver_api::keyspace::{KeySpace, ShardedRange};
-use pageserver_api::models::CompactInfoResponse;
+use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
@@ -61,7 +61,7 @@ use crate::tenant::timeline::{
    DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
    ResidentLayer, drop_rlock,
 };
-use crate::tenant::{DeltaLayer, MaybeOffloaded, gc_block};
+use crate::tenant::{DeltaLayer, MaybeOffloaded};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
@@ -123,7 +123,6 @@ impl GcCompactionQueueItem {
 #[derive(Default)]
 struct GcCompactionGuardItems {
    notify: Option<tokio::sync::oneshot::Sender<()>>,
-    gc_guard: Option<gc_block::Guard>,
    permit: Option<OwnedSemaphorePermit>,
 }

@@ -279,7 +278,7 @@ impl GcCompactionQueue {
            gc_compaction_ratio_percent: u64,
        ) -> bool {
            const AUTO_TRIGGER_LIMIT: u64 = 150 * 1024 * 1024 * 1024; // 150GB
-            if l1_size >= AUTO_TRIGGER_LIMIT || l2_size >= AUTO_TRIGGER_LIMIT {
+            if l1_size + l2_size >= AUTO_TRIGGER_LIMIT {
                // Do not auto-trigger when physical size >= 150GB
                return false;
            }
@@ -319,7 +318,12 @@ impl GcCompactionQueue {
                        flags
                    },
                    sub_compaction: true,
-                    compact_key_range: None,
+                    // Only auto-trigger gc-compaction over the data keyspace due to concerns in
+                    // https://github.com/neondatabase/neon/issues/11318.
+                    compact_key_range: Some(CompactKeyRange {
+                        start: Key::MIN,
+                        end: Key::metadata_key_range().start,
+                    }),
                    compact_lsn_range: None,
                    sub_compaction_max_job_size_mb: None,
                },
@@ -343,44 +347,45 @@ impl GcCompactionQueue {
        info!("compaction job id={} finished", id);
        let mut guard = self.inner.lock().unwrap();
        if let Some(items) = guard.guards.remove(&id) {
-            drop(items.gc_guard);
            if let Some(tx) = items.notify {
                let _ = tx.send(());
            }
        }
    }

+    fn clear_running_job(&self) {
+        let mut guard = self.inner.lock().unwrap();
+        guard.running = None;
+    }
+
    async fn handle_sub_compaction(
        &self,
        id: GcCompactionJobId,
        options: CompactOptions,
        timeline: &Arc<Timeline>,
-        gc_block: &GcBlock,
        auto: bool,
    ) -> Result<(), CompactionError> {
        info!(
            "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
        );
-        let jobs = timeline
+        let res = timeline
            .gc_compaction_split_jobs(
                GcCompactJob::from_compact_options(options.clone()),
                options.sub_compaction_max_job_size_mb,
            )
-            .await?;
+            .await;
+        let jobs = match res {
+            Ok(jobs) => jobs,
+            Err(err) => {
+                warn!("cannot split gc-compaction jobs: {}, unblocked gc", err);
+                self.notify_and_unblock(id);
+                return Err(err);
+            }
+        };
        if jobs.is_empty() {
            info!("no jobs to run, skipping scheduled compaction task");
            self.notify_and_unblock(id);
        } else {
-            let gc_guard = match gc_block.start().await {
-                Ok(guard) => guard,
-                Err(e) => {
-                    return Err(CompactionError::Other(anyhow!(
-                        "cannot run gc-compaction because gc is blocked: {}",
-                        e
-                    )));
-                }
-            };
-
            let jobs_len = jobs.len();
            let mut pending_tasks = Vec::new();
            // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate.
@@ -394,8 +399,8 @@ impl GcCompactionQueue {
                if job.dry_run {
                    flags |= CompactFlags::DryRun;
                }
-                if options.flags.contains(CompactFlags::NoYield) {
-                    flags |= CompactFlags::NoYield;
+                if options.flags.contains(CompactFlags::YieldForL0) {
+                    flags |= CompactFlags::YieldForL0;
                }
                let options = CompactOptions {
                    flags,
@@ -415,7 +420,6 @@ impl GcCompactionQueue {

            {
                let mut guard = self.inner.lock().unwrap();
-                guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
                let mut tasks = Vec::new();
                for task in pending_tasks {
                    let id = guard.next_id();
@@ -444,9 +448,20 @@ impl GcCompactionQueue {
    ) -> Result<CompactionOutcome, CompactionError> {
        let res = self.iteration_inner(cancel, ctx, gc_block, timeline).await;
        if let Err(err) = &res {
-            log_compaction_error(err, None, cancel.is_cancelled());
+            log_compaction_error(err, None, cancel.is_cancelled(), true);
+        }
+        match res {
+            Ok(res) => Ok(res),
+            Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
+            Err(_) => {
+                // There are some cases where traditional gc might collect some layer
+                // files causing gc-compaction cannot read the full history of the key.
+                // This needs to be resolved in the long-term by improving the compaction
+                // process. For now, let's simply avoid such errors triggering the
+                // circuit breaker.
+                Ok(CompactionOutcome::Skipped)
+            }
        }
-        res
    }

    async fn iteration_inner(
@@ -494,27 +509,32 @@ impl GcCompactionQueue {
                    info!(
                        "running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
                    );
-                    self.handle_sub_compaction(id, options, timeline, gc_block, auto)
+                    self.handle_sub_compaction(id, options, timeline, auto)
                        .await?;
                } else {
                    // Auto compaction always enables sub-compaction so we don't need to handle update_l2_lsn
                    // in this branch.
-                    let gc_guard = match gc_block.start().await {
+                    let _gc_guard = match gc_block.start().await {
                        Ok(guard) => guard,
                        Err(e) => {
+                            self.notify_and_unblock(id);
+                            self.clear_running_job();
                            return Err(CompactionError::Other(anyhow!(
                                "cannot run gc-compaction because gc is blocked: {}",
                                e
                            )));
                        }
                    };
-                    {
-                        let mut guard = self.inner.lock().unwrap();
-                        guard.guards.entry(id).or_default().gc_guard = Some(gc_guard);
-                    }
-                    let compaction_result =
-                        timeline.compact_with_options(cancel, options, ctx).await?;
-                    self.notify_and_unblock(id);
+                    let res = timeline.compact_with_options(cancel, options, ctx).await;
+                    let compaction_result = match res {
+                        Ok(res) => res,
+                        Err(err) => {
+                            warn!(%err, "failed to run gc-compaction");
+                            self.notify_and_unblock(id);
+                            self.clear_running_job();
+                            return Err(err);
+                        }
+                    };
                    if compaction_result == CompactionOutcome::YieldForL0 {
                        yield_for_l0 = true;
                    }
@@ -522,7 +542,25 @@ impl GcCompactionQueue {
            }
            GcCompactionQueueItem::SubCompactionJob(options) => {
                // TODO: error handling, clear the queue if any task fails?
-                let compaction_result = timeline.compact_with_options(cancel, options, ctx).await?;
+                let _gc_guard = match gc_block.start().await {
+                    Ok(guard) => guard,
+                    Err(e) => {
+                        self.clear_running_job();
+                        return Err(CompactionError::Other(anyhow!(
+                            "cannot run gc-compaction because gc is blocked: {}",
+                            e
+                        )));
+                    }
+                };
+                let res = timeline.compact_with_options(cancel, options, ctx).await;
+                let compaction_result = match res {
+                    Ok(res) => res,
+                    Err(err) => {
+                        warn!(%err, "failed to run gc-compaction subcompaction job");
+                        self.clear_running_job();
+                        return Err(err);
+                    }
+                };
                if compaction_result == CompactionOutcome::YieldForL0 {
                    // We will permenantly give up a task if we yield for L0 compaction: the preempted subcompaction job won't be running
                    // again. This ensures that we don't keep doing duplicated work within gc-compaction. Not directly returning here because
@@ -553,10 +591,7 @@ impl GcCompactionQueue {
                }
            }
        }
-        {
-            let mut guard = self.inner.lock().unwrap();
-            guard.running = None;
-        }
+        self.clear_running_job();
        Ok(if yield_for_l0 {
            tracing::info!("give up gc-compaction: yield for L0 compaction");
            CompactionOutcome::YieldForL0
@@ -983,7 +1018,7 @@ impl Timeline {

        // Yield if we have pending L0 compaction. The scheduler will do another pass.
        if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0)
-            && !options.flags.contains(CompactFlags::NoYield)
+            && options.flags.contains(CompactFlags::YieldForL0)
        {
            info!("image/ancestor compaction yielding for L0 compaction");
            return Ok(CompactionOutcome::YieldForL0);
@@ -1001,9 +1036,9 @@ impl Timeline {
        {
            Ok(((dense_partitioning, sparse_partitioning), lsn)) => {
                // Disables access_stats updates, so that the files we read remain candidates for eviction after we're done with them
-                let image_ctx = RequestContextBuilder::extend(ctx)
+                let image_ctx = RequestContextBuilder::from(ctx)
                    .access_stats_behavior(AccessStatsBehavior::Skip)
-                    .build();
+                    .attached_child();

                let mut partitioning = dense_partitioning;
                partitioning
@@ -1028,7 +1063,7 @@ impl Timeline {
                            .load()
                            .as_ref()
                            .clone(),
-                        !options.flags.contains(CompactFlags::NoYield),
+                        options.flags.contains(CompactFlags::YieldForL0),
                    )
                    .await
                    .inspect_err(|err| {
@@ -1209,6 +1244,10 @@ impl Timeline {
        let mut replace_image_layers = Vec::new();

        for layer in layers_to_rewrite {
+            if self.cancel.is_cancelled() {
+                return Err(CompactionError::ShuttingDown);
+            }
+
            tracing::info!(layer=%layer, "Rewriting layer after shard split...");
            let mut image_layer_writer = ImageLayerWriter::new(
                self.conf,
@@ -2371,7 +2410,9 @@ impl Timeline {
                } else {
                    lsn_split_points[i]
                };
-                let img = self.reconstruct_value(key, request_lsn, state).await?;
+                let img = self
+                    .reconstruct_value_wo_critical_error(key, request_lsn, state)
+                    .await?;
                Some((request_lsn, img))
            } else {
                None
@@ -2635,7 +2676,7 @@ impl Timeline {
    ) -> Result<CompactionOutcome, CompactionError> {
        let sub_compaction = options.sub_compaction;
        let job = GcCompactJob::from_compact_options(options.clone());
-        let no_yield = options.flags.contains(CompactFlags::NoYield);
+        let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0);
        if sub_compaction {
            info!(
                "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
@@ -2650,7 +2691,7 @@ impl Timeline {
                    idx + 1,
                    jobs_len
                );
-                self.compact_with_gc_inner(cancel, job, ctx, no_yield)
+                self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
                    .await?;
            }
            if jobs_len == 0 {
@@ -2658,7 +2699,8 @@ impl Timeline {
            }
            return Ok(CompactionOutcome::Done);
        }
-        self.compact_with_gc_inner(cancel, job, ctx, no_yield).await
+        self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
+            .await
    }

    async fn compact_with_gc_inner(
@@ -2666,7 +2708,7 @@ impl Timeline {
        cancel: &CancellationToken,
        job: GcCompactJob,
        ctx: &RequestContext,
-        no_yield: bool,
+        yield_for_l0: bool,
    ) -> Result<CompactionOutcome, CompactionError> {
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
@@ -2936,18 +2978,15 @@ impl Timeline {
            if cancel.is_cancelled() {
                return Err(CompactionError::ShuttingDown);
            }
-            if !no_yield {
-                let should_yield = self
+            let should_yield = yield_for_l0
+                && self
                    .l0_compaction_trigger
                    .notified()
                    .now_or_never()
                    .is_some();
-                if should_yield {
-                    tracing::info!(
-                        "preempt gc-compaction when downloading layers: too many L0 layers"
-                    );
-                    return Ok(CompactionOutcome::YieldForL0);
-                }
+            if should_yield {
+                tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
            }
            let resident_layer = layer
                .download_and_keep_resident(ctx)
@@ -3069,8 +3108,6 @@ impl Timeline {
        // the key and LSN range are determined. However, to keep things simple here, we still
        // create this writer, and discard the writer in the end.

-        let mut keys_processed = 0;
-
        while let Some(((key, lsn, val), desc)) = merge_iter
            .next_with_trace()
            .await
@@ -3081,21 +3118,15 @@ impl Timeline {
                return Err(CompactionError::ShuttingDown);
            }

-            if !no_yield {
-                keys_processed += 1;
-                if keys_processed % 1000 == 0 {
-                    let should_yield = self
-                        .l0_compaction_trigger
-                        .notified()
-                        .now_or_never()
-                        .is_some();
-                    if should_yield {
-                        tracing::info!(
-                            "preempt gc-compaction in the main loop: too many L0 layers"
-                        );
-                        return Ok(CompactionOutcome::YieldForL0);
-                    }
-                }
+            let should_yield = yield_for_l0
+                && self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some();
+            if should_yield {
+                tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
            }
            if self.shard_identity.is_key_disposable(&key) {
                // If this shard does not need to store this key, simply skip it.
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -410,10 +410,13 @@ impl DeleteTimelineFlow {
        // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
        // However, we handle this case in tenant loading code so the next time we attach, the issue is
        // resolved.
-        tenant.store_tenant_manifest().await.map_err(|e| match e {
-            TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
-            _ => DeleteTimelineError::Other(e.into()),
-        })?;
+        tenant
+            .maybe_upload_tenant_manifest()
+            .await
+            .map_err(|err| match err {
+                TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
+                err => DeleteTimelineError::Other(err.into()),
+            })?;

        *guard = Self::Finished;

--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -2,10 +2,14 @@ use std::collections::HashSet;
 use std::sync::Arc;

 use anyhow::Context;
+use bytes::Bytes;
 use http_utils::error::ApiError;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::DetachBehavior;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_compaction::helpers::overlaps_with;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::Tenant;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
+use crate::tenant::storage_layer::{
+    AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
+    ValuesReconstructState,
+};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 #[derive(Debug, thiserror::Error)]
@@ -170,6 +177,92 @@ impl Attempt {
    }
 }

+async fn generate_tombstone_image_layer(
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+    ctx: &RequestContext,
+) -> Result<Option<ResidentLayer>, Error> {
+    tracing::info!(
+        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
+    );
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        detached.conf,
+        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
+    );
+    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+    // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+    // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute.
+    let key_range = Key::sparse_non_inherited_keyspace();
+    // avoid generating a "future layer" which will then be removed
+    let image_lsn = ancestor_lsn;
+
+    {
+        let layers = detached.layers.read().await;
+        for layer in layers.all_persistent_layers() {
+            if !layer.is_delta
+                && layer.lsn_range.start == image_lsn
+                && overlaps_with(&key_range, &layer.key_range)
+            {
+                tracing::warn!(
+                    layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files"
+                );
+                return Ok(None);
+            }
+        }
+    }
+
+    let data = ancestor
+        .get_vectored_impl(
+            KeySpace::single(key_range.clone()),
+            image_lsn,
+            &mut reconstruct_state,
+            ctx,
+        )
+        .await
+        .context("failed to retrieve aux keys")
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
+    if !data.is_empty() {
+        // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated
+        // upon compaction but theoretically possible.
+        let mut image_layer_writer = ImageLayerWriter::new(
+            detached.conf,
+            detached.timeline_id,
+            detached.tenant_shard_id,
+            &key_range,
+            image_lsn,
+            ctx,
+        )
+        .await
+        .context("failed to create image layer writer")
+        .map_err(Error::Prepare)?;
+        for key in data.keys() {
+            image_layer_writer
+                .put_image(*key, Bytes::new(), ctx)
+                .await
+                .context("failed to write key")
+                .map_err(|e| Error::launder(e, Error::Prepare))?;
+        }
+        let (desc, path) = image_layer_writer
+            .finish(ctx)
+            .await
+            .context("failed to finish image layer writer for removing the metadata keys")
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        let generated = Layer::finish_creating(detached.conf, detached, desc, &path)
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        detached
+            .remote_client
+            .upload_layer_file(&generated, &detached.cancel)
+            .await
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        tracing::info!(layer=%generated, "wrote image layer");
+        Ok(Some(generated))
+    } else {
+        tracing::info!("no aux keys found in ancestor");
+        Ok(None)
+    }
+}
+
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
@@ -235,7 +328,7 @@ pub(super) async fn prepare(
        return Err(NoAncestor);
    }

-    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn, behavior)?;

    if let DetachBehavior::MultiLevelAndNoReparent = behavior {
        // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline.
@@ -249,7 +342,13 @@ pub(super) async fn prepare(
            ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable
            ancestor = ancestor_of_ancestor;
            // TODO: do we still need to check if we don't want to reparent?
-            check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+            check_no_archived_children_of_ancestor(
+                tenant,
+                detached,
+                &ancestor,
+                ancestor_lsn,
+                behavior,
+            )?;
        }
    } else if ancestor.ancestor_timeline.is_some() {
        // non-technical requirement; we could flatten N ancestors just as easily but we chose
@@ -346,10 +445,16 @@ pub(super) async fn prepare(

    // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
    let mut new_layers: Vec<Layer> =
-        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
+        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
+
+    if let Some(tombstone_layer) =
+        generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
+    {
+        new_layers.push(tombstone_layer.into());
+    }

    {
-        tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
+        tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");

        let mut tasks = tokio::task::JoinSet::new();

@@ -1156,31 +1261,44 @@ fn check_no_archived_children_of_ancestor(
    detached: &Arc<Timeline>,
    ancestor: &Arc<Timeline>,
    ancestor_lsn: Lsn,
+    detach_behavior: DetachBehavior,
 ) -> Result<(), Error> {
-    let timelines = tenant.timelines.lock().unwrap();
-    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-    for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
-        if timeline.is_archived() == Some(true) {
-            return Err(Error::Archived(timeline.timeline_id));
-        }
-    }
-    for timeline_offloaded in timelines_offloaded.values() {
-        if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
-            continue;
-        }
-        // This forbids the detach ancestor feature if flattened timelines are present,
-        // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
-        // But as per current design, we don't record the ancestor_lsn of flattened timelines.
-        // This is a bit unfortunate, but as of writing this we don't support flattening
-        // anyway. Maybe we can evolve the data model in the future.
-        if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
-            let is_earlier = retain_lsn <= ancestor_lsn;
-            if !is_earlier {
-                continue;
+    match detach_behavior {
+        DetachBehavior::NoAncestorAndReparent => {
+            let timelines = tenant.timelines.lock().unwrap();
+            let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+
+            for timeline in
+                reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn)
+            {
+                if timeline.is_archived() == Some(true) {
+                    return Err(Error::Archived(timeline.timeline_id));
+                }
+            }
+
+            for timeline_offloaded in timelines_offloaded.values() {
+                if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
+                    continue;
+                }
+                // This forbids the detach ancestor feature if flattened timelines are present,
+                // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
+                // But as per current design, we don't record the ancestor_lsn of flattened timelines.
+                // This is a bit unfortunate, but as of writing this we don't support flattening
+                // anyway. Maybe we can evolve the data model in the future.
+                if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
+                    let is_earlier = retain_lsn <= ancestor_lsn;
+                    if !is_earlier {
+                        continue;
+                    }
+                }
+                return Err(Error::Archived(timeline_offloaded.timeline_id));
            }
        }
-        return Err(Error::Archived(timeline_offloaded.timeline_id));
+        DetachBehavior::MultiLevelAndNoReparent => {
+            // We don't need to check anything if the user requested to not reparent.
+        }
    }
+
    Ok(())
 }

--- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -32,9 +32,15 @@ impl Client {
        let Some(ref base_url) = conf.import_pgdata_upcall_api else {
            anyhow::bail!("import_pgdata_upcall_api is not configured")
        };
+        let mut http_client = reqwest::Client::builder();
+        for cert in &conf.ssl_ca_certs {
+            http_client = http_client.add_root_certificate(cert.clone());
+        }
+        let http_client = http_client.build()?;
+
        Ok(Self {
            base_url: base_url.to_string(),
-            client: reqwest::Client::new(),
+            client: http_client,
            cancel,
            authorization_header: conf
                .import_pgdata_upcall_api_token
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -111,7 +111,7 @@ pub(crate) async fn offload_timeline(
    // at the next restart attach it again.
    // For that to happen, we'd need to make the manifest reflect our *intended* state,
    // not our actual state of offloaded timelines.
-    tenant.store_tenant_manifest().await?;
+    tenant.maybe_upload_tenant_manifest().await?;

    tracing::info!("Timeline offload complete (remaining arc refcount: {remaining_refcount})");

--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -25,8 +25,8 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
    /// * `align` must be a power of two,
    ///
    /// * `capacity`, when rounded up to the nearest multiple of `align`,
-    ///    must not overflow isize (i.e., the rounded value must be
-    ///    less than or equal to `isize::MAX`).
+    ///   must not overflow isize (i.e., the rounded value must be
+    ///   less than or equal to `isize::MAX`).
    pub fn with_capacity(capacity: usize) -> Self {
        AlignedBufferMut {
            raw: RawAlignedBuffer::with_capacity(capacity),
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
@@ -37,8 +37,8 @@ impl<const A: usize> RawAlignedBuffer<ConstAlign<A>> {
    /// * `align` must be a power of two,
    ///
    /// * `capacity`, when rounded up to the nearest multiple of `align`,
-    ///    must not overflow isize (i.e., the rounded value must be
-    ///    less than or equal to `isize::MAX`).
+    ///   must not overflow isize (i.e., the rounded value must be
+    ///   less than or equal to `isize::MAX`).
    pub fn with_capacity(capacity: usize) -> Self {
        let align = ConstAlign::<A>;
        let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout");
--- a/pgxn/neon/bitmap.h
+++ b/pgxn/neon/bitmap.h
@@ -9,4 +9,4 @@
 #define BITMAP_SET(bm, bit) (bm)[(bit) >> 3] |= (1 << ((bit) & 7))
 #define BITMAP_CLR(bm, bit) (bm)[(bit) >> 3] &= ~(1 << ((bit) & 7))

-#endif //NEON_BITMAP_H
+#endif							/* NEON_BITMAP_H */
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -13,9 +13,6 @@
 *        accumulate changes. On subtransaction commit, the top of the stack
 *        is merged with the table below it.
 *
- * IDENTIFICATION
- *	 contrib/neon/control_plane_connector.c
- *
 *-------------------------------------------------------------------------
 */

--- a/pgxn/neon/extension_server.c
+++ b/pgxn/neon/extension_server.c
@@ -3,9 +3,6 @@
 * extension_server.c
 *	  Request compute_ctl to download extension files.
 *
- * IDENTIFICATION
- *	 contrib/neon/extension_server.c
- *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
--- a/pgxn/neon/extension_server.h
+++ b/pgxn/neon/extension_server.h
@@ -3,9 +3,6 @@
 * extension_server.h
 *	  Request compute_ctl to download extension files.
 *
- * IDENTIFICATION
- *	 contrib/neon/extension_server.h
- *
 *-------------------------------------------------------------------------
 */

--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -1,4 +1,4 @@
-/*
+/*-------------------------------------------------------------------------
 *
 * file_cache.c
 *
@@ -6,10 +6,6 @@
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- *
- * IDENTIFICATION
- *	  pgxn/neon/file_cache.c
- *
 *-------------------------------------------------------------------------
 */

@@ -647,18 +643,25 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return found;
 }

+#if PG_MAJORVERSION_NUM >= 16
+static PGIOAlignedBlock voidblock = {0};
+#else
+static PGAlignedBlock voidblock = {0};
+#endif
+#define SCRIBBLEPAGE (&voidblock.data)
+
 /*
 * Try to read pages from local cache.
 * Returns the number of pages read from the local cache, and sets bits in
- * 'read' for the pages which were read. This may scribble over buffers not
- * marked in 'read', so be careful with operation ordering.
+ * 'mask' for the pages which were read. This may scribble over buffers not
+ * marked in 'mask', so be careful with operation ordering.
 *
 * In case of error local file cache is disabled (lfc->limit is set to zero),
- * and -1 is returned. Note that 'read' and the buffers may be touched and in
- * an otherwise invalid state.
+ * and -1 is returned.
 *
- * If the mask argument is supplied, bits will be set at the offsets of pages
- * that were present and read from the LFC.
+ * If the mask argument is supplied, we'll only try to read those pages which
+ * don't have their bits set on entry. At exit, pages which were successfully
+ * read from LFC will have their bits set.
 */
 int
 lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
@@ -693,23 +696,43 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
-		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
+		int8	chunk_mask[BLOCKS_PER_CHUNK / 8] = {0};
+		int		chunk_offs = (blkno & (BLOCKS_PER_CHUNK - 1));
 		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
 		uint64	io_time_us = 0;
-		int     n_blocks_to_read = 0;
+		int		n_blocks_to_read = 0;
+		int		iov_last_used = 0;
+		int		first_block_in_chunk_read = -1;
 		ConditionVariable* cv;

 		Assert(blocks_in_chunk > 0);

 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
-			n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0);
-			iov[i].iov_base = buffers[buf_offset + i];
 			iov[i].iov_len = BLCKSZ;
-			BITMAP_CLR(mask,  buf_offset + i);
+			/* mask not set = we must do work */
+			if (!BITMAP_ISSET(mask, buf_offset + i))
+			{
+				iov[i].iov_base = buffers[buf_offset + i];
+				n_blocks_to_read++;
+				iov_last_used = i + 1;
+
+				if (first_block_in_chunk_read == -1)
+				{
+					first_block_in_chunk_read = i;
+				}
+			}
+			/* mask set = we must do no work */
+			else
+			{
+				/* don't scribble on pages we weren't requested to write to */
+				iov[i].iov_base = SCRIBBLEPAGE;
+			}
 		}
+
+		/* shortcut IO */
 		if (n_blocks_to_read == 0)
 		{
 			buf_offset += blocks_in_chunk;
@@ -718,6 +741,12 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			continue;
 		}

+		/*
+		 * The effective iov size must be >= the number of blocks we're about
+		 * to read.
+		 */
+		Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read);
+
 		tag.blockNum = blkno - chunk_offs;
 		hash = get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];
@@ -762,10 +791,15 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;

-		for (int i = 0; i < blocks_in_chunk; i++)
+		for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
 		{
 			FileCacheBlockState state = UNAVAILABLE;
 			bool sleeping = false;
+
+			/* no need to work on something we're not interested in */
+			if (BITMAP_ISSET(mask, buf_offset + i))
+				continue;
+
 			while (lfc_ctl->generation == generation)
 			{
 				state = GET_STATE(entry, chunk_offs + i);
@@ -789,7 +823,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			}
 			if (state == AVAILABLE)
 			{
-				BITMAP_SET(mask, buf_offset + i);
+				BITMAP_SET(chunk_mask, i);
 				iteration_hits++;
 			}
 			else
@@ -801,16 +835,34 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 		if (iteration_hits != 0)
 		{
+			/* chunk offset (# of pages) into the LFC file */
+			off_t	first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK;
+			int		nwrite = iov_last_used - first_block_in_chunk_read;
+			/* offset of first IOV */
+			first_read_offset += chunk_offs + first_block_in_chunk_read;
+
 			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
-			rc = preadv(lfc_desc, iov, blocks_in_chunk,
-						((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
+
+			/* Read only the blocks we're interested in, limiting */
+			rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
+						nwrite, first_read_offset * BLCKSZ);
 			pgstat_report_wait_end();

-			if (rc != (BLCKSZ * blocks_in_chunk))
+			if (rc != (BLCKSZ * nwrite))
 			{
 				lfc_disable("read");
 				return -1;
 			}
+
+			/*
+			 * We successfully read the pages we know were valid when we
+			 * started reading; now mark those pages as read
+			 */
+			for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
+			{
+				if (BITMAP_ISSET(chunk_mask, i))
+					BITMAP_SET(mask, buf_offset + i);
+			}
 		}

 		/* Place entry to the head of LRU list */
@@ -1511,8 +1563,12 @@ local_cache_pages(PG_FUNCTION_ARGS)
 				hash_seq_init(&status, lfc_hash);
 				while ((entry = hash_seq_search(&status)) != NULL)
 				{
-					for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
-						n_pages += GET_STATE(entry, i) == AVAILABLE;
+					/* Skip hole tags */
+					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
+					{
+						for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+							n_pages += GET_STATE(entry, i) == AVAILABLE;
+					}
 				}
 			}
 		}
@@ -1540,16 +1596,19 @@ local_cache_pages(PG_FUNCTION_ARGS)
 			{
 				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
 				{
-					if (GET_STATE(entry, i) == AVAILABLE)
+					if (NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key)) != 0)
 					{
-						fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
-						fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
-						fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
-						fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
-						fctx->record[n].forknum = entry->key.forkNum;
-						fctx->record[n].blocknum = entry->key.blockNum + i;
-						fctx->record[n].accesscount = entry->access_count;
-						n += 1;
+						if (GET_STATE(entry, i) == AVAILABLE)
+						{
+							fctx->record[n].pageoffs = entry->offset * BLOCKS_PER_CHUNK + i;
+							fctx->record[n].relfilenode = NInfoGetRelNumber(BufTagGetNRelFileInfo(entry->key));
+							fctx->record[n].reltablespace = NInfoGetSpcOid(BufTagGetNRelFileInfo(entry->key));
+							fctx->record[n].reldatabase = NInfoGetDbOid(BufTagGetNRelFileInfo(entry->key));
+							fctx->record[n].forknum = entry->key.forkNum;
+							fctx->record[n].blocknum = entry->key.blockNum + i;
+							fctx->record[n].accesscount = entry->access_count;
+							n += 1;
+						}
 					}
 				}
 			}
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -6,10 +6,6 @@
 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
 * Portions Copyright (c) 1994, Regents of the University of California
 *
- *
- * IDENTIFICATION
- *	 contrib/neon/libpqpagestore.c
- *
 *-------------------------------------------------------------------------
 */
 #include "postgres.h"
@@ -34,6 +30,7 @@
 #include "storage/lwlock.h"
 #include "storage/pg_shmem.h"
 #include "utils/guc.h"
+#include "utils/memutils.h"

 #include "neon.h"
 #include "neon_perf_counters.h"
@@ -1142,37 +1139,23 @@ pageserver_try_receive(shardno_t shard_no)
 	NeonResponse *resp;
 	PageServer *shard = &page_servers[shard_no];
 	PGconn	   *pageserver_conn = shard->conn;
-	/* read response */
-	int			rc;
+	int	rc;

 	if (shard->state != PS_Connected)
 		return NULL;

 	Assert(pageserver_conn);

-	while (true)
+	rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */);
+	if (rc == 0)
 	{
-		if (PQisBusy(shard->conn))
+		if (!PQconsumeInput(shard->conn))
 		{
-			WaitEvent	event;
-			if (WaitEventSetWait(shard->wes_read, 0, &event, 1,
-								 WAIT_EVENT_NEON_PS_READ) != 1
-				|| (event.events & WL_SOCKET_READABLE) == 0)
-			{
-				return NULL;
-			}
+			return NULL;
 		}
 		rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async */);
-		if (rc == 0)
-		{
-			if (!PQconsumeInput(shard->conn))
-			{
-				return NULL;
-			}
-		}
-		else
-			break;
 	}
+
 	if (rc == 0)
 		return NULL;
 	else if (rc > 0)
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -1,11 +1,11 @@
+#include "postgres.h"
+
 #include <dirent.h>
 #include <limits.h>
 #include <string.h>
 #include <signal.h>
 #include <sys/stat.h>

-#include "postgres.h"
-
 #include "miscadmin.h"
 #include "postmaster/bgworker.h"
 #include "postmaster/interrupt.h"
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -1,10 +1,7 @@
 /*-------------------------------------------------------------------------
 *
 * neon.c
- *	  Utility functions to expose neon specific information to user
- *
- * IDENTIFICATION
- *	 contrib/neon/neon.c
+ *	  Main entry point into the neon exension
 *
 *-------------------------------------------------------------------------
 */
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -3,15 +3,13 @@
 * neon.h
 *	  Functions used in the initialization of this extension.
 *
- * IDENTIFICATION
- *	 contrib/neon/neon.h
- *
 *-------------------------------------------------------------------------
 */

 #ifndef NEON_H
 #define NEON_H
-#include "access/xlogreader.h"
+
+#include "access/xlogdefs.h"
 #include "utils/wait_event.h"

 /* GUCs */
@@ -58,8 +56,8 @@ extern void SetNeonCurrentClusterSize(uint64 size);
 extern uint64 GetNeonCurrentClusterSize(void);
 extern void replication_feedback_get_lsns(XLogRecPtr *writeLsn, XLogRecPtr *flushLsn, XLogRecPtr *applyLsn);

-extern void PGDLLEXPORT WalProposerSync(int argc, char *argv[]);
-extern void PGDLLEXPORT WalProposerMain(Datum main_arg);
-PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
+extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]);
+extern PGDLLEXPORT void WalProposerMain(Datum main_arg);
+extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);

 #endif							/* NEON_H */
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages';`