tests: reproducer for https://github.com/neondatabase/neon/issues/11439

storcon: notify compute if correct observed state was refreshed (#11342 )
## Problem Previously, if the observed state was refreshed and matching the intent, we wouldn't send a compute notification. This is unsafe. There's no guarantee that the location landed on the pageserver _and_ a compute notification for it was delivered. See https://github.com/neondatabase/neon/issues/11291#issuecomment-2743205411 for one such example. ## Summary of changes Add a reproducer and notify the compute if the correct observed state required a refresh. Closes https://github.com/neondatabase/neon/issues/11291
2026-05-16 04:30:38 +00:00 · 2025-04-03 19:51:18 +02:00 · 2025-04-03 16:35:55 +00:00 · 2025-04-03 15:55:22 +00:00 · 2025-04-03 15:26:35 +00:00 · 2025-04-03 14:57:44 +00:00
273 changed files with 3454 additions and 3199 deletions
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -8,6 +8,7 @@ self-hosted-runner:
    - small-arm64
    - us-east-2
 config-variables:
+  - AWS_ECR_REGION
  - AZURE_DEV_CLIENT_ID
  - AZURE_DEV_REGISTRY_NAME
  - AZURE_DEV_SUBSCRIPTION_ID
@@ -15,23 +16,25 @@ config-variables:
  - AZURE_PROD_REGISTRY_NAME
  - AZURE_PROD_SUBSCRIPTION_ID
  - AZURE_TENANT_ID
+  - BENCHMARK_INGEST_TARGET_PROJECTID
+  - BENCHMARK_LARGE_OLTP_PROJECTID
  - BENCHMARK_PROJECT_ID_PUB
  - BENCHMARK_PROJECT_ID_SUB
-  - REMOTE_STORAGE_AZURE_CONTAINER
-  - REMOTE_STORAGE_AZURE_REGION
-  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
  - DEV_AWS_OIDC_ROLE_ARN
-  - BENCHMARK_INGEST_TARGET_PROJECTID
-  - PGREGRESS_PG16_PROJECT_ID
-  - PGREGRESS_PG17_PROJECT_ID
-  - SLACK_ON_CALL_QA_STAGING_STREAM
  - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
-  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
-  - SLACK_CICD_CHANNEL_ID
-  - SLACK_STORAGE_CHANNEL_ID
+  - HETZNER_CACHE_BUCKET
+  - HETZNER_CACHE_ENDPOINT
+  - HETZNER_CACHE_REGION
  - NEON_DEV_AWS_ACCOUNT_ID
  - NEON_PROD_AWS_ACCOUNT_ID
-  - AWS_ECR_REGION
-  - BENCHMARK_LARGE_OLTP_PROJECTID
+  - PGREGRESS_PG16_PROJECT_ID
+  - PGREGRESS_PG17_PROJECT_ID
+  - REMOTE_STORAGE_AZURE_CONTAINER
+  - REMOTE_STORAGE_AZURE_REGION
+  - SLACK_CICD_CHANNEL_ID
  - SLACK_ON_CALL_DEVPROD_STREAM
+  - SLACK_ON_CALL_QA_STAGING_STREAM
+  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
  - SLACK_RUST_CHANNEL_ID
+  - SLACK_STORAGE_CHANNEL_ID
+  - SLACK_UPCOMING_RELEASE_CHANNEL_ID
--- a/.github/scripts/generate_image_maps.py
+++ b/.github/scripts/generate_image_maps.py
@@ -39,12 +39,18 @@ registries = {
    ],
 }

+release_branches = ["release", "release-proxy", "release-compute"]
+
 outputs: dict[str, dict[str, list[str]]] = {}

-target_tags = [target_tag, "latest"] if branch == "main" else [target_tag]
-target_stages = (
-    ["dev", "prod"] if branch in ["release", "release-proxy", "release-compute"] else ["dev"]
+target_tags = (
+    [target_tag, "latest"]
+    if branch == "main"
+    else [target_tag, "released"]
+    if branch in release_branches
+    else [target_tag]
 )
+target_stages = ["dev", "prod"] if branch in release_branches else ["dev"]

 for component_name, component_images in components.items():
    for stage in target_stages:
--- a/.github/scripts/push_with_image_map.py
+++ b/.github/scripts/push_with_image_map.py
@@ -11,12 +11,27 @@ try:
 except json.JSONDecodeError as e:
    raise ValueError("Failed to parse IMAGE_MAP as JSON") from e

-for source, targets in parsed_image_map.items():
-    for target in targets:
-        cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
-        print(f"Running: {' '.join(cmd)}")
-        result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+failures = []

-        if result.returncode != 0:
-            print(f"Error: {result.stdout}")
-            raise RuntimeError(f"Command failed: {' '.join(cmd)}")
+pending = [(source, target) for source, targets in parsed_image_map.items() for target in targets]
+
+while len(pending) > 0:
+    if len(failures) > 10:
+        print("Error: more than 10 failures!")
+        for failure in failures:
+            print(f'"{failure[0]}" failed with the following output:')
+            print(failure[1])
+        raise RuntimeError("Retry limit reached.")
+
+    source, target = pending.pop(0)
+    cmd = ["docker", "buildx", "imagetools", "create", "-t", target, source]
+    print(f"Running: {' '.join(cmd)}")
+    result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+
+    if result.returncode != 0:
+        failures.append((" ".join(cmd), result.stdout))
+        pending.append((source, target))
+
+if len(failures) > 0 and (github_output := os.getenv("GITHUB_OUTPUT")):
+    with open(github_output, "a") as f:
+        f.write("slack_notify=true\n")
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -128,29 +128,49 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg_14
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

      - name: Cache postgres v15 build
        id: cache_pg_15
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

      - name: Cache postgres v16 build
        id: cache_pg_16
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

      - name: Cache postgres v17 build
        id: cache_pg_17
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}

--- a/.github/workflows/_check-codestyle-python.yml
+++ b/.github/workflows/_check-codestyle-python.yml
@@ -37,8 +37,14 @@ jobs:

      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

-      - uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+      - name: Cache poetry deps
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: ~/.cache/pypoetry/virtualenvs
          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

--- a/.github/workflows/_check-codestyle-rust.yml
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -48,8 +48,13 @@ jobs:
          submodules: true

      - name: Cache cargo deps
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: |
            ~/.cargo/registry
            !~/.cargo/registry/src
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -5,6 +5,9 @@ on:
      github-event-name:
        type: string
        required: true
+      github-event-json:
+        type: string
+        required: true
    outputs:
      build-tag:
        description: "Tag for the current workflow run"
@@ -27,6 +30,9 @@ on:
      release-pr-run-id:
        description: "Only available if `run-kind in [storage-release, proxy-release, compute-release]`. Contains the run ID of the `Build and Test` workflow, assuming one with the current commit can be found."
        value: ${{ jobs.tags.outputs.release-pr-run-id }}
+      sha:
+        description: "github.event.pull_request.head.sha on release PRs, github.sha otherwise"
+        value: ${{ jobs.tags.outputs.sha }}

 permissions: {}

@@ -45,6 +51,7 @@ jobs:
      storage: ${{ steps.previous-releases.outputs.storage }}
      run-kind: ${{ steps.run-kind.outputs.run-kind }}
      release-pr-run-id: ${{ steps.release-pr-run-id.outputs.release-pr-run-id }}
+      sha: ${{ steps.sha.outputs.sha }}
    permissions:
      contents: read
    steps:
@@ -54,10 +61,6 @@ jobs:
        with:
          egress-policy: audit

-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-        with:
-          fetch-depth: 0
-
      - name: Get run kind
        id: run-kind
        env:
@@ -78,6 +81,23 @@ jobs:
        run: |
          echo "run-kind=$RUN_KIND" | tee -a $GITHUB_OUTPUT

+      - name: Get the right SHA
+        id: sha
+        env:
+          SHA: >
+            ${{
+              contains(fromJSON('["storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), steps.run-kind.outputs.run-kind)
+              && fromJSON(inputs.github-event-json).pull_request.head.sha
+              || github.sha
+            }}
+        run: |
+          echo "sha=$SHA" | tee -a $GITHUB_OUTPUT
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+        with:
+          fetch-depth: 0
+          ref: ${{ steps.sha.outputs.sha }}
+
      - name: Get build tag
        id: build-tag
        env:
@@ -143,7 +163,7 @@ jobs:
        if: ${{ contains(fromJSON('["storage-release", "compute-release", "proxy-release"]'), steps.run-kind.outputs.run-kind) }}
        env:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-          CURRENT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          CURRENT_SHA: ${{ github.sha }}
        run: |
          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
          echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/_push-to-container-registry.yml
+++ b/.github/workflows/_push-to-container-registry.yml
@@ -104,6 +104,18 @@ jobs:
          password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}

      - name: Copy docker images to target registries
+        id: push
        run: python3 .github/scripts/push_with_image_map.py
        env:
          IMAGE_MAP: ${{ inputs.image-map }}
+
+      - name: Notify Slack if container image pushing fails
+        if: steps.push.outputs.slack_notify == 'true' || failure()
+        uses: slackapi/slack-github-action@485a9d42d3a73031f12ec201c457e2162c45d02d # v2.0.0
+        with:
+          method: chat.postMessage
+          token: ${{ secrets.SLACK_BOT_TOKEN }}
+          payload: |
+            channel: ${{ vars.SLACK_ON_CALL_DEVPROD_STREAM }}
+            text: |
+              Pushing container images failed in <${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}|GitHub Run>
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -63,8 +63,13 @@ jobs:

      - name: Cache postgres ${{ matrix.postgres-version }} build
        id: cache_pg
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/${{ matrix.postgres-version }}
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

@@ -129,15 +134,25 @@ jobs:

      - name: Cache postgres v17 build
        id: cache_pg
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache walproposer-lib
        id: cache_walproposer_lib
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

@@ -203,32 +218,57 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v15 build
        id: cache_pg_v15
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v16 build
        id: cache_pg_v16
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v17 build
        id: cache_pg_v17
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache cargo deps (only for v17)
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: |
            ~/.cargo/registry
            !~/.cargo/registry/src
@@ -238,8 +278,13 @@ jobs:

      - name: Cache walproposer-lib
        id: cache_walproposer_lib
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: pg_install/build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -80,6 +80,7 @@ jobs:
    uses: ./.github/workflows/_meta.yml
    with:
      github-event-name: ${{ github.event_name }}
+      github-event-json: ${{ toJSON(github.event) }}

  build-build-tools-image:
    needs: [ check-permissions ]
@@ -88,8 +89,8 @@ jobs:

  check-codestyle-python:
    needs: [ meta, check-permissions, build-build-tools-image ]
-    # No need to run on `main` because we this in the merge queue
-    if: ${{ needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    uses: ./.github/workflows/_check-codestyle-python.yml
    with:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -97,7 +98,8 @@ jobs:

  check-codestyle-jsonnet:
    needs: [ meta, check-permissions, build-build-tools-image ]
-    if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    runs-on: [ self-hosted, small ]
    container:
      image: ${{ needs.build-build-tools-image.outputs.image }}
@@ -180,8 +182,8 @@ jobs:

  check-codestyle-rust:
    needs: [ meta, check-permissions, build-build-tools-image ]
-    # No need to run on `main` because we this in the merge queue
-    if: ${{ needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    uses: ./.github/workflows/_check-codestyle-rust.yml
    with:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -190,7 +192,8 @@ jobs:

  check-dependencies-rust:
    needs: [ meta, files-changed, build-build-tools-image ]
-    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr' }}
+    # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    uses: ./.github/workflows/cargo-deny.yml
    with:
      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
@@ -198,7 +201,8 @@ jobs:

  build-and-test-locally:
    needs: [ meta, build-build-tools-image ]
-    if: ${{ contains(fromJSON('["pr", "push-main"]'), needs.meta.outputs.run-kind) }}
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
    strategy:
      fail-fast: false
      matrix:
@@ -248,8 +252,13 @@ jobs:
        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

      - name: Cache poetry deps
-        uses: actions/cache@d4323d4df104b026a6aa633fdb11d772146be0bf # v4.2.2
+        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
        with:
+          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
+          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
+          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
+          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
+          use-fallback: false
          path: ~/.cache/pypoetry/virtualenvs
          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}

@@ -540,6 +549,7 @@ jobs:
    uses: ./.github/workflows/trigger-e2e-tests.yml
    with:
      github-event-name: ${{ github.event_name }}
+      github-event-json: ${{ toJSON(github.event) }}
    secrets: inherit

  neon-image-arch:
@@ -563,6 +573,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          submodules: true
+          ref: ${{ needs.meta.outputs.sha }}

      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
@@ -672,6 +683,7 @@ jobs:
      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
        with:
          submodules: true
+          ref: ${{ needs.meta.outputs.sha }}

      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
      - uses: docker/setup-buildx-action@b5ca514318bd6ebac0fb2aedd5d36ec1b5c232a2 # v3.10.0
@@ -1556,10 +1568,10 @@ jobs:
        if: |
          contains(needs.*.result, 'failure')
          || contains(needs.*.result, 'cancelled')
-          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.build-and-test-locally.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.check-codestyle-python.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
-          || (needs.check-codestyle-rust.result == 'skipped' && needs.meta.outputs.run-kind == 'pr')
+          || (needs.check-dependencies-rust.result == 'skipped' && needs.files-changed.outputs.check-rust-dependencies == 'true' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.build-and-test-locally.result == 'skipped' && contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.check-codestyle-python.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
+          || (needs.check-codestyle-rust.result == 'skipped' && contains(fromJSON('["pr", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
          || needs.files-changed.result == 'skipped'
          || (needs.push-compute-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "compute-release", "compute-rc-pr"]'), needs.meta.outputs.run-kind))
          || (needs.push-neon-image-dev.result == 'skipped' && contains(fromJSON('["push-main", "pr", "storage-release", "storage-rc-pr", "proxy-release", "proxy-rc-pr"]'), needs.meta.outputs.run-kind))
--- a/.github/workflows/force-test-extensions-upgrade.yml
+++ b/.github/workflows/force-test-extensions-upgrade.yml
@@ -55,7 +55,7 @@ jobs:
          echo tag=${tag} >> ${GITHUB_OUTPUT}

      - name: Test extension upgrade
-        timeout-minutes: 20
+        timeout-minutes: 60
        env:
          NEW_COMPUTE_TAG: latest
          OLD_COMPUTE_TAG: ${{ steps.get-last-compute-release-tag.outputs.tag }}
--- a/.github/workflows/report-workflow-stats-batch.yml
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -23,7 +23,7 @@ jobs:
        egress-policy: audit

    - name: Export Workflow Run for the past 2 hours
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
      with:
        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
        db_table: "gh_workflow_stats_neon"
@@ -43,7 +43,7 @@ jobs:
        egress-policy: audit

    - name: Export Workflow Run for the past 48 hours
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
      with:
        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
        db_table: "gh_workflow_stats_neon"
@@ -63,7 +63,7 @@ jobs:
        egress-policy: audit

    - name: Export Workflow Run for the past 30 days
-      uses: neondatabase/gh-workflow-stats-action@4c998b25ab5cc6588b52a610b749531f6a566b6b # v0.2.1
+      uses: neondatabase/gh-workflow-stats-action@701b1f202666d0b82e67b4d387e909af2b920127 # v0.2.2
      with:
        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
        db_table: "gh_workflow_stats_neon"
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -9,6 +9,9 @@ on:
      github-event-name:
        type: string
        required: true
+      github-event-json:
+        type: string
+        required: true

 defaults:
  run:
@@ -48,6 +51,7 @@ jobs:
    uses: ./.github/workflows/_meta.yml
    with:
      github-event-name: ${{ inputs.github-event-name || github.event_name }}
+      github-event-json: ${{ inputs.github-event-json || toJSON(github.event) }}

  trigger-e2e-tests:
    needs: [ meta ]
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -148,9 +148,9 @@ dependencies = [

 [[package]]
 name = "arc-swap"
-version = "1.6.0"
+version = "1.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bddcadddf5e9015d310179a59bb28c4d4b9920ad0f11e8e14dbadf654890c9a6"
+checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457"

 [[package]]
 name = "archery"
@@ -3861,11 +3861,10 @@ dependencies = [

 [[package]]
 name = "num-bigint"
-version = "0.4.3"
+version = "0.4.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f93ab6289c7b344a8a9f60f88d80aa20032336fe78da341afc91c8a2341fc75f"
+checksum = "a5e44f723f1133c9deac646763579fdb3ac745e418f2a7af9cd0c431da1f20b9"
 dependencies = [
- "autocfg",
 "num-integer",
 "num-traits",
 ]
@@ -3914,11 +3913,10 @@ dependencies = [

 [[package]]
 name = "num-integer"
-version = "0.1.45"
+version = "0.1.46"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "225d3389fb3509a24c93f5c29eb6bde2586b98d9f016636dff58d7c6f7569cd9"
+checksum = "7969661fd2958a5cb096e56c8e1ad0444ac2bbcd0061bd28660485a44879858f"
 dependencies = [
- "autocfg",
 "num-traits",
 ]

@@ -3947,9 +3945,9 @@ dependencies = [

 [[package]]
 name = "num-traits"
-version = "0.2.15"
+version = "0.2.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
 dependencies = [
 "autocfg",
 "libm",
@@ -5362,26 +5360,25 @@ dependencies = [

 [[package]]
 name = "redis"
-version = "0.25.2"
+version = "0.29.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "71d64e978fd98a0e6b105d066ba4889a7301fca65aeac850a877d8797343feeb"
+checksum = "b110459d6e323b7cda23980c46c77157601199c9da6241552b284cd565a7a133"
 dependencies = [
- "async-trait",
+ "arc-swap",
 "bytes",
 "combine",
 "futures-util",
 "itoa",
+ "num-bigint",
 "percent-encoding",
 "pin-project-lite",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
- "rustls-pemfile 2.1.1",
- "rustls-pki-types",
+ "rustls 0.23.18",
+ "rustls-native-certs 0.8.0",
 "ryu",
 "sha1_smol",
 "socket2",
 "tokio",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
 "tokio-util",
 "url",
 ]
@@ -7217,15 +7214,14 @@ dependencies = [
 "bytes",
 "fallible-iterator",
 "futures-util",
- "log",
 "parking_lot 0.12.1",
- "phf",
 "pin-project-lite",
 "postgres-protocol2",
 "postgres-types2",
 "serde",
 "tokio",
 "tokio-util",
+ "tracing",
 ]

 [[package]]
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -50,7 +50,7 @@ license = "Apache-2.0"
 [workspace.dependencies]
 ahash = "0.8"
 anyhow = { version = "1.0", features = ["backtrace"] }
-arc-swap = "1.6"
+arc-swap = "1.7"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 flate2 = "1.0.26"
@@ -130,7 +130,7 @@ nix = { version = "0.27", features = ["dir", "fs", "process", "socket", "signal"
 # on compute startup metrics (start_postgres_ms), >= 25% degradation.
 notify = "6.0.0"
 num_cpus = "1.15"
-num-traits = "0.2.15"
+num-traits = "0.2.19"
 once_cell = "1.13"
 opentelemetry = "0.27"
 opentelemetry_sdk = "0.27"
@@ -146,7 +146,7 @@ procfs = "0.16"
 prometheus = {version = "0.13", default-features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.13"
 rand = "0.8"
-redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.29.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_27"] }
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -292,7 +292,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.85.0
+ENV RUSTC_VERSION=1.86.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -369,7 +369,7 @@ FROM build-deps AS plv8-src
 ARG PG_VERSION
 WORKDIR /ext-src

-COPY compute/patches/plv8-3.1.10.patch .
+COPY compute/patches/plv8* .

 # plv8 3.2.3 supports v17
 # last release v3.2.3 - Sep 7, 2024
@@ -393,7 +393,7 @@ RUN case "${PG_VERSION:?}" in \
    git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
    tar -czf plv8.tar.gz --exclude .git plv8-src && \
    cd plv8-src && \
-    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8-3.1.10.patch; fi
+    if [[ "${PG_VERSION:?}" < "v17" ]]; then patch -p1 < /ext-src/plv8_v3.1.10.patch; else patch -p1 < /ext-src/plv8_v3.2.3.patch; fi

 # Step 1: Build the vendored V8 engine. It doesn't depend on PostgreSQL, so use
 # 'build-deps' as the base. This enables caching and avoids unnecessary rebuilds.
@@ -1055,34 +1055,6 @@ RUN  if [ -d pg_embedding-src ]; then \
        make -j $(getconf _NPROCESSORS_ONLN) install; \
    fi

-#########################################################################################
-#
-# Layer "pg_anon-build"
-# compile anon extension
-#
-#########################################################################################
-FROM build-deps AS pg_anon-src
-ARG PG_VERSION
-
-# This is an experimental extension, never got to real production.
-# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
-WORKDIR /ext-src
-RUN case "${PG_VERSION:?}" in "v17") \
-    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
-    esac && \
-    wget  https://github.com/neondatabase/postgresql_anonymizer/archive/refs/tags/neon_1.1.1.tar.gz -O pg_anon.tar.gz && \
-    echo "321ea8d5c1648880aafde850a2c576e4a9e7b9933a34ce272efc839328999fa9  pg_anon.tar.gz" | sha256sum --check && \
-    mkdir pg_anon-src && cd pg_anon-src && tar xzf ../pg_anon.tar.gz --strip-components=1 -C .
-
-FROM pg-build AS pg_anon-build
-COPY --from=pg_anon-src /ext-src/ /ext-src/
-WORKDIR /ext-src
-RUN if [ -d pg_anon-src ]; then \
-        cd pg_anon-src && \
-        make -j $(getconf _NPROCESSORS_ONLN) install && \
-        echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control; \
-    fi
-
 #########################################################################################
 #
 # Layer "pg build with nonroot user and cargo installed"
@@ -1366,8 +1338,8 @@ ARG PG_VERSION
 # Do not update without approve from proxy team
 # Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
 WORKDIR /ext-src
-RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.3.0.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "19be2dc0b3834d643706ed430af998bb4c2cdf24b3c45e7b102bb3a550e8660c pg_session_jwt.tar.gz" | sha256sum --check && \
    mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
    sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
@@ -1677,7 +1649,6 @@ COPY --from=pg_roaringbitmap-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_semver-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_embedding-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-build /usr/local/pgsql /usr/local/pgsql
-COPY --from=pg_anon-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_partman-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg_mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1916,26 +1887,30 @@ RUN apt update && \
      ;; \
    esac && \
    apt install --no-install-recommends -y \
+        ca-certificates \
        gdb \
-        liblz4-1 \
-        libreadline8 \
+        iproute2 \
        libboost-iostreams1.74.0 \
        libboost-regex1.74.0 \
        libboost-serialization1.74.0 \
        libboost-system1.74.0 \
-        libossp-uuid16 \
+        libcurl4 \
+        libevent-2.1-7 \
        libgeos-c1v5 \
+        liblz4-1 \
+        libossp-uuid16 \
        libprotobuf-c1 \
+        libreadline8 \
        libsfcgal1 \
        libxml2 \
        libxslt1.1 \
        libzstd1 \
-        libcurl4 \
-        libevent-2.1-7 \
        locales \
+        lsof \
        procps \
-        ca-certificates \
        rsyslog \
+        screen \
+        tcpdump \
        $VERSION_INSTALLS && \
    apt clean && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
    localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -33,6 +33,7 @@
    import 'sql_exporter/lfc_hits.libsonnet',
    import 'sql_exporter/lfc_misses.libsonnet',
    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_used_pages.libsonnet',
    import 'sql_exporter/lfc_writes.libsonnet',
    import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
    import 'sql_exporter/max_cluster_size.libsonnet',
--- a/compute/etc/sql_exporter/lfc_used_pages.libsonnet
+++ b/compute/etc/sql_exporter/lfc_used_pages.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_used_pages',
+  type: 'gauge',
+  help: 'LFC pages used',
+  key_labels: null,
+  values: [
+    'lfc_used_pages',
+  ],
+  query: importstr 'sql_exporter/lfc_used_pages.sql',
+}
--- a/compute/etc/sql_exporter/lfc_used_pages.sql
+++ b/compute/etc/sql_exporter/lfc_used_pages.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages';
--- a/compute/patches/plv8_v3.1.10.patch
+++ b/compute/patches/plv8_v3.1.10.patch
@@ -1,12 +1,6 @@
-commit 46b38d3e46f9cd6c70d9b189dd6ff4abaa17cf5e
-Author: Alexander Bayandin <alexander@neon.tech>
-Date:   Sat Nov 30 18:29:32 2024 +0000
-
-    Fix v8 9.7.37 compilation on Debian 12
-
 diff --git a/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
 new file mode 100644
-index 0000000..f0a5dc7
+index 0000000..fae1cb3
 --- /dev/null
 +++ b/patches/code/84cf3230a9680aac3b73c410c2b758760b6d3066.patch
@@ -0,0 +1,30 @@
@@ -35,8 +29,21 @@ index 0000000..f0a5dc7
 +@@ -5,6 +5,7 @@
 + #ifndef V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
 + #define V8_HEAP_CPPGC_PREFINALIZER_HANDLER_H_
-+ 
+
 ++#include <utility>
 + #include <vector>
-+ 
+
 + #include "include/cppgc/prefinalizer.h"
+diff --git a/plv8.cc b/plv8.cc
+index c1ce883..6e47e94 100644
+--- a/plv8.cc
+++ b/plv8.cc
+@@ -379,7 +379,7 @@ _PG_init(void)
+ 							   NULL,
+ 							   &plv8_v8_flags,
+ 							   NULL,
+-							   PGC_USERSET, 0,
+							   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+ 							   NULL,
+ #endif
--- a/compute/patches/plv8_v3.2.3.patch
+++ b/compute/patches/plv8_v3.2.3.patch
@@ -0,0 +1,13 @@
+diff --git a/plv8.cc b/plv8.cc
+index edfa2aa..623e7f2 100644
+--- a/plv8.cc
+++ b/plv8.cc
+@@ -385,7 +385,7 @@ _PG_init(void)
+                                    NULL,
+                                    &plv8_v8_flags,
+                                    NULL,
+-                                   PGC_USERSET, 0,
+                                   PGC_SUSET, 0,
+ #if PG_VERSION_NUM >= 90100
+                                    NULL,
+ #endif
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,9 @@ use anyhow::{Context, Result};
 use clap::Parser;
 use compute_api::responses::ComputeCtlConfig;
 use compute_api::spec::ComputeSpec;
-use compute_tools::compute::{ComputeNode, ComputeNodeParams, forward_termination_signal};
+use compute_tools::compute::{
+    BUILD_TAG, ComputeNode, ComputeNodeParams, forward_termination_signal,
+};
 use compute_tools::extension_server::get_pg_version_string;
 use compute_tools::logger::*;
 use compute_tools::params::*;
@@ -57,10 +59,6 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

-// this is an arbitrary build tag. Fine as a default / for testing purposes
-// in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "latest";
-
 // Compatibility hack: if the control plane specified any remote-ext-config
 // use the default value for extension storage proxy gateway.
 // Remove this once the control plane is updated to pass the gateway URL
@@ -147,7 +145,7 @@ fn main() -> Result<()> {
        .build()?;
    let _rt_guard = runtime.enter();

-    let build_tag = runtime.block_on(init())?;
+    runtime.block_on(init())?;

    // enable core dumping for all child processes
    setrlimit(Resource::CORE, rlimit::INFINITY, rlimit::INFINITY)?;
@@ -174,8 +172,6 @@ fn main() -> Result<()> {
            cgroup: cli.cgroup,
            #[cfg(target_os = "linux")]
            vm_monitor_addr: cli.vm_monitor_addr,
-            build_tag,
-
            live_config_allowed: cli_spec.live_config_allowed,
        },
        cli_spec.spec,
@@ -189,7 +185,7 @@ fn main() -> Result<()> {
    deinit_and_exit(exit_code);
 }

-async fn init() -> Result<String> {
+async fn init() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL).await?;

    let mut signals = Signals::new([SIGINT, SIGTERM, SIGQUIT])?;
@@ -199,12 +195,9 @@ async fn init() -> Result<String> {
        }
    });

-    let build_tag = option_env!("BUILD_TAG")
-        .unwrap_or(BUILD_TAG_DEFAULT)
-        .to_string();
-    info!("build_tag: {build_tag}");
+    info!("compute build_tag: {}", &BUILD_TAG.to_string());

-    Ok(build_tag)
+    Ok(())
 }

 fn try_spec_from_cli(cli: &Cli) -> Result<CliSpecParams> {
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,6 +20,7 @@ use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use nix::sys::signal::{Signal, kill};
 use nix::unistd::Pid;
+use once_cell::sync::Lazy;
 use postgres;
 use postgres::NoTls;
 use postgres::error::SqlState;
@@ -35,6 +36,7 @@ use crate::disk_quota::set_disk_quota;
 use crate::installed_extensions::get_installed_extensions;
 use crate::logger::startup_context_from_env;
 use crate::lsn_lease::launch_lsn_lease_bg_task_for_static;
+use crate::metrics::COMPUTE_CTL_UP;
 use crate::monitor::launch_monitor;
 use crate::pg_helpers::*;
 use crate::rsyslog::{
@@ -49,6 +51,17 @@ use crate::{config, extension_server, local_proxy};

 pub static SYNC_SAFEKEEPERS_PID: AtomicU32 = AtomicU32::new(0);
 pub static PG_PID: AtomicU32 = AtomicU32::new(0);
+// This is an arbitrary build tag. Fine as a default / for testing purposes
+// in-case of not-set environment var
+const BUILD_TAG_DEFAULT: &str = "latest";
+/// Build tag/version of the compute node binaries/image. It's tricky and ugly
+/// to pass it everywhere as a part of `ComputeNodeParams`, so we use a
+/// global static variable.
+pub static BUILD_TAG: Lazy<String> = Lazy::new(|| {
+    option_env!("BUILD_TAG")
+        .unwrap_or(BUILD_TAG_DEFAULT)
+        .to_string()
+});

 /// Static configuration params that don't change after startup. These mostly
 /// come from the CLI args, or are derived from them.
@@ -72,7 +85,6 @@ pub struct ComputeNodeParams {
    pub pgdata: String,
    pub pgbin: String,
    pub pgversion: String,
-    pub build_tag: String,

    /// The port that the compute's external HTTP server listens on
    pub external_http_port: u16,
@@ -173,6 +185,11 @@ impl ComputeState {
        info!("Changing compute status from {} to {}", prev, status);
        self.status = status;
        state_changed.notify_all();
+
+        COMPUTE_CTL_UP.reset();
+        COMPUTE_CTL_UP
+            .with_label_values(&[&BUILD_TAG, status.to_string().as_str()])
+            .set(1);
    }

    pub fn set_failed_status(&mut self, err: anyhow::Error, state_changed: &Condvar) {
@@ -343,6 +360,14 @@ impl ComputeNode {
            this.prewarm_postgres()?;
        }

+        // Set the up metric with Empty status before starting the HTTP server.
+        // That way on the first metric scrape, an external observer will see us
+        // as 'up' and 'empty' (unless the compute was started with a spec or
+        // already configured by control plane).
+        COMPUTE_CTL_UP
+            .with_label_values(&[&BUILD_TAG, ComputeStatus::Empty.to_string().as_str()])
+            .set(1);
+
        // Launch the external HTTP server first, so that we can serve control plane
        // requests while configuration is still in progress.
        crate::http::server::Server::External {
@@ -2032,12 +2057,8 @@ LIMIT 100",

        let mut download_tasks = Vec::new();
        for library in &libs_vec {
-            let (ext_name, ext_path) = remote_extensions.get_ext(
-                library,
-                true,
-                &self.params.build_tag,
-                &self.params.pgversion,
-            )?;
+            let (ext_name, ext_path) =
+                remote_extensions.get_ext(library, true, &BUILD_TAG, &self.params.pgversion)?;
            download_tasks.push(self.download_extension(ext_name, ext_path));
        }
        let results = join_all(download_tasks).await;
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -59,9 +59,12 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
        Box::pin(async move {
            let request_id = request.extract_parts::<RequestId>().await.unwrap();

-            // TODO: Remove this check after a successful rollout
-            if jwks.keys.is_empty() {
-                warn!(%request_id, "Authorization has not been configured");
+            // TODO: Remove this stanza after teaching neon_local and the
+            // regression tests to use a JWT + JWKS.
+            //
+            // https://github.com/neondatabase/neon/issues/11316
+            if cfg!(feature = "testing") {
+                warn!(%request_id, "Skipping compute_ctl authorization check");

                return Ok(request);
            }
@@ -110,8 +113,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
 impl Authorize {
    /// Verify the token using the JSON Web Key set and return the token data.
    fn verify(jwks: &JwkSet, token: &str, validation: &Validation) -> Result<TokenData<Claims>> {
-        debug_assert!(!jwks.keys.is_empty());
-
        for jwk in jwks.keys.iter() {
            let decoding_key = match DecodingKey::from_jwk(jwk) {
                Ok(key) => key,
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -5,7 +5,7 @@ use axum::response::{IntoResponse, Response};
 use http::StatusCode;
 use serde::Deserialize;

-use crate::compute::ComputeNode;
+use crate::compute::{BUILD_TAG, ComputeNode};
 use crate::http::JsonResponse;
 use crate::http::extract::{Path, Query};

@@ -47,7 +47,7 @@ pub(in crate::http) async fn download_extension(
        remote_extensions.get_ext(
            &filename,
            ext_server_params.is_library,
-            &compute.params.build_tag,
+            &BUILD_TAG,
            &compute.params.pgversion,
        )
    };
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -1,7 +1,8 @@
 use metrics::core::{AtomicF64, Collector, GenericGauge};
 use metrics::proto::MetricFamily;
 use metrics::{
-    IntCounterVec, UIntGaugeVec, register_gauge, register_int_counter_vec, register_uint_gauge_vec,
+    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec,
+    register_int_gauge_vec, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;

@@ -70,8 +71,19 @@ pub(crate) static AUDIT_LOG_DIR_SIZE: Lazy<GenericGauge<AtomicF64>> = Lazy::new(
    .expect("failed to define a metric")
 });

+// Report that `compute_ctl` is up and what's the current compute status.
+pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "compute_ctl_up",
+        "Whether compute_ctl is running",
+        &["build_tag", "status"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub fn collect() -> Vec<MetricFamily> {
-    let mut metrics = INSTALLED_EXTENSIONS.collect();
+    let mut metrics = COMPUTE_CTL_UP.collect();
+    metrics.extend(INSTALLED_EXTENSIONS.collect());
    metrics.extend(CPLANE_REQUESTS_TOTAL.collect());
    metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
    metrics.extend(DB_MIGRATION_FAILED.collect());
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -419,7 +419,7 @@ impl ComputeNode {
                .iter()
                .filter_map(|val| val.parse::<usize>().ok())
                .map(|val| if val > 1 { val - 1 } else { 1 })
-                .last()
+                .next_back()
                .unwrap_or(3)
        }
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -428,11 +428,6 @@ impl PageServerNode {
                .map(|x| x.parse::<usize>())
                .transpose()
                .context("Failed to parse 'l0_flush_delay_threshold' as an integer")?,
-            l0_flush_wait_upload: settings
-                .remove("l0_flush_wait_upload")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'l0_flush_wait_upload' as a boolean")?,
            l0_flush_stall_threshold: settings
                .remove("l0_flush_stall_threshold")
                .map(|x| x.parse::<usize>())
--- a/docs/SUMMARY.md
+++ b/docs/SUMMARY.md
@@ -21,6 +21,7 @@ in this repository.
    - [WAL Redo](./pageserver-walredo.md)
    - [Page cache](./pageserver-pagecache.md)
    - [Storage](./pageserver-storage.md)
+    - [Compaction](./pageserver-compaction.md)
    - [Processing a GetPage request](./pageserver-processing-getpage.md)
    - [Processing WAL](./pageserver-processing-wal.md)

--- a/docs/pageserver-compaction.md
+++ b/docs/pageserver-compaction.md
@@ -0,0 +1,110 @@
+# Pageserver Compaction
+
+Lifted from <https://www.notion.so/neondatabase/Rough-Notes-on-Compaction-1baf189e004780859e65ef63b85cfa81?pvs=4>.
+
+Updated 2025-03-26.
+
+## Pages and WAL
+
+Postgres stores data in 8 KB pages, identified by a page number.
+
+The WAL contains a sequence of page writes: either images (complete page contents) or deltas (patches applied to images). Each write is identified by its byte position in the WAL, aka LSN. 
+
+Each page version is thus identified by page@LSN. Postgres may read pages at past LSNs.
+
+Pageservers ingest WAL by writing WAL records into a key/value store keyed by page@LSN.
+
+Pageservers materialize pages for Postgres reads by finding the most recent page image and applying all subsequent page deltas, up to the read LSN.
+
+## Compaction: Why?
+
+Pageservers store page@LSN keys in a key/value store using a custom variant of an LSM tree. Each timeline on each tenant shard has its own LSM tree.
+
+When Pageservers write new page@LSN entries, they are appended unordered to an ephemeral layer file. When the ephemeral layer file exceeds `checkpoint_distance` (default 256 MB), the key/value pairs are sorted by key and written out to a layer file (for efficient lookups).
+
+As WAL writes continue, more layer files accumulate.
+
+Reads must search through the layer files to find the page’s image and deltas. The more layer files accumulate, the more la yer files reads must search through before they find a page image, aka read amplification.
+
+Compaction’s job is to:
+
+- Reduce read amplification by reorganizing and combining layer files.
+- Remove old garbage from layer files.
+
+As part of this, it may combine several page deltas into a single page image where possible.
+
+## Compaction: How?
+
+Neon uses a non-standard variant of an LSM tree made up of two levels of layer files: L0 and L1.
+
+Compaction runs in two phases: L0→L1 compaction, and L1 image compaction.
+
+L0 contains a stack of L0 layers at decreasing LSN ranges. These have been flushed sequentially from ephemeral layers. Each L0 layer covers the entire page space (page 0 to ~infinity) and the LSN range that was ingested into it. L0 layers are therefore particularly bad for read amp, since every read must search all L0 layers below the read LSN. For example:
+
+```
+| Page 0-99 @ LSN 0400-04ff |
+| Page 0-99 @ LSN 0300-03ff |
+| Page 0-99 @ LSN 0200-02ff |
+| Page 0-99 @ LSN 0100-01ff |
+| Page 0-99 @ LSN 0000-00ff |
+```
+
+L0→L1 compaction takes the bottom-most chunk of L0 layer files of between `compaction_threshold` (default 10) and `compaction_upper_limit` (default 20) layers. It uses merge-sort to write out sorted L1 delta layers of size `compaction_target_size` (default 128 MB).
+
+L1 typically consists of a “bed” of image layers with materialized page images at a specific LSN, and then delta layers of various page/LSN ranges above them with page deltas. For example:
+
+```
+Delta layers:               |     30-84@0310-04ff      |
+Delta layers:    | 10-42@0200-02ff |           | 65-92@0174-02aa |
+Image layers: |    0-39@0100    |    40-79@0100    |    80-99@0100    |
+```
+
+L1 image compaction scans across the L1 keyspace at some LSN, materializes page images by reading the image and delta layers below the LSN (via vectored reads), and writes out new sorted image layers of roughly size `compaction_target_size` (default 128 MB) at that LSN.
+
+Layer files below the new image files’ LSN can be garbage collected when they are no longer needed for PITR.
+
+Even though the old layer files are not immediately garbage collected, the new image layers help with read amp because reads can stop traversing the layer stack as soon as they encounter a page image.
+
+## Compaction: When?
+
+Pageservers run a `compaction_loop` background task for each tenant shard. Every `compaction_period` (default 20 seconds) it will wake up and check if any of the shard’s timelines need compaction. Additionally, L0 layer flushes will eagerly wake the compaction loop if the L0 count exceeds `compaction_threshold` (default 10).
+
+L0 compaction runs if the number of L0 layers exceeds `compaction_threshold` (default 10).
+
+L1 image compaction runs across sections of the L1 keyspace that have at least `image_creation_threshold` (default 3) delta layers overlapping image layers.
+
+At most `CONCURRENT_BACKGROUND_TASKS` (default 3 / 4 * CPUs = 6) background tasks can run concurrently on a Pageserver, including compaction. Further compaction tasks must wait.
+
+Because L0 layers cause the most read amp (they overlap the entire keyspace and only contain page deltas), they are aggressively compacted down:
+
+- L0 is compacted down across all tenant timelines before L1 compaction is attempted (`compaction_l0_first`).
+- L0 compaction uses a separate concurrency limit of `CONCURRENT_L0_COMPACTION_TASKS` (default 3 / 4 * CPUs = 6) to avoid waiting for other tasks (`compaction_l0_semaphore`).
+- If L0 compaction is needed on any tenant timeline, L1 image compaction will yield to start an immediate L0 compaction run (except for compaction run via admin APIs).
+
+## Backpressure
+
+With sustained heavy write loads, new L0 layers may be flushed faster than they can be compacted down. This can cause an unbounded buildup of read amplification and compaction debt, which can take hours to resolve even after the writes stop.
+
+To avoid this and allow compaction to keep up, layer flushes will slow writes down to apply backpressure on the workload:
+
+- At `l0_flush_delay_threshold` (default 30) L0 layers, layer flushes are delayed by the flush duration, such that they take 2x as long.
+- At `l0_flush_stall_threshold` (default disabled) L0 layers, layer flushes stall entirely until the L0 count falls back below the threshold. This is currently disabled because we don’t trust L0 compaction to be responsive enough.
+
+This backpressure is propagated to the compute by waiting for layer flushes when WAL ingestion rolls the ephemeral layer. The compute will significantly slow down WAL writes at:
+
+- `max_replication_write_lag` (default 500 MB), when Pageserver WAL ingestion lags
+- `max_replication_flush_lag` (default 10 GB), when Pageserver L0 flushes lag
+
+Combined, this means that the compute will backpressure when there are 30 L0 layers (30 * 256 MB = 7.7 GB) and the Pageserver WAL ingestion lags the compute by 500 MB, for a total of ~8 GB L0+ephemeral compaction debt on a single shard.
+
+Since we only delay L0 flushes by 2x when backpressuring, and haven’t enabled stalls, it is still possible for read amp to increase unbounded if compaction is too slow (although we haven’t seen this in practice). But this is considered better than stalling flushes and causing unavailability for as long as it takes L0 compaction to react, since we don’t trust it to be fast enough — at the expense of continually increasing read latency and CPU usage for this tenant. We should either enable stalls when we have enough confidence in L0 compaction, or scale the flush delay by the number of L0 layers to apply increasing backpressure.
+
+## Circuit Breaker
+
+Compaction can fail, often repeatedly. This can happen e.g. due to data corruption, faulty hardware, S3 outages, etc.
+
+If compaction fails, the compaction loop will naïvely try and fail again almost immediately. It may only fail after doing a significant amount of wasted work, while holding onto the background task semaphore.
+
+To avoid repeatedly doing wasted work and starving out other compaction jobs, each tenant has a compaction circuit breaker. After 5 repeated compaction failures, the circuit breaker trips and disables compaction for the next 24 hours, before resetting the breaker and trying again. This disables compaction across all tenant timelines (faulty or not).
+
+Disabling compaction for a long time is dangerous, since it can lead to unbounded read amp and compaction debt, and continuous workload backpressure. However, continually failing would not help either. Tripped circuit breakers trigger an alert and must be investigated promptly.
--- a/libs/http-utils/src/server.rs
+++ b/libs/http-utils/src/server.rs
@@ -91,14 +91,14 @@ impl Server {
                                        Ok(tls_stream) => tls_stream,
                                        Err(err) => {
                                            if !suppress_io_error(&err) {
-                                                info!("Failed to accept TLS connection: {err:#}");
+                                                info!(%remote_addr, "Failed to accept TLS connection: {err:#}");
                                            }
                                            return;
                                        }
                                    };
                                    if let Err(err) = Self::serve_connection(tls_stream, service, cancel).await {
                                        if !suppress_hyper_error(&err) {
-                                            info!("Failed to serve HTTPS connection: {err:#}");
+                                            info!(%remote_addr, "Failed to serve HTTPS connection: {err:#}");
                                        }
                                    }
                                }
@@ -106,7 +106,7 @@ impl Server {
                                    // Handle HTTP connection.
                                    if let Err(err) = Self::serve_connection(tcp_stream, service, cancel).await {
                                        if !suppress_hyper_error(&err) {
-                                            info!("Failed to serve HTTP connection: {err:#}");
+                                            info!(%remote_addr, "Failed to serve HTTP connection: {err:#}");
                                        }
                                    }
                                }
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -285,12 +285,6 @@ pub struct TenantConfigToml {
    /// Level0 delta layer threshold at which to stall layer flushes. Must be >compaction_threshold
    /// to avoid deadlock. 0 to disable. Disabled by default.
    pub l0_flush_stall_threshold: Option<usize>,
-    /// If true, Level0 delta layer flushes will wait for S3 upload before flushing the next
-    /// layer. This is a temporary backpressure mechanism which should be removed once
-    /// l0_flush_{delay,stall}_threshold is fully enabled.
-    ///
-    /// TODO: this is no longer enabled, remove it when the config option is no longer set.
-    pub l0_flush_wait_upload: bool,
    // Determines how much history is retained, to allow
    // branching and read replicas at an older point in time.
    // The unit is #of bytes of WAL.
@@ -579,8 +573,6 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_ALGORITHM: crate::models::CompactionAlgorithm =
        crate::models::CompactionAlgorithm::Legacy;

-    pub const DEFAULT_L0_FLUSH_WAIT_UPLOAD: bool = false;
-
    pub const DEFAULT_GC_HORIZON: u64 = 64 * 1024 * 1024;

    // Large DEFAULT_GC_PERIOD is fine as long as PITR_INTERVAL is larger.
@@ -627,7 +619,6 @@ impl Default for TenantConfigToml {
            compaction_l0_semaphore: DEFAULT_COMPACTION_L0_SEMAPHORE,
            l0_flush_delay_threshold: None,
            l0_flush_stall_threshold: None,
-            l0_flush_wait_upload: DEFAULT_L0_FLUSH_WAIT_UPLOAD,
            gc_horizon: DEFAULT_GC_HORIZON,
            gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                .expect("cannot parse default gc period"),
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -523,8 +523,6 @@ pub struct TenantConfigPatch {
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub l0_flush_stall_threshold: FieldPatch<usize>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
-    pub l0_flush_wait_upload: FieldPatch<bool>,
-    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_horizon: FieldPatch<u64>,
    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
    pub gc_period: FieldPatch<String>,
@@ -614,9 +612,6 @@ pub struct TenantConfig {
    #[serde(skip_serializing_if = "Option::is_none")]
    pub l0_flush_stall_threshold: Option<usize>,

-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub l0_flush_wait_upload: Option<bool>,
-
    #[serde(skip_serializing_if = "Option::is_none")]
    pub gc_horizon: Option<u64>,

@@ -712,7 +707,6 @@ impl TenantConfig {
            mut compaction_l0_semaphore,
            mut l0_flush_delay_threshold,
            mut l0_flush_stall_threshold,
-            mut l0_flush_wait_upload,
            mut gc_horizon,
            mut gc_period,
            mut image_creation_threshold,
@@ -765,7 +759,6 @@ impl TenantConfig {
        patch
            .l0_flush_stall_threshold
            .apply(&mut l0_flush_stall_threshold);
-        patch.l0_flush_wait_upload.apply(&mut l0_flush_wait_upload);
        patch.gc_horizon.apply(&mut gc_horizon);
        patch
            .gc_period
@@ -844,7 +837,6 @@ impl TenantConfig {
            compaction_l0_semaphore,
            l0_flush_delay_threshold,
            l0_flush_stall_threshold,
-            l0_flush_wait_upload,
            gc_horizon,
            gc_period,
            image_creation_threshold,
@@ -911,9 +903,6 @@ impl TenantConfig {
            l0_flush_stall_threshold: self
                .l0_flush_stall_threshold
                .or(global_conf.l0_flush_stall_threshold),
-            l0_flush_wait_upload: self
-                .l0_flush_wait_upload
-                .unwrap_or(global_conf.l0_flush_wait_upload),
            gc_horizon: self.gc_horizon.unwrap_or(global_conf.gc_horizon),
            gc_period: self.gc_period.unwrap_or(global_conf.gc_period),
            image_creation_threshold: self
@@ -1429,11 +1418,6 @@ pub struct TimelineInfo {
    pub last_record_lsn: Lsn,
    pub prev_record_lsn: Option<Lsn>,

-    /// Legacy field, retained for one version to enable old storage controller to
-    /// decode (it was a mandatory field).
-    #[serde(default, rename = "latest_gc_cutoff_lsn")]
-    pub _unused: Lsn,
-
    /// The LSN up to which GC has advanced: older data may still exist but it is not available for clients.
    /// This LSN is not suitable for deciding where to create branches etc: use [`TimelineInfo::min_readable_lsn`] instead,
    /// as it is easier to reason about.
--- a/libs/proxy/tokio-postgres2/Cargo.toml
+++ b/libs/proxy/tokio-postgres2/Cargo.toml
@@ -8,10 +8,9 @@ license = "MIT/Apache-2.0"
 bytes.workspace = true
 fallible-iterator.workspace = true
 futures-util = { workspace = true, features = ["sink"] }
-log = "0.4"
+tracing.workspace = true
 parking_lot.workspace = true
 pin-project-lite.workspace = true
-phf = "0.11"
 postgres-protocol2 = { path = "../postgres-protocol2" }
 postgres-types2 = { path = "../postgres-types2" }
 tokio = { workspace = true, features = ["io-util", "time", "net"] }
--- a/libs/proxy/tokio-postgres2/src/connection.rs
+++ b/libs/proxy/tokio-postgres2/src/connection.rs
@@ -6,13 +6,13 @@ use std::task::{Context, Poll};
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{Sink, Stream, ready};
-use log::{info, trace};
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::sync::mpsc;
 use tokio_util::codec::Framed;
 use tokio_util::sync::PollSender;
+use tracing::{info, trace};

 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
 use crate::error::DbError;
--- a/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
+++ b/libs/proxy/tokio-postgres2/src/error/sqlstate.rs
--- a/libs/proxy/tokio-postgres2/src/prepare.rs
+++ b/libs/proxy/tokio-postgres2/src/prepare.rs
@@ -5,9 +5,9 @@ use std::sync::Arc;
 use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
 use futures_util::{TryStreamExt, pin_mut};
-use log::debug;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
+use tracing::debug;

 use crate::client::{CachedTypeInfo, InnerClient};
 use crate::codec::FrontendMessage;
--- a/libs/proxy/tokio-postgres2/src/query.rs
+++ b/libs/proxy/tokio-postgres2/src/query.rs
@@ -7,11 +7,11 @@ use std::task::{Context, Poll};
 use bytes::{BufMut, Bytes, BytesMut};
 use fallible_iterator::FallibleIterator;
 use futures_util::{Stream, ready};
-use log::{Level, debug, log_enabled};
 use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
 use postgres_types2::{Format, ToSql, Type};
+use tracing::debug;

 use crate::client::{InnerClient, Responses};
 use crate::codec::FrontendMessage;
@@ -36,7 +36,7 @@ where
    I: IntoIterator<Item = &'a (dyn ToSql + Sync)>,
    I::IntoIter: ExactSizeIterator,
 {
-    let buf = if log_enabled!(Level::Debug) {
+    let buf = if tracing::enabled!(tracing::Level::DEBUG) {
        let params = params.into_iter().collect::<Vec<_>>();
        debug!(
            "executing statement {} with parameters: {:?}",
--- a/libs/proxy/tokio-postgres2/src/simple_query.rs
+++ b/libs/proxy/tokio-postgres2/src/simple_query.rs
@@ -6,10 +6,10 @@ use std::task::{Context, Poll};
 use bytes::Bytes;
 use fallible_iterator::FallibleIterator;
 use futures_util::{Stream, ready};
-use log::debug;
 use pin_project_lite::pin_project;
 use postgres_protocol2::message::backend::Message;
 use postgres_protocol2::message::frontend;
+use tracing::debug;

 use crate::client::{InnerClient, Responses};
 use crate::codec::FrontendMessage;
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -558,7 +558,7 @@ async fn upload_large_enough_file(
 ) -> usize {
    let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
    let body = bytes::Bytes::from(vec![0u8; 1024]);
-    let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
+    let contents = std::iter::once(header).chain(std::iter::repeat_n(body, 128));

    let len = contents.clone().fold(0, |acc, next| acc + next.len());

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -86,17 +86,17 @@ impl Client {
        resp.json().await.map_err(Error::ReceiveBody)
    }

-    /// Get an arbitrary path and returning a streaming Response.  This function is suitable
-    /// for pass-through/proxy use cases where we don't care what the response content looks
-    /// like.
+    /// Send an HTTP request to an arbitrary path with a desired HTTP method and returning a streaming
+    /// Response.  This function is suitable for pass-through/proxy use cases where we don't care
+    /// what the response content looks like.
    ///
    /// Use/add one of the properly typed methods below if you know aren't proxying, and
    /// know what kind of response you expect.
-    pub async fn get_raw(&self, path: String) -> Result<reqwest::Response> {
+    pub async fn op_raw(&self, method: Method, path: String) -> Result<reqwest::Response> {
        debug_assert!(path.starts_with('/'));
        let uri = format!("{}{}", self.mgmt_api_endpoint, path);

-        let mut req = self.client.request(Method::GET, uri);
+        let mut req = self.client.request(method, uri);
        if let Some(value) = &self.authorization_header {
            req = req.header(reqwest::header::AUTHORIZATION, value);
        }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1133,6 +1133,40 @@ components:
        applied_gc_cutoff_lsn:
          type: string
          format: hex
+        safekeepers:
+          $ref: "#/components/schemas/TimelineSafekeepersInfo"
+
+    TimelineSafekeepersInfo:
+      type: object
+      required:
+        - tenant_id
+        - timeline_id
+        - generation
+        - safekeepers
+      properties:
+        tenant_id:
+          type: string
+          format: hex
+        timeline_id:
+          type: string
+          format: hex
+        generation:
+          type: integer
+        safekeepers:
+          type: array
+          items:
+            $ref: "#/components/schemas/TimelineSafekeeperInfo"
+
+    TimelineSafekeeperInfo:
+      type: object
+      required:
+        - id
+        - hostname
+      properties:
+        id:
+          type: integer
+        hostname:
+          type: string

    SyntheticSizeResponse:
      type: object
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -74,8 +74,8 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, Timeline, WaitLsnTimeout,
-    WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
+    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
    GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -445,6 +445,9 @@ async fn build_timeline_info_common(

    let (pitr_history_size, within_ancestor_pitr) = timeline.get_pitr_history_stats();

+    // Externally, expose the lowest LSN that can be used to create a branch.
+    // Internally we distinguish between the planned GC cutoff (PITR point) and the "applied" GC cutoff (where we
+    // actually trimmed data to), which can pass each other when PITR is changed.
    let min_readable_lsn = std::cmp::max(
        timeline.get_gc_cutoff_lsn(),
        *timeline.get_applied_gc_cutoff_lsn(),
@@ -461,7 +464,6 @@ async fn build_timeline_info_common(
        initdb_lsn,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
-        _unused: Default::default(), // Unused, for legacy decode only
        min_readable_lsn,
        applied_gc_cutoff_lsn: *timeline.get_applied_gc_cutoff_lsn(),
        current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
@@ -2256,7 +2258,6 @@ async fn timeline_compact_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
-    flags |= CompactFlags::NoYield; // run compaction to completion

    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
        flags |= CompactFlags::ForceL0Compaction;
@@ -2336,21 +2337,31 @@ async fn timeline_compact_handler(
 }

 async fn timeline_mark_invisible_handler(
-    request: Request<Body>,
+    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let compact_request = json_request_maybe::<Option<MarkInvisibleRequest>>(&mut request).await?;
+
    let state = get_state(&request);

+    let visibility = match compact_request {
+        Some(req) => match req.is_visible {
+            Some(true) => TimelineVisibilityState::Visible,
+            Some(false) | None => TimelineVisibilityState::Invisible,
+        },
+        None => TimelineVisibilityState::Invisible,
+    };
+
    async {
        let tenant = state
            .tenant_manager
            .get_attached_tenant_shard(tenant_shard_id)?;
        let timeline = tenant.get_timeline(timeline_id, true)?;
-        timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(TimelineVisibilityState::Invisible).map_err(ApiError::InternalServerError)?;
+        timeline.remote_client.schedule_index_upload_for_timeline_invisible_state(visibility).map_err(ApiError::InternalServerError)?;
        json_response(StatusCode::OK, ())
    }
    .instrument(info_span!("manual_timeline_mark_invisible", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
@@ -2417,7 +2428,6 @@ async fn timeline_checkpoint_handler(
    let state = get_state(&request);

    let mut flags = EnumSet::empty();
-    flags |= CompactFlags::NoYield; // run compaction to completion
    if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? {
        flags |= CompactFlags::ForceL0Compaction;
    }
@@ -3178,7 +3188,8 @@ async fn list_aux_files(
        timeline.gate.enter().map_err(|_| ApiError::Cancelled)?,
    );

-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download)
+        .with_scope_timeline(&timeline);
    let files = timeline
        .list_aux_files(body.lsn, &ctx, io_concurrency)
        .await?;
@@ -3776,7 +3787,7 @@ pub fn make_router(
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/mark_invisible",
-            |r| testing_api_handler("mark timeline invisible", r, timeline_mark_invisible_handler),
+            |r| api_handler( r, timeline_mark_invisible_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -10,7 +10,7 @@ use std::time::{Duration, Instant};
 use enum_map::{Enum as _, EnumMap};
 use futures::Future;
 use metrics::{
-    Counter, CounterVec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    Counter, CounterVec, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
    register_counter_vec, register_gauge_vec, register_histogram, register_histogram_vec,
    register_int_counter, register_int_counter_pair_vec, register_int_counter_vec,
@@ -499,15 +499,6 @@ pub(crate) static WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS: Lazy<IntCounter> = Lazy::n
    .expect("failed to define a metric")
 });

-static FLUSH_WAIT_UPLOAD_TIME: Lazy<GaugeVec> = Lazy::new(|| {
-    register_gauge_vec!(
-        "pageserver_flush_wait_upload_seconds",
-        "Time spent waiting for preceding uploads during layer flush",
-        &["tenant_id", "shard_id", "timeline_id"]
-    )
-    .expect("failed to define a metric")
-});
-
 static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
@@ -2864,7 +2855,6 @@ pub(crate) struct TimelineMetrics {
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
    pub flush_delay_histo: StorageTimeMetrics,
-    pub flush_wait_upload_time_gauge: Gauge,
    pub compact_time_histo: StorageTimeMetrics,
    pub create_images_time_histo: StorageTimeMetrics,
    pub logical_size_histo: StorageTimeMetrics,
@@ -2916,9 +2906,6 @@ impl TimelineMetrics {
            &shard_id,
            &timeline_id,
        );
-        let flush_wait_upload_time_gauge = FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
-            .unwrap();
        let compact_time_histo = StorageTimeMetrics::new(
            StorageTimeOperation::Compact,
            &tenant_id,
@@ -3046,7 +3033,6 @@ impl TimelineMetrics {
            timeline_id,
            flush_time_histo,
            flush_delay_histo,
-            flush_wait_upload_time_gauge,
            compact_time_histo,
            create_images_time_histo,
            logical_size_histo,
@@ -3096,14 +3082,6 @@ impl TimelineMetrics {
        self.resident_physical_size_gauge.get()
    }

-    pub(crate) fn flush_wait_upload_time_gauge_add(&self, duration: f64) {
-        self.flush_wait_upload_time_gauge.add(duration);
-        crate::metrics::FLUSH_WAIT_UPLOAD_TIME
-            .get_metric_with_label_values(&[&self.tenant_id, &self.shard_id, &self.timeline_id])
-            .unwrap()
-            .add(duration);
-    }
-
    /// Generates TIMELINE_LAYER labels for a persistent layer.
    fn make_layer_labels(&self, layer_desc: &PersistentLayerDesc) -> [&str; 5] {
        let level = match LayerMap::is_l0(&layer_desc.key_range, layer_desc.is_delta()) {
@@ -3207,7 +3185,6 @@ impl TimelineMetrics {
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
-        let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -219,8 +219,7 @@ pageserver_runtime!(MGMT_REQUEST_RUNTIME, "mgmt request worker");
 pageserver_runtime!(WALRECEIVER_RUNTIME, "walreceiver worker");
 pageserver_runtime!(BACKGROUND_RUNTIME, "background op worker");
 // Bump this number when adding a new pageserver_runtime!
-// SAFETY: it's obviously correct
-const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = unsafe { NonZeroUsize::new_unchecked(4) };
+const NUM_MULTIPLE_RUNTIMES: NonZeroUsize = NonZeroUsize::new(4).unwrap();

 #[derive(Debug, Clone, Copy)]
 pub struct PageserverTaskId(u64);
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3080,6 +3080,7 @@ impl Tenant {
            let mut has_pending_l0 = false;
            for timeline in compact_l0 {
                let ctx = &ctx.with_scope_timeline(&timeline);
+                // NB: don't set CompactFlags::YieldForL0, since this is an L0-only compaction pass.
                let outcome = timeline
                    .compact(cancel, CompactFlags::OnlyL0Compaction.into(), ctx)
                    .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
@@ -3097,14 +3098,9 @@ impl Tenant {
            }
        }

-        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated
-        // more L0 layers, they may also be compacted here.
-        //
-        // NB: image compaction may yield if there is pending L0 compaction.
-        //
-        // TODO: it will only yield if there is pending L0 compaction on the same timeline. If a
-        // different timeline needs compaction, it won't. It should check `l0_compaction_trigger`.
-        // We leave this for a later PR.
+        // Pass 2: image compaction and timeline offloading. If any timelines have accumulated more
+        // L0 layers, they may also be compacted here. Image compaction will yield if there is
+        // pending L0 compaction on any tenant timeline.
        //
        // TODO: consider ordering timelines by some priority, e.g. time since last full compaction,
        // amount of L1 delta debt or garbage, offload-eligible timelines first, etc.
@@ -3115,8 +3111,14 @@ impl Tenant {
            }
            let ctx = &ctx.with_scope_timeline(&timeline);

+            // Yield for L0 if the separate L0 pass is enabled (otherwise there's no point).
+            let mut flags = EnumSet::default();
+            if self.get_compaction_l0_first() {
+                flags |= CompactFlags::YieldForL0;
+            }
+
            let mut outcome = timeline
-                .compact(cancel, EnumSet::default(), ctx)
+                .compact(cancel, flags, ctx)
                .instrument(info_span!("compact_timeline", timeline_id = %timeline.timeline_id))
                .await
                .inspect_err(|err| self.maybe_trip_compaction_breaker(err))?;
@@ -3246,17 +3248,23 @@ impl Tenant {
    async fn housekeeping(&self) {
        // Call through to all timelines to freeze ephemeral layers as needed. This usually happens
        // during ingest, but we don't want idle timelines to hold open layers for too long.
-        let timelines = self
-            .timelines
-            .lock()
-            .unwrap()
-            .values()
-            .filter(|tli| tli.is_active())
-            .cloned()
-            .collect_vec();
+        //
+        // We don't do this if the tenant can't upload layers (i.e. it's in stale attachment mode).
+        // We don't run compaction in this case either, and don't want to keep flushing tiny L0
+        // layers that won't be compacted down.
+        if self.tenant_conf.load().location.may_upload_layers_hint() {
+            let timelines = self
+                .timelines
+                .lock()
+                .unwrap()
+                .values()
+                .filter(|tli| tli.is_active())
+                .cloned()
+                .collect_vec();

-        for timeline in timelines {
-            timeline.maybe_freeze_ephemeral_layer().await;
+            for timeline in timelines {
+                timeline.maybe_freeze_ephemeral_layer().await;
+            }
        }

        // Shut down walredo if idle.
@@ -3681,7 +3689,7 @@ impl Tenant {
                        }
                    }
                }
-                TenantState::Active { .. } => {
+                TenantState::Active => {
                    return Ok(());
                }
                TenantState::Broken { reason, .. } => {
@@ -6516,11 +6524,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
@@ -6537,11 +6541,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
@@ -6558,11 +6558,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        let mut writer = tline.writer().await;
@@ -6579,11 +6575,7 @@ mod tests {

        tline.freeze_and_flush().await?;
        tline
-            .compact(
-                &CancellationToken::new(),
-                CompactFlags::NoYield.into(),
-                &ctx,
-            )
+            .compact(&CancellationToken::new(), EnumSet::default(), &ctx)
            .await?;

        assert_eq!(
@@ -6666,9 +6658,7 @@ mod tests {
            timeline.freeze_and_flush().await?;
            if compact {
                // this requires timeline to be &Arc<Timeline>
-                timeline
-                    .compact(&cancel, CompactFlags::NoYield.into(), ctx)
-                    .await?;
+                timeline.compact(&cancel, EnumSet::default(), ctx).await?;
            }

            // this doesn't really need to use the timeline_id target, but it is closer to what it
@@ -6995,7 +6985,6 @@ mod tests {
        child_timeline.freeze_and_flush().await?;
        let mut flags = EnumSet::new();
        flags.insert(CompactFlags::ForceRepartition);
-        flags.insert(CompactFlags::NoYield);
        child_timeline
            .compact(&CancellationToken::new(), flags, &ctx)
            .await?;
@@ -7374,9 +7363,7 @@ mod tests {

            // Perform a cycle of flush, compact, and GC
            tline.freeze_and_flush().await?;
-            tline
-                .compact(&cancel, CompactFlags::NoYield.into(), &ctx)
-                .await?;
+            tline.compact(&cancel, EnumSet::default(), &ctx).await?;
            tenant
                .gc_iteration(Some(tline.timeline_id), 0, Duration::ZERO, &cancel, &ctx)
                .await?;
@@ -7705,7 +7692,6 @@ mod tests {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::NoYield);
                            flags
                        } else {
                            EnumSet::empty()
@@ -7756,9 +7742,7 @@ mod tests {
        let before_num_l0_delta_files =
            tline.layers.read().await.layer_map()?.level0_deltas().len();

-        tline
-            .compact(&cancel, CompactFlags::NoYield.into(), &ctx)
-            .await?;
+        tline.compact(&cancel, EnumSet::default(), &ctx).await?;

        let after_num_l0_delta_files = tline.layers.read().await.layer_map()?.level0_deltas().len();

@@ -7923,7 +7907,6 @@ mod tests {
                            let mut flags = EnumSet::new();
                            flags.insert(CompactFlags::ForceImageLayerCreation);
                            flags.insert(CompactFlags::ForceRepartition);
-                            flags.insert(CompactFlags::NoYield);
                            flags
                        },
                        &ctx,
@@ -8386,7 +8369,6 @@ mod tests {
                    let mut flags = EnumSet::new();
                    flags.insert(CompactFlags::ForceImageLayerCreation);
                    flags.insert(CompactFlags::ForceRepartition);
-                    flags.insert(CompactFlags::NoYield);
                    flags
                },
                &ctx,
@@ -8454,7 +8436,6 @@ mod tests {
                    let mut flags = EnumSet::new();
                    flags.insert(CompactFlags::ForceImageLayerCreation);
                    flags.insert(CompactFlags::ForceRepartition);
-                    flags.insert(CompactFlags::NoYield);
                    flags
                },
                &ctx,
@@ -11551,4 +11532,255 @@ mod tests {

        Ok(())
    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_synthetic_size_calculation_with_invisible_branches() -> anyhow::Result<()> {
+        use pageserver_api::models::TimelineVisibilityState;
+
+        use crate::tenant::size::gather_inputs;
+
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            // Ensure that we don't compute gc_cutoffs (which needs reading the layer files)
+            pitr_interval: Some(Duration::ZERO),
+            ..Default::default()
+        };
+        let harness = TenantHarness::create_custom(
+            "test_synthetic_size_calculation_with_invisible_branches",
+            tenant_conf,
+            TenantId::generate(),
+            ShardIdentity::unsharded(),
+            Generation::new(0xdeadbeef),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let main_tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![],
+                vec![],
+                vec![],
+                Lsn(0x100),
+            )
+            .await?;
+
+        let snapshot1 = TimelineId::from_array(hex!("11223344556677881122334455667790"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot1,
+                Some(Lsn(0x20)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let snapshot2 = TimelineId::from_array(hex!("11223344556677881122334455667791"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot2,
+                Some(Lsn(0x30)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let snapshot3 = TimelineId::from_array(hex!("11223344556677881122334455667792"));
+        tenant
+            .branch_timeline_test_with_layers(
+                &main_tline,
+                snapshot3,
+                Some(Lsn(0x40)),
+                &ctx,
+                vec![],
+                vec![],
+                Lsn(0x50),
+            )
+            .await?;
+        let limit = Arc::new(Semaphore::new(1));
+        let max_retention_period = None;
+        let mut logical_size_cache = HashMap::new();
+        let cause = LogicalSizeCalculationCause::EvictionTaskImitation;
+        let cancel = CancellationToken::new();
+
+        let inputs = gather_inputs(
+            &tenant,
+            &limit,
+            max_retention_period,
+            &mut logical_size_cache,
+            cause,
+            &cancel,
+            &ctx,
+        )
+        .instrument(info_span!(
+            "gather_inputs",
+            tenant_id = "unknown",
+            shard_id = "unknown",
+        ))
+        .await?;
+        use crate::tenant::size::{LsnKind, ModelInputs, SegmentMeta};
+        use LsnKind::*;
+        use tenant_size_model::Segment;
+        let ModelInputs { mut segments, .. } = inputs;
+        segments.retain(|s| s.timeline_id == TIMELINE_ID);
+        for segment in segments.iter_mut() {
+            segment.segment.parent = None; // We don't care about the parent for the test
+            segment.segment.size = None; // We don't care about the size for the test
+        }
+        assert_eq!(
+            segments,
+            [
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x10,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchStart,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x20,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x30,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x100,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: GcCutOff,
+                }, // we need to retain everything above the last branch point
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x100,
+                        size: None,
+                        needed: true,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchEnd,
+                },
+            ]
+        );
+
+        main_tline
+            .remote_client
+            .schedule_index_upload_for_timeline_invisible_state(
+                TimelineVisibilityState::Invisible,
+            )?;
+        main_tline.remote_client.wait_completion().await?;
+        let inputs = gather_inputs(
+            &tenant,
+            &limit,
+            max_retention_period,
+            &mut logical_size_cache,
+            cause,
+            &cancel,
+            &ctx,
+        )
+        .instrument(info_span!(
+            "gather_inputs",
+            tenant_id = "unknown",
+            shard_id = "unknown",
+        ))
+        .await?;
+        let ModelInputs { mut segments, .. } = inputs;
+        segments.retain(|s| s.timeline_id == TIMELINE_ID);
+        for segment in segments.iter_mut() {
+            segment.segment.parent = None; // We don't care about the parent for the test
+            segment.segment.size = None; // We don't care about the size for the test
+        }
+        assert_eq!(
+            segments,
+            [
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x10,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchStart,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x20,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x30,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40,
+                        size: None,
+                        needed: false,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchPoint,
+                },
+                SegmentMeta {
+                    segment: Segment {
+                        parent: None,
+                        lsn: 0x40, // Branch end LSN == last branch point LSN
+                        size: None,
+                        needed: true,
+                    },
+                    timeline_id: TIMELINE_ID,
+                    kind: BranchEnd,
+                },
+            ]
+        );
+        Ok(())
+    }
 }
--- a/pageserver/src/tenant/layer_map/layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/layer_coverage.rs
@@ -53,7 +53,7 @@ impl<Value: Clone> LayerCoverage<Value> {
    ///
    /// Complexity: O(log N)
    fn add_node(&mut self, key: i128) {
-        let value = match self.nodes.range(..=key).last() {
+        let value = match self.nodes.range(..=key).next_back() {
            Some((_, Some(v))) => Some(v.clone()),
            Some((_, None)) => None,
            None => None,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -58,7 +58,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

 /// For a tenant that appears in TenantsMap, it may either be
 /// - `Attached`: has a full Tenant object, is elegible to service
-///    reads and ingest WAL.
+///   reads and ingest WAL.
 /// - `Secondary`: is only keeping a local cache warm.
 ///
 /// Secondary is a totally distinct state rather than being a mode of a `Tenant`, because
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1968,9 +1968,7 @@ impl RemoteTimelineClient {
    /// Pick next tasks from the queue, and start as many of them as possible without violating
    /// the ordering constraints.
    ///
-    /// TODO: consider limiting the number of in-progress tasks, beyond what remote_storage does.
-    /// This can launch an unbounded number of queued tasks. `UploadQueue::next_ready()` also has
-    /// worst-case quadratic cost in the number of tasks, and may struggle beyond 10,000 tasks.
+    /// The number of inprogress tasks is limited by `Self::inprogress_tasks`, see `next_ready`.
    fn launch_queued_tasks(self: &Arc<Self>, upload_queue: &mut UploadQueueInitialized) {
        while let Some((mut next_op, coalesced_ops)) = upload_queue.next_ready() {
            debug!("starting op: {next_op}");
@@ -2218,6 +2216,11 @@ impl RemoteTimelineClient {
                    }
                    res
                }
+                // TODO: this should wait for the deletion to be executed by the deletion queue.
+                // Otherwise, the deletion may race with an upload and wrongfully delete a newer
+                // file. Some of the above logic attempts to work around this, it should be replaced
+                // by the upload queue ordering guarantees (see `can_bypass`). See:
+                // <https://github.com/neondatabase/neon/issues/10283>.
                UploadOp::Delete(delete) => {
                    if self.config.read().unwrap().block_deletions {
                        let mut queue_locked = self.upload_queue.lock().unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -130,7 +130,7 @@ impl IndexPart {
    /// Version history
    /// - 2: added `deleted_at`
    /// - 3: no longer deserialize `timeline_layers` (serialized format is the same, but timeline_layers
-    ///      is always generated from the keys of `layer_metadata`)
+    ///   is always generated from the keys of `layer_metadata`)
    /// - 4: timeline_layers is fully removed.
    /// - 5: lineage was added
    /// - 6: last_aux_file_policy is added.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -33,7 +33,7 @@ pub struct ModelInputs {
 }

 /// A [`Segment`], with some extra information for display purposes
-#[derive(Debug, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, serde::Serialize, serde::Deserialize, PartialEq, Eq)]
 pub struct SegmentMeta {
    pub segment: Segment,
    pub timeline_id: TimelineId,
@@ -248,6 +248,8 @@ pub(super) async fn gather_inputs(
            None
        };

+        let branch_is_invisible = timeline.is_invisible() == Some(true);
+
        let lease_points = gc_info
            .leases
            .keys()
@@ -271,7 +273,10 @@ pub(super) async fn gather_inputs(
            .map(|(lsn, _child_id, _is_offloaded)| (lsn, LsnKind::BranchPoint))
            .collect::<Vec<_>>();

-        lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+        if !branch_is_invisible {
+            // Do not count lease points for invisible branches.
+            lsns.extend(lease_points.iter().map(|&lsn| (lsn, LsnKind::LeasePoint)));
+        }

        drop(gc_info);

@@ -287,7 +292,9 @@ pub(super) async fn gather_inputs(

        // Add a point for the PITR cutoff
        let branch_start_needed = next_pitr_cutoff <= branch_start_lsn;
-        if !branch_start_needed {
+        if !branch_start_needed && !branch_is_invisible {
+            // Only add the GcCutOff point when the timeline is visible; otherwise, do not compute the size for the LSN
+            // range from the last branch point to the latest data.
            lsns.push((next_pitr_cutoff, LsnKind::GcCutOff));
        }

@@ -373,11 +380,19 @@ pub(super) async fn gather_inputs(
            }
        }

+        let branch_end_lsn = if branch_is_invisible {
+            // If the branch is invisible, the branch end is the last requested LSN (likely a branch cutoff point).
+            segments.last().unwrap().segment.lsn
+        } else {
+            // Otherwise, the branch end is the last record LSN.
+            last_record_lsn.0
+        };
+
        // Current end of the timeline
        segments.push(SegmentMeta {
            segment: Segment {
                parent: Some(parent),
-                lsn: last_record_lsn.0,
+                lsn: branch_end_lsn,
                size: None, // Filled in later, if necessary
                needed: true,
            },
@@ -609,6 +624,7 @@ async fn calculate_logical_size(
    Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }

+#[cfg(test)]
 #[test]
 fn verify_size_for_multiple_branches() {
    // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
@@ -766,6 +782,7 @@ fn verify_size_for_multiple_branches() {
    assert_eq!(inputs.calculate(), 37_851_408);
 }

+#[cfg(test)]
 #[test]
 fn verify_size_for_one_branch() {
    let doc = r#"
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -84,8 +84,8 @@ use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
 use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
+use super::remote_timeline_client::RemoteTimelineClient;
 use super::remote_timeline_client::index::{GcCompactionState, IndexPart};
-use super::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
 use super::secondary::heatmap::HeatMapLayer;
 use super::storage_layer::{LayerFringe, LayerVisibilityHint, ReadableLayer};
 use super::tasks::log_compaction_error;
@@ -870,9 +870,14 @@ pub(crate) enum CompactFlags {
    OnlyL0Compaction,
    EnhancedGcBottomMostCompaction,
    DryRun,
-    /// Disables compaction yielding e.g. due to high L0 count. This is set e.g. when requesting
-    /// compaction via HTTP API.
-    NoYield,
+    /// Makes image compaction yield if there's pending L0 compaction. This should always be used in
+    /// the background compaction task, since we want to aggressively compact down L0 to bound
+    /// read amplification.
+    ///
+    /// It only makes sense to use this when `compaction_l0_first` is enabled (such that we yield to
+    /// an L0 compaction pass), and without `OnlyL0Compaction` (L0 compaction shouldn't yield for L0
+    /// compaction).
+    YieldForL0,
 }

 #[serde_with::serde_as]
@@ -890,6 +895,12 @@ pub(crate) struct CompactRequest {
    pub sub_compaction_max_job_size_mb: Option<u64>,
 }

+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct MarkInvisibleRequest {
+    #[serde(default)]
+    pub is_visible: Option<bool>,
+}
+
 #[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
    pub flags: EnumSet<CompactFlags>,
@@ -1891,18 +1902,19 @@ impl Timeline {
        // out by other background tasks (including image compaction). We request this via
        // `BackgroundLoopKind::L0Compaction`.
        //
-        // If this is a regular compaction pass, and L0-only compaction is enabled in the config,
-        // then we should yield for immediate L0 compaction if necessary while we're waiting for the
-        // background task semaphore. There's no point yielding otherwise, since we'd just end up
-        // right back here.
+        // Yield for pending L0 compaction while waiting for the semaphore.
        let is_l0_only = options.flags.contains(CompactFlags::OnlyL0Compaction);
        let semaphore_kind = match is_l0_only && self.get_compaction_l0_semaphore() {
            true => BackgroundLoopKind::L0Compaction,
            false => BackgroundLoopKind::Compaction,
        };
-        let yield_for_l0 = !is_l0_only
-            && self.get_compaction_l0_first()
-            && !options.flags.contains(CompactFlags::NoYield);
+        let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0);
+        if yield_for_l0 {
+            // If this is an L0 pass, it doesn't make sense to yield for L0.
+            debug_assert!(!is_l0_only, "YieldForL0 during L0 pass");
+            // If `compaction_l0_first` is disabled, there's no point yielding.
+            debug_assert!(self.get_compaction_l0_first(), "YieldForL0 without L0 pass");
+        }

        let acquire = async move {
            let guard = self.compaction_lock.lock().await;
@@ -2209,6 +2221,10 @@ impl Timeline {
        self.remote_client.is_archived()
    }

+    pub(crate) fn is_invisible(&self) -> Option<bool> {
+        self.remote_client.is_invisible()
+    }
+
    pub(crate) fn is_stopping(&self) -> bool {
        self.current_state() == TimelineState::Stopping
    }
@@ -2231,7 +2247,7 @@ impl Timeline {
                        .await
                        .expect("holding a reference to self");
                }
-                TimelineState::Active { .. } => {
+                TimelineState::Active => {
                    return Ok(());
                }
                TimelineState::Broken { .. } | TimelineState::Stopping => {
@@ -2562,14 +2578,6 @@ impl Timeline {
        Some(max(l0_flush_stall_threshold, compaction_threshold))
    }

-    fn get_l0_flush_wait_upload(&self) -> bool {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .l0_flush_wait_upload
-            .unwrap_or(self.conf.default_tenant_conf.l0_flush_wait_upload)
-    }
-
    fn get_image_creation_threshold(&self) -> usize {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -4591,27 +4599,6 @@ impl Timeline {
            // release lock on 'layers'
        };

-        // Backpressure mechanism: wait with continuation of the flush loop until we have uploaded all layer files.
-        // This makes us refuse ingest until the new layers have been persisted to the remote
-        // TODO: remove this, and rely on l0_flush_{delay,stall}_threshold instead.
-        if self.get_l0_flush_wait_upload() {
-            let start = Instant::now();
-            self.remote_client
-                .wait_completion()
-                .await
-                .map_err(|e| match e {
-                    WaitCompletionError::UploadQueueShutDownOrStopped
-                    | WaitCompletionError::NotInitialized(
-                        NotInitialized::ShuttingDown | NotInitialized::Stopped,
-                    ) => FlushLayerError::Cancelled,
-                    WaitCompletionError::NotInitialized(NotInitialized::Uninitialized) => {
-                        FlushLayerError::Other(anyhow!(e).into())
-                    }
-                })?;
-            let duration = start.elapsed().as_secs_f64();
-            self.metrics.flush_wait_upload_time_gauge_add(duration);
-        }
-
        // FIXME: between create_delta_layer and the scheduling of the upload in `update_metadata_file`,
        // a compaction can delete the file and then it won't be available for uploads any more.
        // We still schedule the upload, resulting in an error, but ideally we'd somehow avoid this
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -394,8 +394,8 @@ impl GcCompactionQueue {
                if job.dry_run {
                    flags |= CompactFlags::DryRun;
                }
-                if options.flags.contains(CompactFlags::NoYield) {
-                    flags |= CompactFlags::NoYield;
+                if options.flags.contains(CompactFlags::YieldForL0) {
+                    flags |= CompactFlags::YieldForL0;
                }
                let options = CompactOptions {
                    flags,
@@ -983,7 +983,7 @@ impl Timeline {

        // Yield if we have pending L0 compaction. The scheduler will do another pass.
        if (l0_outcome == CompactionOutcome::Pending || l0_outcome == CompactionOutcome::YieldForL0)
-            && !options.flags.contains(CompactFlags::NoYield)
+            && options.flags.contains(CompactFlags::YieldForL0)
        {
            info!("image/ancestor compaction yielding for L0 compaction");
            return Ok(CompactionOutcome::YieldForL0);
@@ -1028,7 +1028,7 @@ impl Timeline {
                            .load()
                            .as_ref()
                            .clone(),
-                        !options.flags.contains(CompactFlags::NoYield),
+                        options.flags.contains(CompactFlags::YieldForL0),
                    )
                    .await
                    .inspect_err(|err| {
@@ -2635,7 +2635,7 @@ impl Timeline {
    ) -> Result<CompactionOutcome, CompactionError> {
        let sub_compaction = options.sub_compaction;
        let job = GcCompactJob::from_compact_options(options.clone());
-        let no_yield = options.flags.contains(CompactFlags::NoYield);
+        let yield_for_l0 = options.flags.contains(CompactFlags::YieldForL0);
        if sub_compaction {
            info!(
                "running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs"
@@ -2650,7 +2650,7 @@ impl Timeline {
                    idx + 1,
                    jobs_len
                );
-                self.compact_with_gc_inner(cancel, job, ctx, no_yield)
+                self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
                    .await?;
            }
            if jobs_len == 0 {
@@ -2658,7 +2658,8 @@ impl Timeline {
            }
            return Ok(CompactionOutcome::Done);
        }
-        self.compact_with_gc_inner(cancel, job, ctx, no_yield).await
+        self.compact_with_gc_inner(cancel, job, ctx, yield_for_l0)
+            .await
    }

    async fn compact_with_gc_inner(
@@ -2666,7 +2667,7 @@ impl Timeline {
        cancel: &CancellationToken,
        job: GcCompactJob,
        ctx: &RequestContext,
-        no_yield: bool,
+        yield_for_l0: bool,
    ) -> Result<CompactionOutcome, CompactionError> {
        // Block other compaction/GC tasks from running for now. GC-compaction could run along
        // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
@@ -2936,18 +2937,15 @@ impl Timeline {
            if cancel.is_cancelled() {
                return Err(CompactionError::ShuttingDown);
            }
-            if !no_yield {
-                let should_yield = self
+            let should_yield = yield_for_l0
+                && self
                    .l0_compaction_trigger
                    .notified()
                    .now_or_never()
                    .is_some();
-                if should_yield {
-                    tracing::info!(
-                        "preempt gc-compaction when downloading layers: too many L0 layers"
-                    );
-                    return Ok(CompactionOutcome::YieldForL0);
-                }
+            if should_yield {
+                tracing::info!("preempt gc-compaction when downloading layers: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
            }
            let resident_layer = layer
                .download_and_keep_resident(ctx)
@@ -3081,21 +3079,17 @@ impl Timeline {
                return Err(CompactionError::ShuttingDown);
            }

-            if !no_yield {
-                keys_processed += 1;
-                if keys_processed % 1000 == 0 {
-                    let should_yield = self
-                        .l0_compaction_trigger
-                        .notified()
-                        .now_or_never()
-                        .is_some();
-                    if should_yield {
-                        tracing::info!(
-                            "preempt gc-compaction in the main loop: too many L0 layers"
-                        );
-                        return Ok(CompactionOutcome::YieldForL0);
-                    }
-                }
+            keys_processed += 1;
+            let should_yield = yield_for_l0
+                && keys_processed % 1000 == 0
+                && self
+                    .l0_compaction_trigger
+                    .notified()
+                    .now_or_never()
+                    .is_some();
+            if should_yield {
+                tracing::info!("preempt gc-compaction in the main loop: too many L0 layers");
+                return Ok(CompactionOutcome::YieldForL0);
            }
            if self.shard_identity.is_key_disposable(&key) {
                // If this shard does not need to store this key, simply skip it.
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -2,10 +2,14 @@ use std::collections::HashSet;
 use std::sync::Arc;

 use anyhow::Context;
+use bytes::Bytes;
 use http_utils::error::ApiError;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::DetachBehavior;
 use pageserver_api::models::detach_ancestor::AncestorDetached;
 use pageserver_api::shard::ShardIdentity;
+use pageserver_compaction::helpers::overlaps_with;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -22,7 +26,10 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::Tenant;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
-use crate::tenant::storage_layer::{AsLayerDesc as _, DeltaLayerWriter, Layer, ResidentLayer};
+use crate::tenant::storage_layer::{
+    AsLayerDesc as _, DeltaLayerWriter, ImageLayerWriter, IoConcurrency, Layer, ResidentLayer,
+    ValuesReconstructState,
+};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};

 #[derive(Debug, thiserror::Error)]
@@ -170,6 +177,92 @@ impl Attempt {
    }
 }

+async fn generate_tombstone_image_layer(
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+    ctx: &RequestContext,
+) -> Result<Option<ResidentLayer>, Error> {
+    tracing::info!(
+        "removing non-inherited keys by writing an image layer with tombstones at the detach LSN"
+    );
+    let io_concurrency = IoConcurrency::spawn_from_conf(
+        detached.conf,
+        detached.gate.enter().map_err(|_| Error::ShuttingDown)?,
+    );
+    let mut reconstruct_state = ValuesReconstructState::new(io_concurrency);
+    // Directly use `get_vectored_impl` to skip the max_vectored_read_key limit check. Note that the keyspace should
+    // not contain too many keys, otherwise this takes a lot of memory. Currently we limit it to 10k keys in the compute.
+    let key_range = Key::sparse_non_inherited_keyspace();
+    // avoid generating a "future layer" which will then be removed
+    let image_lsn = ancestor_lsn;
+
+    {
+        let layers = detached.layers.read().await;
+        for layer in layers.all_persistent_layers() {
+            if !layer.is_delta
+                && layer.lsn_range.start == image_lsn
+                && overlaps_with(&key_range, &layer.key_range)
+            {
+                tracing::warn!(
+                    layer=%layer, "image layer at the detach LSN already exists, skipping removing aux files"
+                );
+                return Ok(None);
+            }
+        }
+    }
+
+    let data = ancestor
+        .get_vectored_impl(
+            KeySpace::single(key_range.clone()),
+            image_lsn,
+            &mut reconstruct_state,
+            ctx,
+        )
+        .await
+        .context("failed to retrieve aux keys")
+        .map_err(|e| Error::launder(e, Error::Prepare))?;
+    if !data.is_empty() {
+        // TODO: is it possible that we can have an image at `image_lsn`? Unlikely because image layers are only generated
+        // upon compaction but theoretically possible.
+        let mut image_layer_writer = ImageLayerWriter::new(
+            detached.conf,
+            detached.timeline_id,
+            detached.tenant_shard_id,
+            &key_range,
+            image_lsn,
+            ctx,
+        )
+        .await
+        .context("failed to create image layer writer")
+        .map_err(Error::Prepare)?;
+        for key in data.keys() {
+            image_layer_writer
+                .put_image(*key, Bytes::new(), ctx)
+                .await
+                .context("failed to write key")
+                .map_err(|e| Error::launder(e, Error::Prepare))?;
+        }
+        let (desc, path) = image_layer_writer
+            .finish(ctx)
+            .await
+            .context("failed to finish image layer writer for removing the metadata keys")
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        let generated = Layer::finish_creating(detached.conf, detached, desc, &path)
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        detached
+            .remote_client
+            .upload_layer_file(&generated, &detached.cancel)
+            .await
+            .map_err(|e| Error::launder(e, Error::Prepare))?;
+        tracing::info!(layer=%generated, "wrote image layer");
+        Ok(Some(generated))
+    } else {
+        tracing::info!("no aux keys found in ancestor");
+        Ok(None)
+    }
+}
+
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
@@ -235,7 +328,7 @@ pub(super) async fn prepare(
        return Err(NoAncestor);
    }

-    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn, behavior)?;

    if let DetachBehavior::MultiLevelAndNoReparent = behavior {
        // If the ancestor has an ancestor, we might be able to fast-path detach it if the current ancestor does not have any data written/used by the detaching timeline.
@@ -249,7 +342,13 @@ pub(super) async fn prepare(
            ancestor_lsn = ancestor.ancestor_lsn; // Get the LSN first before resetting the `ancestor` variable
            ancestor = ancestor_of_ancestor;
            // TODO: do we still need to check if we don't want to reparent?
-            check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+            check_no_archived_children_of_ancestor(
+                tenant,
+                detached,
+                &ancestor,
+                ancestor_lsn,
+                behavior,
+            )?;
        }
    } else if ancestor.ancestor_timeline.is_some() {
        // non-technical requirement; we could flatten N ancestors just as easily but we chose
@@ -346,10 +445,16 @@ pub(super) async fn prepare(

    // TODO: copying and lsn prefix copying could be done at the same time with a single fsync after
    let mut new_layers: Vec<Layer> =
-        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len());
+        Vec::with_capacity(straddling_branchpoint.len() + rest_of_historic.len() + 1);
+
+    if let Some(tombstone_layer) =
+        generate_tombstone_image_layer(detached, &ancestor, ancestor_lsn, ctx).await?
+    {
+        new_layers.push(tombstone_layer.into());
+    }

    {
-        tracing::debug!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");
+        tracing::info!(to_rewrite = %straddling_branchpoint.len(), "copying prefix of delta layers");

        let mut tasks = tokio::task::JoinSet::new();

@@ -1156,31 +1261,44 @@ fn check_no_archived_children_of_ancestor(
    detached: &Arc<Timeline>,
    ancestor: &Arc<Timeline>,
    ancestor_lsn: Lsn,
+    detach_behavior: DetachBehavior,
 ) -> Result<(), Error> {
-    let timelines = tenant.timelines.lock().unwrap();
-    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-    for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
-        if timeline.is_archived() == Some(true) {
-            return Err(Error::Archived(timeline.timeline_id));
-        }
-    }
-    for timeline_offloaded in timelines_offloaded.values() {
-        if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
-            continue;
-        }
-        // This forbids the detach ancestor feature if flattened timelines are present,
-        // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
-        // But as per current design, we don't record the ancestor_lsn of flattened timelines.
-        // This is a bit unfortunate, but as of writing this we don't support flattening
-        // anyway. Maybe we can evolve the data model in the future.
-        if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
-            let is_earlier = retain_lsn <= ancestor_lsn;
-            if !is_earlier {
-                continue;
+    match detach_behavior {
+        DetachBehavior::NoAncestorAndReparent => {
+            let timelines = tenant.timelines.lock().unwrap();
+            let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+
+            for timeline in
+                reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn)
+            {
+                if timeline.is_archived() == Some(true) {
+                    return Err(Error::Archived(timeline.timeline_id));
+                }
+            }
+
+            for timeline_offloaded in timelines_offloaded.values() {
+                if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
+                    continue;
+                }
+                // This forbids the detach ancestor feature if flattened timelines are present,
+                // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
+                // But as per current design, we don't record the ancestor_lsn of flattened timelines.
+                // This is a bit unfortunate, but as of writing this we don't support flattening
+                // anyway. Maybe we can evolve the data model in the future.
+                if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
+                    let is_earlier = retain_lsn <= ancestor_lsn;
+                    if !is_earlier {
+                        continue;
+                    }
+                }
+                return Err(Error::Archived(timeline_offloaded.timeline_id));
            }
        }
-        return Err(Error::Archived(timeline_offloaded.timeline_id));
+        DetachBehavior::MultiLevelAndNoReparent => {
+            // We don't need to check anything if the user requested to not reparent.
+        }
    }
+
    Ok(())
 }

--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -25,8 +25,8 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
    /// * `align` must be a power of two,
    ///
    /// * `capacity`, when rounded up to the nearest multiple of `align`,
-    ///    must not overflow isize (i.e., the rounded value must be
-    ///    less than or equal to `isize::MAX`).
+    ///   must not overflow isize (i.e., the rounded value must be
+    ///   less than or equal to `isize::MAX`).
    pub fn with_capacity(capacity: usize) -> Self {
        AlignedBufferMut {
            raw: RawAlignedBuffer::with_capacity(capacity),
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
@@ -37,8 +37,8 @@ impl<const A: usize> RawAlignedBuffer<ConstAlign<A>> {
    /// * `align` must be a power of two,
    ///
    /// * `capacity`, when rounded up to the nearest multiple of `align`,
-    ///    must not overflow isize (i.e., the rounded value must be
-    ///    less than or equal to `isize::MAX`).
+    ///   must not overflow isize (i.e., the rounded value must be
+    ///   less than or equal to `isize::MAX`).
    pub fn with_capacity(capacity: usize) -> Self {
        let align = ConstAlign::<A>;
        let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout");
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -647,18 +647,25 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	return found;
 }

+#if PG_MAJORVERSION_NUM >= 16
+static PGIOAlignedBlock voidblock = {0};
+#else
+static PGAlignedBlock voidblock = {0};
+#endif
+#define SCRIBBLEPAGE (&voidblock.data)
+
 /*
 * Try to read pages from local cache.
 * Returns the number of pages read from the local cache, and sets bits in
- * 'read' for the pages which were read. This may scribble over buffers not
- * marked in 'read', so be careful with operation ordering.
+ * 'mask' for the pages which were read. This may scribble over buffers not
+ * marked in 'mask', so be careful with operation ordering.
 *
 * In case of error local file cache is disabled (lfc->limit is set to zero),
- * and -1 is returned. Note that 'read' and the buffers may be touched and in
- * an otherwise invalid state.
+ * and -1 is returned.
 *
- * If the mask argument is supplied, bits will be set at the offsets of pages
- * that were present and read from the LFC.
+ * If the mask argument is supplied, we'll only try to read those pages which
+ * don't have their bits set on entry. At exit, pages which were successfully
+ * read from LFC will have their bits set.
 */
 int
 lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
@@ -693,34 +700,57 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	while (nblocks > 0)
 	{
 		struct iovec iov[PG_IOV_MAX];
-		int		chunk_offs = blkno & (BLOCKS_PER_CHUNK - 1);
-		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - chunk_offs);
+		int8	chunk_mask[BLOCKS_PER_CHUNK / 8] = {0};
+		int		chunk_offs = (blkno & (BLOCKS_PER_CHUNK - 1));
+		int		blocks_in_chunk = Min(nblocks, BLOCKS_PER_CHUNK - (blkno % BLOCKS_PER_CHUNK));
 		int		iteration_hits = 0;
 		int		iteration_misses = 0;
 		uint64	io_time_us = 0;
-		int     n_blocks_to_read = 0;
+		int		n_blocks_to_read = 0;
+		int		iov_last_used = 0;
+		int		first_block_in_chunk_read = -1;
 		ConditionVariable* cv;

 		Assert(blocks_in_chunk > 0);

 		for (int i = 0; i < blocks_in_chunk; i++)
 		{
-			n_blocks_to_read += (BITMAP_ISSET(mask, buf_offset + i) != 0);
-			iov[i].iov_base = buffers[buf_offset + i];
 			iov[i].iov_len = BLCKSZ;
+			/* mask not set = we must do work */
+			if (!BITMAP_ISSET(mask, buf_offset + i))
+			{
+				iov[i].iov_base = buffers[buf_offset + i];
+				n_blocks_to_read++;
+				iov_last_used = i + 1;
+
+				if (first_block_in_chunk_read == -1)
+				{
+					first_block_in_chunk_read = i;
+				}
+			}
+			/* mask set = we must do no work */
+			else
+			{
+				/* don't scribble on pages we weren't requested to write to */
+				iov[i].iov_base = SCRIBBLEPAGE;
+			}
 		}
+
+		/* shortcut IO */
 		if (n_blocks_to_read == 0)
 		{
-			for (int i = 0; i < blocks_in_chunk; i++)
-			{
-				BITMAP_CLR(mask, buf_offset + i);
-			}
 			buf_offset += blocks_in_chunk;
 			nblocks -= blocks_in_chunk;
 			blkno += blocks_in_chunk;
 			continue;
 		}

+		/*
+		 * The effective iov size must be >= the number of blocks we're about
+		 * to read.
+		 */
+		Assert(iov_last_used - first_block_in_chunk_read >= n_blocks_to_read);
+
 		tag.blockNum = blkno - chunk_offs;
 		hash = get_hash_value(lfc_hash, &tag);
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];
@@ -747,10 +777,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		if (entry == NULL)
 		{
 			/* Pages are not cached */
-			for (int i = 0; i < blocks_in_chunk; i++)
-			{
-				BITMAP_CLR(mask, buf_offset + i);
-			}
 			lfc_ctl->misses += blocks_in_chunk;
 			pgBufferUsage.file_cache.misses += blocks_in_chunk;
 			LWLockRelease(lfc_lock);
@@ -769,12 +795,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		generation = lfc_ctl->generation;
 		entry_offset = entry->offset;

-		for (int i = 0; i < blocks_in_chunk; i++)
+		for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
 		{
 			FileCacheBlockState state = UNAVAILABLE;
 			bool sleeping = false;

-			if (!BITMAP_ISSET(mask, buf_offset + i))
+			/* no need to work on something we're not interested in */
+			if (BITMAP_ISSET(mask, buf_offset + i))
 				continue;

 			while (lfc_ctl->generation == generation)
@@ -800,13 +827,11 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			}
 			if (state == AVAILABLE)
 			{
+				BITMAP_SET(chunk_mask, i);
 				iteration_hits++;
 			}
 			else
-			{
-				BITMAP_CLR(mask, buf_offset + i);
 				iteration_misses++;
-			}
 		}
 		LWLockRelease(lfc_lock);

@@ -814,36 +839,33 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,

 		if (iteration_hits != 0)
 		{
-			if (blocks_in_chunk == n_blocks_to_read)
-			{
-				pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
-				rc = preadv(lfc_desc, iov, blocks_in_chunk,
-							((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ);
-				pgstat_report_wait_end();
+			/* chunk offset (# of pages) into the LFC file */
+			off_t	first_read_offset = (off_t) entry_offset * BLOCKS_PER_CHUNK;
+			int		nwrite = iov_last_used - first_block_in_chunk_read;
+			/* offset of first IOV */
+			first_read_offset += chunk_offs + first_block_in_chunk_read;

-				if (rc != (BLCKSZ * blocks_in_chunk))
-				{
-					lfc_disable("read");
-					return -1;
-				}
-			}
-			else
+			pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
+
+			/* Read only the blocks we're interested in, limiting */
+			rc = preadv(lfc_desc, &iov[first_block_in_chunk_read],
+						nwrite, first_read_offset * BLCKSZ);
+			pgstat_report_wait_end();
+
+			if (rc != (BLCKSZ * nwrite))
 			{
-				/* Some blocks are already prefetched in provided buffers, we should not rewrite them, so we can not use vector read */
-				for (int i = 0; i < blocks_in_chunk; i++)
-				{
-					if (BITMAP_ISSET(mask, buf_offset + i))
-					{
-						pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ);
-						rc = pread(lfc_desc, iov[i].iov_base, BLCKSZ, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs + i) * BLCKSZ);
-						pgstat_report_wait_end();
-						if (rc != BLCKSZ)
-						{
-							lfc_disable("read");
-							return -1;
-						}
-					}
-				}
+				lfc_disable("read");
+				return -1;
+			}
+
+			/*
+			 * We successfully read the pages we know were valid when we
+			 * started reading; now mark those pages as read
+			 */
+			for (int i = first_block_in_chunk_read; i < iov_last_used; i++)
+			{
+				if (BITMAP_ISSET(chunk_mask, i))
+					BITMAP_SET(mask, buf_offset + i);
 			}
 		}

@@ -1034,12 +1056,12 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		LWLockRelease(lfc_lock);
 		return false;
 	}
-
+	
 	lwlsn = neon_get_lwlsn(rinfo, forknum, blkno);

 	if (lwlsn > lsn)
 	{
-		elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_modified_since LSN %X/%X",
+		elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
 			 blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn));
 		LWLockRelease(lfc_lock);
 		return false;
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -315,7 +315,7 @@ static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 void *buffer)
 {
-	bits8		rv = 1;
+	bits8		rv = 0;
 	return lfc_readv_select(rinfo, forkNum, blkno, &buffer, 1, &rv) == 1;
 }

--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -99,7 +99,7 @@ static char *hexdump_page(char *page);

 #define IS_LOCAL_REL(reln) (\
 	NInfoGetDbOid(InfoFromSMgrRel(reln)) != 0 && \
-		NInfoGetRelNumber(InfoFromSMgrRel(reln)) > FirstNormalObjectId \
+		NInfoGetRelNumber(InfoFromSMgrRel(reln)) >= FirstNormalObjectId \
 )

 const int	SmgrTrace = DEBUG5;
@@ -1081,6 +1081,9 @@ prefetch_lookup(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkn, neon_r
 * pageserver. If NULL, we utilize the lastWrittenLsn -infrastructure
 * to calculate the LSNs to send.
 *
+ * Bits set in *mask (if present) indicate pages already read; i.e. pages we
+ * can skip in this process.
+ *
 * When performing a prefetch rather than a synchronous request,
 * is_prefetch==true. Currently, it only affects how the request is accounted
 * in the perf counters.
@@ -1126,7 +1129,7 @@ Retry:
 		uint64		ring_index;
 		neon_request_lsns *lsns;

-		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
 			continue;

 		if (frlsns)
@@ -2381,7 +2384,6 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 						 LSN_FORMAT_ARGS(last_written_lsn),
 						 LSN_FORMAT_ARGS(flushlsn));
 				XLogFlush(last_written_lsn);
-				flushlsn = last_written_lsn;
 			}

 			/*
@@ -2397,18 +2399,35 @@ neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 			 * requesting the latest page, by setting request LSN to
 			 * UINT64_MAX.
 			 *
-			 * Remember the current LSN, however, so that we can later
-			 * correctly determine if the response to the request is still
-			 * valid. The most up-to-date LSN we could use for that purpose
-			 * would be the current insert LSN, but to avoid the overhead of
-			 * looking it up, use 'flushlsn' instead. This relies on the
-			 * assumption that if the page was modified since the last WAL
-			 * flush, it should still be in the buffer cache, and we
-			 * wouldn't be requesting it.
+			 * effective_request_lsn is used to check that received response is still valid.
+			 * In case of primary node it is last written LSN. Originally we used flush_lsn here,
+			 * but it is not correct. Consider the following scenario:
+			 * 1. Backend A wants to prefetch block X
+			 * 2. Backend A checks that block X is not present in the shared buffer cache
+			 * 3. Backend A calls prefetch_do_request, which calls neon_get_request_lsns
+			 * 4. neon_get_request_lsns obtains LwLSN=11 for the block
+			 * 5. Backend B downloads block X, updates and wallogs it with LSN=13
+			 * 6. Block X is once again evicted from shared buffers, its LwLSN is set to LSN=13
+			 * 7. Backend A is still executing in neon_get_request_lsns(). It calls 'flushlsn = GetFlushRecPtr();'.
+			 *    Let's say that it is LSN=14
+			 * 8. Backend A uses LSN=14 as effective_lsn in the prefetch slot. The request stored in the slot is
+			 *    [not_modified_since=11, effective_request_lsn=14]
+			 * 9. Backend A sends the prefetch request, pageserver processes it, and sends response.
+			 *    The last LSN that the pageserver had processed was LSN=12, so the page image in the response is valid at LSN=12.
+			 * 10. Backend A calls smgrread() for page X with LwLSN=13
+			 * 11. Backend A finds in prefetch ring the response for the prefetch request with [not_modified_since=11, effective_lsn=Lsn14],
+			 * so it satisfies neon_prefetch_response_usable condition.
+			 *
+			 * Things go wrong in step 7-8, when [not_modified_since=11, effective_request_lsn=14] is determined for the request.
+			 * That is incorrect, because the page has in fact been modified at LSN=13. The invariant is that for any request,
+			 * there should not be any modifications to a page between its not_modified_since and (effective_)request_lsn values.
+			 *
+			 * The problem can be fixed by callingGetFlushRecPtr() before checking if the page is in the buffer cache.
+			 * But you can't do that within smgrprefetch(), would need to modify the caller.
 			 */
 			result->request_lsn = UINT64_MAX;
 			result->not_modified_since = last_written_lsn;
-			result->effective_request_lsn = flushlsn;
+			result->effective_request_lsn = last_written_lsn;
 		}
 	}
 }
@@ -2467,11 +2486,8 @@ neon_prefetch_response_usable(neon_request_lsns *request_lsns,
 	 * `not_modified_since` and `request_lsn` are sent to the pageserver, but
 	 * in the primary node, we always use UINT64_MAX as the `request_lsn`, so
 	 * we remember `effective_request_lsn` separately. In a primary,
-	 * `effective_request_lsn` is the last flush WAL position when the request
-	 * was sent to the pageserver. That's logically the LSN that we are
-	 * requesting the page at, but we send UINT64_MAX to the pageserver so
-	 * that if the GC horizon advances past that position, we still get a
-	 * valid response instead of an error.
+	 * `effective_request_lsn` is the same as  `not_modified_since`.
+	 * See comments in neon_get_request_lsns why we can not use last flush WAL position here.
 	 *
 	 * To determine whether a response to a GetPage request issued earlier is
 	 * still valid to satisfy a new page read, we look at the
@@ -3026,9 +3042,6 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,

 		tag.blockNum = blocknum;

-		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			lfc_present[i] = ~(lfc_present[i]);
-
 		ring_index = prefetch_register_bufferv(tag, NULL, iterblocks,
 											   lfc_present, true);

@@ -3134,6 +3147,15 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 #endif
 }

+/*
+ * Read N pages at a specific LSN.
+ *
+ * *mask is set for pages read at a previous point in time, and which we
+ * should not touch, nor overwrite.
+ * New bits should be set in *mask for the pages we'successfully read.
+ *
+ * The offsets in request_lsns, buffers, and mask are linked.
+ */
 static void
 #if PG_MAJORVERSION_NUM < 16
 neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_blockno, neon_request_lsns *request_lsns,
@@ -3186,7 +3208,7 @@ neon_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber base_block
 		neon_request_lsns *reqlsns = &request_lsns[i];
 		TimestampTz		start_ts, end_ts;

-		if (PointerIsValid(mask) && !BITMAP_ISSET(mask, i))
+		if (PointerIsValid(mask) && BITMAP_ISSET(mask, i))
 			continue;

 		start_ts = GetCurrentTimestamp();
@@ -3485,9 +3507,7 @@ static void
 neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		   void **buffers, BlockNumber nblocks)
 {
-	bits8		prefetch_hits[PG_IOV_MAX / 8] = {0};
-	bits8		lfc_hits[PG_IOV_MAX / 8];
-	bits8		read[PG_IOV_MAX / 8];
+	bits8		read_pages[PG_IOV_MAX / 8];
 	neon_request_lsns request_lsns[PG_IOV_MAX];
 	int			lfc_result;
 	int			prefetch_result;
@@ -3519,19 +3539,18 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 						  request_lsns, nblocks);

+	memset(read_pages, 0, sizeof(read_pages));

-	prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns, nblocks, buffers, prefetch_hits);
+	prefetch_result = prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
+									   blocknum, request_lsns, nblocks,
+									   buffers, read_pages);

 	if (prefetch_result == nblocks)
 		return;

-	/* invert the result: exclude prefetched blocks */
-	for (int i = 0; i < PG_IOV_MAX / 8; i++)
-		lfc_hits[i] = ~prefetch_hits[i];
-
 	/* Try to read from local file cache */
 	lfc_result = lfc_readv_select(InfoFromSMgrRel(reln), forknum, blocknum, buffers,
-								  nblocks, lfc_hits);
+								  nblocks, read_pages);

 	if (lfc_result > 0)
 		MyNeonCounters->file_cache_hits_total += lfc_result;
@@ -3540,21 +3559,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	if (prefetch_result + lfc_result == nblocks)
 		return;

-	if (lfc_result <= 0)
-	{
-		/* can't use the LFC result, so read all blocks from PS */
-		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			read[i] = ~prefetch_hits[i];
-	}
-	else
-	{
-		/* invert the result: exclude blocks read from lfc */
-		for (int i = 0; i < PG_IOV_MAX / 8; i++)
-			read[i] = ~(prefetch_hits[i] | lfc_hits[i]);
-	}
-
 	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
-					  buffers, nblocks, read);
+					  buffers, nblocks, read_pages);

 	/*
 	 * Try to receive prefetch results once again just to make sure we don't leave the smgr code while the OS might still have buffered bytes.
--- a/poetry.lock
+++ b/poetry.lock
@@ -3111,30 +3111,30 @@ six = "*"

 [[package]]
 name = "ruff"
-version = "0.7.0"
+version = "0.11.2"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 groups = ["dev"]
 files = [
-    {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"},
-    {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"},
-    {file = "ruff-0.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06"},
-    {file = "ruff-0.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be"},
-    {file = "ruff-0.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa"},
-    {file = "ruff-0.7.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495"},
-    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598"},
-    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e"},
-    {file = "ruff-0.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914"},
-    {file = "ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9"},
-    {file = "ruff-0.7.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4"},
-    {file = "ruff-0.7.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9"},
-    {file = "ruff-0.7.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d"},
-    {file = "ruff-0.7.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11"},
-    {file = "ruff-0.7.0-py3-none-win32.whl", hash = "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec"},
-    {file = "ruff-0.7.0-py3-none-win_amd64.whl", hash = "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2"},
-    {file = "ruff-0.7.0-py3-none-win_arm64.whl", hash = "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e"},
-    {file = "ruff-0.7.0.tar.gz", hash = "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b"},
+    {file = "ruff-0.11.2-py3-none-linux_armv6l.whl", hash = "sha256:c69e20ea49e973f3afec2c06376eb56045709f0212615c1adb0eda35e8a4e477"},
+    {file = "ruff-0.11.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:2c5424cc1c4eb1d8ecabe6d4f1b70470b4f24a0c0171356290b1953ad8f0e272"},
+    {file = "ruff-0.11.2-py3-none-macosx_11_0_arm64.whl", hash = "sha256:ecf20854cc73f42171eedb66f006a43d0a21bfb98a2523a809931cda569552d9"},
+    {file = "ruff-0.11.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0c543bf65d5d27240321604cee0633a70c6c25c9a2f2492efa9f6d4b8e4199bb"},
+    {file = "ruff-0.11.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:20967168cc21195db5830b9224be0e964cc9c8ecf3b5a9e3ce19876e8d3a96e3"},
+    {file = "ruff-0.11.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:955a9ce63483999d9f0b8f0b4a3ad669e53484232853054cc8b9d51ab4c5de74"},
+    {file = "ruff-0.11.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:86b3a27c38b8fce73bcd262b0de32e9a6801b76d52cdb3ae4c914515f0cef608"},
+    {file = "ruff-0.11.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a3b66a03b248c9fcd9d64d445bafdf1589326bee6fc5c8e92d7562e58883e30f"},
+    {file = "ruff-0.11.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0397c2672db015be5aa3d4dac54c69aa012429097ff219392c018e21f5085147"},
+    {file = "ruff-0.11.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:869bcf3f9abf6457fbe39b5a37333aa4eecc52a3b99c98827ccc371a8e5b6f1b"},
+    {file = "ruff-0.11.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:2a2b50ca35457ba785cd8c93ebbe529467594087b527a08d487cf0ee7b3087e9"},
+    {file = "ruff-0.11.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:7c69c74bf53ddcfbc22e6eb2f31211df7f65054bfc1f72288fc71e5f82db3eab"},
+    {file = "ruff-0.11.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:6e8fb75e14560f7cf53b15bbc55baf5ecbe373dd5f3aab96ff7aa7777edd7630"},
+    {file = "ruff-0.11.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:842a472d7b4d6f5924e9297aa38149e5dcb1e628773b70e6387ae2c97a63c58f"},
+    {file = "ruff-0.11.2-py3-none-win32.whl", hash = "sha256:aca01ccd0eb5eb7156b324cfaa088586f06a86d9e5314b0eb330cb48415097cc"},
+    {file = "ruff-0.11.2-py3-none-win_amd64.whl", hash = "sha256:3170150172a8f994136c0c66f494edf199a0bbea7a409f649e4bc8f4d7084080"},
+    {file = "ruff-0.11.2-py3-none-win_arm64.whl", hash = "sha256:52933095158ff328f4c77af3d74f0379e34fd52f175144cefc1b192e7ccd32b4"},
+    {file = "ruff-0.11.2.tar.gz", hash = "sha256:ec47591497d5a1050175bdf4e1a4e6272cddff7da88a2ad595e1e326041d8d94"},
 ]

 [[package]]
@@ -3844,4 +3844,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "715fc8c896dcfa1b15054deeddcdec557ef93af91b26e1c8e4688fe4dbef5296"
+content-hash = "fb50cb6b291169dce3188560cdb31a14af95647318f8f0f0d718131dbaf1817a"
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -314,9 +314,9 @@ pub async fn run() -> anyhow::Result<()> {
            None => {
                bail!("plain auth requires redis_notifications to be set");
            }
-            Some(url) => Some(
-                ConnectionWithCredentialsProvider::new_with_static_credentials(url.to_string()),
-            ),
+            Some(url) => {
+                Some(ConnectionWithCredentialsProvider::new_with_static_credentials(url.clone()))
+            }
        },
        ("irsa", _) => match (&args.redis_host, args.redis_port) {
            (Some(host), Some(port)) => Some(
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -1,5 +1,6 @@
 //! Mock console backend which relies on a user-provided postgres instance.

+use std::io;
 use std::net::{IpAddr, Ipv4Addr};
 use std::str::FromStr;
 use std::sync::Arc;
@@ -22,7 +23,6 @@ use crate::control_plane::errors::{
 };
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::control_plane::{AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
-use crate::error::io_error;
 use crate::intern::RoleNameInt;
 use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
 use crate::url::ApiUrl;
@@ -36,13 +36,13 @@ enum MockApiError {

 impl From<MockApiError> for ControlPlaneError {
    fn from(e: MockApiError) -> Self {
-        io_error(e).into()
+        io::Error::other(e).into()
    }
 }

 impl From<tokio_postgres::Error> for ControlPlaneError {
    fn from(e: tokio_postgres::Error) -> Self {
-        io_error(e).into()
+        io::Error::other(e).into()
    }
 }

--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -1,8 +1,10 @@
+use std::io;
+
 use thiserror::Error;

 use crate::control_plane::client::ApiLockError;
 use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason};
-use crate::error::{ErrorKind, ReportableError, UserFacingError, io_error};
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::proxy::retry::CouldRetry;

 /// A go-to error message which doesn't leak any detail.
@@ -79,13 +81,13 @@ impl CouldRetry for ControlPlaneError {

 impl From<reqwest::Error> for ControlPlaneError {
    fn from(e: reqwest::Error) -> Self {
-        io_error(e).into()
+        io::Error::other(e).into()
    }
 }

 impl From<reqwest_middleware::Error> for ControlPlaneError {
    fn from(e: reqwest_middleware::Error) -> Self {
-        io_error(e).into()
+        io::Error::other(e).into()
    }
 }

--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,15 +1,9 @@
-use std::error::Error as StdError;
-use std::{fmt, io};
+use std::fmt;

 use anyhow::Context;
 use measured::FixedCardinalityLabel;
 use tokio::task::JoinError;

-/// Upcast (almost) any error into an opaque [`io::Error`].
-pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
-    io::Error::new(io::ErrorKind::Other, e)
-}
-
 /// Marks errors that may be safely shown to a client.
 /// This trait can be seen as a specialized version of [`ToString`].
 ///
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -163,8 +163,7 @@ fn process_proxy_payload(
        // other values are unassigned and must not be emitted by senders. Receivers
        // must drop connections presenting unexpected values here.
        #[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384
-        _ => return Err(io::Error::new(
-            io::ErrorKind::Other,
+        _ => return Err(io::Error::other(
            format!(
                "invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)",
                header.version_and_command
@@ -178,21 +177,20 @@ fn process_proxy_payload(
        TCP_OVER_IPV4 | UDP_OVER_IPV4 => {
            let addr = payload
                .try_get::<ProxyProtocolV2HeaderV4>()
-                .ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
+                .ok_or_else(|| io::Error::other(size_err))?;

            SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
        }
        TCP_OVER_IPV6 | UDP_OVER_IPV6 => {
            let addr = payload
                .try_get::<ProxyProtocolV2HeaderV6>()
-                .ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
+                .ok_or_else(|| io::Error::other(size_err))?;

            SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
        }
        // unspecified or unix stream. ignore the addresses
        _ => {
-            return Err(io::Error::new(
-                io::ErrorKind::Other,
+            return Err(io::Error::other(
                "invalid proxy protocol address family/transport protocol.",
            ));
        }
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -143,6 +143,8 @@ impl ConnectionWithCredentialsProvider {
                        db: 0,
                        username: Some(username),
                        password: Some(password.clone()),
+                        // TODO: switch to RESP3 after testing new client version.
+                        protocol: redis::ProtocolVersion::RESP2,
                    },
                })
            }
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -19,7 +19,7 @@ fn json_value_to_pg_text(value: &Value) -> Option<String> {
        v @ (Value::Bool(_) | Value::Number(_) | Value::Object(_)) => Some(v.to_string()),

        // avoid escaping here, as we pass this as a parameter
-        Value::String(s) => Some(s.to_string()),
+        Value::String(s) => Some(s.clone()),

        // special care for arrays
        Value::Array(_) => json_array_to_pg_array(value),
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -41,7 +41,7 @@ use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;

 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.2.0";
+pub(crate) const EXT_VERSION: &str = "0.3.0";
 pub(crate) const EXT_SCHEMA: &str = "auth";

 #[derive(Clone)]
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -866,7 +866,7 @@ impl QueryData {
        let (inner, mut discard) = client.inner();
        let cancel_token = inner.cancel_token();

-        let res = match select(
+        match select(
            pin!(query_to_json(
                config,
                &mut *inner,
@@ -889,7 +889,7 @@ impl QueryData {
            // The query failed with an error
            Either::Left((Err(e), __not_yet_cancelled)) => {
                discard.discard();
-                return Err(e);
+                Err(e)
            }
            // The query was cancelled.
            Either::Right((_cancelled, query)) => {
@@ -930,8 +930,7 @@ impl QueryData {
                    }
                }
            }
-        };
-        res
+        }
    }
 }

--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -15,7 +15,7 @@ use tracing::warn;
 use crate::cancellation::CancellationHandler;
 use crate::config::ProxyConfig;
 use crate::context::RequestContext;
-use crate::error::{ReportableError, io_error};
+use crate::error::ReportableError;
 use crate::metrics::Metrics;
 use crate::proxy::{ClientMode, ErrorSource, handle_client};
 use crate::rate_limiter::EndpointRateLimiter;
@@ -50,23 +50,23 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncWrite for WebSocketRw<S> {
        let this = self.project();
        let mut stream = this.stream;

-        ready!(stream.as_mut().poll_ready(cx).map_err(io_error))?;
+        ready!(stream.as_mut().poll_ready(cx).map_err(io::Error::other))?;

        this.send.put(buf);
        match stream.as_mut().start_send(Frame::binary(this.send.split())) {
            Ok(()) => Poll::Ready(Ok(buf.len())),
-            Err(e) => Poll::Ready(Err(io_error(e))),
+            Err(e) => Poll::Ready(Err(io::Error::other(e))),
        }
    }

    fn poll_flush(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
        let stream = self.project().stream;
-        stream.poll_flush(cx).map_err(io_error)
+        stream.poll_flush(cx).map_err(io::Error::other)
    }

    fn poll_shutdown(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<io::Result<()>> {
        let stream = self.project().stream;
-        stream.poll_close(cx).map_err(io_error)
+        stream.poll_close(cx).map_err(io::Error::other)
    }
 }

@@ -97,7 +97,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
            }

            let res = ready!(this.stream.as_mut().poll_next(cx));
-            match res.transpose().map_err(io_error)? {
+            match res.transpose().map_err(io::Error::other)? {
                Some(message) => match message.opcode {
                    OpCode::Ping => {}
                    OpCode::Pong => {}
@@ -105,7 +105,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AsyncBufRead for WebSocketRw<S> {
                        // We expect to see only binary messages.
                        let error = "unexpected text message in the websocket";
                        warn!(length = message.payload.len(), error);
-                        return Poll::Ready(Err(io_error(error)));
+                        return Poll::Ready(Err(io::Error::other(error)));
                    }
                    OpCode::Binary | OpCode::Continuation => {
                        debug_assert!(this.recv.is_empty());
--- a/proxy/src/tls/server_config.rs
+++ b/proxy/src/tls/server_config.rs
@@ -173,7 +173,7 @@ impl CertResolver {
    }

    pub fn get_common_names(&self) -> HashSet<String> {
-        self.certs.keys().map(|s| s.to_string()).collect()
+        self.certs.keys().cloned().collect()
    }
 }

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,7 +53,7 @@ jsonnet = "^0.21.0-rc2"

 [tool.poetry.group.dev.dependencies]
 mypy = "==1.13.0"
-ruff = "^0.7.0"
+ruff = "^0.11.2"

 [build-system]
 requires = ["poetry-core>=1.0.0"]
@@ -109,4 +109,5 @@ select = [
    "W", # pycodestyle
    "B", # bugbear
    "UP", # pyupgrade
+    "TC", # flake8-type-checking
 ]
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.85.0"
+channel = "1.86.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -94,10 +94,10 @@ impl WalReceivers {

    /// Get reference to locked slot contents. Slot must exist (registered
    /// earlier).
-    fn get_slot<'a>(
-        self: &'a Arc<WalReceivers>,
+    fn get_slot(
+        self: &Arc<WalReceivers>,
        id: WalReceiverId,
-    ) -> MappedMutexGuard<'a, WalReceiverState> {
+    ) -> MappedMutexGuard<'_, WalReceiverState> {
        MutexGuard::map(self.mutex.lock(), |locked| {
            locked.slots[id]
                .as_mut()
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -699,7 +699,7 @@ impl Timeline {
    }

    /// Take a writing mutual exclusive lock on timeline shared_state.
-    pub async fn write_shared_state<'a>(self: &'a Arc<Self>) -> WriteGuardSharedState<'a> {
+    pub async fn write_shared_state(self: &Arc<Self>) -> WriteGuardSharedState<'_> {
        WriteGuardSharedState::new(self.clone(), self.mutex.write().await)
    }

--- a/safekeeper/tests/misc_test.rs
+++ b/safekeeper/tests/misc_test.rs
@@ -116,7 +116,7 @@ fn test_many_tx() -> anyhow::Result<()> {
            }
            None
        })
-        .last()
+        .next_back()
        .unwrap();

    let initdb_lsn = 21623024;
--- a/scripts/download_basebackup.py
+++ b/scripts/download_basebackup.py
@@ -8,9 +8,12 @@
 from __future__ import annotations

 import argparse
+from typing import TYPE_CHECKING

 import psycopg2
-from psycopg2.extensions import connection as PgConnection
+
+if TYPE_CHECKING:
+    from psycopg2.extensions import connection as PgConnection


 def main(args: argparse.Namespace):
--- a/scripts/force_layer_download.py
+++ b/scripts/force_layer_download.py
@@ -7,13 +7,13 @@ import logging
 import signal
 import sys
 from collections import defaultdict
-from collections.abc import Awaitable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING

 import aiohttp

 if TYPE_CHECKING:
+    from collections.abc import Awaitable
    from typing import Any


--- a/storage_controller/src/heartbeater.rs
+++ b/storage_controller/src/heartbeater.rs
@@ -253,7 +253,7 @@ impl HeartBeat<Node, PageserverState> for HeartbeaterTask<Node, PageserverState>
                PageserverState::WarmingUp { .. } => {
                    warming_up += 1;
                }
-                PageserverState::Offline { .. } => offline += 1,
+                PageserverState::Offline => offline += 1,
                PageserverState::Available { .. } => {}
            }
        }
@@ -391,7 +391,7 @@ impl HeartBeat<Safekeeper, SafekeeperState> for HeartbeaterTask<Safekeeper, Safe
        let mut offline = 0;
        for state in new_state.values() {
            match state {
-                SafekeeperState::Offline { .. } => offline += 1,
+                SafekeeperState::Offline => offline += 1,
                SafekeeperState::Available { .. } => {}
            }
        }
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -24,9 +24,9 @@ use pageserver_api::controller_api::{
    ShardsPreferredAzsRequest, TenantCreateRequest, TenantPolicyRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::models::{
-    DetachBehavior, TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
-    TenantShardSplitRequest, TenantTimeTravelRequest, TimelineArchivalConfigRequest,
-    TimelineCreateRequest,
+    DetachBehavior, LsnLeaseRequest, TenantConfigPatchRequest, TenantConfigRequest,
+    TenantLocationConfigRequest, TenantShardSplitRequest, TenantTimeTravelRequest,
+    TimelineArchivalConfigRequest, TimelineCreateRequest,
 };
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
@@ -582,6 +582,32 @@ async fn handle_tenant_timeline_download_heatmap_layers(
    json_response(StatusCode::OK, ())
 }

+async fn handle_tenant_timeline_lsn_lease(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+
+    check_permissions(&req, Scope::PageServerApi)?;
+    maybe_rate_limit(&req, tenant_id).await;
+
+    let mut req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let lsn_lease_request = json_request::<LsnLeaseRequest>(&mut req).await?;
+
+    service
+        .tenant_timeline_lsn_lease(tenant_id, timeline_id, lsn_lease_request.lsn)
+        .await?;
+
+    json_response(StatusCode::OK, ())
+}
+
 // For metric labels where we would like to include the approximate path, but exclude high-cardinality fields like query parameters
 // and tenant/timeline IDs.  Since we are proxying to arbitrary paths, we don't have routing templates to
 // compare to, so we can just filter out our well known ID format with regexes.
@@ -613,6 +639,15 @@ async fn handle_tenant_timeline_passthrough(
        return Err(ApiError::BadRequest(anyhow::anyhow!("Missing path")));
    };

+    let method = match *req.method() {
+        hyper::Method::GET => reqwest::Method::GET,
+        hyper::Method::POST => reqwest::Method::POST,
+        hyper::Method::PUT => reqwest::Method::PUT,
+        hyper::Method::DELETE => reqwest::Method::DELETE,
+        hyper::Method::PATCH => reqwest::Method::PATCH,
+        _ => return Err(ApiError::BadRequest(anyhow::anyhow!("Unsupported method"))),
+    };
+
    tracing::info!(
        "Proxying request for tenant {} ({})",
        tenant_or_shard_id.tenant_id,
@@ -660,7 +695,7 @@ async fn handle_tenant_timeline_passthrough(
        node.base_url(),
        service.get_config().pageserver_jwt_token.as_deref(),
    );
-    let resp = client.get_raw(path).await.map_err(|e|
+    let resp = client.op_raw(method, path).await.map_err(|e|
        // We return 503 here because if we can't successfully send a request to the pageserver,
        // either we aren't available or the pageserver is unavailable.
        ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
@@ -1381,6 +1416,12 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
        )));
    }

+    if id <= 0 {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "id not allowed to be zero or negative: {id}"
+        )));
+    }
+
    let req = match maybe_forward(req).await {
        ForwardOutcome::Forwarded(res) => {
            return res;
@@ -2192,6 +2233,17 @@ pub fn make_router(
                )
            },
        )
+        // LSN lease passthrough to all shards
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/lsn_lease",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_lsn_lease,
+                    RequestName("v1_tenant_timeline_lsn_lease"),
+                )
+            },
+        )
        // Tenant detail GET passthrough to shard zero:
        .get("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(
@@ -2210,6 +2262,17 @@ pub fn make_router(
                RequestName("v1_tenant_passthrough"),
            )
        })
+        // Tenant timeline mark_invisible passthrough to shard zero
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_passthrough,
+                    RequestName("v1_tenant_timeline_mark_invisible_passthrough"),
+                )
+            },
+        )
 }

 #[cfg(test)]
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -115,19 +115,17 @@ struct Cli {
    #[arg(long)]
    split_threshold: Option<u64>,

-    /// Maximum number of shards during autosplits. 0 disables autosplits.
-    // TODO: defaults to 8 for backwards compatibility, should default to 255.
-    #[arg(long, default_value = "8")]
+    /// Maximum number of shards during autosplits. 0 disables autosplits. Defaults
+    /// to 16 as a safety to avoid too many shards by accident.
+    #[arg(long, default_value = "16")]
    max_split_shards: u8,

    /// Size threshold for initial shard splits of unsharded tenants. 0 disables initial splits.
-    // TODO: defaults to 64 GB for backwards compatibility. Should default to None.
-    #[arg(long, default_value = "68719476736")]
-    initial_split_threshold: u64,
+    #[arg(long)]
+    initial_split_threshold: Option<u64>,

-    /// Number of target shards for initial splits. 0 or 1 disables initial splits.
-    // TODO: defaults to 8 for backwards compatibility. Should default to 2.
-    #[arg(long, default_value = "8")]
+    /// Number of target shards for initial splits. 0 or 1 disables initial splits. Defaults to 2.
+    #[arg(long, default_value = "2")]
    initial_split_shards: u8,

    /// Maximum number of normal-priority reconcilers that may run in parallel
@@ -285,10 +283,8 @@ impl Secrets {
    fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
        if let Some(v) = cli {
            Some(v.clone())
-        } else if let Ok(v) = std::env::var(env_name) {
-            Some(v)
        } else {
-            None
+            std::env::var(env_name).ok()
        }
    }
 }
@@ -417,7 +413,7 @@ async fn async_main() -> anyhow::Result<()> {
        tenant_rate_limit: args.tenant_rate_limit,
        split_threshold: args.split_threshold,
        max_split_shards: args.max_split_shards,
-        initial_split_threshold: Some(args.initial_split_threshold),
+        initial_split_threshold: args.initial_split_threshold,
        initial_split_shards: args.initial_split_shards,
        neon_local_repo_dir: args.neon_local_repo_dir,
        max_secondary_lag_bytes: args.max_secondary_lag_bytes,
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -1,6 +1,6 @@
 use pageserver_api::models::detach_ancestor::AncestorDetached;
 use pageserver_api::models::{
-    DetachBehavior, LocationConfig, LocationConfigListResponse, PageserverUtilization,
+    DetachBehavior, LocationConfig, LocationConfigListResponse, LsnLease, PageserverUtilization,
    SecondaryProgress, TenantScanRemoteStorageResponse, TenantShardSplitRequest,
    TenantShardSplitResponse, TenantWaitLsnRequest, TimelineArchivalConfigRequest,
    TimelineCreateRequest, TimelineInfo, TopTenantShardsRequest, TopTenantShardsResponse,
@@ -10,6 +10,7 @@ use pageserver_client::BlockUnblock;
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;

 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -195,6 +196,22 @@ impl PageserverClient {
        )
    }

+    pub(crate) async fn timeline_lease_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<LsnLease> {
+        measured_request!(
+            "timeline_lease_lsn",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner
+                .timeline_init_lsn_lease(tenant_shard_id, timeline_id, lsn)
+                .await
+        )
+    }
+
    pub(crate) async fn tenant_shard_split(
        &self,
        tenant_shard_id: TenantShardId,
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1369,6 +1369,65 @@ impl Persistence {
        Ok(timeline_from_db)
    }

+    /// Set `delete_at` for the given timeline
+    pub(crate) async fn timeline_set_deleted_at(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> DatabaseResult<()> {
+        use crate::schema::timelines;
+
+        let deletion_time = chrono::Local::now().to_utc();
+        self.with_measured_conn(DatabaseOperation::InsertTimeline, move |conn| {
+            Box::pin(async move {
+                let updated = diesel::update(timelines::table)
+                    .filter(timelines::tenant_id.eq(tenant_id.to_string()))
+                    .filter(timelines::timeline_id.eq(timeline_id.to_string()))
+                    .set(timelines::deleted_at.eq(Some(deletion_time)))
+                    .execute(conn)
+                    .await?;
+
+                match updated {
+                    0 => Ok(()),
+                    1 => Ok(()),
+                    _ => Err(DatabaseError::Logical(format!(
+                        "unexpected number of rows ({})",
+                        updated
+                    ))),
+                }
+            })
+        })
+        .await
+    }
+
+    /// Load timeline from db. Returns `None` if not present.
+    ///
+    /// Only works if `deleted_at` is set, so you should call [`Self::timeline_set_deleted_at`] before.
+    pub(crate) async fn delete_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> DatabaseResult<()> {
+        use crate::schema::timelines::dsl;
+
+        let tenant_id = &tenant_id;
+        let timeline_id = &timeline_id;
+        self.with_measured_conn(DatabaseOperation::GetTimeline, move |conn| {
+            Box::pin(async move {
+                diesel::delete(dsl::timelines)
+                    .filter(dsl::tenant_id.eq(&tenant_id.to_string()))
+                    .filter(dsl::timeline_id.eq(&timeline_id.to_string()))
+                    .filter(dsl::deleted_at.is_not_null())
+                    .execute(conn)
+                    .await?;
+                Ok(())
+            })
+        })
+        .await?;
+
+        Ok(())
+    }
+
    /// Loads a list of all timelines from database.
    pub(crate) async fn list_timelines_for_tenant(
        &self,
@@ -1491,6 +1550,34 @@ impl Persistence {

        Ok(timeline_from_db)
    }
+    /// List pending operations for a given timeline (including tenant-global ones)
+    pub(crate) async fn list_pending_ops_for_timeline(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> DatabaseResult<Vec<TimelinePendingOpPersistence>> {
+        use crate::schema::safekeeper_timeline_pending_ops::dsl;
+
+        let timelines_from_db = self
+            .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| {
+                Box::pin(async move {
+                    let from_db: Vec<TimelinePendingOpPersistence> =
+                        dsl::safekeeper_timeline_pending_ops
+                            .filter(dsl::tenant_id.eq(tenant_id.to_string()))
+                            .filter(
+                                dsl::timeline_id
+                                    .eq(timeline_id.to_string())
+                                    .or(dsl::timeline_id.eq("")),
+                            )
+                            .load(conn)
+                            .await?;
+                    Ok(from_db)
+                })
+            })
+            .await?;
+
+        Ok(timelines_from_db)
+    }

    /// Delete all pending ops for the given timeline.
    ///
@@ -1974,7 +2061,7 @@ impl ToSql<crate::schema::sql_types::PgLsn, Pg> for LsnWrapper {
    }
 }

-#[derive(Insertable, AsChangeset, Queryable, Selectable, Clone)]
+#[derive(Insertable, AsChangeset, Clone)]
 #[diesel(table_name = crate::schema::timelines)]
 pub(crate) struct TimelinePersistence {
    pub(crate) tenant_id: String,
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -686,6 +686,8 @@ impl Reconciler {
                .await?,
        );

+        pausable_failpoint!("reconciler-live-migrate-post-generation-inc");
+
        let dest_conf = build_location_config(
            &self.shard,
            &self.config,
@@ -760,7 +762,9 @@ impl Reconciler {
        Ok(())
    }

-    async fn maybe_refresh_observed(&mut self) -> Result<(), ReconcileError> {
+    /// Returns true if the observed state of the attached location was refreshed
+    /// and false otherwise.
+    async fn maybe_refresh_observed(&mut self) -> Result<bool, ReconcileError> {
        // If the attached node has uncertain state, read it from the pageserver before proceeding: this
        // is important to avoid spurious generation increments.
        //
@@ -770,7 +774,7 @@ impl Reconciler {

        let Some(attached_node) = self.intent.attached.as_ref() else {
            // Nothing to do
-            return Ok(());
+            return Ok(false);
        };

        if matches!(
@@ -815,7 +819,7 @@ impl Reconciler {
            }
        }

-        Ok(())
+        Ok(true)
    }

    /// Reconciling a tenant makes API calls to pageservers until the observed state
@@ -831,7 +835,7 @@ impl Reconciler {
    /// state where it still requires later reconciliation.
    pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
        // Prepare: if we have uncertain `observed` state for our would-be attachement location, then refresh it
-        self.maybe_refresh_observed().await?;
+        let refreshed = self.maybe_refresh_observed().await?;

        // Special case: live migration
        self.maybe_live_migrate().await?;
@@ -855,8 +859,14 @@ impl Reconciler {
            );
            match self.observed.locations.get(&node.get_id()) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
-                    // Nothing to do
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
+                    if refreshed {
+                        tracing::info!(
+                            node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute.");
+                        self.compute_notify().await?;
+                    } else {
+                        // Nothing to do
+                        tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.");
+                    }
                }
                observed => {
                    // In all cases other than a matching observed configuration, we will
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -12,7 +12,7 @@ use std::ops::{Deref, DerefMut};
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::{Duration, Instant, SystemTime};

 use anyhow::Context;
 use context_iterator::TenantShardContextIterator;
@@ -34,7 +34,7 @@ use pageserver_api::controller_api::{
    TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 use pageserver_api::models::{
-    self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode,
+    self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
    PageserverUtilization, SecondaryProgress, ShardParameters, TenantConfig,
    TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
    TenantLocationConfigResponse, TenantShardLocation, TenantShardSplitRequest,
@@ -60,6 +60,7 @@ use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 use utils::completion::Barrier;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
+use utils::lsn::Lsn;
 use utils::sync::gate::Gate;
 use utils::{failpoint_support, pausable_failpoint};

@@ -152,6 +153,7 @@ enum TenantOperations {
    TimelineGcBlockUnblock,
    DropDetached,
    DownloadHeatmapLayers,
+    TimelineLsnLease,
 }

 #[derive(Clone, strum_macros::Display)]
@@ -3987,6 +3989,75 @@ impl Service {
        Ok(())
    }

+    pub(crate) async fn tenant_timeline_lsn_lease(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        lsn: Lsn,
+    ) -> Result<LsnLease, ApiError> {
+        let _tenant_lock = trace_shared_lock(
+            &self.tenant_op_locks,
+            tenant_id,
+            TenantOperations::TimelineLsnLease,
+        )
+        .await;
+
+        let targets = {
+            let locked = self.inner.read().unwrap();
+            let mut targets = Vec::new();
+
+            // If the request got an unsharded tenant id, then apply
+            // the operation to all shards. Otherwise, apply it to a specific shard.
+            let shards_range = TenantShardId::tenant_range(tenant_id);
+
+            for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
+                if let Some(node_id) = shard.intent.get_attached() {
+                    let node = locked
+                        .nodes
+                        .get(node_id)
+                        .expect("Pageservers may not be deleted while referenced");
+
+                    targets.push((*tenant_shard_id, node.clone()));
+                }
+            }
+            targets
+        };
+
+        let res = self
+            .tenant_for_shards_api(
+                targets,
+                |tenant_shard_id, client| async move {
+                    client
+                        .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
+                        .await
+                },
+                1,
+                1,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        let mut valid_until = None;
+        for r in res {
+            match r {
+                Ok(lease) => {
+                    if let Some(ref mut valid_until) = valid_until {
+                        *valid_until = std::cmp::min(*valid_until, lease.valid_until);
+                    } else {
+                        valid_until = Some(lease.valid_until);
+                    }
+                }
+                Err(e) => {
+                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                }
+            }
+        }
+        Ok(LsnLease {
+            valid_until: valid_until.unwrap_or_else(SystemTime::now),
+        })
+    }
+
    pub(crate) async fn tenant_timeline_download_heatmap_layers(
        &self,
        tenant_shard_id: TenantShardId,
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -4,7 +4,7 @@ use std::time::Duration;

 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
-use rand::thread_rng;
+use rand::{Rng, thread_rng};
 use tokio_util::sync::CancellationToken;
 use utils::id::NodeId;
 use utils::shard::TenantShardId;
@@ -64,17 +64,22 @@ impl ChaosInjector {
        let mut interval = tokio::time::interval(self.interval);
        #[derive(Debug)]
        enum ChaosEvent {
-            ShuffleTenant,
-            ForceKill,
+            MigrationsToSecondary,
+            ForceKillController,
+            GracefulMigrationsAnywhere,
        }
        loop {
            let cron_interval = self.get_cron_interval_sleep_future();
            let chaos_type = tokio::select! {
                _ = interval.tick() => {
-                    ChaosEvent::ShuffleTenant
+                    if thread_rng().gen_bool(0.5) {
+                        ChaosEvent::MigrationsToSecondary
+                    } else {
+                        ChaosEvent::GracefulMigrationsAnywhere
+                    }
                }
                Some(_) = maybe_sleep(cron_interval) => {
-                    ChaosEvent::ForceKill
+                    ChaosEvent::ForceKillController
                }
                _ = cancel.cancelled() => {
                    tracing::info!("Shutting down");
@@ -83,16 +88,29 @@ impl ChaosInjector {
            };
            tracing::info!("Chaos iteration: {chaos_type:?}...");
            match chaos_type {
-                ChaosEvent::ShuffleTenant => {
-                    self.inject_chaos().await;
+                ChaosEvent::MigrationsToSecondary => {
+                    self.inject_migrations_to_secondary();
                }
-                ChaosEvent::ForceKill => {
+                ChaosEvent::GracefulMigrationsAnywhere => {
+                    self.inject_graceful_migrations_anywhere();
+                }
+                ChaosEvent::ForceKillController => {
                    self.force_kill().await;
                }
            }
        }
    }

+    fn is_shard_eligible_for_chaos(&self, shard: &TenantShard) -> bool {
+        // - Skip non-active scheduling policies, so that a shard with a policy like Pause can
+        //   be pinned without being disrupted by us.
+        // - Skip shards doing a graceful migration already, so that we allow these to run to
+        //   completion rather than only exercising the first part and then cancelling with
+        //   some other chaos.
+        !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active)
+            && shard.get_preferred_node().is_none()
+    }
+
    /// If a shard has a secondary and attached location, then re-assign the secondary to be
    /// attached and the attached to be secondary.
    ///
@@ -108,13 +126,7 @@ impl ChaosInjector {
            .get_mut(&tenant_shard_id)
            .expect("Held lock between choosing ID and this get");

-        if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
-            // Skip non-active scheduling policies, so that a shard with a policy like Pause can
-            // be pinned without being disrupted by us.
-            tracing::info!(
-                "Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
-                shard.get_scheduling_policy()
-            );
+        if !self.is_shard_eligible_for_chaos(shard) {
            return;
        }

@@ -152,7 +164,77 @@ impl ChaosInjector {
        std::process::exit(1);
    }

-    async fn inject_chaos(&mut self) {
+    // Unlike [`Self::inject_migrations_to_secondary`], this function will not only cut over to secondary, it
+    // will migrate a tenant to a random node in its home AZ using a graceful migration of the same type
+    // that my be initiated by an API caller using prewarm=true.
+    //
+    // This is a much more expensive operation in terms of I/O and time, as we will fully warm up
+    // some new location in order to migrate the tenant there.  For that reason we do far fewer of these.
+    fn inject_graceful_migrations_anywhere(&mut self) {
+        let batch_size = 1;
+        let mut inner = self.service.inner.write().unwrap();
+        let (nodes, tenants, _scheduler) = inner.parts_mut();
+
+        let mut candidates = tenants
+            .values_mut()
+            .filter(|shard| self.is_shard_eligible_for_chaos(shard))
+            .collect::<Vec<_>>();
+
+        tracing::info!(
+            "Injecting chaos: found {} candidates for graceful migrations anywhere",
+            candidates.len()
+        );
+
+        let mut victims: Vec<&mut TenantShard> = Vec::new();
+
+        // Pick our victims: use a hand-rolled loop rather than choose_multiple() because we want
+        // to take the mutable refs from our candidates rather than ref'ing them.
+        while !candidates.is_empty() && victims.len() < batch_size {
+            let i = thread_rng().gen_range(0..candidates.len());
+            victims.push(candidates.swap_remove(i));
+        }
+
+        for victim in victims.into_iter() {
+            // Find a node in the same AZ as the shard, or if the shard has no AZ preference, which
+            // is not where they are currently attached.
+            let candidate_nodes = nodes
+                .values()
+                .filter(|node| {
+                    if let Some(preferred_az) = victim.preferred_az() {
+                        node.get_availability_zone_id() == preferred_az
+                    } else if let Some(attached) = *victim.intent.get_attached() {
+                        node.get_id() != attached
+                    } else {
+                        true
+                    }
+                })
+                .collect::<Vec<_>>();
+
+            let Some(victim_node) = candidate_nodes.choose(&mut thread_rng()) else {
+                // This can happen if e.g. we are in a small region with only one pageserver per AZ.
+                tracing::info!(
+                    "no candidate nodes found for migrating shard {tenant_shard_id} within its home AZ",
+                    tenant_shard_id = victim.tenant_shard_id
+                );
+                continue;
+            };
+
+            // This doesn't change intent immediately: next iteration of Service::optimize_all should do that.  We avoid
+            // doing it here because applying optimizations requires dropping lock to do some async work to check the optimisation
+            // is valid given remote state, and it would be a shame to duplicate that dance here.
+            tracing::info!(
+                "Injecting chaos: migrate {} to {}",
+                victim.tenant_shard_id,
+                victim_node
+            );
+            victim.set_preferred_node(Some(victim_node.get_id()));
+        }
+    }
+
+    /// Migrations of attached locations to their secondary location.  This exercises reconciliation in general,
+    /// live migration in particular, and the pageserver code for cleanly shutting down and starting up tenants
+    /// during such migrations.
+    fn inject_migrations_to_secondary(&mut self) {
        // Pick some shards to interfere with
        let batch_size = 128;
        let mut inner = self.service.inner.write().unwrap();
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -160,9 +160,8 @@ pub(crate) struct ScheduleRequest {
 }

 struct ReconcilerHandle {
-    tx: UnboundedSender<(ScheduleRequest, Arc<CancellationToken>)>,
-    #[allow(clippy::type_complexity)]
-    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), Arc<CancellationToken>>>,
+    tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
+    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
    cancel: CancellationToken,
 }

@@ -172,13 +171,13 @@ impl ReconcilerHandle {
        &self,
        tenant_id: TenantId,
        timeline_id: Option<TimelineId>,
-    ) -> Arc<CancellationToken> {
+    ) -> CancellationToken {
        let entry = self.ongoing_tokens.entry((tenant_id, timeline_id));
        if let Entry::Occupied(entry) = &entry {
            let cancel: &CancellationToken = entry.get();
            cancel.cancel();
        }
-        entry.insert(Arc::new(self.cancel.child_token())).clone()
+        entry.insert(self.cancel.child_token()).clone()
    }
    /// Cancel an ongoing reconciliation
    fn cancel_reconciliation(&self, tenant_id: TenantId, timeline_id: Option<TimelineId>) {
@@ -197,7 +196,7 @@ impl ReconcilerHandle {

 pub(crate) struct SafekeeperReconciler {
    service: Arc<Service>,
-    rx: UnboundedReceiver<(ScheduleRequest, Arc<CancellationToken>)>,
+    rx: UnboundedReceiver<(ScheduleRequest, CancellationToken)>,
    cancel: CancellationToken,
 }

@@ -243,7 +242,7 @@ impl SafekeeperReconciler {
                .await;
        }
    }
-    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: Arc<CancellationToken>) {
+    async fn reconcile_one(&self, req: ScheduleRequest, req_cancel: CancellationToken) {
        let req_host = req.safekeeper.skp.host.clone();
        match req.kind {
            SafekeeperTimelineOpKind::Pull => {
@@ -300,36 +299,96 @@ impl SafekeeperReconciler {
            SafekeeperTimelineOpKind::Delete => {
                let tenant_id = req.tenant_id;
                if let Some(timeline_id) = req.timeline_id {
-                    self.reconcile_inner(
+                    let deleted = self.reconcile_inner(
                        req,
                        async |client| client.delete_timeline(tenant_id, timeline_id).await,
                        |_resp| {
-                            tracing::info!("deleted timeline from {req_host}");
+                            tracing::info!(%tenant_id, %timeline_id, "deleted timeline from {req_host}");
                        },
                        req_cancel,
                    )
                    .await;
+                    if deleted {
+                        self.delete_timeline_from_db(tenant_id, timeline_id).await;
+                    }
                } else {
-                    self.reconcile_inner(
-                        req,
-                        async |client| client.delete_tenant(tenant_id).await,
-                        |_resp| {
-                            tracing::info!("deleted tenant from {req_host}");
-                        },
-                        req_cancel,
-                    )
-                    .await;
+                    let deleted = self
+                        .reconcile_inner(
+                            req,
+                            async |client| client.delete_tenant(tenant_id).await,
+                            |_resp| {
+                                tracing::info!(%tenant_id, "deleted tenant from {req_host}");
+                            },
+                            req_cancel,
+                        )
+                        .await;
+                    if deleted {
+                        self.delete_tenant_timelines_from_db(tenant_id).await;
+                    }
                }
            }
        }
    }
+    async fn delete_timeline_from_db(&self, tenant_id: TenantId, timeline_id: TimelineId) {
+        match self
+            .service
+            .persistence
+            .list_pending_ops_for_timeline(tenant_id, timeline_id)
+            .await
+        {
+            Ok(list) => {
+                if !list.is_empty() {
+                    tracing::info!(%tenant_id, %timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len());
+                    return;
+                }
+            }
+            Err(e) => {
+                tracing::warn!(%tenant_id, %timeline_id, "couldn't query pending ops: {e}");
+                return;
+            }
+        }
+        tracing::info!(%tenant_id, %timeline_id, "deleting timeline from db after all reconciles succeeded");
+        // In theory we could crash right after deleting the op from the db and right before reaching this,
+        // but then we'll boot up with a timeline that has deleted_at set, so hopefully we'll issue deletion ops for it again.
+        if let Err(err) = self
+            .service
+            .persistence
+            .delete_timeline(tenant_id, timeline_id)
+            .await
+        {
+            tracing::warn!(%tenant_id, %timeline_id, "couldn't delete timeline from db: {err}");
+        }
+    }
+    async fn delete_tenant_timelines_from_db(&self, tenant_id: TenantId) {
+        let timeline_list = match self
+            .service
+            .persistence
+            .list_timelines_for_tenant(tenant_id)
+            .await
+        {
+            Ok(timeline_list) => timeline_list,
+            Err(e) => {
+                tracing::warn!(%tenant_id, "couldn't query timelines: {e}");
+                return;
+            }
+        };
+        for timeline in timeline_list {
+            let Ok(timeline_id) = TimelineId::from_str(&timeline.timeline_id) else {
+                tracing::warn!("Invalid timeline ID in database {}", timeline.timeline_id);
+                continue;
+            };
+            self.delete_timeline_from_db(tenant_id, timeline_id).await;
+        }
+    }
+    /// Returns whether the reconciliation happened successfully
    async fn reconcile_inner<T, F, U>(
        &self,
        req: ScheduleRequest,
        closure: impl Fn(SafekeeperClient) -> F,
        log_success: impl FnOnce(T) -> U,
-        req_cancel: Arc<CancellationToken>,
-    ) where
+        req_cancel: CancellationToken,
+    ) -> bool
+    where
        F: Future<Output = Result<T, safekeeper_client::mgmt_api::Error>>,
    {
        let jwt = self
@@ -373,11 +432,11 @@ impl SafekeeperReconciler {
                            req.safekeeper.skp.host
                        );
                    }
-                    return;
+                    return true;
                }
                Err(mgmt_api::Error::Cancelled) => {
                    // On cancellation, the code that issued it will take care of removing db entries (if needed)
-                    return;
+                    return false;
                }
                Err(e) => {
                    tracing::info!(
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -1,4 +1,4 @@
-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
@@ -313,25 +313,32 @@ impl Service {
            );
            return Ok(());
        };
+        self.persistence
+            .timeline_set_deleted_at(tenant_id, timeline_id)
+            .await?;
        let all_sks = tl
            .new_sk_set
            .iter()
-            .flat_map(|sks| {
-                sks.iter()
-                    .map(|sk| (*sk, SafekeeperTimelineOpKind::Exclude))
-            })
-            .chain(
-                tl.sk_set
-                    .iter()
-                    .map(|v| (*v, SafekeeperTimelineOpKind::Delete)),
-            )
-            .collect::<HashMap<_, _>>();
+            .flatten()
+            .chain(tl.sk_set.iter())
+            .collect::<HashSet<_>>();

        // Schedule reconciliations
+        for &sk_id in all_sks.iter() {
+            let pending_op = TimelinePendingOpPersistence {
+                tenant_id: tenant_id.to_string(),
+                timeline_id: timeline_id.to_string(),
+                generation: tl.generation,
+                op_kind: SafekeeperTimelineOpKind::Delete,
+                sk_id: *sk_id,
+            };
+            tracing::info!("writing pending op for sk id {sk_id}");
+            self.persistence.insert_pending_op(pending_op).await?;
+        }
        {
            let mut locked = self.inner.write().unwrap();
-            for (sk_id, kind) in all_sks {
-                let sk_id = NodeId(sk_id as u64);
+            for sk_id in all_sks {
+                let sk_id = NodeId(*sk_id as u64);
                let Some(sk) = locked.safekeepers.get(&sk_id) else {
                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
                        "Couldn't find safekeeper with id {sk_id}"
@@ -345,7 +352,7 @@ impl Service {
                    tenant_id,
                    timeline_id: Some(timeline_id),
                    generation: tl.generation as u32,
-                    kind,
+                    kind: SafekeeperTimelineOpKind::Delete,
                };
                locked.safekeeper_reconcilers.schedule_request(self, req);
            }
@@ -379,32 +386,50 @@ impl Service {
            })
            .collect::<Result<Vec<_>, ApiError>>()?;

-        // Remove pending ops from db.
+        // Remove pending ops from db, and set `deleted_at`.
        // We cancel them in a later iteration once we hold the state lock.
        for (timeline_id, _timeline) in timeline_list.iter() {
            self.persistence
                .remove_pending_ops_for_timeline(tenant_id, Some(*timeline_id))
                .await?;
+            self.persistence
+                .timeline_set_deleted_at(tenant_id, *timeline_id)
+                .await?;
        }

-        let mut locked = self.inner.write().unwrap();
-
        // The list of safekeepers that have any of the timelines
        let mut sk_list = HashSet::new();

        // List all pending ops for all timelines, cancel them
-        for (timeline_id, timeline) in timeline_list.iter() {
+        for (_timeline_id, timeline) in timeline_list.iter() {
            let sk_iter = timeline
                .sk_set
                .iter()
                .chain(timeline.new_sk_set.iter().flatten())
                .map(|id| NodeId(*id as u64));
-            for sk_id in sk_iter.clone() {
+            sk_list.extend(sk_iter);
+        }
+
+        for &sk_id in sk_list.iter() {
+            let pending_op = TimelinePendingOpPersistence {
+                tenant_id: tenant_id.to_string(),
+                timeline_id: String::new(),
+                generation: i32::MAX,
+                op_kind: SafekeeperTimelineOpKind::Delete,
+                sk_id: sk_id.0 as i64,
+            };
+            tracing::info!("writing pending op for sk id {sk_id}");
+            self.persistence.insert_pending_op(pending_op).await?;
+        }
+
+        let mut locked = self.inner.write().unwrap();
+
+        for (timeline_id, _timeline) in timeline_list.iter() {
+            for sk_id in sk_list.iter() {
                locked
                    .safekeeper_reconcilers
-                    .cancel_reconciles_for_timeline(sk_id, tenant_id, Some(*timeline_id));
+                    .cancel_reconciles_for_timeline(*sk_id, tenant_id, Some(*timeline_id));
            }
-            sk_list.extend(sk_iter);
        }

        // unwrap is safe: we return above for an empty timeline list
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -622,7 +622,7 @@ impl TenantShard {
            .collect::<Vec<_>>();

        attached_locs.sort_by_key(|i| i.1);
-        if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
+        if let Some((node_id, _gen)) = attached_locs.into_iter().next_back() {
            self.intent.set_attached(scheduler, Some(*node_id));
        }

--- a/storage_scrubber/src/find_large_objects.rs
+++ b/storage_scrubber/src/find_large_objects.rs
@@ -18,7 +18,7 @@ enum LargeObjectKind {

 impl LargeObjectKind {
    fn from_key(key: &str) -> Self {
-        let fname = key.split('/').last().unwrap();
+        let fname = key.split('/').next_back().unwrap();

        let Ok((layer_name, _generation)) = parse_layer_object_name(fname) else {
            return LargeObjectKind::Other;
--- a/test_runner/cloud_regress/test_cloud_regress.py
+++ b/test_runner/cloud_regress/test_cloud_regress.py
@@ -4,11 +4,15 @@ Run the regression tests on the cloud instance of Neon

 from __future__ import annotations

-from pathlib import Path
+from typing import TYPE_CHECKING

 import pytest
-from fixtures.neon_fixtures import RemotePostgres
-from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fixtures.neon_fixtures import RemotePostgres
+    from fixtures.pg_version import PgVersion


@pytest.mark.timeout(7200)
--- a/test_runner/fixtures/auth_tokens.py
+++ b/test_runner/fixtures/auth_tokens.py
@@ -2,11 +2,12 @@ from __future__ import annotations

 from dataclasses import dataclass
 from enum import StrEnum
-from typing import Any
+from typing import TYPE_CHECKING, Any

 import jwt

-from fixtures.common_types import TenantId
+if TYPE_CHECKING:
+    from fixtures.common_types import TenantId


@dataclass
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -15,18 +15,20 @@ from typing import TYPE_CHECKING

 import allure
 import pytest
-from _pytest.config import Config
-from _pytest.config.argparsing import Parser
-from _pytest.fixtures import FixtureRequest
-from _pytest.terminal import TerminalReporter

-from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import NeonPageserver

 if TYPE_CHECKING:
    from collections.abc import Callable, Iterator, Mapping

+    from _pytest.config import Config
+    from _pytest.config.argparsing import Parser
+    from _pytest.fixtures import FixtureRequest
+    from _pytest.terminal import TerminalReporter
+
+    from fixtures.common_types import TenantId, TimelineId
+    from fixtures.neon_fixtures import NeonPageserver
+

 """
 This file contains fixtures for micro-benchmarks.
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -11,7 +11,6 @@ from pathlib import Path
 from typing import TYPE_CHECKING, final

 import pytest
-from _pytest.fixtures import FixtureRequest
 from typing_extensions import override

 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
@@ -24,11 +23,14 @@ from fixtures.neon_fixtures import (
    VanillaPostgres,
    wait_for_last_flush_lsn,
 )
-from fixtures.pg_stats import PgStatTable

 if TYPE_CHECKING:
    from collections.abc import Iterator

+    from _pytest.fixtures import FixtureRequest
+
+    from fixtures.pg_stats import PgStatTable
+

 class PgCompare(ABC):
    """Common interface of all postgres implementations, useful for benchmarks.
--- a/test_runner/fixtures/compute_reconfigure.py
+++ b/test_runner/fixtures/compute_reconfigure.py
@@ -4,8 +4,6 @@ import concurrent.futures
 from typing import TYPE_CHECKING

 import pytest
-from pytest_httpserver import HTTPServer
-from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response

 from fixtures.common_types import TenantId
@@ -15,6 +13,9 @@ if TYPE_CHECKING:
    from collections.abc import Callable
    from typing import Any

+    from pytest_httpserver import HTTPServer
+    from werkzeug.wrappers.request import Request
+

 class ComputeReconfigure:
    def __init__(self, server: HTTPServer):
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -5,6 +5,8 @@ import urllib.parse
 import requests
 from requests.adapters import HTTPAdapter

+from fixtures.log_helper import log
+

 class EndpointHttpClient(requests.Session):
    def __init__(
@@ -51,6 +53,7 @@ class EndpointHttpClient(requests.Session):
    def metrics(self) -> str:
        res = self.get(f"http://localhost:{self.external_port}/metrics")
        res.raise_for_status()
+        log.debug("raw compute metrics: %s", res.text)
        return res.text

    # Current compute status.
--- a/test_runner/fixtures/fast_import.py
+++ b/test_runner/fixtures/fast_import.py
@@ -147,7 +147,7 @@ def fast_import(
        pg_distrib_dir,
        pg_version,
        workdir,
-        cleanup=not cast(bool, pytestconfig.getoption("--preserve-database-files")),
+        cleanup=not cast("bool", pytestconfig.getoption("--preserve-database-files")),
    ) as fi:
        yield fi

--- a/test_runner/fixtures/h2server.py
+++ b/test_runner/fixtures/h2server.py
@@ -10,7 +10,6 @@ import asyncio
 import collections
 import io
 import json
-from collections.abc import AsyncIterable
 from typing import TYPE_CHECKING, final

 import pytest_asyncio
@@ -31,6 +30,7 @@ from h2.settings import SettingCodes
 from typing_extensions import override

 if TYPE_CHECKING:
+    from collections.abc import AsyncIterable
    from typing import Any


--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -1,12 +1,15 @@
 from __future__ import annotations

 from collections import defaultdict
+from typing import TYPE_CHECKING

 from prometheus_client.parser import text_string_to_metric_families
-from prometheus_client.samples import Sample

 from fixtures.log_helper import log

+if TYPE_CHECKING:
+    from prometheus_client.samples import Sample
+

 class Metrics:
    metrics: dict[str, list[Sample]]
@@ -168,7 +171,6 @@ PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
    "pageserver_evictions_with_low_residence_duration_total",
    "pageserver_aux_file_estimated_size",
    "pageserver_valid_lsn_lease_count",
-    "pageserver_flush_wait_upload_seconds",
    counter("pageserver_tenant_throttling_count_accounted_start"),
    counter("pageserver_tenant_throttling_count_accounted_finish"),
    counter("pageserver_tenant_throttling_wait_usecs_sum"),
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`SELECT lfc_value AS lfc_used_pages FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used_pages';`