tests: update tests that presume a single AZ

tests: use 3 AZs by default
CI/CD Hardening: Fixing StepSecurity Flagged Issues (#11724 )
2026-05-16 04:30:38 +00:00 · 2025-04-25 18:49:33 +02:00 · 2025-04-25 18:49:03 +02:00 · 2025-04-25 14:36:45 +00:00 · 2025-04-25 14:28:56 +00:00 · 2025-04-25 11:46:15 +00:00
250 changed files with 13079 additions and 4027 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,7 +19,7 @@
 !pageserver/
 !pgxn/
 !proxy/
-!object_storage/
+!endpoint_storage/
 !storage_scrubber/
 !safekeeper/
 !storage_broker/
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -113,8 +113,6 @@ runs:
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
-        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
-        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
        RERUN_FAILED: ${{ inputs.rerun_failed }}
        PG_VERSION: ${{ inputs.pg_version }}
        SANITIZERS: ${{ inputs.sanitizers }}
@@ -135,6 +133,7 @@ runs:
        fi

        PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+        echo "PERF_REPORT_DIR=${PERF_REPORT_DIR}" >> ${GITHUB_ENV}
        rm -rf $PERF_REPORT_DIR

        TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
@@ -211,11 +210,12 @@ runs:
          --verbose \
          -rA $TEST_SELECTION $EXTRA_PARAMS

-        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
-          export REPORT_FROM="$PERF_REPORT_DIR"
-          export REPORT_TO="$PLATFORM"
-          scripts/generate_and_push_perf_report.sh
-        fi
+    - name: Upload performance report
+      if: ${{ !cancelled() && inputs.save_perf_report == 'true' }}
+      shell: bash -euxo pipefail {0}
+      run: |
+        export REPORT_FROM="${PERF_REPORT_DIR}"
+        scripts/generate_and_push_perf_report.sh

    - name: Upload compatibility snapshot
      # Note, that we use `github.base_ref` which is a target branch for a PR
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -272,10 +272,13 @@ jobs:
          # run pageserver tests with different settings
          for get_vectored_concurrent_io in sequential sidecar-task; do
            for io_engine in std-fs tokio-epoll-uring ; do
-              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                ${cov_prefix} \
-                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+                for io_mode in buffered direct direct-rw ; do
+                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE=$io_mode \
+                  ${cov_prefix} \
+                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+              done
            done
          done

@@ -346,7 +349,7 @@ jobs:
      contents: read
      statuses: write
    needs: [ build-neon ]
-    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large-metal')) }}
    container:
      image: ${{ inputs.build-tools-image }}
      credentials:
@@ -392,6 +395,7 @@ jobs:
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -165,5 +165,5 @@ jobs:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          CURRENT_SHA: ${{ github.sha }}
        run: |
-          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
+          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release.*$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
          echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/build-macos.yml
+++ b/.github/workflows/build-macos.yml
@@ -63,13 +63,8 @@ jobs:

      - name: Cache postgres ${{ matrix.postgres-version }} build
        id: cache_pg
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/${{ matrix.postgres-version }}
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-${{ matrix.postgres-version }}-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

@@ -134,25 +129,15 @@ jobs:

      - name: Cache postgres v17 build
        id: cache_pg
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache walproposer-lib
        id: cache_walproposer_lib
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

@@ -218,57 +203,32 @@ jobs:

      - name: Cache postgres v14 build
        id: cache_pg
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/v14
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v14-${{ steps.pg_rev_v14.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v15 build
        id: cache_pg_v15
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/v15
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v15-${{ steps.pg_rev_v15.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v16 build
        id: cache_pg_v16
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/v16
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v16-${{ steps.pg_rev_v16.outputs.pg_rev }}-${{ hashFiles('Makefile') }}
      - name: Cache postgres v17 build
        id: cache_pg_v17
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/v17
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-pg-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

      - name: Cache cargo deps (only for v17)
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: |
            ~/.cargo/registry
            !~/.cargo/registry/src
@@ -278,13 +238,8 @@ jobs:

      - name: Cache walproposer-lib
        id: cache_walproposer_lib
-        uses: tespkg/actions-cache@b7bf5fcc2f98a52ac6080eb0fd282c2f752074b1  # v1.8.0
+        uses: actions/cache@5a3ec84eff668545956fd18022155c47e93e2684 # v4.2.3
        with:
-          endpoint: ${{ vars.HETZNER_CACHE_REGION }}.${{ vars.HETZNER_CACHE_ENDPOINT }}
-          bucket: ${{ vars.HETZNER_CACHE_BUCKET }}
-          accessKey: ${{ secrets.HETZNER_CACHE_ACCESS_KEY }}
-          secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
-          use-fallback: false
          path: pg_install/build/walproposer-lib
          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ env.BUILD_TYPE }}-walproposer_lib-v17-${{ steps.pg_rev_v17.outputs.pg_rev }}-${{ hashFiles('Makefile') }}

--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,6 +323,8 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct-rw
          SYNC_BETWEEN_TESTS: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
@@ -1236,7 +1238,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
+          TIMEOUT=5400 # 90 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
          INTERVAL=15 # try each N seconds

          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
--- a/.github/workflows/check-permissions.yml
+++ b/.github/workflows/check-permissions.yml
@@ -19,7 +19,7 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
    - name: Harden the runner (Audit all outbound calls)
-      uses: step-security/harden-runner@v2
+      uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
      with:
        egress-policy: audit

--- a/.github/workflows/cleanup-caches-by-a-branch.yml
+++ b/.github/workflows/cleanup-caches-by-a-branch.yml
@@ -12,7 +12,7 @@ jobs:
    runs-on: ubuntu-22.04
    steps:
      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@v2
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit

--- a/.github/workflows/fast-forward.yml
+++ b/.github/workflows/fast-forward.yml
@@ -14,7 +14,7 @@ jobs:

    steps:
      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@v2
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit

@@ -27,15 +27,17 @@ jobs:
      - name: Fast forwarding
        uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
        # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
-        if: ${{ github.event.pull_request.mergeable_state  == 'clean' }}
+        if: ${{ contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
        with:
          merge: true
          comment: on-error
          github_token: ${{ secrets.CI_ACCESS_TOKEN }}

      - name: Comment if mergeable_state is not clean
-        if: ${{ github.event.pull_request.mergeable_state  != 'clean' }}
+        if: ${{ !contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          gh pr comment ${{ github.event.pull_request.number }} \
            --repo "${GITHUB_REPOSITORY}" \
-            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."
+            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\` or \`unstable\`."
--- a/.github/workflows/label-for-external-users.yml
+++ b/.github/workflows/label-for-external-users.yml
@@ -28,7 +28,7 @@ jobs:

    steps:
    - name: Harden the runner (Audit all outbound calls)
-      uses: step-security/harden-runner@v2
+      uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
      with:
        egress-policy: audit

@@ -75,7 +75,7 @@ jobs:

    steps:
    - name: Harden the runner (Audit all outbound calls)
-      uses: step-security/harden-runner@v2
+      uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
      with:
        egress-policy: audit

--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -30,7 +30,7 @@ permissions:
  statuses: write # require for posting a status update

 env:
-  DEFAULT_PG_VERSION: 16
+  DEFAULT_PG_VERSION: 17
  PLATFORM: neon-captest-new
  AWS_DEFAULT_REGION: eu-central-1

@@ -42,6 +42,8 @@ jobs:
      github-event-name: ${{ github.event_name }}

  build-build-tools-image:
+    permissions:
+      packages: write
    needs: [ check-permissions ]
    uses: ./.github/workflows/build-build-tools-image.yml
    secrets: inherit
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -41,7 +41,7 @@ jobs:

    steps:
      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@v2
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit

--- a/.github/workflows/random-ops-test.yml
+++ b/.github/workflows/random-ops-test.yml
@@ -0,0 +1,93 @@
+name: Random Operations Test
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │  ┌───────────── hour (0 - 23)
+    #          │  │  ┌───────────── day of the month (1 - 31)
+    #          │  │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │  │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 */2 * * *' # runs every 2 hours
+  workflow_dispatch:
+    inputs:
+      random_seed:
+        type: number
+        description: 'The random seed'
+        required: false
+        default: 0
+      num_operations:
+        type: number
+        description: "The number of operations to test"
+        default: 250
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+permissions: {}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  run-random-rests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+    runs-on: small
+    permissions:
+      id-token: write
+      statuses: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+      - name: Run tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: remote
+          test_selection: random_ops
+          run_in_parallel: false
+          extra_params: -m remote_cluster
+          pg_version: ${{ matrix.pg-version }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+          RANDOM_SEED: ${{ inputs.random_seed }}
+          NUM_OPERATIONS: ${{ inputs.num_operations }}
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -35,7 +35,7 @@ jobs:

    steps:
      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@v2
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit

@@ -73,7 +73,7 @@ jobs:
        }}
    steps:
      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@v2
+        uses: step-security/harden-runner@0634a2670c59f64b4a01f0f96f84700a4088b9f0 # v2.12.0
        with:
          egress-policy: audit

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -40,7 +40,7 @@ dependencies = [
 "getrandom 0.2.11",
 "once_cell",
 "version_check",
- "zerocopy",
+ "zerocopy 0.7.31",
 ]

 [[package]]
@@ -1323,7 +1323,6 @@ dependencies = [
 "serde_json",
 "serde_with",
 "signal-hook",
- "spki 0.7.3",
 "tar",
 "thiserror 1.0.69",
 "tokio",
@@ -2037,6 +2036,33 @@ dependencies = [
 "zeroize",
 ]

+[[package]]
+name = "endpoint_storage"
+version = "0.0.1"
+dependencies = [
+ "anyhow",
+ "axum",
+ "axum-extra",
+ "camino",
+ "camino-tempfile",
+ "futures",
+ "http-body-util",
+ "itertools 0.10.5",
+ "jsonwebtoken",
+ "prometheus",
+ "rand 0.8.5",
+ "remote_storage",
+ "serde",
+ "serde_json",
+ "test-log",
+ "tokio",
+ "tokio-util",
+ "tower 0.5.2",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "enum-map"
 version = "2.5.0"
@@ -3998,33 +4024,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "object_storage"
-version = "0.0.1"
-dependencies = [
- "anyhow",
- "axum",
- "axum-extra",
- "camino",
- "camino-tempfile",
- "futures",
- "http-body-util",
- "itertools 0.10.5",
- "jsonwebtoken",
- "prometheus",
- "rand 0.8.5",
- "remote_storage",
- "serde",
- "serde_json",
- "test-log",
- "tokio",
- "tokio-util",
- "tower 0.5.2",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "once_cell"
 version = "1.20.2"
@@ -4285,6 +4284,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "pageserver_compaction",
+ "pem",
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
@@ -4301,6 +4301,7 @@ dependencies = [
 "remote_storage",
 "reqwest",
 "rpds",
+ "rstest",
 "rustls 0.23.18",
 "scopeguard",
 "send-future",
@@ -4352,6 +4353,7 @@ dependencies = [
 "humantime-serde",
 "itertools 0.10.5",
 "nix 0.27.1",
+ "once_cell",
 "postgres_backend",
 "postgres_ffi",
 "rand 0.8.5",
@@ -4413,9 +4415,9 @@ dependencies = [

 [[package]]
 name = "papaya"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
+checksum = "6827e3fc394523c21d4464d02c0bb1c19966ea4a58a9844ad6d746214179d2bc"
 dependencies = [
 "equivalent",
 "seize",
@@ -5202,7 +5204,7 @@ dependencies = [
 "walkdir",
 "workspace_hack",
 "x509-cert",
- "zerocopy",
+ "zerocopy 0.8.24",
 ]

 [[package]]
@@ -5592,7 +5594,7 @@ dependencies = [
 "wasm-bindgen-futures",
 "wasm-streams",
 "web-sys",
- "webpki-roots 0.26.1",
+ "webpki-roots",
 "winreg",
 ]

@@ -6000,6 +6002,7 @@ dependencies = [
 "once_cell",
 "pageserver_api",
 "parking_lot 0.12.1",
+ "pem",
 "postgres-protocol",
 "postgres_backend",
 "postgres_ffi",
@@ -6192,13 +6195,13 @@ checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"

 [[package]]
 name = "sentry"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
+checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.21.12",
+ "rustls 0.23.18",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -6206,14 +6209,14 @@ dependencies = [
 "sentry-tracing",
 "tokio",
 "ureq",
- "webpki-roots 0.25.2",
+ "webpki-roots",
 ]

 [[package]]
 name = "sentry-backtrace"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
+checksum = "00293cd332a859961f24fd69258f7e92af736feaeb91020cff84dac4188a4302"
 dependencies = [
 "backtrace",
 "once_cell",
@@ -6223,9 +6226,9 @@ dependencies = [

 [[package]]
 name = "sentry-contexts"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
+checksum = "961990f9caa76476c481de130ada05614cd7f5aa70fb57c2142f0e09ad3fb2aa"
 dependencies = [
 "hostname",
 "libc",
@@ -6237,9 +6240,9 @@ dependencies = [

 [[package]]
 name = "sentry-core"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
+checksum = "1a6409d845707d82415c800290a5d63be5e3df3c2e417b0997c60531dfbd35ef"
 dependencies = [
 "once_cell",
 "rand 0.8.5",
@@ -6250,9 +6253,9 @@ dependencies = [

 [[package]]
 name = "sentry-panic"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
+checksum = "609b1a12340495ce17baeec9e08ff8ed423c337c1a84dffae36a178c783623f3"
 dependencies = [
 "sentry-backtrace",
 "sentry-core",
@@ -6260,9 +6263,9 @@ dependencies = [

 [[package]]
 name = "sentry-tracing"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
+checksum = "49f4e86402d5c50239dc7d8fd3f6d5e048221d5fcb4e026d8d50ab57fe4644cb"
 dependencies = [
 "sentry-backtrace",
 "sentry-core",
@@ -6272,9 +6275,9 @@ dependencies = [

 [[package]]
 name = "sentry-types"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
+checksum = "3d3f117b8755dbede8260952de2aeb029e20f432e72634e8969af34324591631"
 dependencies = [
 "debugid",
 "hex",
@@ -6613,12 +6616,14 @@ dependencies = [
 "anyhow",
 "async-stream",
 "bytes",
+ "camino",
 "clap",
 "const_format",
 "futures",
 "futures-core",
 "futures-util",
 "http-body-util",
+ "http-utils",
 "humantime",
 "hyper 1.4.1",
 "hyper-util",
@@ -6628,6 +6633,7 @@ dependencies = [
 "prost 0.13.3",
 "rustls 0.23.18",
 "tokio",
+ "tokio-rustls 0.26.0",
 "tonic",
 "tonic-build",
 "tracing",
@@ -6708,8 +6714,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-stream",
- "aws-config",
- "aws-sdk-s3",
 "camino",
 "chrono",
 "clap",
@@ -7798,7 +7802,7 @@ dependencies = [
 "rustls 0.23.18",
 "rustls-pki-types",
 "url",
- "webpki-roots 0.26.1",
+ "webpki-roots",
 ]

 [[package]]
@@ -8166,12 +8170,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "webpki-roots"
-version = "0.25.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
-
 [[package]]
 name = "webpki-roots"
 version = "0.26.1"
@@ -8479,6 +8477,8 @@ dependencies = [
 "regex-syntax 0.8.2",
 "reqwest",
 "rustls 0.23.18",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.8",
 "scopeguard",
 "sec1 0.7.3",
 "serde",
@@ -8507,7 +8507,6 @@ dependencies = [
 "tracing-log",
 "url",
 "uuid",
- "zerocopy",
 "zeroize",
 "zstd",
 "zstd-safe",
@@ -8611,8 +8610,16 @@ version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
 dependencies = [
- "byteorder",
- "zerocopy-derive",
+ "zerocopy-derive 0.7.31",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879"
+dependencies = [
+ "zerocopy-derive 0.8.24",
 ]

 [[package]]
@@ -8626,6 +8633,17 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "zerocopy-derive"
+version = "0.8.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "zerofrom"
 version = "0.1.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -40,7 +40,7 @@ members = [
    "libs/proxy/postgres-protocol2",
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
-    "object_storage",
+    "endpoint_storage",
 ]

 [workspace.package]
@@ -164,7 +164,7 @@ scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
 send-future = "0.1.0"
-sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.37", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
@@ -220,7 +220,7 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.8"
 whoami = "1.5.1"
-zerocopy = { version = "0.7", features = ["derive"] }
+zerocopy = { version = "0.8", features = ["derive", "simd"] }
 json-structural-diff = { version = "0.2.0" }
 x509-cert = { version = "0.2.5" }

--- a/4
+++ b/4
@@ -89,7 +89,7 @@ RUN set -e \
      --bin storage_broker  \
      --bin storage_controller  \
      --bin proxy  \
-      --bin object_storage \
+      --bin endpoint_storage \
      --bin neon_local \
      --bin storage_scrubber \
      --locked --release
@@ -122,7 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage    /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin

--- a/README.md
+++ b/README.md
@@ -270,7 +270,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -173,7 +173,7 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && rm -rf protoc.zip protoc

 # s5cmd
-ENV S5CMD_VERSION=2.2.2
+ENV S5CMD_VERSION=2.3.0
 RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
    && chmod +x s5cmd \
    && mv s5cmd /usr/local/bin/s5cmd
@@ -206,7 +206,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.34.1
+ENV MOLD_VERSION=v2.37.1
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
@@ -268,7 +268,7 @@ WORKDIR /home/nonroot
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc

 # Python
-ENV PYTHON_VERSION=3.11.10 \
+ENV PYTHON_VERSION=3.11.12 \
    PYENV_ROOT=/home/nonroot/.pyenv \
    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -296,12 +296,12 @@ ENV RUSTC_VERSION=1.86.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
-ARG CARGO_HAKARI_VERSION=0.9.33
-ARG CARGO_DENY_VERSION=0.16.2
-ARG CARGO_HACK_VERSION=0.6.33
-ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_HAKARI_VERSION=0.9.36
+ARG CARGO_DENY_VERSION=0.18.2
+ARG CARGO_HACK_VERSION=0.6.36
+ARG CARGO_NEXTEST_VERSION=0.9.94
 ARG CARGO_CHEF_VERSION=0.1.71
-ARG CARGO_DIESEL_CLI_VERSION=2.2.6
+ARG CARGO_DIESEL_CLI_VERSION=2.2.9
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
--- a/clippy.toml
+++ b/clippy.toml
@@ -12,3 +12,5 @@ disallowed-macros = [
    # cannot disallow this, because clippy finds used from tokio macros
    #"tokio::pin",
 ]
+
+allow-unwrap-in-tests = true
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1677,7 +1677,7 @@ RUN set -e \
    && apt clean && rm -rf /var/lib/apt/lists/*

 # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+ENV PGBOUNCER_TAG=pgbouncer_1_24_1
 RUN set -e \
    && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
    && cd pgbouncer \
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,265 +0,0 @@
-commit 00aa659afc9c7336ab81036edec3017168aabf40
-Author: Heikki Linnakangas <heikki@neon.tech>
-Date:   Tue Nov 12 16:59:19 2024 +0200
-
-    Temporarily disable test that depends on timezone
-
-diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
-index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
-+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
-@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
-  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
- (1 row)
- 
-SELECT anon.generalize_tstzrange('19041107','millennium');
-                      generalize_tstzrange                       
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-   generalize_daterange   
-diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
-index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
-+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
-@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
- SELECT anon.generalize_tstzrange('19041107','year');
- SELECT anon.generalize_tstzrange('19041107','decade');
- SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- 
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-
-commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Fri May 31 06:34:26 2024 +0000
-
-    These alternative expected files were added to consider the neon features
-
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-new file mode 100644
-index 0000000..2539cfd
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-@@ -0,0 +1,101 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE mallory_the_masked_user;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.anonymize_table('t1');
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ERROR:  Only supersusers can start the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT * FROM mask.t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  SELECT * FROM public.t1;
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  Only supersusers can stop the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-new file mode 100644
-index 0000000..8b090fe
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-@@ -0,0 +1,104 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE oscar_the_owner;
-+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE oscar_the_owner;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+SELECT anon.anonymize_table('t1');
-+ anonymize_table 
-+-----------------
-+ t
-+(1 row)
-+
-+SELECT * FROM t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+UPDATE t1 SET t='test' WHERE i=1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+SELECT * FROM t1;
-+ i |  t   
-+---+------
-+ 1 | test
-+(1 row)
-+
-+--SELECT * FROM mask.t1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  permission denied for schema mask
-+CONTEXT:  SQL statement "DROP VIEW mask.t1;"
-+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
-+SQL statement "SELECT anon.mask_drop_view(oid)
-+  FROM pg_catalog.pg_class
-+  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
-+  AND relkind IN ('r','p','f')"
-+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
--- a/compute/patches/pg_repack.patch
+++ b/compute/patches/pg_repack.patch
@@ -11,6 +11,14 @@ index bf6edcb..89b4c7f 100644
 
 USE_PGXS = 1	# use pgxs if not in contrib directory
 PGXS := $(shell $(PG_CONFIG) --pgxs)
+diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
+index 9f2e171..f6e4f8d 100644
+--- a/regress/expected/init-extension.out
+++ b/regress/expected/init-extension.out
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
 index 8d0a94e..63b68bf 100644
 --- a/regress/expected/nosuper.out
@@ -42,6 +50,14 @@ index 8d0a94e..63b68bf 100644
 INFO: repacking table "public.tbl_cluster"
 ERROR: query failed: ERROR:  current transaction is aborted, commands ignored until end of transaction block
 DETAIL: query was: RESET lock_timeout
+diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
+index 9f2e171..f6e4f8d 100644
+--- a/regress/sql/init-extension.sql
+++ b/regress/sql/init-extension.sql
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
 index 072f0fa..dbe60f8 100644
 --- a/regress/sql/nosuper.sql
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -44,7 +44,6 @@ serde.workspace = true
 serde_with.workspace = true
 serde_json.workspace = true
 signal-hook.workspace = true
-spki = { version = "0.7.3", features = ["std"] }
 tar.workspace = true
 tower.workspace = true
 tower-http.workspace = true
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -116,9 +116,7 @@ struct Cli {
    #[arg(long)]
    pub set_disk_quota_for_fs: Option<String>,

-    // TODO(tristan957): remove alias after compatibility tests are no longer
-    // an issue
-    #[arg(short = 'c', long, alias = "spec-path")]
+    #[arg(short = 'c', long)]
    pub config: Option<OsString>,

    #[arg(short = 'i', long, group = "compute-id")]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -641,7 +641,26 @@ impl ComputeNode {

                let log_directory_path = Path::new(&self.params.pgdata).join("log");
                let log_directory_path = log_directory_path.to_string_lossy().to_string();
-                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Add project_id,endpoint_id tag to identify the logs.
+                //
+                // These ids are passed from cplane,
+                // for backwards compatibility (old computes that don't have them),
+                // we set them to None.
+                // TODO: Clean up this code when all computes have them.
+                let tag: Option<String> = match (
+                    pspec.spec.project_id.as_deref(),
+                    pspec.spec.endpoint_id.as_deref(),
+                ) {
+                    (Some(project_id), Some(endpoint_id)) => {
+                        Some(format!("{project_id}/{endpoint_id}"))
+                    }
+                    (Some(project_id), None) => Some(format!("{project_id}/None")),
+                    (None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
+                    (None, None) => None,
+                };
+
+                configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;

                // Launch a background task to clean up the audit logs
                launch_pgaudit_gc(log_directory_path);
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -6,4 +6,5 @@ pub(crate) mod request_id;
 pub(crate) use json::Json;
 pub(crate) use path::Path;
 pub(crate) use query::Query;
+#[allow(unused)]
 pub(crate) use request_id::RequestId;
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -13,7 +13,7 @@ use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
 use tower_http::auth::AsyncAuthorizeRequest;
 use tracing::{debug, warn};

-use crate::http::{JsonResponse, extract::RequestId};
+use crate::http::JsonResponse;

 #[derive(Clone, Debug)]
 pub(in crate::http) struct Authorize {
@@ -52,18 +52,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
        let validation = self.validation.clone();

        Box::pin(async move {
-            let request_id = request.extract_parts::<RequestId>().await.unwrap();
-
-            // TODO(tristan957): Remove this stanza after teaching neon_local
-            // and the regression tests to use a JWT + JWKS.
-            //
-            // https://github.com/neondatabase/neon/issues/11316
-            if cfg!(feature = "testing") {
-                warn!(%request_id, "Skipping compute_ctl authorization check");
-
-                return Ok(request);
-            }
-
            let TypedHeader(Authorization(bearer)) = request
                .extract_parts::<TypedHeader<Authorization<Bearer>>>()
                .await
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -1,8 +1,8 @@
-use metrics::core::{AtomicF64, Collector, GenericGauge};
+use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge};
 use metrics::proto::MetricFamily;
 use metrics::{
-    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec,
-    register_int_gauge_vec, register_uint_gauge_vec,
+    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
+    register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;

@@ -81,6 +81,22 @@ pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static PG_CURR_DOWNTIME_MS: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
+    register_gauge!(
+        "compute_pg_current_downtime_ms",
+        "Non-cumulative duration of Postgres downtime in ms; resets after successful check",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::new(|| {
+    register_int_counter!(
+        "compute_pg_downtime_ms_total",
+        "Cumulative duration of Postgres downtime in ms",
+    )
+    .expect("failed to define a metric")
+});
+
 pub fn collect() -> Vec<MetricFamily> {
    let mut metrics = COMPUTE_CTL_UP.collect();
    metrics.extend(INSTALLED_EXTENSIONS.collect());
@@ -88,5 +104,7 @@ pub fn collect() -> Vec<MetricFamily> {
    metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
    metrics.extend(DB_MIGRATION_FAILED.collect());
    metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
+    metrics.extend(PG_CURR_DOWNTIME_MS.collect());
+    metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
    metrics
 }
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -6,197 +6,294 @@ use chrono::{DateTime, Utc};
 use compute_api::responses::ComputeStatus;
 use compute_api::spec::ComputeFeature;
 use postgres::{Client, NoTls};
-use tracing::{debug, error, info, warn};
+use tracing::{Level, error, info, instrument, span};

 use crate::compute::ComputeNode;
+use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};

 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

-// Spin in a loop and figure out the last activity time in the Postgres.
-// Then update it in the shared state. This function never errors out.
-// NB: the only expected panic is at `Mutex` unwrap(), all other errors
-// should be handled gracefully.
-fn watch_compute_activity(compute: &ComputeNode) {
-    // Suppose that `connstr` doesn't change
-    let connstr = compute.params.connstr.clone();
-    let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
+struct ComputeMonitor {
+    compute: Arc<ComputeNode>,

-    // During startup and configuration we connect to every Postgres database,
-    // but we don't want to count this as some user activity. So wait until
-    // the compute fully started before monitoring activity.
-    wait_for_postgres_start(compute);
+    /// The moment when Postgres had some activity,
+    /// that should prevent compute from being suspended.
+    last_active: Option<DateTime<Utc>>,

-    // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = conf.connect(NoTls);
+    /// The moment when we last tried to check Postgres.
+    last_checked: DateTime<Utc>,
+    /// The last moment we did a successful Postgres check.
+    last_up: DateTime<Utc>,

-    let mut sleep = false;
-    let mut prev_active_time: Option<f64> = None;
-    let mut prev_sessions: Option<i64> = None;
+    /// Only used for internal statistics change tracking
+    /// between monitor runs and can be outdated.
+    active_time: Option<f64>,
+    /// Only used for internal statistics change tracking
+    /// between monitor runs and can be outdated.
+    sessions: Option<i64>,

-    if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
-        info!("starting experimental activity monitor for {}", connstr);
-    } else {
-        info!("starting activity monitor for {}", connstr);
+    /// Use experimental statistics-based activity monitor. It's no longer
+    /// 'experimental' per se, as it's enabled for everyone, but we still
+    /// keep the flag as an option to turn it off in some cases if it will
+    /// misbehave.
+    experimental: bool,
+}
+
+impl ComputeMonitor {
+    fn report_down(&self) {
+        let now = Utc::now();
+
+        // Calculate and report current downtime
+        // (since the last time Postgres was up)
+        let downtime = now.signed_duration_since(self.last_up);
+        PG_CURR_DOWNTIME_MS.set(downtime.num_milliseconds() as f64);
+
+        // Calculate and update total downtime
+        // (cumulative duration of Postgres downtime in ms)
+        let inc = now
+            .signed_duration_since(self.last_checked)
+            .num_milliseconds();
+        PG_TOTAL_DOWNTIME_MS.inc_by(inc as u64);
    }

-    loop {
-        // We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
-        // But skip the first sleep, so we can connect to Postgres immediately.
-        if sleep {
-            // Should be outside of the mutex lock to allow others to read while we sleep.
-            thread::sleep(MONITOR_CHECK_INTERVAL);
-        } else {
-            sleep = true;
-        }
+    fn report_up(&mut self) {
+        self.last_up = Utc::now();
+        PG_CURR_DOWNTIME_MS.set(0.0);
+    }

-        match &mut client {
-            Ok(cli) => {
-                if cli.is_closed() {
-                    info!("connection to Postgres is closed, trying to reconnect");
+    fn downtime_info(&self) -> String {
+        format!(
+            "total_ms: {}, current_ms: {}, last_up: {}",
+            PG_TOTAL_DOWNTIME_MS.get(),
+            PG_CURR_DOWNTIME_MS.get(),
+            self.last_up
+        )
+    }

-                    // Connection is closed, reconnect and try again.
-                    client = conf.connect(NoTls);
-                    continue;
-                }
+    /// Spin in a loop and figure out the last activity time in the Postgres.
+    /// Then update it in the shared state. This function never errors out.
+    /// NB: the only expected panic is at `Mutex` unwrap(), all other errors
+    /// should be handled gracefully.
+    #[instrument(skip_all)]
+    pub fn run(&mut self) {
+        // Suppose that `connstr` doesn't change
+        let connstr = self.compute.params.connstr.clone();
+        let conf = self
+            .compute
+            .get_conn_conf(Some("compute_ctl:compute_monitor"));

-                // This is a new logic, only enable if the feature flag is set.
-                // TODO: remove this once we are sure that it works OR drop it altogether.
-                if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
-                    // First, check if the total active time or sessions across all databases has changed.
-                    // If it did, it means that user executed some queries. In theory, it can even go down if
-                    // some databases were dropped, but it's still a user activity.
-                    match get_database_stats(cli) {
-                        Ok((active_time, sessions)) => {
-                            let mut detected_activity = false;
+        // During startup and configuration we connect to every Postgres database,
+        // but we don't want to count this as some user activity. So wait until
+        // the compute fully started before monitoring activity.
+        wait_for_postgres_start(&self.compute);

-                            prev_active_time = match prev_active_time {
-                                Some(prev_active_time) => {
-                                    if active_time != prev_active_time {
-                                        detected_activity = true;
-                                    }
-                                    Some(active_time)
-                                }
-                                None => Some(active_time),
-                            };
-                            prev_sessions = match prev_sessions {
-                                Some(prev_sessions) => {
-                                    if sessions != prev_sessions {
-                                        detected_activity = true;
-                                    }
-                                    Some(sessions)
-                                }
-                                None => Some(sessions),
-                            };
+        // Define `client` outside of the loop to reuse existing connection if it's active.
+        let mut client = conf.connect(NoTls);

-                            if detected_activity {
-                                // Update the last active time and continue, we don't need to
-                                // check backends state change.
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            error!("could not get database statistics: {}", e);
-                            continue;
-                        }
-                    }
-                }
+        info!("starting compute monitor for {}", connstr);

-                // Second, if database statistics is the same, check all backends state change,
-                // maybe there is some with more recent activity. `get_backends_state_change()`
-                // can return None or stale timestamp, so it's `compute.update_last_active()`
-                // responsibility to check if the new timestamp is more recent than the current one.
-                // This helps us to discover new sessions, that did nothing yet.
-                match get_backends_state_change(cli) {
-                    Ok(last_active) => {
-                        compute.update_last_active(last_active);
-                    }
-                    Err(e) => {
-                        error!("could not get backends state change: {}", e);
-                    }
-                }
-
-                // Finally, if there are existing (logical) walsenders, do not suspend.
-                //
-                // walproposer doesn't currently show up in pg_stat_replication,
-                // but protect if it will be
-                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
-                match cli.query_one(ws_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_ws) => {
-                            if num_ws > 0 {
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse walsenders count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of walsenders: {:?}", e);
-                        continue;
-                    }
-                }
-                //
-                // Don't suspend compute if there is an active logical replication subscription
-                //
-                // `where pid is not null` – to filter out read only computes and subscription on branches
-                //
-                let logical_subscriptions_query =
-                    "select count(*) from pg_stat_subscription where pid is not null;";
-                match cli.query_one(logical_subscriptions_query, &[]) {
-                    Ok(row) => match row.try_get::<&str, i64>("count") {
-                        Ok(num_subscribers) => {
-                            if num_subscribers > 0 {
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!(
-                            "failed to get list of active logical replication subscriptions: {:?}",
-                            e
+        loop {
+            match &mut client {
+                Ok(cli) => {
+                    if cli.is_closed() {
+                        info!(
+                            downtime_info = self.downtime_info(),
+                            "connection to Postgres is closed, trying to reconnect"
                        );
-                        continue;
-                    }
-                }
-                //
-                // Do not suspend compute if autovacuum is running
-                //
-                let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
-                match cli.query_one(autovacuum_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_workers) => {
-                            if num_workers > 0 {
-                                compute.update_last_active(Some(Utc::now()));
-                                continue;
+                        self.report_down();
+
+                        // Connection is closed, reconnect and try again.
+                        client = conf.connect(NoTls);
+                    } else {
+                        match self.check(cli) {
+                            Ok(_) => {
+                                self.report_up();
+                                self.compute.update_last_active(self.last_active);
+                            }
+                            Err(e) => {
+                                // Although we have many places where we can return errors in `check()`,
+                                // normally it shouldn't happen. I.e., we will likely return error if
+                                // connection got broken, query timed out, Postgres returned invalid data, etc.
+                                // In all such cases it's suspicious, so let's report this as downtime.
+                                self.report_down();
+                                error!(
+                                    downtime_info = self.downtime_info(),
+                                    "could not check Postgres: {}", e
+                                );
+
+                                // Reconnect to Postgres just in case. During tests, I noticed
+                                // that queries in `check()` can fail with `connection closed`,
+                                // but `cli.is_closed()` above doesn't detect it. Even if old
+                                // connection is still alive, it will be dropped when we reassign
+                                // `client` to a new connection.
+                                client = conf.connect(NoTls);
                            }
                        }
-                        Err(e) => {
-                            warn!("failed to parse autovacuum workers count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of autovacuum workers: {:?}", e);
-                        continue;
                    }
                }
-            }
-            Err(e) => {
-                debug!("could not connect to Postgres: {}, retrying", e);
+                Err(e) => {
+                    info!(
+                        downtime_info = self.downtime_info(),
+                        "could not connect to Postgres: {}, retrying", e
+                    );
+                    self.report_down();

-                // Establish a new connection and try again.
-                client = conf.connect(NoTls);
+                    // Establish a new connection and try again.
+                    client = conf.connect(NoTls);
+                }
+            }
+
+            // Reset the `last_checked` timestamp and sleep before the next iteration.
+            self.last_checked = Utc::now();
+            thread::sleep(MONITOR_CHECK_INTERVAL);
+        }
+    }
+
+    #[instrument(skip_all)]
+    fn check(&mut self, cli: &mut Client) -> anyhow::Result<()> {
+        // This is new logic, only enable if the feature flag is set.
+        // TODO: remove this once we are sure that it works OR drop it altogether.
+        if self.experimental {
+            // Check if the total active time or sessions across all databases has changed.
+            // If it did, it means that user executed some queries. In theory, it can even go down if
+            // some databases were dropped, but it's still user activity.
+            match get_database_stats(cli) {
+                Ok((active_time, sessions)) => {
+                    let mut detected_activity = false;
+
+                    if let Some(prev_active_time) = self.active_time {
+                        if active_time != prev_active_time {
+                            detected_activity = true;
+                        }
+                    }
+                    self.active_time = Some(active_time);
+
+                    if let Some(prev_sessions) = self.sessions {
+                        if sessions != prev_sessions {
+                            detected_activity = true;
+                        }
+                    }
+                    self.sessions = Some(sessions);
+
+                    if detected_activity {
+                        // Update the last active time and continue, we don't need to
+                        // check backends state change.
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    return Err(anyhow::anyhow!("could not get database statistics: {}", e));
+                }
            }
        }
+
+        // If database statistics are the same, check all backends for state changes.
+        // Maybe there are some with more recent activity. `get_backends_state_change()`
+        // can return None or stale timestamp, so it's `compute.update_last_active()`
+        // responsibility to check if the new timestamp is more recent than the current one.
+        // This helps us to discover new sessions that have not done anything yet.
+        match get_backends_state_change(cli) {
+            Ok(last_active) => match (last_active, self.last_active) {
+                (Some(last_active), Some(prev_last_active)) => {
+                    if last_active > prev_last_active {
+                        self.last_active = Some(last_active);
+                        return Ok(());
+                    }
+                }
+                (Some(last_active), None) => {
+                    self.last_active = Some(last_active);
+                    return Ok(());
+                }
+                _ => {}
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "could not get backends state change: {}",
+                    e
+                ));
+            }
+        }
+
+        // If there are existing (logical) walsenders, do not suspend.
+        //
+        // N.B. walproposer doesn't currently show up in pg_stat_replication,
+        // but protect if it will.
+        const WS_COUNT_QUERY: &str =
+            "select count(*) from pg_stat_replication where application_name != 'walproposer';";
+        match cli.query_one(WS_COUNT_QUERY, &[]) {
+            Ok(r) => match r.try_get::<&str, i64>("count") {
+                Ok(num_ws) => {
+                    if num_ws > 0 {
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    let err: anyhow::Error = e.into();
+                    return Err(err.context("failed to parse walsenders count"));
+                }
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!("failed to get list of walsenders: {}", e));
+            }
+        }
+
+        // Don't suspend compute if there is an active logical replication subscription
+        //
+        // `where pid is not null` – to filter out read only computes and subscription on branches
+        const LOGICAL_SUBSCRIPTIONS_QUERY: &str =
+            "select count(*) from pg_stat_subscription where pid is not null;";
+        match cli.query_one(LOGICAL_SUBSCRIPTIONS_QUERY, &[]) {
+            Ok(row) => match row.try_get::<&str, i64>("count") {
+                Ok(num_subscribers) => {
+                    if num_subscribers > 0 {
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    return Err(anyhow::anyhow!(
+                        "failed to parse 'pg_stat_subscription' count: {}",
+                        e
+                    ));
+                }
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "failed to get list of active logical replication subscriptions: {}",
+                    e
+                ));
+            }
+        }
+
+        // Do not suspend compute if autovacuum is running
+        const AUTOVACUUM_COUNT_QUERY: &str =
+            "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
+        match cli.query_one(AUTOVACUUM_COUNT_QUERY, &[]) {
+            Ok(r) => match r.try_get::<&str, i64>("count") {
+                Ok(num_workers) => {
+                    if num_workers > 0 {
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    };
+                }
+                Err(e) => {
+                    return Err(anyhow::anyhow!(
+                        "failed to parse autovacuum workers count: {}",
+                        e
+                    ));
+                }
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "failed to get list of autovacuum workers: {}",
+                    e
+                ));
+            }
+        }
+
+        Ok(())
    }
 }

@@ -315,9 +412,24 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
 /// Launch a separate compute monitor thread and return its `JoinHandle`.
 pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let compute = Arc::clone(compute);
+    let experimental = compute.has_feature(ComputeFeature::ActivityMonitorExperimental);
+    let now = Utc::now();
+    let mut monitor = ComputeMonitor {
+        compute,
+        last_active: None,
+        last_checked: now,
+        last_up: now,
+        active_time: None,
+        sessions: None,
+        experimental,
+    };

+    let span = span!(Level::INFO, "compute_monitor");
    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&compute))
+        .spawn(move || {
+            let _enter = span.enter();
+            monitor.run();
+        })
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -50,13 +50,13 @@ fn restart_rsyslog() -> Result<()> {

 pub fn configure_audit_rsyslog(
    log_directory: String,
-    tag: &str,
+    tag: Option<String>,
    remote_endpoint: &str,
 ) -> Result<()> {
    let config_content: String = format!(
        include_str!("config_template/compute_audit_rsyslog_template.conf"),
        log_directory = log_directory,
-        tag = tag,
+        tag = tag.unwrap_or("".to_string()),
        remote_endpoint = remote_endpoint
    );

--- a/compute_tools/src/tls.rs
+++ b/compute_tools/src/tls.rs
@@ -3,7 +3,6 @@ use std::{io::Write, os::unix::fs::OpenOptionsExt, path::Path, time::Duration};
 use anyhow::{Context, Result, bail};
 use compute_api::responses::TlsConfig;
 use ring::digest;
-use spki::der::{Decode, PemReader};
 use x509_cert::Certificate;

 #[derive(Clone, Copy)]
@@ -52,7 +51,7 @@ pub fn update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) {
        match try_update_key_path_blocking(pg_data, tls_config) {
            Ok(()) => break,
            Err(e) => {
-                tracing::error!("could not create key file {e:?}");
+                tracing::error!(error = ?e, "could not create key file");
                std::thread::sleep(Duration::from_secs(1))
            }
        }
@@ -92,8 +91,14 @@ fn try_update_key_path_blocking(pg_data: &Path, tls_config: &TlsConfig) -> Resul
 fn verify_key_cert(key: &str, cert: &str) -> Result<()> {
    use x509_cert::der::oid::db::rfc5912::ECDSA_WITH_SHA_256;

-    let cert = Certificate::decode(&mut PemReader::new(cert.as_bytes()).context("pem reader")?)
-        .context("decode cert")?;
+    let certs = Certificate::load_pem_chain(cert.as_bytes())
+        .context("decoding PEM encoded certificates")?;
+
+    // First certificate is our server-cert,
+    // all the rest of the certs are the CA cert chain.
+    let Some(cert) = certs.first() else {
+        bail!("no certificates found");
+    };

    match cert.signature_algorithm.oid {
        ECDSA_WITH_SHA_256 => {
@@ -115,3 +120,82 @@ fn verify_key_cert(key: &str, cert: &str) -> Result<()> {

    Ok(())
 }
+
+#[cfg(test)]
+mod tests {
+    use super::verify_key_cert;
+
+    /// Real certificate chain file, generated by cert-manager in dev.
+    /// The server auth certificate has expired since 2025-04-24T15:41:35Z.
+    const CERT: &str = "
+-----BEGIN CERTIFICATE-----
+MIICCDCCAa+gAwIBAgIQKhLomFcNULbZA/bPdGzaSzAKBggqhkjOPQQDAjBEMQsw
+CQYDVQQGEwJVUzESMBAGA1UEChMJTmVvbiBJbmMuMSEwHwYDVQQDExhOZW9uIEs4
+cyBJbnRlcm1lZGlhdGUgQ0EwHhcNMjUwNDIzMTU0MTM1WhcNMjUwNDI0MTU0MTM1
+WjBBMT8wPQYDVQQDEzZjb21wdXRlLXdpc3B5LWdyYXNzLXcwY21laWp3LmRlZmF1
+bHQuc3ZjLmNsdXN0ZXIubG9jYWwwWTATBgcqhkjOPQIBBggqhkjOPQMBBwNCAATF
+QCcG2m/EVHAiZtSsYgVnHgoTjUL/Jtwfdrpvz2t0bVRZmBmSKhlo53uPV9Y5eKFG
+AmR54p9/gT2eO3xU7vAgo4GFMIGCMA4GA1UdDwEB/wQEAwIFoDAMBgNVHRMBAf8E
+AjAAMB8GA1UdIwQYMBaAFFR2JAhXkeiNQNEixTvAYIwxUu3QMEEGA1UdEQQ6MDiC
+NmNvbXB1dGUtd2lzcHktZ3Jhc3MtdzBjbWVpancuZGVmYXVsdC5zdmMuY2x1c3Rl
+ci5sb2NhbDAKBggqhkjOPQQDAgNHADBEAiBLG22wKG8XS9e9RxBT+kmUx/kIThcP
+DIpp7jx0PrFcdQIgEMTdnXpx5Cv/Z0NIEDxtMHUD7G0vuRPfztki36JuakM=
+-----END CERTIFICATE-----
+-----BEGIN CERTIFICATE-----
+MIICFzCCAb6gAwIBAgIUbbX98N2Ip6lWAONRk8dU9hSz+YIwCgYIKoZIzj0EAwIw
+RDELMAkGA1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEhMB8GA1UEAxMYTmVv
+biBBV1MgSW50ZXJtZWRpYXRlIENBMB4XDTI1MDQyMjE1MTAxMFoXDTI1MDcyMTE1
+MTAxMFowRDELMAkGA1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEhMB8GA1UE
+AxMYTmVvbiBLOHMgSW50ZXJtZWRpYXRlIENBMFkwEwYHKoZIzj0CAQYIKoZIzj0D
+AQcDQgAE5++m5owqNI4BPMTVNIUQH0qvU7pYhdpHGVGhdj/Lgars6ROvE6uSNQV4
+SAmJN5HBzj5/6kLQaTPWpXW7EHXjK6OBjTCBijAOBgNVHQ8BAf8EBAMCAQYwEgYD
+VR0TAQH/BAgwBgEB/wIBADAdBgNVHQ4EFgQUVHYkCFeR6I1A0SLFO8BgjDFS7dAw
+HwYDVR0jBBgwFoAUgHfNXfyKtHO0V9qoLOWCjkNiaI8wJAYDVR0eAQH/BBowGKAW
+MBSCEi5zdmMuY2x1c3Rlci5sb2NhbDAKBggqhkjOPQQDAgNHADBEAiBObVFFdXaL
+QpOXmN60dYUNnQRwjKreFduEkQgOdOlssgIgVAdJJQFgvlrvEOBhY8j5WyeKRwUN
+k/ALs6KpgaFBCGY=
+-----END CERTIFICATE-----
+-----BEGIN CERTIFICATE-----
+MIIB4jCCAYegAwIBAgIUFlxWFn/11yoGdmD+6gf+yQMToS0wCgYIKoZIzj0EAwIw
+ODELMAkGA1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEVMBMGA1UEAxMMTmVv
+biBSb290IENBMB4XDTI1MDQwMzA3MTUyMloXDTI2MDQwMzA3MTUyMlowRDELMAkG
+A1UEBhMCVVMxEjAQBgNVBAoTCU5lb24gSW5jLjEhMB8GA1UEAxMYTmVvbiBBV1Mg
+SW50ZXJtZWRpYXRlIENBMFkwEwYHKoZIzj0CAQYIKoZIzj0DAQcDQgAEqonG/IQ6
+ZxtEtOUTkkoNopPieXDO5CBKUkNFTGeJEB7OxRlSpYJgsBpaYIaD6Vc4sVk3thIF
+p+pLw52idQOIN6NjMGEwDgYDVR0PAQH/BAQDAgEGMA8GA1UdEwEB/wQFMAMBAf8w
+HQYDVR0OBBYEFIB3zV38irRztFfaqCzlgo5DYmiPMB8GA1UdIwQYMBaAFKh7M4/G
+FHvr/ORDQZt4bMLlJvHCMAoGCCqGSM49BAMCA0kAMEYCIQCbS4x7QPslONzBYbjC
+UQaQ0QLDW4CJHvQ4u4gbWFG87wIhAJMsHQHjP9qTT27Q65zQCR7O8QeLAfha1jrH
+Ag/LsxSr
+-----END CERTIFICATE-----
+";
+
+    /// The key corresponding to [`CERT`]
+    const KEY: &str = "
+-----BEGIN EC PRIVATE KEY-----
+MHcCAQEEIDnAnrqmIJjndCLWP1iIO5X3X63Aia48TGpGuMXwvm6IoAoGCCqGSM49
+AwEHoUQDQgAExUAnBtpvxFRwImbUrGIFZx4KE41C/ybcH3a6b89rdG1UWZgZkioZ
+aOd7j1fWOXihRgJkeeKff4E9njt8VO7wIA==
+-----END EC PRIVATE KEY-----
+";
+
+    /// An incorrect key.
+    const INCORRECT_KEY: &str = "
+-----BEGIN EC PRIVATE KEY-----
+MHcCAQEEIL6WqqBDyvM0HWz7Ir5M5+jhFWB7IzOClGn26OPrzHCXoAoGCCqGSM49
+AwEHoUQDQgAE7XVvdOy5lfwtNKb+gJEUtnG+DrnnXLY5LsHDeGQKV9PTRcEMeCrG
+YZzHyML4P6Sr4yi2ts+4B9i47uvAG8+XwQ==
+-----END EC PRIVATE KEY-----
+";
+
+    #[test]
+    fn certificate_verification() {
+        verify_key_cert(KEY, CERT).unwrap();
+    }
+
+    #[test]
+    #[should_panic(expected = "private key file does not match certificate")]
+    fn certificate_verification_fail() {
+        verify_key_cert(INCORRECT_KEY, CERT).unwrap();
+    }
+}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -17,19 +17,19 @@ use std::time::Duration;
 use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::spec::ComputeMode;
+use control_plane::broker::StorageBroker;
 use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage};
+use control_plane::local_env;
 use control_plane::local_env::{
-    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
-    ObjectStorageConf, SafekeeperConf,
+    EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf,
+    NeonLocalInitPageserverConf, SafekeeperConf,
 };
-use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
-use control_plane::object_storage::ObjectStorage;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
    NeonStorageControllerStartArgs, NeonStorageControllerStopArgs, StorageController,
 };
-use control_plane::{broker, local_env};
 use nix::fcntl::{FlockArg, flock};
 use pageserver_api::config::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
@@ -63,7 +63,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: u32 = 16;
+const DEFAULT_PG_VERSION: u32 = 17;

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -93,7 +93,7 @@ enum NeonLocalCmd {
    #[command(subcommand)]
    Safekeeper(SafekeeperCmd),
    #[command(subcommand)]
-    ObjectStorage(ObjectStorageCmd),
+    EndpointStorage(EndpointStorageCmd),
    #[command(subcommand)]
    Endpoint(EndpointCmd),
    #[command(subcommand)]
@@ -460,14 +460,14 @@ enum SafekeeperCmd {

 #[derive(clap::Subcommand)]
 #[clap(about = "Manage object storage")]
-enum ObjectStorageCmd {
-    Start(ObjectStorageStartCmd),
-    Stop(ObjectStorageStopCmd),
+enum EndpointStorageCmd {
+    Start(EndpointStorageStartCmd),
+    Stop(EndpointStorageStopCmd),
 }

 #[derive(clap::Args)]
 #[clap(about = "Start object storage")]
-struct ObjectStorageStartCmd {
+struct EndpointStorageStartCmd {
    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
@@ -475,7 +475,7 @@ struct ObjectStorageStartCmd {

 #[derive(clap::Args)]
 #[clap(about = "Stop object storage")]
-struct ObjectStorageStopCmd {
+struct EndpointStorageStopCmd {
    #[arg(value_enum, default_value = "fast")]
    #[clap(
        short = 'm',
@@ -797,7 +797,9 @@ fn main() -> Result<()> {
            }
            NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
            NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
-            NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
+            NeonLocalCmd::EndpointStorage(subcmd) => {
+                rt.block_on(handle_endpoint_storage(&subcmd, env))
+            }
            NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
            NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
        };
@@ -987,7 +989,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
        NeonLocalInitConf {
            control_plane_api: Some(DEFAULT_PAGESERVER_CONTROL_PLANE_API.parse().unwrap()),
            broker: NeonBroker {
-                listen_addr: DEFAULT_BROKER_ADDR.parse().unwrap(),
+                listen_addr: Some(DEFAULT_BROKER_ADDR.parse().unwrap()),
+                listen_https_addr: None,
            },
            safekeepers: vec![SafekeeperConf {
                id: DEFAULT_SAFEKEEPER_ID,
@@ -1014,8 +1017,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                    }
                })
                .collect(),
-            object_storage: ObjectStorageConf {
-                port: OBJECT_STORAGE_DEFAULT_PORT,
+            endpoint_storage: EndpointStorageConf {
+                port: ENDPOINT_STORAGE_DEFAULT_PORT,
            },
            pg_distrib_dir: None,
            neon_distrib_dir: None,
@@ -1544,7 +1547,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
            let jwt = endpoint.generate_jwt()?;

-            println!("{jwt}");
+            print!("{jwt}");
        }
    }

@@ -1735,12 +1738,15 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
-    use ObjectStorageCmd::*;
-    let storage = ObjectStorage::from_env(env);
+async fn handle_endpoint_storage(
+    subcmd: &EndpointStorageCmd,
+    env: &local_env::LocalEnv,
+) -> Result<()> {
+    use EndpointStorageCmd::*;
+    let storage = EndpointStorage::from_env(env);

    // In tests like test_forward_compatibility or test_graceful_cluster_restart
-    // old neon binaries (without object_storage) are present
+    // old neon binaries (without endpoint_storage) are present
    if !storage.bin.exists() {
        eprintln!(
            "{} binary not found. Ignore if this is a compatibility test",
@@ -1750,13 +1756,13 @@ async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::Local
    }

    match subcmd {
-        Start(ObjectStorageStartCmd { start_timeout }) => {
+        Start(EndpointStorageStartCmd { start_timeout }) => {
            if let Err(e) = storage.start(start_timeout).await {
-                eprintln!("object_storage start failed: {e}");
+                eprintln!("endpoint_storage start failed: {e}");
                exit(1);
            }
        }
-        Stop(ObjectStorageStopCmd { stop_mode }) => {
+        Stop(EndpointStorageStopCmd { stop_mode }) => {
            let immediate = match stop_mode {
                StopMode::Fast => false,
                StopMode::Immediate => true,
@@ -1773,7 +1779,8 @@ async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::Local
 async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::LocalEnv) -> Result<()> {
    match subcmd {
        StorageBrokerCmd::Start(args) => {
-            if let Err(e) = broker::start_broker_process(env, &args.start_timeout).await {
+            let storage_broker = StorageBroker::from_env(env);
+            if let Err(e) = storage_broker.start(&args.start_timeout).await {
                eprintln!("broker start failed: {e}");
                exit(1);
            }
@@ -1781,7 +1788,8 @@ async fn handle_storage_broker(subcmd: &StorageBrokerCmd, env: &local_env::Local

        StorageBrokerCmd::Stop(_args) => {
            // FIXME: stop_mode unused
-            if let Err(e) = broker::stop_broker_process(env) {
+            let storage_broker = StorageBroker::from_env(env);
+            if let Err(e) = storage_broker.stop() {
                eprintln!("broker stop failed: {e}");
                exit(1);
            }
@@ -1831,8 +1839,11 @@ async fn handle_start_all_impl(
    #[allow(clippy::redundant_closure_call)]
    (|| {
        js.spawn(async move {
-            let retry_timeout = retry_timeout;
-            broker::start_broker_process(env, &retry_timeout).await
+            let storage_broker = StorageBroker::from_env(env);
+            storage_broker
+                .start(&retry_timeout)
+                .await
+                .map_err(|e| e.context("start storage_broker"))
        });

        js.spawn(async move {
@@ -1866,10 +1877,10 @@ async fn handle_start_all_impl(
        }

        js.spawn(async move {
-            ObjectStorage::from_env(env)
+            EndpointStorage::from_env(env)
                .start(&retry_timeout)
                .await
-                .map_err(|e| e.context("start object_storage"))
+                .map_err(|e| e.context("start endpoint_storage"))
        });
    })();

@@ -1968,9 +1979,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    let storage = ObjectStorage::from_env(env);
+    let storage = EndpointStorage::from_env(env);
    if let Err(e) = storage.stop(immediate) {
-        eprintln!("object_storage stop failed: {:#}", e);
+        eprintln!("endpoint_storage stop failed: {:#}", e);
    }

    for ps_conf in &env.pageservers {
@@ -1987,7 +1998,8 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    if let Err(e) = broker::stop_broker_process(env) {
+    let storage_broker = StorageBroker::from_env(env);
+    if let Err(e) = storage_broker.stop() {
        eprintln!("neon broker stop failed: {e:#}");
    }

--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -3,60 +3,86 @@
 //! In the local test environment, the storage broker stores its data directly in
 //!
 //! ```text
-//!   .neon
+//!   .neon/storage_broker
 //! ```
 use std::time::Duration;

 use anyhow::Context;
 use camino::Utf8PathBuf;

-use crate::{background_process, local_env};
+use crate::{background_process, local_env::LocalEnv};

-pub async fn start_broker_process(
-    env: &local_env::LocalEnv,
-    retry_timeout: &Duration,
-) -> anyhow::Result<()> {
-    let broker = &env.broker;
-    let listen_addr = &broker.listen_addr;
-
-    print!("Starting neon broker at {}", listen_addr);
-
-    let args = [format!("--listen-addr={listen_addr}")];
-
-    let client = reqwest::Client::new();
-    background_process::start_process(
-        "storage_broker",
-        &env.base_data_dir,
-        &env.storage_broker_bin(),
-        args,
-        [],
-        background_process::InitialPidFile::Create(storage_broker_pid_file_path(env)),
-        retry_timeout,
-        || async {
-            let url = broker.client_url();
-            let status_url = url.join("status").with_context(|| {
-                format!("Failed to append /status path to broker endpoint {url}")
-            })?;
-            let request = client
-                .get(status_url)
-                .build()
-                .with_context(|| format!("Failed to construct request to broker endpoint {url}"))?;
-            match client.execute(request).await {
-                Ok(resp) => Ok(resp.status().is_success()),
-                Err(_) => Ok(false),
-            }
-        },
-    )
-    .await
-    .context("Failed to spawn storage_broker subprocess")?;
-    Ok(())
+pub struct StorageBroker {
+    env: LocalEnv,
 }

-pub fn stop_broker_process(env: &local_env::LocalEnv) -> anyhow::Result<()> {
-    background_process::stop_process(true, "storage_broker", &storage_broker_pid_file_path(env))
-}
+impl StorageBroker {
+    /// Create a new `StorageBroker` instance from the environment.
+    pub fn from_env(env: &LocalEnv) -> Self {
+        Self { env: env.clone() }
+    }

-fn storage_broker_pid_file_path(env: &local_env::LocalEnv) -> Utf8PathBuf {
-    Utf8PathBuf::from_path_buf(env.base_data_dir.join("storage_broker.pid"))
-        .expect("non-Unicode path")
+    pub fn initialize(&self) -> anyhow::Result<()> {
+        if self.env.generate_local_ssl_certs {
+            self.env.generate_ssl_cert(
+                &self.env.storage_broker_data_dir().join("server.crt"),
+                &self.env.storage_broker_data_dir().join("server.key"),
+            )?;
+        }
+        Ok(())
+    }
+
+    /// Start the storage broker process.
+    pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
+        let broker = &self.env.broker;
+
+        print!("Starting neon broker at {}", broker.client_url());
+
+        let mut args = Vec::new();
+
+        if let Some(addr) = &broker.listen_addr {
+            args.push(format!("--listen-addr={addr}"));
+        }
+        if let Some(addr) = &broker.listen_https_addr {
+            args.push(format!("--listen-https-addr={addr}"));
+        }
+
+        let client = self.env.create_http_client();
+        background_process::start_process(
+            "storage_broker",
+            &self.env.storage_broker_data_dir(),
+            &self.env.storage_broker_bin(),
+            args,
+            [],
+            background_process::InitialPidFile::Create(self.pid_file_path()),
+            retry_timeout,
+            || async {
+                let url = broker.client_url();
+                let status_url = url.join("status").with_context(|| {
+                    format!("Failed to append /status path to broker endpoint {url}")
+                })?;
+                let request = client.get(status_url).build().with_context(|| {
+                    format!("Failed to construct request to broker endpoint {url}")
+                })?;
+                match client.execute(request).await {
+                    Ok(resp) => Ok(resp.status().is_success()),
+                    Err(_) => Ok(false),
+                }
+            },
+        )
+        .await
+        .context("Failed to spawn storage_broker subprocess")?;
+        Ok(())
+    }
+
+    /// Stop the storage broker process.
+    pub fn stop(&self) -> anyhow::Result<()> {
+        background_process::stop_process(true, "storage_broker", &self.pid_file_path())
+    }
+
+    /// Get the path to the PID file for the storage broker.
+    fn pid_file_path(&self) -> Utf8PathBuf {
+        Utf8PathBuf::from_path_buf(self.env.base_data_dir.join("storage_broker.pid"))
+            .expect("non-Unicode path")
+    }
 }
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -766,10 +766,6 @@ impl Endpoint {
            }
        };

-        // TODO(tristan957): Remove the write to spec.json after compatibility
-        // tests work themselves out
-        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
        let config_path = self.endpoint_path().join("config.json");
        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;

@@ -779,16 +775,6 @@ impl Endpoint {
            .append(true)
            .open(self.endpoint_path().join("compute.log"))?;

-        // TODO(tristan957): Remove when compatibility tests are no longer an
-        // issue
-        let old_compute_ctl = {
-            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-            let help_output = cmd.arg("--help").output()?;
-            let help_output = String::from_utf8_lossy(&help_output.stdout);
-
-            !help_output.contains("--config")
-        };
-
        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
@@ -807,19 +793,8 @@ impl Endpoint {
        ])
        .args(["--pgdata", self.pgdata().to_str().unwrap()])
        .args(["--connstr", &conn_str])
-        // TODO(tristan957): Change this to --config when compatibility tests
-        // are no longer an issue
-        .args([
-            "--spec-path",
-            self.endpoint_path()
-                .join(if old_compute_ctl {
-                    "spec.json"
-                } else {
-                    "config.json"
-                })
-                .to_str()
-                .unwrap(),
-        ])
+        .arg("--config")
+        .arg(self.endpoint_path().join("config.json").as_os_str())
        .args([
            "--pgbin",
            self.env
--- a/control_plane/src/endpoint_storage.rs
+++ b/control_plane/src/endpoint_storage.rs
@@ -1,34 +1,33 @@
 use crate::background_process::{self, start_process, stop_process};
 use crate::local_env::LocalEnv;
-use anyhow::anyhow;
 use anyhow::{Context, Result};
 use camino::Utf8PathBuf;
 use std::io::Write;
 use std::time::Duration;

 /// Directory within .neon which will be used by default for LocalFs remote storage.
-pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
-pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
+pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage";
+pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993;

-pub struct ObjectStorage {
+pub struct EndpointStorage {
    pub bin: Utf8PathBuf,
    pub data_dir: Utf8PathBuf,
    pub pemfile: Utf8PathBuf,
    pub port: u16,
 }

-impl ObjectStorage {
-    pub fn from_env(env: &LocalEnv) -> ObjectStorage {
-        ObjectStorage {
-            bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
-            data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
+impl EndpointStorage {
+    pub fn from_env(env: &LocalEnv) -> EndpointStorage {
+        EndpointStorage {
+            bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(),
+            data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(),
            pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
-            port: env.object_storage.port,
+            port: env.endpoint_storage.port,
        }
    }

    fn config_path(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.json")
+        self.data_dir.join("endpoint_storage.json")
    }

    fn listen_addr(&self) -> Utf8PathBuf {
@@ -49,7 +48,7 @@ impl ObjectStorage {
        let cfg = Cfg {
            listen: self.listen_addr(),
            pemfile: parent.join(self.pemfile.clone()),
-            local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
+            local_path: parent.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR),
            r#type: "LocalFs".to_string(),
        };
        std::fs::create_dir_all(self.config_path().parent().unwrap())?;
@@ -59,24 +58,19 @@ impl ObjectStorage {
    }

    pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
-        println!("Starting s3 proxy at {}", self.listen_addr());
+        println!("Starting endpoint_storage at {}", self.listen_addr());
        std::io::stdout().flush().context("flush stdout")?;

        let process_status_check = || async {
-            tokio::time::sleep(Duration::from_millis(500)).await;
-            let res = reqwest::Client::new()
-                .get(format!("http://{}/metrics", self.listen_addr()))
-                .send()
-                .await;
-            match res {
-                Ok(response) if response.status().is_success() => Ok(true),
-                Ok(_) => Err(anyhow!("Failed to query /metrics")),
-                Err(e) => Err(anyhow!("Failed to check node status: {e}")),
+            let res = reqwest::Client::new().get(format!("http://{}/metrics", self.listen_addr()));
+            match res.send().await {
+                Ok(res) => Ok(res.status().is_success()),
+                Err(_) => Ok(false),
            }
        };

        let res = start_process(
-            "object_storage",
+            "endpoint_storage",
            &self.data_dir.clone().into_std_path_buf(),
            &self.bin.clone().into_std_path_buf(),
            vec![self.config_path().to_string()],
@@ -94,14 +88,14 @@ impl ObjectStorage {
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        stop_process(immediate, "object_storage", &self.pid_file())
+        stop_process(immediate, "endpoint_storage", &self.pid_file())
    }

    fn log_file(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.log")
+        self.data_dir.join("endpoint_storage.log")
    }

    fn pid_file(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.pid")
+        self.data_dir.join("endpoint_storage.pid")
    }
 }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -9,8 +9,8 @@
 mod background_process;
 pub mod broker;
 pub mod endpoint;
+pub mod endpoint_storage;
 pub mod local_env;
-pub mod object_storage;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -4,7 +4,7 @@
 //! script which will use local paths.

 use std::collections::HashMap;
-use std::net::{IpAddr, Ipv4Addr, SocketAddr};
+use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
 use std::time::Duration;
@@ -14,16 +14,17 @@ use anyhow::{Context, bail};
 use clap::ValueEnum;
 use pem::Pem;
 use postgres_backend::AuthType;
-use reqwest::Url;
+use reqwest::{Certificate, Url};
 use serde::{Deserialize, Serialize};
 use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};

-use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
+use crate::broker::StorageBroker;
+use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 //
 // This data structures represents neon_local CLI config
@@ -72,7 +73,7 @@ pub struct LocalEnv {

    pub safekeepers: Vec<SafekeeperConf>,

-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,

    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
@@ -110,7 +111,7 @@ pub struct OnDiskConfig {
    )]
    pub pageservers: Vec<PageServerConf>,
    pub safekeepers: Vec<SafekeeperConf>,
-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub control_plane_compute_hook_api: Option<Url>,
@@ -144,7 +145,7 @@ pub struct NeonLocalInitConf {
    pub storage_controller: Option<NeonStorageControllerConf>,
    pub pageservers: Vec<NeonLocalInitPageserverConf>,
    pub safekeepers: Vec<SafekeeperConf>,
-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub generate_local_ssl_certs: bool,
@@ -152,16 +153,21 @@ pub struct NeonLocalInitConf {

 #[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
-pub struct ObjectStorageConf {
+pub struct EndpointStorageConf {
    pub port: u16,
 }

 /// Broker config for cluster internal communication.
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
+#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug, Default)]
 #[serde(default)]
 pub struct NeonBroker {
-    /// Broker listen address for storage nodes coordination, e.g. '127.0.0.1:50051'.
-    pub listen_addr: SocketAddr,
+    /// Broker listen HTTP address for storage nodes coordination, e.g. '127.0.0.1:50051'.
+    /// At least one of listen_addr or listen_https_addr must be set.
+    pub listen_addr: Option<SocketAddr>,
+    /// Broker listen HTTPS address for storage nodes coordination, e.g. '127.0.0.1:50051'.
+    /// At least one of listen_addr or listen_https_addr must be set.
+    /// listen_https_addr is preferred over listen_addr in neon_local.
+    pub listen_https_addr: Option<SocketAddr>,
 }

 /// A part of storage controller's config the neon_local knows about.
@@ -235,18 +241,19 @@ impl Default for NeonStorageControllerConf {
    }
 }

-// Dummy Default impl to satisfy Deserialize derive.
-impl Default for NeonBroker {
-    fn default() -> Self {
-        NeonBroker {
-            listen_addr: SocketAddr::new(IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)), 0),
-        }
-    }
-}
-
 impl NeonBroker {
    pub fn client_url(&self) -> Url {
-        Url::parse(&format!("http://{}", self.listen_addr)).expect("failed to construct url")
+        let url = if let Some(addr) = self.listen_https_addr {
+            format!("https://{}", addr)
+        } else {
+            format!(
+                "http://{}",
+                self.listen_addr
+                    .expect("at least one address should be set")
+            )
+        };
+
+        Url::parse(&url).expect("failed to construct url")
    }
 }

@@ -413,8 +420,8 @@ impl LocalEnv {
        self.pg_dir(pg_version, "lib")
    }

-    pub fn object_storage_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("object_storage")
+    pub fn endpoint_storage_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("endpoint_storage")
    }

    pub fn pageserver_bin(&self) -> PathBuf {
@@ -441,6 +448,10 @@ impl LocalEnv {
        self.base_data_dir.join("endpoints")
    }

+    pub fn storage_broker_data_dir(&self) -> PathBuf {
+        self.base_data_dir.join("storage_broker")
+    }
+
    pub fn pageserver_data_dir(&self, pageserver_id: NodeId) -> PathBuf {
        self.base_data_dir
            .join(format!("pageserver_{pageserver_id}"))
@@ -450,8 +461,8 @@ impl LocalEnv {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

-    pub fn object_storage_data_dir(&self) -> PathBuf {
-        self.base_data_dir.join("object_storage")
+    pub fn endpoint_storage_data_dir(&self) -> PathBuf {
+        self.base_data_dir.join("endpoint_storage")
    }

    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
@@ -503,6 +514,23 @@ impl LocalEnv {
        )
    }

+    /// Creates HTTP client with local SSL CA certificates.
+    pub fn create_http_client(&self) -> reqwest::Client {
+        let ssl_ca_certs = self.ssl_ca_cert_path().map(|ssl_ca_file| {
+            let buf = std::fs::read(ssl_ca_file).expect("SSL CA file should exist");
+            Certificate::from_pem_bundle(&buf).expect("SSL CA file should be valid")
+        });
+
+        let mut http_client = reqwest::Client::builder();
+        for ssl_ca_cert in ssl_ca_certs.unwrap_or_default() {
+            http_client = http_client.add_root_certificate(ssl_ca_cert);
+        }
+
+        http_client
+            .build()
+            .expect("HTTP client should construct with no error")
+    }
+
    /// Inspect the base data directory and extract the instance id and instance directory path
    /// for all storage controller instances
    pub async fn storage_controller_instances(&self) -> std::io::Result<Vec<(u8, PathBuf)>> {
@@ -615,7 +643,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: _,
                branch_name_mappings,
                generate_local_ssl_certs,
-                object_storage,
+                endpoint_storage,
            } = on_disk_config;
            LocalEnv {
                base_data_dir: repopath.to_owned(),
@@ -632,7 +660,7 @@ impl LocalEnv {
                control_plane_hooks_api,
                branch_name_mappings,
                generate_local_ssl_certs,
-                object_storage,
+                endpoint_storage,
            }
        };

@@ -742,7 +770,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: None,
                branch_name_mappings: self.branch_name_mappings.clone(),
                generate_local_ssl_certs: self.generate_local_ssl_certs,
-                object_storage: self.object_storage.clone(),
+                endpoint_storage: self.endpoint_storage.clone(),
            },
        )
    }
@@ -849,7 +877,7 @@ impl LocalEnv {
            control_plane_api,
            generate_local_ssl_certs,
            control_plane_hooks_api,
-            object_storage,
+            endpoint_storage,
        } = conf;

        // Find postgres binaries.
@@ -901,7 +929,7 @@ impl LocalEnv {
            control_plane_hooks_api,
            branch_name_mappings: Default::default(),
            generate_local_ssl_certs,
-            object_storage,
+            endpoint_storage,
        };

        if generate_local_ssl_certs {
@@ -911,6 +939,12 @@ impl LocalEnv {
        // create endpoints dir
        fs::create_dir_all(env.endpoints_path())?;

+        // create storage broker dir
+        fs::create_dir_all(env.storage_broker_data_dir())?;
+        StorageBroker::from_env(&env)
+            .initialize()
+            .context("storage broker init failed")?;
+
        // create safekeeper dirs
        for safekeeper in &env.safekeepers {
            fs::create_dir_all(SafekeeperNode::datadir_path_by_id(&env, safekeeper.id))?;
@@ -929,13 +963,13 @@ impl LocalEnv {
                .context("pageserver init failed")?;
        }

-        ObjectStorage::from_env(&env)
+        EndpointStorage::from_env(&env)
            .init()
            .context("object storage init failed")?;

        // setup remote remote location for default LocalFs remote storage
        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
-        std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
+        std::fs::create_dir_all(env.base_data_dir.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR))?;

        env.persist_config()
    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -21,7 +21,6 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
 use postgres_connection::{PgConnectionConfig, parse_host_port};
-use reqwest::Certificate;
 use utils::auth::{Claims, Scope};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -51,19 +50,6 @@ impl PageServerNode {
            parse_host_port(&conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
        let port = port.unwrap_or(5432);

-        let ssl_ca_certs = env.ssl_ca_cert_path().map(|ssl_ca_file| {
-            let buf = std::fs::read(ssl_ca_file).expect("SSL root CA file should exist");
-            Certificate::from_pem_bundle(&buf).expect("SSL CA file should be valid")
-        });
-
-        let mut http_client = reqwest::Client::builder();
-        for ssl_ca_cert in ssl_ca_certs.unwrap_or_default() {
-            http_client = http_client.add_root_certificate(ssl_ca_cert);
-        }
-        let http_client = http_client
-            .build()
-            .expect("Client constructs with no errors");
-
        let endpoint = if env.storage_controller.use_https_pageserver_api {
            format!(
                "https://{}",
@@ -80,7 +66,7 @@ impl PageServerNode {
            conf: conf.clone(),
            env: env.clone(),
            http_client: mgmt_api::Client::new(
-                http_client,
+                env.create_http_client(),
                endpoint,
                {
                    match conf.http_auth_type {
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -87,7 +87,7 @@ impl SafekeeperNode {
            conf: conf.clone(),
            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
            env: env.clone(),
-            http_client: reqwest::Client::new(),
+            http_client: env.create_http_client(),
            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
            listen_addr,
        }
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -20,7 +20,7 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use pem::Pem;
 use postgres_backend::AuthType;
-use reqwest::{Certificate, Method};
+use reqwest::Method;
 use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use tokio::process::Command;
@@ -153,24 +153,11 @@ impl StorageController {
            }
        };

-        let ssl_ca_certs = env.ssl_ca_cert_path().map(|ssl_ca_file| {
-            let buf = std::fs::read(ssl_ca_file).expect("SSL CA file should exist");
-            Certificate::from_pem_bundle(&buf).expect("SSL CA file should be valid")
-        });
-
-        let mut http_client = reqwest::Client::builder();
-        for ssl_ca_cert in ssl_ca_certs.unwrap_or_default() {
-            http_client = http_client.add_root_certificate(ssl_ca_cert);
-        }
-        let http_client = http_client
-            .build()
-            .expect("HTTP client should construct with no error");
-
        Self {
            env: env.clone(),
            private_key,
            public_key,
-            client: http_client,
+            client: env.create_http_client(),
            config: env.storage_controller.clone(),
            listen_port: OnceLock::default(),
        }
--- a/deny.toml
+++ b/deny.toml
@@ -45,9 +45,7 @@ allow = [
    "ISC",
    "MIT",
    "MPL-2.0",
-    "OpenSSL",
    "Unicode-3.0",
-    "Zlib",
 ]
 confidence-threshold = 0.8
 exceptions = [
@@ -56,14 +54,6 @@ exceptions = [
    { allow = ["Zlib"], name = "const_format", version = "*" },
 ]

-[[licenses.clarify]]
-name = "ring"
-version = "*"
-expression = "MIT AND ISC AND OpenSSL"
-license-files = [
-    { path = "LICENSE", hash = 0xbd0eed23 }
-]
-
 [licenses.private]
 ignore = true
 registries = []
@@ -116,7 +106,11 @@ name = "openssl"
 unknown-registry = "warn"
 unknown-git = "warn"
 allow-registry = ["https://github.com/rust-lang/crates.io-index"]
-allow-git = []
+allow-git = [
+    # Crate pinned to commit in origin repo due to opentelemetry version.
+    # TODO: Remove this once crate is fetched from crates.io again.
+    "https://github.com/mattiapenati/tower-otel",
+]

 [sources.allow-org]
 github = [
--- a/docker-compose/README.md
+++ b/docker-compose/README.md
@@ -1,4 +1,3 @@
-
 # Example docker compose configuration

 The configuration in this directory is used for testing Neon docker images: it is
@@ -8,3 +7,13 @@ you can experiment with a miniature Neon system, use `cargo neon` rather than co
 This configuration does not start the storage controller, because the controller
 needs a way to reconfigure running computes, and no such thing exists in this setup.

+## Generating the JWKS for a compute
+
+```shell
+openssl genpkey -algorithm Ed25519 -out private-key.pem
+openssl pkey -in private-key.pem -pubout -out public-key.pem
+openssl pkey -pubin -inform pem -in public-key.pem -pubout -outform der -out public-key.der
+key="$(xxd -plain -cols 32 -s -32 public-key.der)"
+key_id="$(printf '%s' "$key" | sha256sum | awk '{ print $1 }' | basenc --base64url --wrap=0)"
+x="$(printf '%s' "$key" | basenc --base64url --wrap=0)"
+```
--- a/docker-compose/compute_wrapper/private-key.pem
+++ b/docker-compose/compute_wrapper/private-key.pem
@@ -0,0 +1,3 @@
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEIOmnRbzt2AJ0d+S3aU1hiYOl/tXpvz1FmWBfwHYBgOma
+-----END PRIVATE KEY-----
--- a/docker-compose/compute_wrapper/public-key.der
+++ b/docker-compose/compute_wrapper/public-key.der
--- a/docker-compose/compute_wrapper/public-key.pem
+++ b/docker-compose/compute_wrapper/public-key.pem
@@ -0,0 +1,3 @@
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEADY0al/U0bgB3+9fUGk+3PKWnsck9OyxN5DjHIN6Xep0=
+-----END PUBLIC KEY-----
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -81,19 +81,9 @@ sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}

 cat ${CONFIG_FILE}

-# TODO(tristan957): Remove these workarounds for backwards compatibility after
-# the next compute release. That includes these next few lines and the
-# --spec-path in the compute_ctl invocation.
-if compute_ctl --help | grep --quiet -- '--config'; then
-  SPEC_PATH="$CONFIG_FILE"
-else
-  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
-  SPEC_PATH=/tmp/spec.json
-fi
-
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-$RANDOM"                          \
-     --spec-path "$SPEC_PATH"
+     --config "$CONFIG_FILE"
--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -142,7 +142,19 @@
    },
    "compute_ctl_config": {
        "jwks": {
-            "keys": []
+            "keys": [
+                {
+                    "use": "sig",
+                    "key_ops": [
+                        "verify"
+                    ],
+                    "alg": "EdDSA",
+                    "kid": "ZGIxMzAzOGY0YWQwODk2ODU1MTk1NzMxMDFkYmUyOWU2NzZkOWNjNjMyMGRkZGJjOWY0MjdjYWVmNzE1MjUyOAo=",
+                    "kty": "OKP",
+                    "crv": "Ed25519",
+                    "x": "MGQ4ZDFhOTdmNTM0NmUwMDc3ZmJkN2Q0MWE0ZmI3M2NhNWE3YjFjOTNkM2IyYzRkZTQzOGM3MjBkZTk3N2E5ZAo="
+                }
+            ]
        }
    }
 }
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -9,21 +9,20 @@
 # to verify custom image builds (e.g pre-published ones).
 #
 # A test script for postgres extensions
-# Currently supports only v16
+# Currently supports only v16+
 #
 set -eux -o pipefail

-COMPOSE_FILE='docker-compose.yml'
-cd $(dirname $0)
-COMPUTE_CONTAINER_NAME=docker-compose-compute-1
-TEST_CONTAINER_NAME=docker-compose-neon-test-extensions-1
+export COMPOSE_FILE='docker-compose.yml'
+export COMPOSE_PROFILES=test-extensions
+cd "$(dirname "${0}")"
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"

-cleanup() {
+function cleanup() {
    echo "show container information"
    docker ps
    echo "stop containers..."
-    docker compose --profile test-extensions -f $COMPOSE_FILE down
+    docker compose down
 }

 for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
@@ -31,55 +30,55 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
    echo "clean up containers if exists"
    cleanup
    PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --quiet-pull --build -d
+    PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d

    echo "wait until the compute is ready. timeout after 60s. "
    cnt=0
    while sleep 3; do
        # check timeout
-        cnt=`expr $cnt + 3`
-        if [ $cnt -gt 60 ]; then
+        (( cnt += 3 ))
+        if [[ ${cnt} -gt 60 ]]; then
            echo "timeout before the compute is ready."
            exit 1
        fi
-        if docker compose --profile test-extensions -f $COMPOSE_FILE logs "compute_is_ready" | grep -q "accepting connections"; then
+        if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then
            echo "OK. The compute is ready to connect."
            echo "execute simple queries."
-            docker exec $COMPUTE_CONTAINER_NAME /bin/bash -c "psql $PSQL_OPTION"
+            docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
            break
        fi
    done

-    if [ $pg_version -ge 16 ]; then
+    if [[ ${pg_version} -ge 16 ]]; then
        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
        echo Adding dummy config
-        docker exec $COMPUTE_CONTAINER_NAME touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+        docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
        TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/ext-src/pg_hint_plan-src/
-        rm -rf $TMPDIR
+        docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
+        docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/
+        rm -rf "${TMPDIR}"
        # The following block does the same for the contrib/file_fdw test
        TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/postgres/contrib/file_fdw/data $TMPDIR/data
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/postgres/contrib/file_fdw/data
-        rm -rf $TMPDIR
+        docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data"
+        docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data
+        rm -rf "${TMPDIR}"
        # Apply patches
-        cat ../compute/patches/contrib_pg${pg_version}.patch | docker exec -i $TEST_CONTAINER_NAME bash -c "(cd /postgres && patch -p1)"
+        docker compose exec -i neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
        # We are running tests now
        rm -f testout.txt testout_contrib.txt
-        docker exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
-        $TEST_CONTAINER_NAME /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
-        docker exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
-        $TEST_CONTAINER_NAME /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
-        if [ $EXT_SUCCESS -eq 0 ] || [ $CONTRIB_SUCCESS -eq 0 ]; then
+        docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,postgis-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
+        neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
+        docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
+        neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
+        if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then
            CONTRIB_FAILED=
            FAILED=
-            [ $EXT_SUCCESS -eq 0 ] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
-            [ $CONTRIB_SUCCESS -eq 0 ] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
-            for d in $FAILED $CONTRIB_FAILED; do
-                docker exec $TEST_CONTAINER_NAME bash -c 'for file in $(find '"$d"' -name regression.diffs -o -name regression.out); do cat $file; done' || [ $? -eq 1 ]
+            [[ ${EXT_SUCCESS} -eq 0 ]] && FAILED=$(tail -1 testout.txt | awk '{for(i=1;i<=NF;i++){print "/ext-src/"$i;}}')
+            [[ ${CONTRIB_SUCCESS} -eq 0 ]] && CONTRIB_FAILED=$(tail -1 testout_contrib.txt | awk '{for(i=0;i<=NF;i++){print "/postgres/contrib/"$i;}}')
+            for d in ${FAILED} ${CONTRIB_FAILED}; do
+                docker compose exec neon-test-extensions bash -c 'for file in $(find '"${d}"' -name regression.diffs -o -name regression.out); do cat ${file}; done' || [[ ${?} -eq 1 ]]
            done
        exit 1
        fi
--- a/endpoint_storage/Cargo.toml
+++ b/endpoint_storage/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "object_storage"
+name = "endpoint_storage"
 version = "0.0.1"
 edition.workspace = true
 license.workspace = true
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -2,7 +2,7 @@ use anyhow::anyhow;
 use axum::body::{Body, Bytes};
 use axum::response::{IntoResponse, Response};
 use axum::{Router, http::StatusCode};
-use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
+use endpoint_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
 use remote_storage::TimeoutOrCancel;
 use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
 use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
@@ -46,12 +46,12 @@ async fn metrics() -> Result {

 async fn get(S3Path { path }: S3Path, state: State) -> Result {
    info!(%path, "downloading");
-    let download_err = |e| {
-        if let DownloadError::NotFound = e {
-            info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
+    let download_err = |err| {
+        if let DownloadError::NotFound = err {
+            info!(%path, %err, "downloading"); // 404 is not an issue of _this_ service
            return not_found(&path);
        }
-        internal_error(e, &path, "downloading")
+        internal_error(err, &path, "downloading")
    };
    let cancel = state.cancel.clone();
    let opts = &DownloadOpts::default();
@@ -249,7 +249,7 @@ mod tests {
        };

        let proxy = Storage {
-            auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
+            auth: endpoint_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
            storage,
            cancel: cancel.clone(),
            max_upload_file_limit: usize::MAX,
@@ -343,14 +343,14 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
    fn token() -> String {
-        let claims = object_storage::Claims {
+        let claims = endpoint_storage::Claims {
            tenant_id: TENANT_ID,
            timeline_id: TIMELINE_ID,
            endpoint_id: ENDPOINT_ID.into(),
            exp: u64::MAX,
        };
        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
-        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
        jsonwebtoken::encode(&header, &claims, &key).unwrap()
    }

@@ -364,7 +364,10 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
            vec![ENDPOINT_ID, "ep-ololo"]
        )
-        .skip(1);
+        // first one is fully valid path, second path is valid for GET as
+        // read paths may have different endpoint if tenant and timeline matches
+        // (needed for prewarming RO->RW replica)
+        .skip(2);

        for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
            info!(%uri, %method, %tenant, %timeline, %endpoint);
@@ -475,6 +478,16 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        requests_chain(chain.into_iter(), |_| token()).await;
    }

+    #[testlog(tokio::test)]
+    async fn read_other_endpoint_data() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/other_endpoint/key");
+        let chain = vec![
+            (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
+            (uri.clone(), "PUT", "", StatusCode::UNAUTHORIZED, false),
+        ];
+        requests_chain(chain.into_iter(), |_| token()).await;
+    }
+
    fn delete_prefix_token(uri: &str) -> String {
        use serde::Serialize;
        let parts = uri.split("/").collect::<Vec<&str>>();
@@ -482,7 +495,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        struct PrefixClaims {
            tenant_id: TenantId,
            timeline_id: Option<TimelineId>,
-            endpoint_id: Option<object_storage::EndpointId>,
+            endpoint_id: Option<endpoint_storage::EndpointId>,
            exp: u64,
        }
        let claims = PrefixClaims {
@@ -492,7 +505,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            exp: u64::MAX,
        };
        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
-        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
        jsonwebtoken::encode(&header, &claims, &key).unwrap()
    }

--- a/endpoint_storage/src/lib.rs
+++ b/endpoint_storage/src/lib.rs
@@ -169,10 +169,19 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
            .auth
            .decode(bearer.token())
            .map_err(|e| bad_request(e, "decoding token"))?;
+
+        // Read paths may have different endpoint ids. For readonly -> readwrite replica
+        // prewarming, endpoint must read other endpoint's data.
+        let endpoint_id = if parts.method == axum::http::Method::GET {
+            claims.endpoint_id.clone()
+        } else {
+            path.endpoint_id.clone()
+        };
+
        let route = Claims {
            tenant_id: path.tenant_id,
            timeline_id: path.timeline_id,
-            endpoint_id: path.endpoint_id.clone(),
+            endpoint_id,
            exp: claims.exp,
        };
        if route != claims {
--- a/endpoint_storage/src/main.rs
+++ b/endpoint_storage/src/main.rs
@@ -1,4 +1,4 @@
-//! `object_storage` is a service which provides API for uploading and downloading
+//! `endpoint_storage` is a service which provides API for uploading and downloading
 //! files. It is used by compute and control plane for accessing LFC prewarm data.
 //! This service is deployed either as a separate component or as part of compute image
 //! for large computes.
@@ -33,7 +33,7 @@ async fn main() -> anyhow::Result<()> {

    let config: String = std::env::args().skip(1).take(1).collect();
    if config.is_empty() {
-        anyhow::bail!("Usage: object_storage config.json")
+        anyhow::bail!("Usage: endpoint_storage config.json")
    }
    info!("Reading config from {config}");
    let config = std::fs::read_to_string(config.clone())?;
@@ -41,7 +41,7 @@ async fn main() -> anyhow::Result<()> {
    info!("Reading pemfile from {}", config.pemfile.clone());
    let pemfile = std::fs::read(config.pemfile.clone())?;
    info!("Loading public key from {}", config.pemfile.clone());
-    let auth = object_storage::JwtAuth::new(&pemfile)?;
+    let auth = endpoint_storage::JwtAuth::new(&pemfile)?;

    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
    info!("listening on {}", listener.local_addr().unwrap());
@@ -50,7 +50,7 @@ async fn main() -> anyhow::Result<()> {
    let cancel = tokio_util::sync::CancellationToken::new();
    app::check_storage_permissions(&storage, cancel.clone()).await?;

-    let proxy = std::sync::Arc::new(object_storage::Storage {
+    let proxy = std::sync::Arc::new(endpoint_storage::Storage {
        auth,
        storage,
        cancel: cancel.clone(),
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -242,13 +242,22 @@ impl RemoteExtSpec {

        match self.extension_data.get(real_ext_name) {
            Some(_ext_data) => {
+                // We have decided to use the Go naming convention due to Kubernetes.
+
+                let arch = match std::env::consts::ARCH {
+                    "x86_64" => "amd64",
+                    "aarch64" => "arm64",
+                    arch => arch,
+                };
+
                // Construct the path to the extension archive
                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
                //
                // Keep it in sync with path generation in
                // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str =
-                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                let archive_path_str = format!(
+                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
+                );
                Ok((
                    real_ext_name.to_string(),
                    RemotePath::from_string(&archive_path_str)?,
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -76,7 +76,14 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    mfs
 }

-
+static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "libmetrics_disk_io_bytes_total",
+        "Bytes written and read from disk, grouped by the operation (read|write)",
+        &["io_operation"]
+    )
+    .expect("Failed to register disk i/o bytes int gauge vec")
+});

 static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
@@ -254,7 +261,12 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

-    
+    DISK_IO_BYTES
+        .with_label_values(&["read"])
+        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
+    DISK_IO_BYTES
+        .with_label_values(&["write"])
+        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);

    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
    #[cfg(target_os = "macos")]
@@ -345,7 +357,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }

-    
+    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
+        res[0] = self.inc.remove_label_values(vals);
+        res[1] = self.dec.remove_label_values(vals);
+    }
 }

 impl<P: Atomic> GenericCounterPair<P> {
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -35,6 +35,7 @@ nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
 tracing-utils.workspace = true
+once_cell.workspace = true

 [dev-dependencies]
 bincode.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -181,6 +181,7 @@ pub struct ConfigToml {
    pub generate_unarchival_heatmap: Option<bool>,
    pub tracing: Option<Tracing>,
    pub enable_tls_page_service_api: bool,
+    pub dev_mode: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -657,6 +658,7 @@ impl Default for ConfigToml {
            generate_unarchival_heatmap: None,
            tracing: None,
            enable_tls_page_service_api: false,
+            dev_mode: false,
        }
    }
 }
@@ -682,10 +684,10 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;

    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
-    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
-    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
-    // with this config, we can get a maximum peak compaction usage of 9 GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
+    // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
+    // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
+    // compaction usage of 15360MB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
    // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
    // read amp.
    pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
@@ -702,8 +704,11 @@ pub mod tenant_conf_defaults {
    // Relevant: https://github.com/neondatabase/neon/issues/3394
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
-    // layer creation will end immediately. Set to 0 to disable.
+    // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
+    // without looking at the exact number of L0 layers.
+    // It was expected to have the following behavior:
+    // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // > layer creation will end immediately. Set to 0 to disable.
    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -169,6 +169,8 @@ pub struct TenantDescribeResponseShard {
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
+    /// A timeline is being imported into this tenant
+    pub is_importing: bool,

    pub scheduling_policy: ShardSchedulingPolicy,

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -320,6 +320,35 @@ pub struct TimelineCreateRequest {
    pub mode: TimelineCreateRequestMode,
 }

+impl TimelineCreateRequest {
+    pub fn mode_tag(&self) -> &'static str {
+        match &self.mode {
+            TimelineCreateRequestMode::Branch { .. } => "branch",
+            TimelineCreateRequestMode::ImportPgdata { .. } => "import",
+            TimelineCreateRequestMode::Bootstrap { .. } => "bootstrap",
+        }
+    }
+
+    pub fn is_import(&self) -> bool {
+        matches!(self.mode, TimelineCreateRequestMode::ImportPgdata { .. })
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum ShardImportStatus {
+    InProgress,
+    Done,
+    Error(String),
+}
+impl ShardImportStatus {
+    pub fn is_terminal(&self) -> bool {
+        match self {
+            ShardImportStatus::InProgress => false,
+            ShardImportStatus::Done | ShardImportStatus::Error(_) => true,
+        }
+    }
+}
+
 /// Storage controller specific extensions to [`TimelineInfo`].
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateResponseStorcon {
@@ -1774,6 +1803,8 @@ pub struct TopTenantShardsResponse {
 }

 pub mod virtual_file {
+    use std::sync::LazyLock;
+
    #[derive(
        Copy,
        Clone,
@@ -1811,14 +1842,38 @@ pub mod virtual_file {
    pub enum IoMode {
        /// Uses buffered IO.
        Buffered,
-        /// Uses direct IO, error out if the operation fails.
+        /// Uses direct IO for reads only.
        #[cfg(target_os = "linux")]
        Direct,
+        /// Use direct IO for reads and writes.
+        #[cfg(target_os = "linux")]
+        DirectRw,
    }

    impl IoMode {
-        pub const fn preferred() -> Self {
-            Self::Buffered
+        pub fn preferred() -> Self {
+            // The default behavior when running Rust unit tests without any further
+            // flags is to use the newest behavior (DirectRw).
+            // The CI uses the following environment variable to unit tests for all
+            // different modes.
+            // NB: the Python regression & perf tests have their own defaults management
+            // that writes pageserver.toml; they do not use this variable.
+            if cfg!(test) {
+                static CACHED: LazyLock<IoMode> = LazyLock::new(|| {
+                    utils::env::var_serde_json_string(
+                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
+                    )
+                    .unwrap_or(
+                        #[cfg(target_os = "linux")]
+                        IoMode::DirectRw,
+                        #[cfg(not(target_os = "linux"))]
+                        IoMode::Buffered,
+                    )
+                });
+                *CACHED
+            } else {
+                IoMode::Buffered
+            }
        }
    }

@@ -1830,6 +1885,8 @@ pub mod virtual_file {
                v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
                #[cfg(target_os = "linux")]
                v if v == (IoMode::Direct as u8) => IoMode::Direct,
+                #[cfg(target_os = "linux")]
+                v if v == (IoMode::DirectRw as u8) => IoMode::DirectRw,
                x => return Err(x),
            })
        }
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -4,10 +4,10 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
+use utils::id::{NodeId, TimelineId};

 use crate::controller_api::NodeRegisterRequest;
-use crate::models::LocationConfigMode;
+use crate::models::{LocationConfigMode, ShardImportStatus};
 use crate::shard::TenantShardId;

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
@@ -62,3 +62,10 @@ pub struct ValidateResponseTenant {
    pub id: TenantShardId,
    pub valid: bool,
 }
+
+#[derive(Serialize, Deserialize)]
+pub struct PutTimelineImportStatusRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub status: ShardImportStatus,
+}
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -14,8 +14,9 @@ use anyhow::{Context, Result};
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
-use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::blob::operations::GetBlobBuilder;
+use azure_storage_blobs::blob::{Blob, CopyStatus};
+use azure_storage_blobs::container::operations::ListBlobsBuilder;
 use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
 use bytes::Bytes;
 use futures::FutureExt;
@@ -23,6 +24,7 @@ use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::{StreamExt, TryStreamExt};
 use http_types::{StatusCode, Url};
+use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use utils::backoff;
@@ -31,7 +33,7 @@ use utils::backoff::exponential_backoff_duration_seconds;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use crate::config::AzureConfig;
 use crate::error::Cancelled;
-use crate::metrics::RequestKind;
+use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
    ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
@@ -164,7 +166,7 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

-      
+        let started_at = start_measuring_requests(kind);

        let download = async {
            let response = builder
@@ -236,58 +238,31 @@ impl AzureBlobStorage {
                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
            },
        };
-        
-      
+        let started_at = ScopeGuard::into_inner(started_at);
+        let outcome = match &download {
+            Ok(_) => AttemptOutcome::Ok,
+            // At this level in the stack 404 and 304 responses do not indicate an error.
+            // There's expected cases when a blob may not exist or hasn't been modified since
+            // the last get (e.g. probing for timeline indices and heatmap downloads).
+            // Callers should handle errors if they are unexpected.
+            Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
        download
    }

-    async fn permit(
-        &self,
-        kind: RequestKind,
-        cancel: &CancellationToken,
-    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
-        let acquire = self.concurrency_limiter.acquire(kind);
-
-        tokio::select! {
-            permit = acquire => Ok(permit.expect("never closed")),
-            _ = cancel.cancelled() => Err(Cancelled),
-        }
-    }
-
-    pub fn container_name(&self) -> &str {
-        &self.container_name
-    }
-}
-
-fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
-    let mut res = Metadata::new();
-    for (k, v) in metadata.0.into_iter() {
-        res.insert(k, v);
-    }
-    res
-}
-
-fn to_download_error(error: azure_core::Error) -> DownloadError {
-    if let Some(http_err) = error.as_http_error() {
-        match http_err.status() {
-            StatusCode::NotFound => DownloadError::NotFound,
-            StatusCode::NotModified => DownloadError::Unmodified,
-            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
-            _ => DownloadError::Other(anyhow::Error::new(error)),
-        }
-    } else {
-        DownloadError::Other(error.into())
-    }
-}
-
-impl RemoteStorage for AzureBlobStorage {
-    fn list_streaming(
+    fn list_streaming_for_fn<T: Default + ListingCollector>(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        request_kind: RequestKind,
+        customize_builder: impl Fn(ListBlobsBuilder) -> ListBlobsBuilder,
+    ) -> impl Stream<Item = Result<T, DownloadError>> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| {
            self.prefix_in_container.clone().map(|mut s| {
@@ -299,7 +274,7 @@ impl RemoteStorage for AzureBlobStorage {
        });

        async_stream::stream! {
-            let _permit = self.permit(RequestKind::List, cancel).await?;
+            let _permit = self.permit(request_kind, cancel).await?;

            let mut builder = self.client.list_blobs();

@@ -315,6 +290,8 @@ impl RemoteStorage for AzureBlobStorage {
                builder = builder.max_results(MaxResults::new(limit));
            }

+            builder = customize_builder(builder);
+
            let mut next_marker = None;

            let mut timeout_try_cnt = 1;
@@ -370,26 +347,20 @@ impl RemoteStorage for AzureBlobStorage {
                    break;
                };

-                let mut res = Listing::default();
+                let mut res = T::default();
                next_marker = entry.continuation();
                let prefix_iter = entry
                    .blobs
                    .prefixes()
                    .map(|prefix| self.name_to_relative_path(&prefix.name));
-                res.prefixes.extend(prefix_iter);
+                res.add_prefixes(self, prefix_iter);

                let blob_iter = entry
                    .blobs
-                    .blobs()
-                    .map(|k| ListingObject{
-                        key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into(),
-                        size: k.properties.content_length,
-                    }
-                );
+                    .blobs();

                for key in blob_iter {
-                    res.keys.push(key);
+                    res.add_blob(self, key);

                    if let Some(mut mk) = max_keys {
                        assert!(mk > 0);
@@ -411,6 +382,128 @@ impl RemoteStorage for AzureBlobStorage {
        }
    }

+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        tokio::select! {
+            permit = acquire => Ok(permit.expect("never closed")),
+            _ = cancel.cancelled() => Err(Cancelled),
+        }
+    }
+
+    pub fn container_name(&self) -> &str {
+        &self.container_name
+    }
+}
+
+trait ListingCollector {
+    fn add_prefixes(&mut self, abs: &AzureBlobStorage, prefix_it: impl Iterator<Item = RemotePath>);
+    fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob);
+}
+
+impl ListingCollector for Listing {
+    fn add_prefixes(
+        &mut self,
+        _abs: &AzureBlobStorage,
+        prefix_it: impl Iterator<Item = RemotePath>,
+    ) {
+        self.prefixes.extend(prefix_it);
+    }
+    fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) {
+        self.keys.push(ListingObject {
+            key: abs.name_to_relative_path(&blob.name),
+            last_modified: blob.properties.last_modified.into(),
+            size: blob.properties.content_length,
+        });
+    }
+}
+
+impl ListingCollector for crate::VersionListing {
+    fn add_prefixes(
+        &mut self,
+        _abs: &AzureBlobStorage,
+        _prefix_it: impl Iterator<Item = RemotePath>,
+    ) {
+        // nothing
+    }
+    fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) {
+        let id = crate::VersionId(blob.version_id.clone().expect("didn't find version ID"));
+        self.versions.push(crate::Version {
+            key: abs.name_to_relative_path(&blob.name),
+            last_modified: blob.properties.last_modified.into(),
+            kind: crate::VersionKind::Version(id),
+        });
+    }
+}
+
+fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
+    let mut res = Metadata::new();
+    for (k, v) in metadata.0.into_iter() {
+        res.insert(k, v);
+    }
+    res
+}
+
+fn to_download_error(error: azure_core::Error) -> DownloadError {
+    if let Some(http_err) = error.as_http_error() {
+        match http_err.status() {
+            StatusCode::NotFound => DownloadError::NotFound,
+            StatusCode::NotModified => DownloadError::Unmodified,
+            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
+            _ => DownloadError::Other(anyhow::Error::new(error)),
+        }
+    } else {
+        DownloadError::Other(error.into())
+    }
+}
+
+impl RemoteStorage for AzureBlobStorage {
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        let customize_builder = |builder| builder;
+        let kind = RequestKind::ListVersions;
+        self.list_streaming_for_fn(prefix, mode, max_keys, cancel, kind, customize_builder)
+    }
+
+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> std::result::Result<crate::VersionListing, DownloadError> {
+        let customize_builder = |mut builder: ListBlobsBuilder| {
+            builder = builder.include_versions(true);
+            builder
+        };
+        let kind = RequestKind::ListVersions;
+
+        let mut stream = std::pin::pin!(self.list_streaming_for_fn(
+            prefix,
+            mode,
+            max_keys,
+            cancel,
+            kind,
+            customize_builder
+        ));
+        let mut combined: crate::VersionListing =
+            stream.next().await.expect("At least one item required")?;
+        while let Some(list) = stream.next().await {
+            let list = list?;
+            combined.versions.extend(list.versions.into_iter());
+        }
+        Ok(combined)
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
@@ -419,7 +512,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
        let properties_future = blob_client.get_properties().into_future();
@@ -431,9 +524,12 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(_inner) = &res {
-            
-         
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
        }

        let data = match res {
@@ -461,7 +557,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -494,7 +590,14 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-      
+        let outcome = match res {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);

        res
    }
@@ -510,7 +613,12 @@ impl RemoteStorage for AzureBlobStorage {
        let mut builder = blob_client.get();

        if let Some(ref etag) = opts.etag {
-            builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()))
+            builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()));
+        }
+
+        if let Some(ref version_id) = opts.version_id {
+            let version_id = azure_storage_blobs::prelude::VersionId::new(version_id.0.clone());
+            builder = builder.blob_versioning(version_id);
        }

        if let Some((start, end)) = opts.byte_range() {
@@ -540,7 +648,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Delete;
        let _permit = self.permit(kind, cancel).await?;
-    
+        let started_at = start_measuring_requests(kind);

        let op = async {
            // TODO batch requests are not supported by the SDK
@@ -606,8 +714,10 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
        res
    }

@@ -623,7 +733,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Copy;
        let _permit = self.permit(kind, cancel).await?;
-  
+        let started_at = start_measuring_requests(kind);

        let timeout = tokio::time::sleep(self.timeout);

@@ -677,8 +787,10 @@ impl RemoteStorage for AzureBlobStorage {
            },
        };

-        
-      
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
        res
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -176,6 +176,32 @@ pub struct Listing {
    pub keys: Vec<ListingObject>,
 }

+#[derive(Default)]
+pub struct VersionListing {
+    pub versions: Vec<Version>,
+}
+
+pub struct Version {
+    pub key: RemotePath,
+    pub last_modified: SystemTime,
+    pub kind: VersionKind,
+}
+
+impl Version {
+    pub fn version_id(&self) -> Option<&VersionId> {
+        match &self.kind {
+            VersionKind::Version(id) => Some(id),
+            VersionKind::DeletionMarker => None,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum VersionKind {
+    DeletionMarker,
+    Version(VersionId),
+}
+
 /// Options for downloads. The default value is a plain GET.
 pub struct DownloadOpts {
    /// If given, returns [`DownloadError::Unmodified`] if the object still has
@@ -186,6 +212,8 @@ pub struct DownloadOpts {
    /// The end of the byte range to download, or unbounded. Must be after the
    /// start bound.
    pub byte_end: Bound<u64>,
+    /// Optionally request a specific version of a key
+    pub version_id: Option<VersionId>,
    /// Indicate whether we're downloading something small or large: this indirectly controls
    /// timeouts: for something like an index/manifest/heatmap, we should time out faster than
    /// for layer files
@@ -197,12 +225,16 @@ pub enum DownloadKind {
    Small,
 }

+#[derive(Debug, Clone)]
+pub struct VersionId(pub String);
+
 impl Default for DownloadOpts {
    fn default() -> Self {
        Self {
            etag: Default::default(),
            byte_start: Bound::Unbounded,
            byte_end: Bound::Unbounded,
+            version_id: None,
            kind: DownloadKind::Large,
        }
    }
@@ -295,6 +327,14 @@ pub trait RemoteStorage: Send + Sync + 'static {
        Ok(combined)
    }

+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<VersionListing, DownloadError>;
+
    /// Obtain metadata information about an object.
    async fn head_object(
        &self,
@@ -475,6 +515,22 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // See [`RemoteStorage::list_versions`].
+    pub async fn list_versions<'a>(
+        &'a self,
+        prefix: Option<&'a RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &'a CancellationToken,
+    ) -> Result<VersionListing, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+        }
+    }
+
    // See [`RemoteStorage::head_object`].
    pub async fn head_object(
        &self,
@@ -727,6 +783,7 @@ impl ConcurrencyLimiter {
            RequestKind::Copy => &self.write,
            RequestKind::TimeTravel => &self.write,
            RequestKind::Head => &self.read,
+            RequestKind::ListVersions => &self.read,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,6 +445,16 @@ impl RemoteStorage for LocalFs {
        }
    }

+    async fn list_versions(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _mode: ListingMode,
+        _max_keys: Option<NonZeroU32>,
+        _cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        unimplemented!()
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -1,7 +1,9 @@
+use metrics::{
+    Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec,
+};
+use once_cell::sync::Lazy;

-
-
-
+pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
 pub(crate) enum RequestKind {
@@ -12,11 +14,68 @@ pub(crate) enum RequestKind {
    Copy = 4,
    TimeTravel = 5,
    Head = 6,
+    ListVersions = 7,
 }

-
+use RequestKind::*;
 use scopeguard::ScopeGuard;

+impl RequestKind {
+    const fn as_str(&self) -> &'static str {
+        match self {
+            Get => "get_object",
+            Put => "put_object",
+            Delete => "delete_object",
+            List => "list_objects",
+            Copy => "copy_object",
+            TimeTravel => "time_travel_recover",
+            Head => "head_object",
+            ListVersions => "list_versions",
+        }
+    }
+    const fn as_index(&self) -> usize {
+        *self as usize
+    }
+}
+
+const REQUEST_KIND_LIST: &[RequestKind] =
+    &[Get, Put, Delete, List, Copy, TimeTravel, Head, ListVersions];
+
+const REQUEST_KIND_COUNT: usize = REQUEST_KIND_LIST.len();
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
+
+impl<C> RequestTyped<C> {
+    pub(crate) fn get(&self, kind: RequestKind) -> &C {
+        &self.0[kind.as_index()]
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
+        let mut it = REQUEST_KIND_LIST.iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
+            let next = it.next().unwrap();
+            assert_eq!(index, next.as_index());
+            f(*next)
+        });
+
+        if let Some(next) = it.next() {
+            panic!("unexpected {next:?}");
+        }
+
+        RequestTyped(arr)
+    }
+}
+
+impl RequestTyped<Histogram> {
+    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+        self.get(kind).observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(crate) struct PassFailCancelledRequestTyped<C> {
+    success: RequestTyped<C>,
+    fail: RequestTyped<C>,
+    cancelled: RequestTyped<C>,
+}

 #[derive(Debug, Clone, Copy)]
 pub(crate) enum AttemptOutcome {
@@ -34,22 +93,138 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
    }
 }

+impl AttemptOutcome {
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            AttemptOutcome::Ok => "ok",
+            AttemptOutcome::Err => "err",
+            AttemptOutcome::Cancelled => "cancelled",
+        }
+    }
+}

+impl<C> PassFailCancelledRequestTyped<C> {
+    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+        let target = match outcome {
+            AttemptOutcome::Ok => &self.success,
+            AttemptOutcome::Err => &self.fail,
+            AttemptOutcome::Cancelled => &self.cancelled,
+        };
+        target.get(kind)
+    }

+    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
+        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
+        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
+        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));

+        PassFailCancelledRequestTyped {
+            success,
+            fail,
+            cancelled,
+        }
+    }
+}

+impl PassFailCancelledRequestTyped<Histogram> {
+    pub(crate) fn observe_elapsed(
+        &self,
+        kind: RequestKind,
+        outcome: impl Into<AttemptOutcome>,
+        started_at: std::time::Instant,
+    ) {
+        self.get(kind, outcome.into())
+            .observe(started_at.elapsed().as_secs_f64())
+    }
+}

-
-
-
-/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
-pub(crate) fn start_measuring_requests(
-    _kind: RequestKind,
+/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
+pub(crate) fn start_counting_cancelled_wait(
+    kind: RequestKind,
 ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_started_at| {
-        
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        crate::metrics::BUCKET_METRICS
+            .cancelled_waits
+            .get(kind)
+            .inc()
    })
 }

+/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
+pub(crate) fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}

+pub(crate) struct BucketMetrics {
+    /// Full request duration until successful completion, error or cancellation.
+    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
+    pub(crate) wait_seconds: RequestTyped<Histogram>,

+    /// Track how many semaphore awaits were cancelled per request type.
+    ///
+    /// This is in case cancellations are happening more than expected.
+    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(crate) deleted_objects_total: IntCounter,
+}
+
+impl Default for BucketMetrics {
+    fn default() -> Self {
+        // first bucket 100 microseconds to count requests that do not need to wait at all
+        // and get a permit immediately
+        let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+
+        let req_seconds = register_histogram_vec!(
+            "remote_storage_s3_request_seconds",
+            "Seconds to complete a request",
+            &["request_type", "result"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
+            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
+        });
+
+        let wait_seconds = register_histogram_vec!(
+            "remote_storage_s3_wait_seconds",
+            "Seconds rate limited",
+            &["request_type"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let wait_seconds =
+            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
+
+        let cancelled_waits = register_int_counter_vec!(
+            "remote_storage_s3_cancelled_waits_total",
+            "Times a semaphore wait has been cancelled per request type",
+            &["request_type"],
+        )
+        .unwrap();
+        let cancelled_waits =
+            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
+
+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
+        Self {
+            req_seconds,
+            wait_seconds,
+            cancelled_waits,
+            deleted_objects_total,
+        }
+    }
+}
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -21,9 +21,8 @@ use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep};
 use aws_sdk_s3::error::SdkError;
 use aws_sdk_s3::operation::get_object::GetObjectError;
 use aws_sdk_s3::operation::head_object::HeadObjectError;
-use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass};
+use aws_sdk_s3::types::{Delete, ObjectIdentifier, StorageClass};
 use aws_smithy_async::rt::sleep::TokioSleep;
-use aws_smithy_types::DateTime;
 use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::date_time::ConversionError;
@@ -41,12 +40,12 @@ use super::StorageMetadata;
 use crate::config::S3Config;
 use crate::error::Cancelled;
 pub(super) use crate::metrics::RequestKind;
-use crate::metrics::{AttemptOutcome, start_measuring_requests};
+use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests};
 use crate::support::PermitCarrying;
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
    MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage,
-    TimeTravelError, TimeoutOrCancel,
+    TimeTravelError, TimeoutOrCancel, Version, VersionId, VersionKind, VersionListing,
 };

 /// AWS S3 storage.
@@ -66,6 +65,7 @@ struct GetObjectRequest {
    key: String,
    etag: Option<String>,
    range: Option<String>,
+    version_id: Option<String>,
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
@@ -199,7 +199,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
-       
+        let started_at = start_counting_cancelled_wait(kind);
        let acquire = self.concurrency_limiter.acquire(kind);

        let permit = tokio::select! {
@@ -207,8 +207,10 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);

        Ok(permit)
    }
@@ -218,7 +220,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
-       
+        let started_at = start_counting_cancelled_wait(kind);
        let acquire = self.concurrency_limiter.acquire_owned(kind);

        let permit = tokio::select! {
@@ -226,8 +228,10 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
        Ok(permit)
    }

@@ -247,6 +251,7 @@ impl S3Bucket {
            .get_object()
            .bucket(request.bucket)
            .key(request.key)
+            .set_version_id(request.version_id)
            .set_range(request.range);

        if let Some(etag) = request.etag {
@@ -269,7 +274,11 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::NotFound);
            }
            Err(SdkError::ServiceError(e))
@@ -279,11 +288,19 @@ impl S3Bucket {
                if e.raw().status().as_u16() == StatusCode::NotModified =>
            {
                // Count an unmodified file as a success.
-               
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::Unmodified);
            }
            Err(e) => {
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("download s3 object"),
@@ -330,11 +347,11 @@ impl S3Bucket {
        delete_objects: &[ObjectIdentifier],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-   
+        let kind = RequestKind::Delete;
        let mut cancel = std::pin::pin!(cancel.cancelled());

        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
-           
+            let started_at = start_measuring_requests(kind);

            let req = self
                .client
@@ -354,10 +371,15 @@ impl S3Bucket {
                _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
            };

-          
-            
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
            let resp = resp.context("request deletion")?;
-            
+            crate::metrics::BUCKET_METRICS
+                .deleted_objects_total
+                .inc_by(chunk.len() as u64);

            if let Some(errors) = resp.errors {
                // Log a bounded number of the errors within the response:
@@ -384,6 +406,124 @@ impl S3Bucket {
        Ok(())
    }

+    async fn list_versions_with_permit(
+        &self,
+        _permit: &tokio::sync::SemaphorePermit<'_>,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let prefix = prefix
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |e: &_| matches!(e, DownloadError::Cancelled);
+
+        let mut key_marker = None;
+        let mut version_id_marker = None;
+        let mut versions_and_deletes = Vec::new();
+
+        loop {
+            let response = backoff::retry(
+                || async {
+                    let mut request = self
+                        .client
+                        .list_object_versions()
+                        .bucket(self.bucket_name.clone())
+                        .set_prefix(prefix.clone())
+                        .set_key_marker(key_marker.clone())
+                        .set_version_id_marker(version_id_marker.clone());
+
+                    if let ListingMode::WithDelimiter = mode {
+                        request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+                    }
+
+                    let op = request.send();
+
+                    tokio::select! {
+                        res = op => res.map_err(|e| DownloadError::Other(e.into())),
+                        _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+                    }
+                },
+                is_permanent,
+                warn_threshold,
+                max_retries,
+                "listing object versions",
+                cancel,
+            )
+            .await
+            .ok_or_else(|| DownloadError::Cancelled)
+            .and_then(|x| x)?;
+
+            tracing::trace!(
+                "  Got List response version_id_marker={:?}, key_marker={:?}",
+                response.version_id_marker,
+                response.key_marker
+            );
+            let versions = response
+                .versions
+                .unwrap_or_default()
+                .into_iter()
+                .map(|version| {
+                    let key = version.key.expect("response does not contain a key");
+                    let key = self.s3_object_to_relative_path(&key);
+                    let version_id = VersionId(version.version_id.expect("needing version id"));
+                    let last_modified =
+                        SystemTime::try_from(version.last_modified.expect("no last_modified"))?;
+                    Ok(Version {
+                        key,
+                        last_modified,
+                        kind: crate::VersionKind::Version(version_id),
+                    })
+                });
+            let deletes = response
+                .delete_markers
+                .unwrap_or_default()
+                .into_iter()
+                .map(|version| {
+                    let key = version.key.expect("response does not contain a key");
+                    let key = self.s3_object_to_relative_path(&key);
+                    let last_modified =
+                        SystemTime::try_from(version.last_modified.expect("no last_modified"))?;
+                    Ok(Version {
+                        key,
+                        last_modified,
+                        kind: crate::VersionKind::DeletionMarker,
+                    })
+                });
+            itertools::process_results(versions.chain(deletes), |n_vds| {
+                versions_and_deletes.extend(n_vds)
+            })
+            .map_err(DownloadError::Other)?;
+            fn none_if_empty(v: Option<String>) -> Option<String> {
+                v.filter(|v| !v.is_empty())
+            }
+            version_id_marker = none_if_empty(response.next_version_id_marker);
+            key_marker = none_if_empty(response.next_key_marker);
+            if version_id_marker.is_none() {
+                // The final response is not supposed to be truncated
+                if response.is_truncated.unwrap_or_default() {
+                    return Err(DownloadError::Other(anyhow::anyhow!(
+                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
+                    )));
+                }
+                break;
+            }
+            if let Some(max_keys) = max_keys {
+                if versions_and_deletes.len() >= max_keys.get().try_into().unwrap() {
+                    return Err(DownloadError::Other(anyhow::anyhow!("too many versions")));
+                }
+            }
+        }
+        Ok(VersionListing {
+            versions: versions_and_deletes,
+        })
+    }
+
    pub fn bucket_name(&self) -> &str {
        &self.bucket_name
    }
@@ -424,8 +564,8 @@ pin_project_lite::pin_project! {
    }

    impl<S> PinnedDrop for TimedDownload<S> {
-        fn drop(mut _this: Pin<&mut Self>) {
-           
+        fn drop(mut this: Pin<&mut Self>) {
+            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
        }
    }
 }
@@ -490,7 +630,7 @@ impl RemoteStorage for S3Bucket {

            let mut continuation_token = None;
            'outer: loop {
-           
+                let started_at = start_measuring_requests(kind);

                // min of two Options, returning Some if one is value and another is
                // None (None is smaller than anything, so plain min doesn't work).
@@ -523,9 +663,11 @@ impl RemoteStorage for S3Bucket {
                    .context("Failed to list S3 prefixes")
                    .map_err(DownloadError::Other);

-               
+                let started_at = ScopeGuard::into_inner(started_at);

-                
+                crate::metrics::BUCKET_METRICS
+                    .req_seconds
+                    .observe_elapsed(kind, &response, started_at);

                let response = match response {
                    Ok(response) => response,
@@ -598,6 +740,19 @@ impl RemoteStorage for S3Bucket {
        }
    }

+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        let kind = RequestKind::ListVersions;
+        let permit = self.permit(kind, cancel).await?;
+        self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel)
+            .await
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
@@ -606,7 +761,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-     
+        let started_at = start_measuring_requests(kind);

        let head_future = self
            .client
@@ -625,18 +780,30 @@ impl RemoteStorage for S3Bucket {
        let res = res.map_err(|_e| DownloadError::Timeout)?;

        // do not incl. timeouts as errors in metrics but cancellations
-  
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
        let data = match res {
            Ok(object_output) => object_output,
            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("s3 head object"),
@@ -669,7 +836,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
        let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
@@ -692,10 +859,12 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(_inner) = &res {
+        if let Ok(inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
-       
-            
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
        }

        match res {
@@ -716,7 +885,7 @@ impl RemoteStorage for S3Bucket {

        let timeout = tokio::time::sleep(self.timeout);

-       
+        let started_at = start_measuring_requests(kind);

        // we need to specify bucket_name as a prefix
        let copy_source = format!(
@@ -740,8 +909,10 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);

        res?;

@@ -762,6 +933,7 @@ impl RemoteStorage for S3Bucket {
                key: self.relative_path_to_s3_object(from),
                etag: opts.etag.as_ref().map(|e| e.to_string()),
                range: opts.byte_range_header(),
+                version_id: opts.version_id.as_ref().map(|v| v.0.to_owned()),
            },
            cancel,
        )
@@ -806,94 +978,25 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::TimeTravel;
        let permit = self.permit(kind, cancel).await?;

-        let timestamp = DateTime::from(timestamp);
-        let done_if_after = DateTime::from(done_if_after);
-
        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");

-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let prefix = prefix
-            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone());
+        // Limit the number of versions deletions, mostly so that we don't
+        // keep requesting forever if the list is too long, as we'd put the
+        // list in RAM.
+        // Building a list of 100k entries that reaches the limit roughly takes
+        // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
+        const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);

-        let warn_threshold = 3;
-        let max_retries = 10;
-        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
-
-        let mut key_marker = None;
-        let mut version_id_marker = None;
-        let mut versions_and_deletes = Vec::new();
-
-        loop {
-            let response = backoff::retry(
-                || async {
-                    let op = self
-                        .client
-                        .list_object_versions()
-                        .bucket(self.bucket_name.clone())
-                        .set_prefix(prefix.clone())
-                        .set_key_marker(key_marker.clone())
-                        .set_version_id_marker(version_id_marker.clone())
-                        .send();
-
-                    tokio::select! {
-                        res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
-                        _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
-                    }
-                },
-                is_permanent,
-                warn_threshold,
-                max_retries,
-                "listing object versions for time_travel_recover",
-                cancel,
-            )
+        let mode = ListingMode::NoDelimiter;
+        let version_listing = self
+            .list_versions_with_permit(&permit, prefix, mode, COMPLEXITY_LIMIT, cancel)
            .await
-            .ok_or_else(|| TimeTravelError::Cancelled)
-            .and_then(|x| x)?;
-
-            tracing::trace!(
-                "  Got List response version_id_marker={:?}, key_marker={:?}",
-                response.version_id_marker,
-                response.key_marker
-            );
-            let versions = response
-                .versions
-                .unwrap_or_default()
-                .into_iter()
-                .map(VerOrDelete::from_version);
-            let deletes = response
-                .delete_markers
-                .unwrap_or_default()
-                .into_iter()
-                .map(VerOrDelete::from_delete_marker);
-            itertools::process_results(versions.chain(deletes), |n_vds| {
-                versions_and_deletes.extend(n_vds)
-            })
-            .map_err(TimeTravelError::Other)?;
-            fn none_if_empty(v: Option<String>) -> Option<String> {
-                v.filter(|v| !v.is_empty())
-            }
-            version_id_marker = none_if_empty(response.next_version_id_marker);
-            key_marker = none_if_empty(response.next_key_marker);
-            if version_id_marker.is_none() {
-                // The final response is not supposed to be truncated
-                if response.is_truncated.unwrap_or_default() {
-                    return Err(TimeTravelError::Other(anyhow::anyhow!(
-                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
-                    )));
-                }
-                break;
-            }
-            // Limit the number of versions deletions, mostly so that we don't
-            // keep requesting forever if the list is too long, as we'd put the
-            // list in RAM.
-            // Building a list of 100k entries that reaches the limit roughly takes
-            // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
-            const COMPLEXITY_LIMIT: usize = 100_000;
-            if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
-                return Err(TimeTravelError::TooManyVersions);
-            }
-        }
+            .map_err(|err| match err {
+                DownloadError::Other(e) => TimeTravelError::Other(e),
+                DownloadError::Cancelled => TimeTravelError::Cancelled,
+                other => TimeTravelError::Other(other.into()),
+            })?;
+        let versions_and_deletes = version_listing.versions;

        tracing::info!(
            "Built list for time travel with {} versions and deletions",
@@ -909,24 +1012,26 @@ impl RemoteStorage for S3Bucket {
        let mut vds_for_key = HashMap::<_, Vec<_>>::new();

        for vd in &versions_and_deletes {
-            let VerOrDelete {
-                version_id, key, ..
-            } = &vd;
-            if version_id == "null" {
+            let Version { key, .. } = &vd;
+            let version_id = vd.version_id().map(|v| v.0.as_str());
+            if version_id == Some("null") {
                return Err(TimeTravelError::Other(anyhow!(
                    "Received ListVersions response for key={key} with version_id='null', \
                    indicating either disabled versioning, or legacy objects with null version id values"
                )));
            }
-            tracing::trace!(
-                "Parsing version key={key} version_id={version_id} kind={:?}",
-                vd.kind
-            );
+            tracing::trace!("Parsing version key={key} kind={:?}", vd.kind);

            vds_for_key.entry(key).or_default().push(vd);
        }
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
+
        for (key, versions) in vds_for_key {
            let last_vd = versions.last().unwrap();
+            let key = self.relative_path_to_s3_object(key);
            if last_vd.last_modified > done_if_after {
                tracing::trace!("Key {key} has version later than done_if_after, skipping");
                continue;
@@ -951,11 +1056,11 @@ impl RemoteStorage for S3Bucket {
                do_delete = true;
            } else {
                match &versions[version_to_restore_to - 1] {
-                    VerOrDelete {
-                        kind: VerOrDeleteKind::Version,
-                        version_id,
+                    Version {
+                        kind: VersionKind::Version(version_id),
                        ..
                    } => {
+                        let version_id = &version_id.0;
                        tracing::trace!("Copying old version {version_id} for {key}...");
                        // Restore the state to the last version by copying
                        let source_id =
@@ -967,7 +1072,7 @@ impl RemoteStorage for S3Bucket {
                                    .client
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
-                                    .key(key)
+                                    .key(&key)
                                    .set_storage_class(self.upload_storage_class.clone())
                                    .copy_source(&source_id)
                                    .send();
@@ -988,8 +1093,8 @@ impl RemoteStorage for S3Bucket {
                        .and_then(|x| x)?;
                        tracing::info!(%version_id, %key, "Copied old version in S3");
                    }
-                    VerOrDelete {
-                        kind: VerOrDeleteKind::DeleteMarker,
+                    Version {
+                        kind: VersionKind::DeletionMarker,
                        ..
                    } => {
                        do_delete = true;
@@ -997,7 +1102,7 @@ impl RemoteStorage for S3Bucket {
                }
            };
            if do_delete {
-                if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
+                if matches!(last_vd.kind, VersionKind::DeletionMarker) {
                    // Key has since been deleted (but there was some history), no need to do anything
                    tracing::trace!("Key {key} already deleted, skipping.");
                } else {
@@ -1025,62 +1130,6 @@ impl RemoteStorage for S3Bucket {
    }
 }

-// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
-struct VerOrDelete {
-    kind: VerOrDeleteKind,
-    last_modified: DateTime,
-    version_id: String,
-    key: String,
-}
-
-#[derive(Debug)]
-enum VerOrDeleteKind {
-    Version,
-    DeleteMarker,
-}
-
-impl VerOrDelete {
-    fn with_kind(
-        kind: VerOrDeleteKind,
-        last_modified: Option<DateTime>,
-        version_id: Option<String>,
-        key: Option<String>,
-    ) -> anyhow::Result<Self> {
-        let lvk = (last_modified, version_id, key);
-        let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
-            anyhow::bail!(
-                "One (or more) of last_modified, key, and id is None. \
-            Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
-                lvk.0,
-                lvk.1,
-                lvk.2,
-            );
-        };
-        Ok(Self {
-            kind,
-            last_modified,
-            version_id,
-            key,
-        })
-    }
-    fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
-        Self::with_kind(
-            VerOrDeleteKind::Version,
-            v.last_modified,
-            v.version_id,
-            v.key,
-        )
-    }
-    fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
-        Self::with_kind(
-            VerOrDeleteKind::DeleteMarker,
-            v.last_modified,
-            v.version_id,
-            v.key,
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::num::NonZeroUsize;
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -139,6 +139,20 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list(prefix, mode, max_keys, cancel).await
    }

+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner
+            .list_versions(prefix, mode, max_keys, cancel)
+            .await
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -1,6 +1,7 @@
 use std::fmt::Display;
 use std::time::{Duration, Instant};

+use metrics::IntCounter;

 /// Circuit breakers are for operations that are expensive and fallible.
 ///
@@ -53,7 +54,7 @@ impl CircuitBreaker {
        }
    }

-    pub fn fail<E>(&mut self,  error: E)
+    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
@@ -63,18 +64,18 @@ impl CircuitBreaker {

        self.fail_count += 1;
        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
-            self.break_circuit( error);
+            self.break_circuit(metric, error);
        }
    }

    /// Call this after successfully executing an operation
-    pub fn success(&mut self) {
+    pub fn success(&mut self, metric: &IntCounter) {
        self.fail_count = 0;
        if let Some(broken_at) = &self.broken_at {
            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
                humantime::format_duration(broken_at.elapsed()));
            self.broken_at = None;
-          
+            metric.inc();
        }
    }

@@ -97,13 +98,13 @@ impl CircuitBreaker {
        }
    }

-    fn break_circuit<E>(&mut self,  error: E)
+    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
        self.broken_at = Some(Instant::now());
        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
-        
+        metric.inc();
    }

    fn reset_circuit(&mut self) {
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -78,6 +78,7 @@ metrics.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
+pem.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
@@ -105,6 +106,7 @@ hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
 indoc.workspace = true
 uuid.workspace = true
+rstest.workspace = true

 [[bench]]
 name = "bench_layer_map"
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -11,6 +11,7 @@ use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::storage_layer::InMemoryLayer;
 use pageserver::{page_cache, virtual_file};
 use pageserver_api::key::Key;
+use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::value::Value;
 use tokio_util::sync::CancellationToken;
@@ -28,6 +29,7 @@ fn murmurhash32(mut h: u32) -> u32 {
    h
 }

+#[derive(serde::Serialize, Clone, Copy, Debug)]
 enum KeyLayout {
    /// Sequential unique keys
    Sequential,
@@ -37,6 +39,7 @@ enum KeyLayout {
    RandomReuse(u32),
 }

+#[derive(serde::Serialize, Clone, Copy, Debug)]
 enum WriteDelta {
    Yes,
    No,
@@ -138,12 +141,15 @@ async fn ingest(
 /// Wrapper to instantiate a tokio runtime
 fn ingest_main(
    conf: &'static PageServerConf,
+    io_mode: IoMode,
    put_size: usize,
    put_count: usize,
    key_layout: KeyLayout,
    write_delta: WriteDelta,
 ) {
-    let runtime = tokio::runtime::Builder::new_current_thread()
+    pageserver::virtual_file::set_io_mode(io_mode);
+
+    let runtime = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();
@@ -174,93 +180,245 @@ fn criterion_benchmark(c: &mut Criterion) {
    virtual_file::init(
        16384,
        virtual_file::io_engine_for_bench(),
+        // immaterial, each `ingest_main` invocation below overrides this
        conf.virtual_file_io_mode,
+        // without actually doing syncs, buffered writes have an unfair advantage over direct IO writes
        virtual_file::SyncMode::Sync,
    );
    page_cache::init(conf.page_cache_size);

-    {
-        let mut group = c.benchmark_group("ingest-small-values");
-        let put_size = 100usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/100b seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Random,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::RandomReuse(0x3ff),
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
+    #[derive(serde::Serialize)]
+    struct ExplodedParameters {
+        io_mode: IoMode,
+        volume_mib: usize,
+        key_size: usize,
+        key_layout: KeyLayout,
+        write_delta: WriteDelta,
    }
-
-    {
-        let mut group = c.benchmark_group("ingest-big-values");
-        let put_size = 8192usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+    #[derive(Clone)]
+    struct HandPickedParameters {
+        volume_mib: usize,
+        key_size: usize,
+        key_layout: KeyLayout,
+        write_delta: WriteDelta,
+    }
+    let expect = vec![
+        // Small values (100b) tests
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::Random,
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::RandomReuse(0x3ff),
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::No,
+        },
+        // Large values (8k) tests
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 8192,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 8192,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::No,
+        },
+    ];
+    let exploded_parameters = {
+        let mut out = Vec::new();
+        for io_mode in [
+            IoMode::Buffered,
+            #[cfg(target_os = "linux")]
+            IoMode::Direct,
+            #[cfg(target_os = "linux")]
+            IoMode::DirectRw,
+        ] {
+            for param in expect.clone() {
+                let HandPickedParameters {
+                    volume_mib,
+                    key_size,
+                    key_layout,
+                    write_delta,
+                } = param;
+                out.push(ExplodedParameters {
+                    io_mode,
+                    volume_mib,
+                    key_size,
+                    key_layout,
+                    write_delta,
+                });
+            }
+        }
+        out
+    };
+    impl ExplodedParameters {
+        fn benchmark_id(&self) -> String {
+            let ExplodedParameters {
+                io_mode,
+                volume_mib,
+                key_size,
+                key_layout,
+                write_delta,
+            } = self;
+            format!(
+                "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}"
+            )
+        }
+    }
+    let mut group = c.benchmark_group("ingest");
+    for params in exploded_parameters {
+        let id = params.benchmark_id();
+        let ExplodedParameters {
+            io_mode,
+            volume_mib,
+            key_size,
+            key_layout,
+            write_delta,
+        } = params;
+        let put_count = volume_mib * 1024 * 1024 / key_size;
+        group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64));
        group.sample_size(10);
-        group.bench_function("ingest 128MB/8k seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
+        group.bench_function(id, |b| {
+            b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta))
        });
    }
 }

 criterion_group!(benches, criterion_benchmark);
 criterion_main!(benches);
+
+/*
+cargo bench --bench bench_ingest
+
+im4gn.2xlarge:
+
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [1.2901 s 1.2943 s 1.2991 s]
+                        thrpt:  [98.533 MiB/s 98.892 MiB/s 99.220 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [2.1387 s 2.1623 s 2.1845 s]
+                        thrpt:  [58.595 MiB/s 59.197 MiB/s 59.851 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
+                        time:   [1.2036 s 1.2074 s 1.2122 s]
+                        thrpt:  [105.60 MiB/s 106.01 MiB/s 106.35 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [520.55 ms 521.46 ms 522.57 ms]
+                        thrpt:  [244.94 MiB/s 245.47 MiB/s 245.89 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [440.33 ms 442.24 ms 444.10 ms]
+                        thrpt:  [288.22 MiB/s 289.43 MiB/s 290.69 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [168.78 ms 169.42 ms 170.18 ms]
+                        thrpt:  [752.16 MiB/s 755.52 MiB/s 758.40 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [1.2978 s 1.3094 s 1.3227 s]
+                        thrpt:  [96.775 MiB/s 97.758 MiB/s 98.632 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [2.1976 s 2.2067 s 2.2154 s]
+                        thrpt:  [57.777 MiB/s 58.006 MiB/s 58.245 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes
+                        time:   [1.2103 s 1.2160 s 1.2233 s]
+                        thrpt:  [104.64 MiB/s 105.26 MiB/s 105.76 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [525.05 ms 526.37 ms 527.79 ms]
+                        thrpt:  [242.52 MiB/s 243.17 MiB/s 243.79 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [443.06 ms 444.88 ms 447.15 ms]
+                        thrpt:  [286.26 MiB/s 287.72 MiB/s 288.90 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [169.40 ms 169.80 ms 170.17 ms]
+                        thrpt:  [752.21 MiB/s 753.81 MiB/s 755.60 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [1.2844 s 1.2915 s 1.2990 s]
+                        thrpt:  [98.536 MiB/s 99.112 MiB/s 99.657 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [2.1431 s 2.1663 s 2.1900 s]
+                        thrpt:  [58.446 MiB/s 59.087 MiB/s 59.726 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
+                        time:   [1.1906 s 1.1926 s 1.1947 s]
+                        thrpt:  [107.14 MiB/s 107.33 MiB/s 107.51 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [516.86 ms 518.25 ms 519.47 ms]
+                        thrpt:  [246.40 MiB/s 246.98 MiB/s 247.65 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [536.50 ms 536.53 ms 536.60 ms]
+                        thrpt:  [238.54 MiB/s 238.57 MiB/s 238.59 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [267.77 ms 267.90 ms 268.04 ms]
+                        thrpt:  [477.53 MiB/s 477.79 MiB/s 478.02 MiB/s]
+
+Hetzner AX102:
+
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [836.58 ms 861.93 ms 886.57 ms]
+                        thrpt:  [144.38 MiB/s 148.50 MiB/s 153.00 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [1.2782 s 1.3191 s 1.3665 s]
+                        thrpt:  [93.668 MiB/s 97.037 MiB/s 100.14 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
+                        time:   [791.27 ms 807.08 ms 822.95 ms]
+                        thrpt:  [155.54 MiB/s 158.60 MiB/s 161.77 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [310.78 ms 314.66 ms 318.47 ms]
+                        thrpt:  [401.92 MiB/s 406.79 MiB/s 411.87 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [377.11 ms 387.77 ms 399.21 ms]
+                        thrpt:  [320.63 MiB/s 330.10 MiB/s 339.42 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [128.37 ms 132.96 ms 138.55 ms]
+                        thrpt:  [923.83 MiB/s 962.69 MiB/s 997.11 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [900.38 ms 914.88 ms 928.86 ms]
+                        thrpt:  [137.80 MiB/s 139.91 MiB/s 142.16 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [1.2538 s 1.2936 s 1.3313 s]
+                        thrpt:  [96.149 MiB/s 98.946 MiB/s 102.09 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes
+                        time:   [787.17 ms 803.89 ms 820.63 ms]
+                        thrpt:  [155.98 MiB/s 159.23 MiB/s 162.61 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [318.78 ms 321.89 ms 324.74 ms]
+                        thrpt:  [394.16 MiB/s 397.65 MiB/s 401.53 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [374.01 ms 383.45 ms 393.20 ms]
+                        thrpt:  [325.53 MiB/s 333.81 MiB/s 342.24 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [137.98 ms 141.31 ms 143.57 ms]
+                        thrpt:  [891.58 MiB/s 905.79 MiB/s 927.66 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [613.69 ms 622.48 ms 630.97 ms]
+                        thrpt:  [202.86 MiB/s 205.63 MiB/s 208.57 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [1.0299 s 1.0766 s 1.1273 s]
+                        thrpt:  [113.55 MiB/s 118.90 MiB/s 124.29 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
+                        time:   [637.80 ms 647.78 ms 658.01 ms]
+                        thrpt:  [194.53 MiB/s 197.60 MiB/s 200.69 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [266.09 ms 267.20 ms 268.31 ms]
+                        thrpt:  [477.06 MiB/s 479.04 MiB/s 481.04 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [269.34 ms 273.27 ms 277.69 ms]
+                        thrpt:  [460.95 MiB/s 468.40 MiB/s 475.24 MiB/s]
+ingest/io_mode=DirectRw volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [123.18 ms 124.24 ms 125.15 ms]
+                        thrpt:  [1022.8 MiB/s 1.0061 GiB/s 1.0148 GiB/s]
+*/
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -419,6 +419,23 @@ impl Client {
        }
    }

+    pub async fn timeline_detail(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::GET, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn timeline_archival_config(
        &self,
        tenant_shard_id: TenantShardId,
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -68,6 +68,13 @@ pub(crate) struct Args {
    targets: Option<Vec<TenantTimelineId>>,
 }

+/// State shared by all clients
+#[derive(Debug)]
+struct SharedState {
+    start_work_barrier: tokio::sync::Barrier,
+    live_stats: LiveStats,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
    completed_requests: AtomicU64,
@@ -240,24 +247,26 @@ async fn main_impl(
        all_ranges
    };

-    let live_stats = Arc::new(LiveStats::default());
-
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = args.num_clients.get() * timelines.len();
    let num_main_impl = 1;

-    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_live_stats_dump + num_work_sender_tasks + num_main_impl,
-    ));
+    let shared_state = Arc::new(SharedState {
+        start_work_barrier: tokio::sync::Barrier::new(
+            num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        ),
+        live_stats: LiveStats::default(),
+    });
+    let cancel = CancellationToken::new();

+    let ss = shared_state.clone();
    tokio::spawn({
-        let stats = Arc::clone(&live_stats);
-        let start_work_barrier = Arc::clone(&start_work_barrier);
        async move {
-            start_work_barrier.wait().await;
+            ss.start_work_barrier.wait().await;
            loop {
                let start = std::time::Instant::now();
                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let stats = &ss.live_stats;
                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
                let missed = stats.missed.swap(0, Ordering::Relaxed);
                let elapsed = start.elapsed();
@@ -270,14 +279,12 @@ async fn main_impl(
        }
    });

-    let cancel = CancellationToken::new();
-
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
-        let live_stats = live_stats.clone();
-        let start_work_barrier = start_work_barrier.clone();
+        let ss = shared_state.clone();
+        let cancel = cancel.clone();
        let ranges: Vec<KeyRange> = all_ranges
            .iter()
            .filter(|r| r.timeline == worker_id.timeline)
@@ -287,85 +294,8 @@ async fn main_impl(
            rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
                .unwrap();

-        let cancel = cancel.clone();
        Box::pin(async move {
-            let client =
-                pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-                    .await
-                    .unwrap();
-            let mut client = client
-                .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
-                .await
-                .unwrap();
-
-            start_work_barrier.wait().await;
-            let client_start = Instant::now();
-            let mut ticks_processed = 0;
-            let mut inflight = VecDeque::new();
-            while !cancel.is_cancelled() {
-                // Detect if a request took longer than the RPS rate
-                if let Some(period) = &rps_period {
-                    let periods_passed_until_now =
-                        usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
-                            .unwrap();
-
-                    if periods_passed_until_now > ticks_processed {
-                        live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
-                    }
-                    ticks_processed = periods_passed_until_now;
-                }
-
-                while inflight.len() < args.queue_depth.get() {
-                    let start = Instant::now();
-                    let req = {
-                        let mut rng = rand::thread_rng();
-                        let r = &ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        assert!(key.is_rel_block_key());
-                        let (rel_tag, block_no) = key
-                            .to_rel_block()
-                            .expect("we filter non-rel-block keys out above");
-                        PagestreamGetPageRequest {
-                            hdr: PagestreamRequest {
-                                reqid: 0,
-                                request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                                    Lsn::MAX
-                                } else {
-                                    r.timeline_lsn
-                                },
-                                not_modified_since: r.timeline_lsn,
-                            },
-                            rel: rel_tag,
-                            blkno: block_no,
-                        }
-                    };
-                    client.getpage_send(req).await.unwrap();
-                    inflight.push_back(start);
-                }
-
-                let start = inflight.pop_front().unwrap();
-                client.getpage_recv().await.unwrap();
-                let end = Instant::now();
-                live_stats.request_done();
-                ticks_processed += 1;
-                STATS.with(|stats| {
-                    stats
-                        .borrow()
-                        .lock()
-                        .unwrap()
-                        .observe(end.duration_since(start))
-                        .unwrap();
-                });
-
-                if let Some(period) = &rps_period {
-                    let next_at = client_start
-                        + Duration::from_micros(
-                            (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
-                        );
-                    tokio::time::sleep_until(next_at.into()).await;
-                }
-            }
+            client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
        })
    };

@@ -387,7 +317,7 @@ async fn main_impl(
    };

    info!("waiting for everything to become ready");
-    start_work_barrier.wait().await;
+    shared_state.start_work_barrier.wait().await;
    info!("work started");
    if let Some(runtime) = args.runtime {
        tokio::time::sleep(runtime.into()).await;
@@ -416,3 +346,91 @@ async fn main_impl(

    anyhow::Ok(())
 }
+
+async fn client_libpq(
+    args: &Args,
+    worker_id: WorkerId,
+    shared_state: Arc<SharedState>,
+    cancel: CancellationToken,
+    rps_period: Option<Duration>,
+    ranges: Vec<KeyRange>,
+    weights: rand::distributions::weighted::WeightedIndex<i128>,
+) {
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+        .await
+        .unwrap();
+
+    shared_state.start_work_barrier.wait().await;
+    let client_start = Instant::now();
+    let mut ticks_processed = 0;
+    let mut inflight = VecDeque::new();
+    while !cancel.is_cancelled() {
+        // Detect if a request took longer than the RPS rate
+        if let Some(period) = &rps_period {
+            let periods_passed_until_now =
+                usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
+
+            if periods_passed_until_now > ticks_processed {
+                shared_state
+                    .live_stats
+                    .missed((periods_passed_until_now - ticks_processed) as u64);
+            }
+            ticks_processed = periods_passed_until_now;
+        }
+
+        while inflight.len() < args.queue_depth.get() {
+            let start = Instant::now();
+            let req = {
+                let mut rng = rand::thread_rng();
+                let r = &ranges[weights.sample(&mut rng)];
+                let key: i128 = rng.gen_range(r.start..r.end);
+                let key = Key::from_i128(key);
+                assert!(key.is_rel_block_key());
+                let (rel_tag, block_no) = key
+                    .to_rel_block()
+                    .expect("we filter non-rel-block keys out above");
+                PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid: 0,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since: r.timeline_lsn,
+                    },
+                    rel: rel_tag,
+                    blkno: block_no,
+                }
+            };
+            client.getpage_send(req).await.unwrap();
+            inflight.push_back(start);
+        }
+
+        let start = inflight.pop_front().unwrap();
+        client.getpage_recv().await.unwrap();
+        let end = Instant::now();
+        shared_state.live_stats.request_done();
+        ticks_processed += 1;
+        STATS.with(|stats| {
+            stats
+                .borrow()
+                .lock()
+                .unwrap()
+                .observe(end.duration_since(start))
+                .unwrap();
+        });
+
+        if let Some(period) = &rps_period {
+            let next_at = client_start
+                + Duration::from_micros(
+                    (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                );
+            tokio::time::sleep_until(next_at.into()).await;
+        }
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,6 +20,7 @@ use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{
    BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
 };
@@ -320,9 +321,10 @@ where
    }
 }

-fn startup_checkpoint(started_at: Instant, _phase: &str, human_phase: &str) {
+fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
    let elapsed = started_at.elapsed();
    let secs = elapsed.as_secs_f64();
+    STARTUP_DURATION.with_label_values(&[phase]).set(secs);

    info!(
        elapsed_ms = elapsed.as_millis(),
@@ -353,7 +355,10 @@ fn start_pageserver(
    set_launch_timestamp_metric(launch_ts);
    #[cfg(target_os = "linux")]
    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
-    
+    metrics::register_internal(Box::new(
+        pageserver::metrics::tokio_epoll_uring::Collector::new(),
+    ))
+    .unwrap();
    pageserver::preinitialize_metrics(conf, ignored);

    // If any failpoints were set from FAILPOINTS environment variable,
@@ -411,8 +416,18 @@ fn start_pageserver(
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME
        .block_on(async {
+            let tls_config = storage_broker::ClientTlsConfig::new().ca_certificates(
+                conf.ssl_ca_certs
+                    .iter()
+                    .map(pem::encode)
+                    .map(storage_broker::Certificate::from_pem),
+            );
            // Note: we do not attempt connecting here (but validate endpoints sanity).
-            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+            storage_broker::connect(
+                conf.broker_endpoint.clone(),
+                conf.broker_keepalive_interval,
+                tls_config,
+            )
        })
        .with_context(|| {
            format!(
@@ -497,6 +512,7 @@ fn start_pageserver(
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
+    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
    //
@@ -572,6 +588,7 @@ fn start_pageserver(
                    "initial_tenant_load",
                    "Initial load completed",
                );
+                STARTUP_IS_LOADING.set(0);
            });

            let WaitForPhaseResult {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -17,9 +17,10 @@ use once_cell::sync::OnceCell;
 use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
+use pem::Pem;
 use postgres_backend::AuthType;
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use reqwest::{Certificate, Url};
+use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
@@ -67,8 +68,8 @@ pub struct PageServerConf {
    /// Period to reload certificate and private key from files.
    /// Default: 60s.
    pub ssl_cert_reload_period: Duration,
-    /// Trusted root CA certificates to use in https APIs.
-    pub ssl_ca_certs: Vec<Certificate>,
+    /// Trusted root CA certificates to use in https APIs in PEM format.
+    pub ssl_ca_certs: Vec<Pem>,

    /// Current availability zone. Used for traffic metrics.
    pub availability_zone: Option<String>,
@@ -118,13 +119,13 @@ pub struct PageServerConf {
    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
    pub concurrent_tenant_warmup: ConfigurableSemaphore,

-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
-    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
+    /// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -224,6 +225,11 @@ pub struct PageServerConf {
    /// Does not force TLS: the client negotiates TLS usage during the handshake.
    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
    pub enable_tls_page_service_api: bool,
+
+    /// Run in development mode, which disables certain safety checks
+    /// such as authentication requirements for HTTP and PostgreSQL APIs.
+    /// This is insecure and should only be used in development environments.
+    pub dev_mode: bool,
 }

 /// Token for authentication to safekeepers
@@ -397,6 +403,7 @@ impl PageServerConf {
            generate_unarchival_heatmap,
            tracing,
            enable_tls_page_service_api,
+            dev_mode,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -448,6 +455,7 @@ impl PageServerConf {
            get_vectored_concurrent_io,
            tracing,
            enable_tls_page_service_api,
+            dev_mode,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
@@ -497,7 +505,10 @@ impl PageServerConf {
            ssl_ca_certs: match ssl_ca_file {
                Some(ssl_ca_file) => {
                    let buf = std::fs::read(ssl_ca_file)?;
-                    Certificate::from_pem_bundle(&buf)?
+                    pem::parse_many(&buf)?
+                        .into_iter()
+                        .filter(|pem| pem.tag() == "CERTIFICATE")
+                        .collect()
                }
                None => Vec::new(),
            },
@@ -588,10 +599,10 @@ impl ConfigurableSemaphore {
    /// Initializse using a non-zero amount of permits.
    ///
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
-    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
+    /// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 mod disk_cache;
 mod metrics;
@@ -428,7 +428,7 @@ async fn calculate_synthetic_size_worker(
    }
 }

-async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) {
    const CAUSE: LogicalSizeCalculationCause =
        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -175,9 +175,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::remote_size`]
+    /// [`TenantShard::remote_size`]
    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    /// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size
    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
            tenant_id,
@@ -199,9 +199,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
@@ -254,16 +254,18 @@ pub(super) async fn collect_all_metrics(

 async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
 where
-    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::TenantShard>)>,
 {
    let mut current_metrics: Vec<NewRawMetric> = Vec::new();

    let mut tenants = std::pin::pin!(tenants);

    while let Some((tenant_id, tenant)) = tenants.next().await {
-        let tenant_resident_size = 0;
+        let mut tenant_resident_size = 0;

-        for timeline in tenant.list_timelines() {
+        let timelines = tenant.list_timelines();
+        let timelines_len = timelines.len();
+        for timeline in timelines {
            let timeline_id = timeline.timeline_id;

            match TimelineSnapshot::collect(&timeline, ctx) {
@@ -286,6 +288,12 @@ where
                }
            }

+            tenant_resident_size += timeline.resident_physical_size();
+        }
+
+        if timelines_len == 0 {
+            // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
+            tenant_resident_size = 1;
        }

        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
@@ -307,7 +315,7 @@ impl TenantSnapshot {
    ///
    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
    /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
        TenantSnapshot {
            resident_size,
            remote_size: t.remote_size(),
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -91,12 +91,12 @@

 use std::{sync::Arc, time::Duration};

-
+use once_cell::sync::Lazy;
 use tracing::warn;
 use utils::{id::TimelineId, shard::TenantShardId};

 use crate::{
-    metrics::TimelineMetrics,
+    metrics::{StorageIoSizeMetrics, TimelineMetrics},
    task_mgr::TaskKind,
    tenant::Timeline,
 };
@@ -122,35 +122,38 @@ pub struct RequestContext {
 #[derive(Clone)]
 pub(crate) enum Scope {
    Global {
-        
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    SecondaryTenant {
-       
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    SecondaryTimeline {
-       
+        io_size_metrics: crate::metrics::StorageIoSizeMetrics,
    },
    Timeline {
-       // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
+        // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
        // context creation contending for the ref counters of the Arc<TimelineMetrics>,
        // which are shared among all tasks that operate on the timeline, especially
        // concurrent page_service connections.
        #[allow(clippy::redundant_allocation)]
-        #[allow(dead_code)]
-        arc_arc: Arc<Arc<TimelineMetrics>>,    },
+        arc_arc: Arc<Arc<TimelineMetrics>>,
+    },
    #[cfg(test)]
    UnitTest {
-       
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    DebugTools {
-        
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
 }

+static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
+    Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));

 impl Scope {
    pub(crate) fn new_global() -> Self {
        Scope::Global {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
    /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
@@ -170,13 +173,18 @@ impl Scope {
        }
    }
    pub(crate) fn new_secondary_timeline(
-        _tenant_shard_id: &TenantShardId,
-        _timeline_id: &TimelineId,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
    ) -> Self {
        // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.

+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = tenant_shard_id.shard_slug().to_string();
+        let timeline_id = timeline_id.to_string();

-        Scope::SecondaryTimeline {  }
+        let io_size_metrics =
+            crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
+        Scope::SecondaryTimeline { io_size_metrics }
    }
    pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
        // Before propagating metrics via RequestContext, the labels were inferred from file path.
@@ -189,19 +197,19 @@ impl Scope {
        // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
        // at this point, so, we were able to completely side-step tenant-scoped stuff there).
        Scope::SecondaryTenant {
-           
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
    #[cfg(test)]
    pub(crate) fn new_unit_test() -> Self {
        Scope::UnitTest {
-          
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }

    pub(crate) fn new_debug_tools() -> Self {
        Scope::DebugTools {
-          
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
 }
@@ -515,18 +523,58 @@ impl RequestContext {
        self.access_stats_behavior
    }

+    pub(crate) fn page_content_kind(&self) -> PageContentKind {
+        self.page_content_kind
+    }

    pub(crate) fn read_path_debug(&self) -> bool {
        self.read_path_debug
    }

+    pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
+        match &self.scope {
+            Scope::Global { io_size_metrics } => {
+                let is_unit_test = cfg!(test);
+                let is_regress_test_build = cfg!(feature = "testing");
+                if is_unit_test || is_regress_test_build {
+                    panic!("all VirtualFile instances are timeline-scoped");
+                } else {
+                    use once_cell::sync::Lazy;
+                    use std::sync::Mutex;
+                    use std::time::Duration;
+                    use utils::rate_limit::RateLimit;
+                    static LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
+                    let mut guard = LIMIT.lock().unwrap();
+                    guard.call2(|rate_limit_stats| {
+                        warn!(
+                            %rate_limit_stats,
+                            backtrace=%std::backtrace::Backtrace::force_capture(),
+                            "all VirtualFile instances are timeline-scoped",
+                        );
+                    });
+
+                    io_size_metrics
+                }
+            }
+            Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
+            Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
+            Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
+            #[cfg(test)]
+            Scope::UnitTest { io_size_metrics } => io_size_metrics,
+            Scope::DebugTools { io_size_metrics } => io_size_metrics,
+        }
+    }
+
    pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
        if duration == Duration::ZERO {
            return;
        }

        match &self.scope {
-            Scope::Timeline { arc_arc: _ } => {},
+            Scope::Timeline { arc_arc } => arc_arc
+                .wait_ondemand_download_time
+                .observe(self.task_kind, duration),
            _ => {
                use once_cell::sync::Lazy;
                use std::sync::Mutex;
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -3,17 +3,19 @@ use std::collections::HashMap;
 use futures::Future;
 use pageserver_api::config::NodeMetadata;
 use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest};
+use pageserver_api::models::ShardImportStatus;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{
-    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-    ValidateRequestTenant, ValidateResponse,
+    PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
+    ValidateRequest, ValidateRequestTenant, ValidateResponse,
 };
+use reqwest::Certificate;
 use serde::Serialize;
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::generation::Generation;
-use utils::id::NodeId;
+use utils::id::{NodeId, TimelineId};
 use utils::{backoff, failpoint_support};

 use crate::config::PageServerConf;
@@ -45,6 +47,12 @@ pub trait StorageControllerUpcallApi {
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
    ) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
+    fn put_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        status: ShardImportStatus,
+    ) -> impl Future<Output = Result<(), RetryForeverError>> + Send;
 }

 impl StorageControllerUpcallClient {
@@ -76,8 +84,8 @@ impl StorageControllerUpcallClient {
            client = client.default_headers(headers);
        }

-        for ssl_ca_cert in &conf.ssl_ca_certs {
-            client = client.add_root_certificate(ssl_ca_cert.clone());
+        for cert in &conf.ssl_ca_certs {
+            client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
        }

        Ok(Some(Self {
@@ -272,4 +280,30 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {

        Ok(result.into_iter().collect())
    }
+
+    /// Send a shard import status to the storage controller
+    ///
+    /// The implementation must have at-least-once delivery semantics.
+    /// To this end, we retry the request until it succeeds. If the pageserver
+    /// restarts or crashes, the shard import will start again from the beggining.
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
+    async fn put_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        status: ShardImportStatus,
+    ) -> Result<(), RetryForeverError> {
+        let url = self
+            .base_url
+            .join("timeline_import_status")
+            .expect("Failed to build path");
+
+        let request = PutTimelineImportStatusRequest {
+            tenant_shard_id,
+            timeline_id,
+            status,
+        };
+
+        self.retry_http_forever(&url, request).await
+    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -27,6 +27,7 @@ use self::list_writer::{DeletionOp, ListWriter, RecoverOp};
 use self::validator::Validator;
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::StorageControllerUpcallApi;
+use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -162,6 +163,11 @@ struct TenantDeletionList {
    generation: Generation,
 }

+impl TenantDeletionList {
+    pub(crate) fn len(&self) -> usize {
+        self.timelines.values().map(|v| v.len()).sum()
+    }
+}

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
@@ -461,6 +467,9 @@ impl DeletionQueueClient {
        // they may be historical.
        assert!(!current_generation.is_none());

+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(layers.len() as u64);
        self.do_push(
            &self.tx,
            ListWriterQueueMessage::Delete(DeletionOp {
@@ -544,6 +553,9 @@ impl DeletionQueueClient {
        &self,
        objects: Vec<RemotePath>,
    ) -> Result<(), DeletionQueueError> {
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(objects.len() as u64);
        self.executor_tx
            .send(DeleterMessage::Delete(objects))
            .await
@@ -775,6 +787,15 @@ mod test {

            Ok(result)
        }
+
+        async fn put_timeline_import_status(
+            &self,
+            _tenant_shard_id: TenantShardId,
+            _timeline_id: TimelineId,
+            _status: pageserver_api::models::ShardImportStatus,
+        ) -> Result<(), RetryForeverError> {
+            unimplemented!()
+        }
    }

    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -14,6 +14,7 @@ use tracing::{info, warn};
 use utils::{backoff, pausable_failpoint};

 use super::{DeletionQueueError, FlushOp};
+use crate::metrics;

 const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);

@@ -59,6 +60,10 @@ impl Deleter {
                fail::fail_point!("deletion-queue-before-execute", |_| {
                    info!("Skipping execution, failpoint set");

+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                });

@@ -85,6 +90,9 @@ impl Deleter {
                Ok(()) => {
                    // Note: we assume that the remote storage layer returns Ok(()) if some
                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
                    info!(
                        "Executed deletion batch {}..{}",
                        self.accumulator
@@ -101,6 +109,10 @@ impl Deleter {
                        return Err(DeletionQueueError::ShuttingDown);
                    }
                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
                }
            };
        }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -25,6 +25,7 @@ use utils::id::TimelineId;
 use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage};
 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error};
@@ -151,7 +152,7 @@ impl ListWriter {
                }
            }
            Err(e) => {
-                
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
                warn!(
                    sequence = self.pending.sequence,
                    "Failed to write deletion list, will retry later ({e:#})"
@@ -179,6 +180,7 @@ impl ListWriter {
                        // This should never happen unless we make a mistake with our serialization.
                        // Ignoring a deletion header is not consequential for correctnes because all deletions
                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        Ok(None)
                    }
                }
@@ -247,6 +249,7 @@ impl ListWriter {
                    .as_str()
            } else {
                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
                continue;
            };

@@ -254,6 +257,7 @@ impl ListWriter {
                Ok(s) => s,
                Err(e) => {
                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -282,6 +286,7 @@ impl ListWriter {
                    // Drop the list on the floor: any objects it referenced will be left behind
                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -324,6 +329,9 @@ impl ListWriter {

            // We will drop out of recovery if this fails: it indicates that we are shutting down
            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
            self.tx
                .send(ValidatorQueueMessage::Delete(deletion_list))
                .await?;
@@ -345,6 +353,7 @@ impl ListWriter {
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
            return;
        }

@@ -413,6 +422,7 @@ impl ListWriter {
                            tracing::error!(
                                "Failed to enqueue deletions, leaking objects.  This is a bug."
                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
                        }
                    }
                }
@@ -440,6 +450,7 @@ impl ListWriter {
                        tracing::error!(
                            "Deletion queue recovery called more than once.  This is a bug."
                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
                        continue;
                    }
@@ -451,6 +462,7 @@ impl ListWriter {
                        info!(
                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        return;
                    } else {
                        self.recovered = true;
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -26,6 +26,7 @@ use super::deleter::DeleterMessage;
 use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates};
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::{RetryForeverError, StorageControllerUpcallApi};
+use crate::metrics;
 use crate::virtual_file::MaybeFatalIo;

 // After this length of time, do any validation work that is pending,
@@ -185,6 +186,7 @@ where
                    "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}",
                    tenant_lsn_state.generation
                );
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
            }
        }

@@ -219,8 +221,11 @@ where

                if !this_list_valid {
                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
-                } 
+                } else {
+                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
+                }
                this_list_valid
            });
            list.validated = true;
@@ -232,7 +237,7 @@ where
                    // Highly unexpected.  Could happen if e.g. disk full.
                    // If we didn't save the trimmed list, it is _not_ valid to execute.
                    warn!("Failed to save modified deletion list {list}: {e:#}");
-                    
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();

                    // Rather than have a complex retry process, just drop it and leak the objects,
                    // scrubber will clean up eventually.
@@ -271,7 +276,7 @@ where
                // The save() function logs a warning on error.
                if let Err(e) = header.save(self.conf).await {
                    warn!("Failed to write deletion queue header: {e:#}");
-                    
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                }
            }
        }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -56,6 +56,7 @@ use utils::completion;
 use utils::id::TimelineId;

 use crate::config::PageServerConf;
+use crate::metrics::disk_usage_based_eviction::METRICS;
 use crate::task_mgr::{self, BACKGROUND_RUNTIME};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
@@ -387,7 +388,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        }
    };

-
+    METRICS.layers_collected.inc_by(candidates.len() as u64);

    tracing::info!(
        elapsed_ms = collection_time.as_millis(),
@@ -427,7 +428,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let (evicted_amount, usage_planned) =
        select_victims(&candidates, usage_pre).into_amount_and_planned();

- 
+    METRICS.layers_selected.inc_by(evicted_amount as u64);

    // phase2: evict layers

@@ -456,6 +457,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            if let Some(next) = next {
                match next {
                    Ok(Ok(file_size)) => {
+                        METRICS.layers_evicted.inc();
                        usage_assumed.add_available_bytes(file_size);
                    }
                    Ok(Err((
@@ -786,6 +788,7 @@ async fn collect_eviction_candidates(
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
+    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
    let tenants = tenant_manager
@@ -819,7 +822,7 @@ async fn collect_eviction_candidates(
            continue;
        }

-        
+        let started_at = std::time::Instant::now();

        // collect layers from all timelines in this tenant
        //
@@ -914,11 +917,25 @@ async fn collect_eviction_candidates(
                    (partition, candidate)
                });

+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);

        candidates.extend(tenant_candidates);

-       
+        let elapsed = started_at.elapsed();
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());

+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -945,7 +962,7 @@ async fn collect_eviction_candidates(
            layer_info.resident_layers.len()
        );

-       
+        let started_at = std::time::Instant::now();

        layer_info
            .resident_layers
@@ -967,13 +984,28 @@ async fn collect_eviction_candidates(
                        candidate,
                    )
                });
+
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
        candidates.extend(tenant_candidates);

        tokio::task::yield_now().await;

-    
+        let elapsed = started_at.elapsed();

-        
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    debug_assert!(
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1873,7 +1873,7 @@ async fn update_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

@@ -1917,7 +1917,7 @@ async fn patch_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -24,6 +24,7 @@ use wal_decoder::models::InterpretedWalRecord;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::{WalIngest, WalIngestErrorKind};
@@ -323,6 +324,7 @@ async fn import_wal(
                walingest
                    .ingest_record(interpreted, &mut modification, ctx)
                    .await?;
+                WAL_INGEST.records_committed.inc();

                modification.commit(ctx).await?;
                last_lsn = lsn;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -77,6 +77,7 @@ use anyhow::Context;
 use once_cell::sync::OnceCell;

 use crate::context::RequestContext;
+use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics};
 use crate::virtual_file::{IoBufferMut, IoPageSlice};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -194,7 +195,7 @@ impl SlotInner {
 }

 pub struct PageCache {
-    immutable_page_maps: [std::sync::RwLock<HashMap<(FileId, u32), usize>>; 16],
+    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -204,103 +205,8 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
-}

-impl PageCache {
-    /// Helper function to determine the shard index based on the low 4 bits of the u32 in the key tuple.
-    fn shard_index(_file_id: &FileId, blkno: u32) -> usize {
-        (blkno & 0xF) as usize
-    }
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Returns the slot index, if any.
-    ///
-    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
-    /// get recycled for an unrelated page immediately after this function
-    /// returns. The caller is responsible for re-checking that the slot still
-    /// contains the page with the same key before using it.
-    ///
-    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
-        match cache_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let map = self.immutable_page_maps[shard_idx].read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    ///
-    /// Remove mapping for given key.
-    ///
-    fn remove_mapping(&self, old_key: &CacheKey) {
-        match old_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-            }
-        }
-    }
-
-    ///
-    /// Insert mapping for given key.
-    ///
-    /// If a mapping already existed for the given key, returns the slot index
-    /// of the existing mapping and leaves it untouched.
-    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
-        match new_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        None
-                    }
-                }
-            }
-        }
-    }
-
-    /// Initialize a new page cache
-    ///
-    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
-        assert!(num_pages > 0, "page cache size must be > 0");
-
-        // We could use Vec::leak here, but that potentially also leaks
-        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
-        // this is avoided.
-        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
-
-        let slots = page_buffer
-            .chunks_exact_mut(PAGE_SZ)
-            .map(|chunk| {
-                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
-                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
-
-                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        permit: std::sync::Mutex::new(Weak::new()),
-                    }),
-                    usage_count: AtomicU8::new(0),
-                }
-            })
-            .collect();
-
-        Self {
-            immutable_page_maps: Default::default(),
-            slots,
-            next_evict_slot: AtomicUsize::new(0),
-            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-        }
-    }
+    size_metrics: &'static PageCacheSizeMetrics,
 }

 struct PinnedSlotsPermit {
@@ -508,17 +414,32 @@ impl PageCache {
    async fn lock_for_read(
        &self,
        cache_key: &CacheKey,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut permit = Some(self.try_get_pinned_slot_permit().await?);

+        let (read_access, hit) = match cache_key {
+            CacheKey::ImmutableFilePage { .. } => (
+                &crate::metrics::PAGE_CACHE
+                    .for_ctx(ctx)
+                    .read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
+            ),
+        };
+        read_access.inc();
+
+        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
                debug_assert!(permit.is_none());
+                if is_first_iteration {
+                    hit.inc();
+                }
                return Ok(ReadBufResult::Found(read_guard));
            }
            debug_assert!(permit.is_some());
+            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) = self
@@ -563,6 +484,63 @@ impl PageCache {
        }
    }

+    //
+    // Section 3: Mapping functions
+    //
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Returns the slot index, if any.
+    ///
+    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
+    /// get recycled for an unrelated page immediately after this function
+    /// returns.  The caller is responsible for re-checking that the slot still
+    /// contains the page with the same key before using it.
+    ///
+    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
+        match cache_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let map = self.immutable_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    ///
+    /// Remove mapping for given key.
+    ///
+    fn remove_mapping(&self, old_key: &CacheKey) {
+        match old_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
+            }
+        }
+    }
+
+    ///
+    /// Insert mapping for given key.
+    ///
+    /// If a mapping already existed for the given key, returns the slot index
+    /// of the existing mapping and leaves it untouched.
+    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
+        match new_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
+                        None
+                    }
+                }
+            }
+        }
+    }
+
    //
    // Section 4: Misc internal helpers
    //
@@ -617,7 +595,11 @@ impl PageCache {
                            // Note that just yielding to tokio during iteration without such
                            // priority boosting is likely counter-productive. We'd just give more opportunities
                            // for B to bump usage count, further starving A.
-                            
+                            page_cache_eviction_metrics::observe(
+                                page_cache_eviction_metrics::Outcome::ItersExceeded {
+                                    iters: iters.try_into().unwrap(),
+                                },
+                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
@@ -627,12 +609,84 @@ impl PageCache {
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
                    inner.key = None;
-                    
-                } 
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                } else {
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                }
                return Ok((slot_idx, inner));
            }
        }
    }

+    /// Initialize a new page cache
+    ///
+    /// This should be called only once at page server startup.
+    fn new(num_pages: usize) -> Self {
+        assert!(num_pages > 0, "page cache size must be > 0");
+
+        // We could use Vec::leak here, but that potentially also leaks
+        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
+        // this is avoided.
+        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
+
+        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
+        size_metrics.max_bytes.set_page_sz(num_pages);
+        size_metrics.current_bytes_immutable.set_page_sz(0);
+
+        let slots = page_buffer
+            .chunks_exact_mut(PAGE_SZ)
+            .map(|chunk| {
+                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
+                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
+
+                Slot {
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
+                    usage_count: AtomicU8::new(0),
+                }
+            })
+            .collect();
+
+        Self {
+            immutable_page_map: Default::default(),
+            slots,
+            next_evict_slot: AtomicUsize::new(0),
+            size_metrics,
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
+        }
+    }
 }

+trait PageSzBytesMetric {
+    fn set_page_sz(&self, count: usize);
+    fn add_page_sz(&self, count: usize);
+    fn sub_page_sz(&self, count: usize);
+}
+
+#[inline(always)]
+fn count_times_page_sz(count: usize) -> u64 {
+    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
+}
+
+impl PageSzBytesMetric for metrics::UIntGauge {
+    fn set_page_sz(&self, count: usize) {
+        self.set(count_times_page_sz(count));
+    }
+    fn add_page_sz(&self, count: usize) {
+        self.add(count_times_page_sz(count));
+    }
+    fn sub_page_sz(&self, count: usize) {
+        self.sub(count_times_page_sz(count));
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -59,7 +59,8 @@ use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::metrics::{
-    self, GetPageBatchBreakReason, SmgrOpTimer, TimelineMetrics,
+    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
+    SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -75,7 +76,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
 use crate::{basebackup, timed_after_cancellation};

-/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
+/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which
 /// is not yet in state [`TenantState::Active`].
 ///
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
@@ -274,6 +275,9 @@ async fn page_service_conn_main(
    cancel: CancellationToken,
    gate_guard: GateGuard,
 ) -> ConnectionHandlerResult {
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["page_service"])
+        .guard();

    socket
        .set_nodelay(true)
@@ -637,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {

 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
+    timer: SmgrOpTimer,
    effective_request_lsn: Lsn,
    ctx: RequestContext,
 }
@@ -644,6 +649,7 @@ struct BatchedGetPageRequest {
 #[cfg(feature = "testing")]
 struct BatchedTestRequest {
    req: models::PagestreamTestRequest,
+    timer: SmgrOpTimer,
 }

 /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
@@ -653,13 +659,13 @@ struct BatchedTestRequest {
 enum BatchedFeMessage {
    Exists {
        span: Span,
-       
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamExistsRequest,
    },
    Nblocks {
        span: Span,
-     
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamNblocksRequest,
    },
@@ -671,13 +677,13 @@ enum BatchedFeMessage {
    },
    DbSize {
        span: Span,
-  
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamDbSizeRequest,
    },
    GetSlruSegment {
        span: Span,
-   
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamGetSlruSegmentRequest,
    },
@@ -698,7 +704,27 @@ impl BatchedFeMessage {
        self.into()
    }

-    fn observe_execution_start(&mut self, _at: Instant) {
+    fn observe_execution_start(&mut self, at: Instant) {
+        match self {
+            BatchedFeMessage::Exists { timer, .. }
+            | BatchedFeMessage::Nblocks { timer, .. }
+            | BatchedFeMessage::DbSize { timer, .. }
+            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
+                timer.observe_execution_start(at);
+            }
+            BatchedFeMessage::GetPage { pages, .. } => {
+                for page in pages {
+                    page.timer.observe_execution_start(at);
+                }
+            }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test { requests, .. } => {
+                for req in requests {
+                    req.timer.observe_execution_start(at);
+                }
+            }
+            BatchedFeMessage::RespondError { .. } => {}
+        }
    }

    fn should_break_batch(
@@ -938,7 +964,7 @@ impl PageServerHandler {
                    .await?;
                debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
                let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelExists,
                    received_at,
@@ -946,7 +972,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Exists {
                    span,
-                
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -956,7 +982,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelSize,
                    received_at,
@@ -964,7 +990,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Nblocks {
                    span,
-                   
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -974,7 +1000,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetDbSize,
                    received_at,
@@ -982,7 +1008,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::DbSize {
                    span,
-              
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -992,7 +1018,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetSlruSegment,
                    received_at,
@@ -1000,7 +1026,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::GetSlruSegment {
                    span,
-               
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -1099,7 +1125,7 @@ impl PageServerHandler {
                // request handler log messages contain the request-specific fields.
                let span = mkspan!(shard.tenant_shard_id.shard_slug());

-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetPageAtLsn,
                    received_at,
@@ -1132,6 +1158,7 @@ impl PageServerHandler {
                    shard: shard.downgrade(),
                    pages: smallvec::smallvec![BatchedGetPageRequest {
                        req,
+                        timer,
                        effective_request_lsn,
                        ctx,
                    }],
@@ -1147,12 +1174,13 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
+                let timer =
+                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
                        .await?;
                BatchedFeMessage::Test {
                    span,
                    shard: shard.downgrade(),
-                    requests: vec![BatchedTestRequest { req,  }],
+                    requests: vec![BatchedTestRequest { req, timer }],
                }
            }
        };
@@ -1253,7 +1281,7 @@ impl PageServerHandler {

        // Dispatch the batch to the appropriate request handler.
        let log_slow_name = batch.as_static_str();
-        let (handler_results, span) = {
+        let (mut handler_results, span) = {
            // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
            // won't fit on the stack.
            let mut boxpinned =
@@ -1283,31 +1311,31 @@ impl PageServerHandler {
        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
-        // let flush_timers = {
-        //     let flushing_start_time = Instant::now();
-        //     let mut flush_timers = Vec::with_capacity(handler_results.len());
-        //     for handler_result in &mut handler_results {
-        //         let flush_timer = match handler_result {
-        //             Ok((_, timer)) => Some(
-        //                 timer
-        //                     .observe_execution_end(flushing_start_time)
-        //                     .expect("we are the first caller"),
-        //             ),
-        //             Err(_) => {
-        //                 // TODO: measure errors
-        //                 None
-        //             }
-        //         };
-        //         flush_timers.push(flush_timer);
-        //     }
-        //     assert_eq!(flush_timers.len(), handler_results.len());
-        //     flush_timers
-        // };
+        let flush_timers = {
+            let flushing_start_time = Instant::now();
+            let mut flush_timers = Vec::with_capacity(handler_results.len());
+            for handler_result in &mut handler_results {
+                let flush_timer = match handler_result {
+                    Ok((_, timer)) => Some(
+                        timer
+                            .observe_execution_end(flushing_start_time)
+                            .expect("we are the first caller"),
+                    ),
+                    Err(_) => {
+                        // TODO: measure errors
+                        None
+                    }
+                };
+                flush_timers.push(flush_timer);
+            }
+            assert_eq!(flush_timers.len(), handler_results.len());
+            flush_timers
+        };

        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for handler_result in handler_results.into_iter() {
+        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
            let response_msg = match handler_result {
                Err(e) => match &e.err {
                    PageStreamError::Shutdown => {
@@ -1339,7 +1367,7 @@ impl PageServerHandler {
                        })
                    }
                },
-                Ok((response_msg, )) => response_msg,
+                Ok((response_msg, _op_timer_already_observed)) => response_msg,
            };

            //
@@ -1353,17 +1381,17 @@ impl PageServerHandler {
            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);

            // what we want to do
-           
+            let socket_fd = pgb_writer.socket_fd;
            let flush_fut = pgb_writer.flush();
            // metric for how long flushing takes
-            // let flush_fut = match flushing_timer {
-            //     Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
-            //         Instant::now(),
-            //         flush_fut,
-            //         socket_fd,
-            //     )),
-            //     None => futures::future::Either::Right(flush_fut),
-            // };
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
+                None => futures::future::Either::Right(flush_fut),
+            };
            // do it while respecting cancellation
            let _: () = async move {
                tokio::select! {
@@ -1393,7 +1421,7 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<
        (
-            Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>>,
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
            Span,
        ),
        QueryError,
@@ -1409,7 +1437,7 @@ impl PageServerHandler {
        Ok(match batch {
            BatchedFeMessage::Exists {
                span,
-               
+                timer,
                shard,
                req,
            } => {
@@ -1420,7 +1448,7 @@ impl PageServerHandler {
                        self.handle_get_rel_exists_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1428,7 +1456,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::Nblocks {
                span,
-           
+                timer,
                shard,
                req,
            } => {
@@ -1439,7 +1467,7 @@ impl PageServerHandler {
                        self.handle_get_nblocks_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1475,6 +1503,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::DbSize {
                span,
+                timer,
                shard,
                req,
            } => {
@@ -1485,7 +1514,7 @@ impl PageServerHandler {
                        self.handle_db_size_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1493,6 +1522,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::GetSlruSegment {
                span,
+                timer,
                shard,
                req,
            } => {
@@ -1503,7 +1533,7 @@ impl PageServerHandler {
                        self.handle_get_slru_segment_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -2149,11 +2179,15 @@ impl PageServerHandler {
        timeline: &Timeline,
        requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
        io_concurrency: IoConcurrency,
-        _batch_break_reason: GetPageBatchBreakReason,
+        batch_break_reason: GetPageBatchBreakReason,
        ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        timeline
+            .query_metrics
+            .observe_getpage_batch_start(requests.len(), batch_break_reason);
+
        // If a page trace is running, submit an event for this request.
        if let Some(page_trace) = timeline.page_trace.load().as_ref() {
            let time = SystemTime::now();
@@ -2253,7 +2287,7 @@ impl PageServerHandler {
                                req: req.req,
                                page,
                            }),
-                            
+                            req.timer,
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2298,7 +2332,7 @@ impl PageServerHandler {
        timeline: &Timeline,
        requests: Vec<BatchedTestRequest>,
        _ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage,), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
        // real requests would do something with the timeline
        let mut results = Vec::with_capacity(requests.len());
        for _req in requests.iter() {
@@ -2324,6 +2358,7 @@ impl PageServerHandler {
                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
                                req: req.req.clone(),
                            }),
+                            req.timer,
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2878,7 +2913,12 @@ where
                    .record("timeline_id", field::display(timeline_id));

                self.check_permission(Some(tenant_id))?;
-                
+                let command_kind = match protocol_version {
+                    PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
+                    PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
+                };
+                COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
+
                self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
                    .await?;
            }
@@ -2895,7 +2935,10 @@ where

                self.check_permission(Some(tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Basebackup)
+                    .inc();
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
                let res = async {
                    self.handle_basebackup_request(
                        pgb,
@@ -2913,7 +2956,7 @@ where
                    Result::<(), QueryError>::Ok(())
                }
                .await;
-
+                metric_recording.observe(&res);
                res?;
            }
            // same as basebackup, but result includes relational data as well
@@ -2929,7 +2972,9 @@ where

                self.check_permission(Some(tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Fullbackup)
+                    .inc();

                // Check that the timeline exists
                self.handle_basebackup_request(
@@ -2963,7 +3008,9 @@ where

                self.check_permission(Some(tenant_shard_id.tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::LeaseLsn)
+                    .inc();

                match self
                    .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,13 +36,15 @@ use tracing::{debug, info, info_span, trace, warn};
 use utils::bin_ser::{BeSer, DeserializeError};
 use utils::lsn::Lsn;
 use utils::pausable_failpoint;
-use wal_decoder::serialized_batch::SerializedValueBatch ;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
 use crate::context::{PerfInstrumentFutureExt, RequestContext};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-
+use crate::metrics::{
+    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+};
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -1030,16 +1032,19 @@ impl Timeline {
            )
            .await?;
        let mut result = HashMap::new();
-
+        let mut sz = 0;
        for (_, v) in kv {
            let v = v?;
            let v = aux_file::decode_file_value_bytes(&v)
                .context("value decode")
                .map_err(PageReconstructError::Other)?;
            for (fname, content) in v {
+                sz += fname.len();
+                sz += content.len();
                result.insert(fname, content);
            }
        }
+        self.aux_file_size_estimator.on_initial(sz);
        Ok(result)
    }

@@ -1310,12 +1315,12 @@ impl Timeline {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
-                
+                RELSIZE_CACHE_HITS.inc();
                return Some(*nblocks);
            }
-            
+            RELSIZE_CACHE_MISSES_OLD.inc();
        }
-       
+        RELSIZE_CACHE_MISSES.inc();
        None
    }

@@ -1340,21 +1345,25 @@ impl Timeline {
            }
            hash_map::Entry::Vacant(entry) => {
                entry.insert((lsn, nblocks));
-               
+                RELSIZE_CACHE_ENTRIES.inc();
            }
        }
    }

    /// Store cached relation size
-    pub fn set_cached_rel_size(&self, _tag: RelTag, _lsn: Lsn, _nblocks: BlockNumber) {
-       
-        
+    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_CACHE_ENTRIES.inc();
+        }
    }

    /// Remove cached relation size
-    pub fn remove_cached_rel_size(&self, _tag: &RelTag) {
-        
-        
+    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.remove(tag).is_some() {
+            RELSIZE_CACHE_ENTRIES.dec();
+        }
    }
 }

@@ -1429,7 +1438,25 @@ impl DatadirModification<'_> {
            .is_some_and(|b| b.has_data())
    }

-    
+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }

    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
@@ -2304,15 +2331,20 @@ impl DatadirModification<'_> {
        }
        let mut new_files = other_files;
        match (modifying_file, content.is_empty()) {
-            (Some(_old_content), false) => {
-                
+            (Some(old_content), false) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_update(old_content.len(), content.len());
                new_files.push((path, content));
            }
-            (Some(_old_content), true) => {
-               
+            (Some(old_content), true) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_remove(old_content.len());
                // not adding the file key to the final `new_files` vec.
            }
            (None, false) => {
+                self.tline.aux_file_size_estimator.on_add(content.len());
                new_files.push((path, content));
            }
            // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -83,6 +83,11 @@ use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
+use crate::metrics::{
+    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
+    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
+};
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
 use crate::tenant::gc_result::GcResult;
@@ -153,7 +158,7 @@ pub struct TenantSharedResources {
    pub l0_flush_global_state: L0FlushGlobalState,
 }

-/// A [`Tenant`] is really an _attached_ tenant.  The configuration
+/// A [`TenantShard`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
 #[derive(Clone)]
@@ -240,7 +245,7 @@ pub(crate) enum SpawnMode {
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
-pub struct Tenant {
+pub struct TenantShard {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

@@ -262,7 +267,7 @@ pub struct Tenant {
    shard_identity: ShardIdentity,

    /// The remote storage generation, used to protect S3 objects from split-brain.
-    /// Does not change over the lifetime of the [`Tenant`] object.
+    /// Does not change over the lifetime of the [`TenantShard`] object.
    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
@@ -304,7 +309,7 @@ pub struct Tenant {
    // Access to global deletion queue for when this tenant wants to schedule a deletion
    deletion_queue_client: DeletionQueueClient,

-    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
+    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,

@@ -332,12 +337,12 @@ pub struct Tenant {
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,

-    // Users of the Tenant such as the page service must take this Gate to avoid
-    // trying to use a Tenant which is shutting down.
+    // Users of the TenantShard such as the page service must take this Gate to avoid
+    // trying to use a TenantShard which is shutting down.
    pub(crate) gate: Gate,

    /// Throttle applied at the top of [`Timeline::get`].
-    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
+    /// All [`TenantShard::timelines`] of a given [`TenantShard`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,

    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
@@ -357,7 +362,7 @@ pub struct Tenant {

    l0_flush_global_state: L0FlushGlobalState,
 }
-impl std::fmt::Debug for Tenant {
+impl std::fmt::Debug for TenantShard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
    }
@@ -836,7 +841,7 @@ impl Debug for SetStoppingError {
    }
 }

-/// Arguments to [`Tenant::create_timeline`].
+/// Arguments to [`TenantShard::create_timeline`].
 ///
 /// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
 /// is `None`, the result of the timeline create call is not deterministic.
@@ -871,7 +876,7 @@ pub(crate) struct CreateTimelineParamsImportPgdata {
    pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`] in  [`Tenant::start_creating_timeline`].
+/// What is used to determine idempotency of a [`TenantShard::create_timeline`] call in  [`TenantShard::start_creating_timeline`] in  [`TenantShard::start_creating_timeline`].
 ///
 /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
@@ -909,7 +914,7 @@ pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
    idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is returned by [`Tenant::start_creating_timeline`].
+/// What is returned by [`TenantShard::start_creating_timeline`].
 #[must_use]
 enum StartCreatingTimelineResult {
    CreateGuard(TimelineCreateGuard),
@@ -938,13 +943,13 @@ struct TimelineInitAndSyncNeedsSpawnImportPgdata {
    guard: TimelineCreateGuard,
 }

-/// What is returned by [`Tenant::create_timeline`].
+/// What is returned by [`TenantShard::create_timeline`].
 enum CreateTimelineResult {
    Created(Arc<Timeline>),
    Idempotent(Arc<Timeline>),
-    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
+    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`TenantShard::timelines`] when
    /// we return this result, nor will this concrete object ever be added there.
-    /// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
+    /// Cf method comment on [`TenantShard::create_timeline_import_pgdata`].
    ImportSpawned(Arc<Timeline>),
 }

@@ -1077,7 +1082,7 @@ pub(crate) enum LoadConfigError {
    NotFound(Utf8PathBuf),
 }

-impl Tenant {
+impl TenantShard {
    /// Yet another helper for timeline initialization.
    ///
    /// - Initializes the Timeline struct and inserts it into the tenant's hash map
@@ -1298,7 +1303,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
+    ) -> Result<Arc<TenantShard>, GlobalShutDown> {
        let wal_redo_manager =
            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;

@@ -1312,7 +1317,7 @@ impl Tenant {
        let attach_mode = attached_conf.location.attach_mode;
        let generation = attached_conf.location.generation;

-        let tenant = Arc::new(Tenant::new(
+        let tenant = Arc::new(TenantShard::new(
            TenantState::Attaching,
            conf,
            attached_conf,
@@ -1329,7 +1334,7 @@ impl Tenant {
        let attach_gate_guard = tenant
            .gate
            .enter()
-            .expect("We just created the Tenant: nothing else can have shut it down yet");
+            .expect("We just created the TenantShard: nothing else can have shut it down yet");

        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
@@ -1353,11 +1358,11 @@ impl Tenant {
                let starting_up = init_order.is_some();
                scopeguard::defer! {
                    if starting_up {
-                       
+                        TENANT.startup_complete.inc();
                    }
                }

-                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
+                fn make_broken_or_stopping(t: &TenantShard, err: anyhow::Error) {
                    t.state.send_modify(|state| match state {
                        // TODO: the old code alluded to DeleteTenantFlow sometimes setting
                        // TenantState::Stopping before we get here, but this may be outdated.
@@ -1456,7 +1461,7 @@ impl Tenant {

                let preload = match &mode {
                    SpawnMode::Eager | SpawnMode::Lazy => {
-                      
+                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
                            .preload(&remote_storage, task_mgr::shutdown_token())
                            .await;
@@ -1478,7 +1483,7 @@ impl Tenant {
                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                let attach_start = std::time::Instant::now();
                let attached = {
-                
+                    let _attach_timer = Some(TENANT.attach.start_timer());
                    tenant_clone.attach(preload, &ctx).await
                };
                let attach_duration = attach_start.elapsed();
@@ -1622,7 +1627,7 @@ impl Tenant {
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -1952,7 +1957,7 @@ impl Tenant {
    }

    async fn load_timelines_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
        heatmap: Option<(HeatMapTenant, std::time::Instant)>,
@@ -2023,7 +2028,7 @@ impl Tenant {
    }

    fn load_timeline_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_id: TimelineId,
        remote_storage: GenericRemoteStorage,
        previous_heatmap: Option<PreviousHeatmap>,
@@ -2424,14 +2429,14 @@ impl Tenant {
    /// This is used by tests & import-from-basebackup.
    ///
    /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
-    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
+    /// a state that will fail [`TenantShard::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
    ///
    /// The caller is responsible for getting the timeline into a state that will be accepted
-    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
+    /// by [`TenantShard::load_remote_timeline`] / [`TenantShard::attach`].
    /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
-    /// to the [`Tenant::timelines`].
+    /// to the [`TenantShard::timelines`].
    ///
-    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
+    /// Tests should use `TenantShard::create_test_timeline` to set up the minimum required metadata keys.
    pub(crate) async fn create_empty_timeline(
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
@@ -2579,7 +2584,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        params: CreateTimelineParams,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
@@ -2746,13 +2751,13 @@ impl Tenant {
        Ok(activated_timeline)
    }

-    /// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
+    /// The returned [`Arc<Timeline>`] is NOT in the [`TenantShard::timelines`] map until the import
    /// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
-    /// [`Tenant::timelines`] map when the import completes.
+    /// [`TenantShard::timelines`] map when the import completes.
    /// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
    /// for the response.
    async fn create_timeline_import_pgdata(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
        params: CreateTimelineParamsImportPgdata,
        activate: ActivateTimelineArgs,
        ctx: &RequestContext,
@@ -2849,7 +2854,7 @@ impl Tenant {

    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
    async fn create_timeline_import_pgdata_task(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2877,7 +2882,7 @@ impl Tenant {
    }

    async fn create_timeline_import_pgdata_task_impl(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2894,10 +2899,10 @@ impl Tenant {
        // Reload timeline from remote.
        // This proves that the remote state is attachable, and it reuses the code.
        //
-        // TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
+        // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown.
        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
-        // But our activate() call might launch new background tasks after Tenant::shutdown
-        // already went past shutting down the Tenant::timelines, which this timeline here is no part of.
+        // But our activate() call might launch new background tasks after TenantShard::shutdown
+        // already went past shutting down the TenantShard::timelines, which this timeline here is no part of.
        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
        // down while bootstrapping/branching + activating), but, the race condition is much more likely
        // to manifest because of the long runtime of this import task.
@@ -2912,7 +2917,7 @@ impl Tenant {
        // };
        let timeline_id = timeline.timeline_id;

-        // load from object storage like Tenant::attach does
+        // load from object storage like TenantShard::attach does
        let resources = self.build_timeline_resources(timeline_id);
        let index_part = resources
            .remote_client
@@ -3180,7 +3185,7 @@ impl Tenant {
        self.compaction_circuit_breaker
            .lock()
            .unwrap()
-            .success();
+            .success(&CIRCUIT_BREAKERS_UNBROKEN);

        match has_pending {
            true => Ok(CompactionOutcome::Pending),
@@ -3201,13 +3206,13 @@ impl Tenant {
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
-                    .fail( err);
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
            }
            CompactionError::Other(err) => {
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
-                    .fail( err);
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
            }
            CompactionError::AlreadyRunning(_) => {}
        }
@@ -3387,7 +3392,7 @@ impl Tenant {
                    "activation attempt finished"
                );

-              
+                TENANT.activation.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -3512,6 +3517,7 @@ impl Tenant {
        // Wait for any in-flight operations to complete
        self.gate.close().await;

+        remove_tenant_metrics(&self.tenant_shard_id);

        Ok(())
    }
@@ -3810,6 +3816,24 @@ impl Tenant {
                MaybeDeletedIndexPart::IndexPart(p) => p,
            };

+            // A shard split may not take place while a timeline import is on-going
+            // for the tenant. Timeline imports run as part of each tenant shard
+            // and rely on the sharding scheme to split the work among pageservers.
+            // If we were to split in the middle of this process, we would have to
+            // either ensure that it's driven to completion on the old shard set
+            // or transfer it to the new shard set. It's technically possible, but complex.
+            match index_part.import_pgdata {
+                Some(ref import) if !import.is_done() => {
+                    anyhow::bail!(
+                        "Cannot split due to import with idempotency key: {:?}",
+                        import.idempotency_key()
+                    );
+                }
+                Some(_) | None => {
+                    // fallthrough
+                }
+            }
+
            for child_shard in child_shards {
                tracing::info!(%timeline_id, "Uploading index_part for child {}", child_shard.to_index());
                upload_index_part(
@@ -3844,13 +3868,33 @@ impl Tenant {
    }

    pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
-         TopTenantShardItem {
+        let mut result = TopTenantShardItem {
            id: self.tenant_shard_id,
            resident_size: 0,
            physical_size: 0,
            max_logical_size: 0,
            max_logical_size_per_shard: 0,
+        };
+
+        for timeline in self.timelines.lock().unwrap().values() {
+            result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
+
+            result.physical_size += timeline
+                .remote_client
+                .metrics
+                .remote_physical_size_gauge
+                .get();
+            result.max_logical_size = std::cmp::max(
+                result.max_logical_size,
+                timeline.metrics.current_logical_size_gauge.get(),
+            );
        }
+
+        result.max_logical_size_per_shard = result
+            .max_logical_size
+            .div_ceil(self.tenant_shard_id.shard_count.count() as u64);
+
+        result
    }
 }

@@ -3912,7 +3956,7 @@ enum ActivateTimelineArgs {
    No,
 }

-impl Tenant {
+impl TenantShard {
    pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig {
        self.tenant_conf.load().tenant_conf.clone()
    }
@@ -4070,7 +4114,7 @@ impl Tenant {
        update: F,
    ) -> anyhow::Result<pageserver_api::models::TenantConfig> {
        // Use read-copy-update in order to avoid overwriting the location config
-        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // state if this races with [`TenantShard::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
@@ -4193,7 +4237,7 @@ impl Tenant {
        Ok((timeline, timeline_ctx))
    }

-    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// [`TenantShard::shutdown`] must be called before dropping the returned [`TenantShard`] object
    /// to ensure proper cleanup of background tasks and metrics.
    //
    // Allow too_many_arguments because a constructor's argument list naturally grows with the
@@ -4209,7 +4253,7 @@ impl Tenant {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
-    ) -> Tenant {
+    ) -> TenantShard {
        debug_assert!(
            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
        );
@@ -4217,19 +4261,59 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
-            
-            loop {
+            // reflect tenant state in metrics:
+            // - global per tenant state: TENANT_STATE_METRIC
+            // - "set" of broken tenants: BROKEN_TENANTS_SET
+            //
+            // set of broken tenants should not have zero counts so that it remains accessible for
+            // alerting.

+            let tid = tenant_shard_id.to_string();
+            let shard_id = tenant_shard_id.shard_slug().to_string();
+            let set_key = &[tid.as_str(), shard_id.as_str()][..];
+
+            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
+                ([state.into()], matches!(state, TenantState::Broken { .. }))
+            }
+
+            let mut tuple = inspect_state(&rx.borrow_and_update());
+
+            let is_broken = tuple.1;
+            let mut counted_broken = if is_broken {
+                // add the id to the set right away, there should not be any updates on the channel
+                // after before tenant is removed, if ever
+                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                true
+            } else {
+                false
+            };
+
+            loop {
+                let labels = &tuple.0;
+                let current = TENANT_STATE_METRIC.with_label_values(labels);
+                current.inc();

                if rx.changed().await.is_err() {
-                   
+                    // tenant has been dropped
+                    current.dec();
+                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                    break;
                }

+                current.dec();
+                tuple = inspect_state(&rx.borrow_and_update());
+
+                let is_broken = tuple.1;
+                if is_broken && !counted_broken {
+                    counted_broken = true;
+                    // insert the tenant_id (back) into the set while avoiding needless counter
+                    // access
+                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                }
            }
        });

-        Tenant {
+        TenantShard {
            tenant_shard_id,
            shard_identity,
            generation: attached_conf.location.generation,
@@ -4264,7 +4348,7 @@ impl Tenant {
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            pagestream_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                TenantShard::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
            )),
            pagestream_throttle_metrics: Arc::new(
                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
@@ -4400,11 +4484,11 @@ impl Tenant {

        // Perform GC for each timeline.
        //
-        // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the
+        // Note that we don't hold the `TenantShard::gc_cs` lock here because we don't want to delay the
        // branch creation task, which requires the GC lock. A GC iteration can run concurrently
        // with branch creation.
        //
-        // See comments in [`Tenant::branch_timeline`] for more information about why branch
+        // See comments in [`TenantShard::branch_timeline`] for more information about why branch
        // creation task can run concurrently with timeline's GC iteration.
        for timeline in gc_timelines {
            if cancel.is_cancelled() {
@@ -4434,7 +4518,7 @@ impl Tenant {

    /// Refreshes the Timeline::gc_info for all timelines, returning the
    /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
-    /// [`Tenant::get_gc_horizon`].
+    /// [`TenantShard::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
    pub(crate) async fn refresh_gc_info(
@@ -4600,6 +4684,10 @@ impl Tenant {
                let now = SystemTime::now();
                target.leases.retain(|_, lease| !lease.is_expired(&now));

+                timeline
+                    .metrics
+                    .valid_lsn_lease_count_gauge
+                    .set(target.leases.len() as u64);

                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
@@ -4609,6 +4697,22 @@ impl Tenant {
                    }
                }

+                // Update metrics that depend on GC state
+                timeline
+                    .metrics
+                    .archival_size
+                    .set(if target.within_ancestor_pitr {
+                        timeline.metrics.current_logical_size_gauge.get()
+                    } else {
+                        0
+                    });
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.time)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );

                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                // - this timeline was created while we were finding cutoffs
@@ -5358,6 +5462,10 @@ impl Tenant {
        // Only shard zero should be calculating synthetic sizes
        debug_assert!(self.shard_identity.is_shard_zero());

+        TENANT_SYNTHETIC_SIZE_METRIC
+            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
+            .unwrap()
+            .set(size);
    }

    pub fn cached_synthetic_size(&self) -> u64 {
@@ -5409,7 +5517,7 @@ impl Tenant {
            }
        }

-        // The flushes we did above were just writes, but the Tenant might have had
+        // The flushes we did above were just writes, but the TenantShard might have had
        // pending deletions as well from recent compaction/gc: we want to flush those
        // as well.  This requires flushing the global delete queue.  This is cheap
        // because it's typically a no-op.
@@ -5427,25 +5535,34 @@ impl Tenant {

    /// How much local storage would this tenant like to have?  It can cope with
    /// less than this (via eviction and on-demand downloads), but this function enables
-    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
+    /// the TenantShard to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
    ///
    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
    /// than they report here, due to layer eviction.  Tenants with many active branches may
    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
-        1000
+        let timelines = self.timelines.lock().unwrap();
+
+        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
+        // reflects the observation that on tenants with multiple large branches, typically only one
+        // of them is used actively enough to occupy space on disk.
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .max()
+            .unwrap_or(0)
    }

    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
    /// manifest in `Self::remote_tenant_manifest`.
    ///
    /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
-    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
+    /// changing any `TenantShard` state that's included in the manifest, consider making the manifest
    /// the authoritative source of data with an API that automatically uploads on changes. Revisit
    /// this when the manifest is more widely used and we have a better idea of the data model.
    pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
+        // Multiple tasks may call this function concurrently after mutating the TenantShard runtime
        // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
        // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
        // simple coalescing mechanism.
@@ -5518,11 +5635,16 @@ async fn run_initdb(
    );

    let _permit = {
-       
+        let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
        INIT_DB_SEMAPHORE.acquire().await
    };

+    CONCURRENT_INITDBS.inc();
+    scopeguard::defer! {
+        CONCURRENT_INITDBS.dec();
+    }

+    let _timer = INITDB_RUN_TIME.start_timer();
    let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
        superuser: &conf.superuser,
        locale: &conf.locale,
@@ -5708,7 +5830,7 @@ pub(crate) mod harness {
            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
        }

-        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<TenantShard>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
                .with_scope_unit_test();
            (
@@ -5723,10 +5845,10 @@ pub(crate) mod harness {
        pub(crate) async fn do_try_load(
            &self,
            ctx: &RequestContext,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

-            let tenant = Arc::new(Tenant::new(
+            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
                self.conf,
                AttachedTenantConf::try_from(LocationConf::attached_single(
@@ -5942,7 +6064,7 @@ mod tests {
    #[cfg(feature = "testing")]
    #[allow(clippy::too_many_arguments)]
    async fn randomize_timeline(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
        new_timeline_id: TimelineId,
        pg_version: u32,
        spec: TestTimelineSpecification,
@@ -6832,7 +6954,7 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        lsn: Lsn,
@@ -6844,7 +6966,7 @@ mod tests {
    }

    async fn bulk_insert_maybe_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
@@ -7754,7 +7876,7 @@ mod tests {
            let (tline, _ctx) = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
+            // Leave the timeline ID in [`TenantShard::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown(super::timeline::ShutdownMode::Hard)
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -15,21 +15,23 @@
 //! len >= 128: 1CCCXXXX XXXXXXXX XXXXXXXX XXXXXXXX
 //!
 use std::cmp::min;
-use std::io::Error;

+use anyhow::Context;
 use async_compression::Level;
 use bytes::{BufMut, BytesMut};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use tokio::io::AsyncWriteExt;
-use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
+use tokio_epoll_uring::IoBuf;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;

 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
+use crate::virtual_file::owned_buffers_io::write::{BufferedWriter, FlushTaskError};
+use crate::virtual_file::owned_buffers_io::write::{BufferedWriterShutdownMode, OwnedAsyncWriter};

 #[derive(Copy, Clone, Debug)]
 pub struct CompressionInfo {
@@ -50,12 +52,9 @@ pub struct Header {

 impl Header {
    /// Decodes a header from a byte slice.
-    pub fn decode(bytes: &[u8]) -> Result<Self, std::io::Error> {
+    pub fn decode(bytes: &[u8]) -> anyhow::Result<Self> {
        let Some(&first_header_byte) = bytes.first() else {
-            return Err(std::io::Error::new(
-                std::io::ErrorKind::InvalidData,
-                "zero-length blob header",
-            ));
+            anyhow::bail!("zero-length blob header");
        };

        // If the first bit is 0, this is just a 1-byte length prefix up to 128 bytes.
@@ -69,12 +68,9 @@ impl Header {

        // Otherwise, this is a 4-byte header containing compression information and length.
        const HEADER_LEN: usize = 4;
-        let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN].try_into().map_err(|_| {
-            std::io::Error::new(
-                std::io::ErrorKind::InvalidData,
-                format!("blob header too short: {bytes:?}"),
-            )
-        })?;
+        let mut header_buf: [u8; HEADER_LEN] = bytes[0..HEADER_LEN]
+            .try_into()
+            .map_err(|_| anyhow::anyhow!("blob header too short: {bytes:?}"))?;

        // TODO: verify the compression bits and convert to an enum.
        let compression_bits = header_buf[0] & LEN_COMPRESSION_BIT_MASK;
@@ -94,6 +90,16 @@ impl Header {
    }
 }

+#[derive(Debug, thiserror::Error)]
+pub enum WriteBlobError {
+    #[error(transparent)]
+    Flush(FlushTaskError),
+    #[error("blob too large ({len} bytes)")]
+    BlobTooLarge { len: usize },
+    #[error(transparent)]
+    WriteBlobRaw(anyhow::Error),
+}
+
 impl BlockCursor<'_> {
    /// Read a blob into a new buffer.
    pub async fn read_blob(
@@ -213,143 +219,64 @@ pub(super) const BYTE_UNCOMPRESSED: u8 = 0x80;
 pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;

 /// A wrapper of `VirtualFile` that allows users to write blobs.
-///
-/// If a `BlobWriter` is dropped, the internal buffer will be
-/// discarded. You need to call [`flush_buffer`](Self::flush_buffer)
-/// manually before dropping.
-pub struct BlobWriter<const BUFFERED: bool> {
-    inner: VirtualFile,
-    offset: u64,
-    /// A buffer to save on write calls, only used if BUFFERED=true
-    buf: Vec<u8>,
+pub struct BlobWriter<W> {
    /// We do tiny writes for the length headers; they need to be in an owned buffer;
    io_buf: Option<BytesMut>,
+    writer: BufferedWriter<IoBufferMut, W>,
+    offset: u64,
 }

-impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
+impl<W> BlobWriter<W>
+where
+    W: OwnedAsyncWriter + std::fmt::Debug + Send + Sync + 'static,
+{
+    /// See [`BufferedWriter`] struct-level doc comment for semantics of `start_offset`.
    pub fn new(
-        inner: VirtualFile,
+        file: W,
        start_offset: u64,
-        _gate: &utils::sync::gate::Gate,
-        _cancel: CancellationToken,
-        _ctx: &RequestContext,
-    ) -> Self {
-        Self {
-            inner,
-            offset: start_offset,
-            buf: Vec::with_capacity(Self::CAPACITY),
+        gate: &utils::sync::gate::Gate,
+        cancel: CancellationToken,
+        ctx: &RequestContext,
+        flush_task_span: tracing::Span,
+    ) -> anyhow::Result<Self> {
+        Ok(Self {
            io_buf: Some(BytesMut::new()),
-        }
+            writer: BufferedWriter::new(
+                file,
+                start_offset,
+                || IoBufferMut::with_capacity(Self::CAPACITY),
+                gate.enter()?,
+                cancel,
+                ctx,
+                flush_task_span,
+            ),
+            offset: start_offset,
+        })
    }

    pub fn size(&self) -> u64 {
        self.offset
    }

-    const CAPACITY: usize = if BUFFERED { 64 * 1024 } else { 0 };
+    const CAPACITY: usize = 64 * 1024;

-    /// Writes the given buffer directly to the underlying `VirtualFile`.
-    /// You need to make sure that the internal buffer is empty, otherwise
-    /// data will be written in wrong order.
-    #[inline(always)]
-    async fn write_all_unbuffered<Buf: IoBuf + Send>(
-        &mut self,
-        src_buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, Result<(), Error>) {
-        let (src_buf, res) = self.inner.write_all(src_buf, ctx).await;
-        let nbytes = match res {
-            Ok(nbytes) => nbytes,
-            Err(e) => return (src_buf, Err(e)),
-        };
-        self.offset += nbytes as u64;
-        (src_buf, Ok(()))
-    }
-
-    #[inline(always)]
-    /// Flushes the internal buffer to the underlying `VirtualFile`.
-    pub async fn flush_buffer(&mut self, ctx: &RequestContext) -> Result<(), Error> {
-        let buf = std::mem::take(&mut self.buf);
-        let (slice, res) = self.inner.write_all(buf.slice_len(), ctx).await;
-        res?;
-        let mut buf = slice.into_raw_slice().into_inner();
-        buf.clear();
-        self.buf = buf;
-        Ok(())
-    }
-
-    #[inline(always)]
-    /// Writes as much of `src_buf` into the internal buffer as it fits
-    fn write_into_buffer(&mut self, src_buf: &[u8]) -> usize {
-        let remaining = Self::CAPACITY - self.buf.len();
-        let to_copy = src_buf.len().min(remaining);
-        self.buf.extend_from_slice(&src_buf[..to_copy]);
-        self.offset += to_copy as u64;
-        to_copy
-    }
-
-    /// Internal, possibly buffered, write function
+    /// Writes `src_buf` to the file at the current offset.
    async fn write_all<Buf: IoBuf + Send>(
        &mut self,
        src_buf: FullSlice<Buf>,
        ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, Result<(), Error>) {
-        let src_buf = src_buf.into_raw_slice();
-        let src_buf_bounds = src_buf.bounds();
-        let restore = move |src_buf_slice: Slice<_>| {
-            FullSlice::must_new(Slice::from_buf_bounds(
-                src_buf_slice.into_inner(),
-                src_buf_bounds,
-            ))
-        };
+    ) -> (FullSlice<Buf>, Result<(), FlushTaskError>) {
+        let res = self
+            .writer
+            // TODO: why are we taking a FullSlice if we're going to pass a borrow downstack?
+            // Can remove all the complexity around owned buffers upstack
+            .write_buffered_borrowed(&src_buf, ctx)
+            .await
+            .map(|len| {
+                self.offset += len as u64;
+            });

-        if !BUFFERED {
-            assert!(self.buf.is_empty());
-            return self
-                .write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
-                .await;
-        }
-        let remaining = Self::CAPACITY - self.buf.len();
-        let src_buf_len = src_buf.bytes_init();
-        if src_buf_len == 0 {
-            return (restore(src_buf), Ok(()));
-        }
-        let mut src_buf = src_buf.slice(0..src_buf_len);
-        // First try to copy as much as we can into the buffer
-        if remaining > 0 {
-            let copied = self.write_into_buffer(&src_buf);
-            src_buf = src_buf.slice(copied..);
-        }
-        // Then, if the buffer is full, flush it out
-        if self.buf.len() == Self::CAPACITY {
-            if let Err(e) = self.flush_buffer(ctx).await {
-                return (restore(src_buf), Err(e));
-            }
-        }
-        // Finally, write the tail of src_buf:
-        // If it wholly fits into the buffer without
-        // completely filling it, then put it there.
-        // If not, write it out directly.
-        let src_buf = if !src_buf.is_empty() {
-            assert_eq!(self.buf.len(), 0);
-            if src_buf.len() < Self::CAPACITY {
-                let copied = self.write_into_buffer(&src_buf);
-                // We just verified above that src_buf fits into our internal buffer.
-                assert_eq!(copied, src_buf.len());
-                restore(src_buf)
-            } else {
-                let (src_buf, res) = self
-                    .write_all_unbuffered(FullSlice::must_new(src_buf), ctx)
-                    .await;
-                if let Err(e) = res {
-                    return (src_buf, Err(e));
-                }
-                src_buf
-            }
-        } else {
-            restore(src_buf)
-        };
-        (src_buf, Ok(()))
+        (src_buf, res)
    }

    /// Write a blob of data. Returns the offset that it was written to,
@@ -358,7 +285,7 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        &mut self,
        srcbuf: FullSlice<Buf>,
        ctx: &RequestContext,
-    ) -> (FullSlice<Buf>, Result<u64, Error>) {
+    ) -> (FullSlice<Buf>, Result<u64, WriteBlobError>) {
        let (buf, res) = self
            .write_blob_maybe_compressed(srcbuf, ctx, ImageCompressionAlgorithm::Disabled)
            .await;
@@ -372,7 +299,10 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        srcbuf: FullSlice<Buf>,
        ctx: &RequestContext,
        algorithm: ImageCompressionAlgorithm,
-    ) -> (FullSlice<Buf>, Result<(u64, CompressionInfo), Error>) {
+    ) -> (
+        FullSlice<Buf>,
+        Result<(u64, CompressionInfo), WriteBlobError>,
+    ) {
        let offset = self.offset;
        let mut compression_info = CompressionInfo {
            written_compressed: false,
@@ -388,14 +318,16 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
            if len < 128 {
                // Short blob. Write a 1-byte length header
                io_buf.put_u8(len as u8);
-                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
+                let (slice, res) = self.write_all(io_buf.slice_len(), ctx).await;
+                let res = res.map_err(WriteBlobError::Flush);
+                ((slice, res), srcbuf)
            } else {
                // Write a 4-byte length header
                if len > MAX_SUPPORTED_BLOB_LEN {
                    return (
                        (
                            io_buf.slice_len(),
-                            Err(Error::other(format!("blob too large ({len} bytes)"))),
+                            Err(WriteBlobError::BlobTooLarge { len }),
                        ),
                        srcbuf,
                    );
@@ -429,7 +361,9 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
                assert_eq!(len_buf[0] & 0xf0, 0);
                len_buf[0] |= high_bit_mask;
                io_buf.extend_from_slice(&len_buf[..]);
-                (self.write_all(io_buf.slice_len(), ctx).await, srcbuf)
+                let (slice, res) = self.write_all(io_buf.slice_len(), ctx).await;
+                let res = res.map_err(WriteBlobError::Flush);
+                ((slice, res), srcbuf)
            }
        }
        .await;
@@ -444,33 +378,49 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        } else {
            self.write_all(srcbuf, ctx).await
        };
+        let res = res.map_err(WriteBlobError::Flush);
        (srcbuf, res.map(|_| (offset, compression_info)))
    }
-}

-impl BlobWriter<true> {
-    /// Access the underlying `VirtualFile`.
-    ///
-    /// This function flushes the internal buffer before giving access
-    /// to the underlying `VirtualFile`.
-    pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
-        self.flush_buffer(ctx).await?;
-        Ok(self.inner)
+    /// Writes a raw blob containing both header and data, returning its offset.
+    pub(crate) async fn write_blob_raw<Buf: IoBuf + Send>(
+        &mut self,
+        raw_with_header: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<u64, WriteBlobError>) {
+        // Verify the header, to ensure we don't write invalid/corrupt data.
+        let header = match Header::decode(&raw_with_header)
+            .context("decoding blob header")
+            .map_err(WriteBlobError::WriteBlobRaw)
+        {
+            Ok(header) => header,
+            Err(err) => return (raw_with_header, Err(err)),
+        };
+        if raw_with_header.len() != header.total_len() {
+            let header_total_len = header.total_len();
+            let raw_len = raw_with_header.len();
+            return (
+                raw_with_header,
+                Err(WriteBlobError::WriteBlobRaw(anyhow::anyhow!(
+                    "header length mismatch: {header_total_len} != {raw_len}"
+                ))),
+            );
+        }
+
+        let offset = self.offset;
+        let (raw_with_header, result) = self.write_all(raw_with_header, ctx).await;
+        let result = result.map_err(WriteBlobError::Flush);
+        (raw_with_header, result.map(|_| offset))
    }

-    /// Access the underlying `VirtualFile`.
-    ///
-    /// Unlike [`into_inner`](Self::into_inner), this doesn't flush
-    /// the internal buffer before giving access.
-    pub fn into_inner_no_flush(self) -> VirtualFile {
-        self.inner
-    }
-}
-
-impl BlobWriter<false> {
-    /// Access the underlying `VirtualFile`.
-    pub fn into_inner(self) -> VirtualFile {
-        self.inner
+    /// Finish this blob writer and return the underlying `W`.
+    pub async fn shutdown(
+        self,
+        mode: BufferedWriterShutdownMode,
+        ctx: &RequestContext,
+    ) -> Result<W, FlushTaskError> {
+        let (_, file) = self.writer.shutdown(mode, ctx).await?;
+        Ok(file)
    }
 }

@@ -479,21 +429,25 @@ pub(crate) mod tests {
    use camino::Utf8PathBuf;
    use camino_tempfile::Utf8TempDir;
    use rand::{Rng, SeedableRng};
+    use tracing::info_span;

    use super::*;
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::BlockReaderRef;
+    use crate::virtual_file;
+    use crate::virtual_file::TempVirtualFile;
+    use crate::virtual_file::VirtualFile;

-    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
-        round_trip_test_compressed::<BUFFERED>(blobs, false).await
+    async fn round_trip_test(blobs: &[Vec<u8>]) -> anyhow::Result<()> {
+        round_trip_test_compressed(blobs, false).await
    }

-    pub(crate) async fn write_maybe_compressed<const BUFFERED: bool>(
+    pub(crate) async fn write_maybe_compressed(
        blobs: &[Vec<u8>],
        compression: bool,
        ctx: &RequestContext,
-    ) -> Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>), Error> {
+    ) -> anyhow::Result<(Utf8TempDir, Utf8PathBuf, Vec<u64>)> {
        let temp_dir = camino_tempfile::tempdir()?;
        let pathbuf = temp_dir.path().join("file");
        let gate = utils::sync::gate::Gate::default();
@@ -502,8 +456,19 @@ pub(crate) mod tests {
        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
-            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
+            let file = TempVirtualFile::new(
+                VirtualFile::open_with_options_v2(
+                    pathbuf.as_path(),
+                    virtual_file::OpenOptions::new()
+                        .create_new(true)
+                        .write(true),
+                    ctx,
+                )
+                .await?,
+                gate.enter()?,
+            );
+            let mut wtr =
+                BlobWriter::new(file, 0, &gate, cancel.clone(), ctx, info_span!("test")).unwrap();
            for blob in blobs.iter() {
                let (_, res) = if compression {
                    let res = wtr
@@ -520,26 +485,28 @@ pub(crate) mod tests {
                let offs = res?;
                offsets.push(offs);
            }
-            // Write out one page worth of zeros so that we can
-            // read again with read_blk
-            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await;
-            let offs = res?;
-            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(ctx).await?;
-        }
+            let file = wtr
+                .shutdown(
+                    BufferedWriterShutdownMode::ZeroPadToNextMultiple(PAGE_SZ),
+                    ctx,
+                )
+                .await?;
+            file.disarm_into_inner()
+        };
        Ok((temp_dir, pathbuf, offsets))
    }

-    async fn round_trip_test_compressed<const BUFFERED: bool>(
+    async fn round_trip_test_compressed(
        blobs: &[Vec<u8>],
        compression: bool,
-    ) -> Result<(), Error> {
+    ) -> anyhow::Result<()> {
        let ctx =
            RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error).with_scope_unit_test();
        let (_temp_dir, pathbuf, offsets) =
-            write_maybe_compressed::<BUFFERED>(blobs, compression, &ctx).await?;
+            write_maybe_compressed(blobs, compression, &ctx).await?;

-        let file = VirtualFile::open(pathbuf, &ctx).await?;
+        println!("Done writing!");
+        let file = VirtualFile::open_v2(pathbuf, &ctx).await?;
        let rdr = BlockReaderRef::VirtualFile(&file);
        let rdr = BlockCursor::new_with_compression(rdr, compression);
        for (idx, (blob, offset)) in blobs.iter().zip(offsets.iter()).enumerate() {
@@ -558,30 +525,27 @@ pub(crate) mod tests {
    }

    #[tokio::test]
-    async fn test_one() -> Result<(), Error> {
+    async fn test_one() -> anyhow::Result<()> {
        let blobs = &[vec![12, 21, 22]];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
+        round_trip_test(blobs).await?;
        Ok(())
    }

    #[tokio::test]
-    async fn test_hello_simple() -> Result<(), Error> {
+    async fn test_hello_simple() -> anyhow::Result<()> {
        let blobs = &[
            vec![0, 1, 2, 3],
            b"Hello, World!".to_vec(),
            Vec::new(),
            b"foobar".to_vec(),
        ];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false>(blobs, true).await?;
-        round_trip_test_compressed::<true>(blobs, true).await?;
+        round_trip_test(blobs).await?;
+        round_trip_test_compressed(blobs, true).await?;
        Ok(())
    }

    #[tokio::test]
-    async fn test_really_big_array() -> Result<(), Error> {
+    async fn test_really_big_array() -> anyhow::Result<()> {
        let blobs = &[
            b"test".to_vec(),
            random_array(10 * PAGE_SZ),
@@ -590,25 +554,22 @@ pub(crate) mod tests {
            vec![0xf3; 24 * PAGE_SZ],
            b"foobar".to_vec(),
        ];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
-        round_trip_test_compressed::<false>(blobs, true).await?;
-        round_trip_test_compressed::<true>(blobs, true).await?;
+        round_trip_test(blobs).await?;
+        round_trip_test_compressed(blobs, true).await?;
        Ok(())
    }

    #[tokio::test]
-    async fn test_arrays_inc() -> Result<(), Error> {
+    async fn test_arrays_inc() -> anyhow::Result<()> {
        let blobs = (0..PAGE_SZ / 8)
            .map(|v| random_array(v * 16))
            .collect::<Vec<_>>();
-        round_trip_test::<false>(&blobs).await?;
-        round_trip_test::<true>(&blobs).await?;
+        round_trip_test(&blobs).await?;
        Ok(())
    }

    #[tokio::test]
-    async fn test_arrays_random_size() -> Result<(), Error> {
+    async fn test_arrays_random_size() -> anyhow::Result<()> {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
        let blobs = (0..1024)
            .map(|_| {
@@ -620,20 +581,18 @@ pub(crate) mod tests {
                random_array(sz.into())
            })
            .collect::<Vec<_>>();
-        round_trip_test::<false>(&blobs).await?;
-        round_trip_test::<true>(&blobs).await?;
+        round_trip_test(&blobs).await?;
        Ok(())
    }

    #[tokio::test]
-    async fn test_arrays_page_boundary() -> Result<(), Error> {
+    async fn test_arrays_page_boundary() -> anyhow::Result<()> {
        let blobs = &[
            random_array(PAGE_SZ - 4),
            random_array(PAGE_SZ - 4),
            random_array(PAGE_SZ - 4),
        ];
-        round_trip_test::<false>(blobs).await?;
-        round_trip_test::<true>(blobs).await?;
+        round_trip_test(blobs).await?;
        Ok(())
    }
 }
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -4,14 +4,12 @@

 use std::ops::Deref;

-use bytes::Bytes;
-
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PAGE_SZ, PageReadGuard, PageWriteGuard, ReadBufResult};
 #[cfg(test)]
 use crate::virtual_file::IoBufferMut;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::{IoBuffer, VirtualFile};

 /// This is implemented by anything that can read 8 kB (PAGE_SZ)
 /// blocks, using the page cache
@@ -247,17 +245,17 @@ pub trait BlockWriter {
    /// 'buf' must be of size PAGE_SZ. Returns the block number the page was
    /// written to.
    ///
-    fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error>;
+    fn write_blk(&mut self, buf: IoBuffer) -> Result<u32, std::io::Error>;
 }

 ///
 /// A simple in-memory buffer of blocks.
 ///
 pub struct BlockBuf {
-    pub blocks: Vec<Bytes>,
+    pub blocks: Vec<IoBuffer>,
 }
 impl BlockWriter for BlockBuf {
-    fn write_blk(&mut self, buf: Bytes) -> Result<u32, std::io::Error> {
+    fn write_blk(&mut self, buf: IoBuffer) -> Result<u32, std::io::Error> {
        assert!(buf.len() == PAGE_SZ);
        let blknum = self.blocks.len();
        self.blocks.push(buf);
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -25,7 +25,7 @@ use std::{io, result};

 use async_stream::try_stream;
 use byteorder::{BE, ReadBytesExt};
-use bytes::{BufMut, Bytes, BytesMut};
+use bytes::BufMut;
 use either::Either;
 use futures::{Stream, StreamExt};
 use hex;
@@ -34,6 +34,7 @@ use tracing::error;

 use crate::context::RequestContext;
 use crate::tenant::block_io::{BlockReader, BlockWriter};
+use crate::virtual_file::{IoBuffer, IoBufferMut, owned_buffers_io::write::Buffer};

 // The maximum size of a value stored in the B-tree. 5 bytes is enough currently.
 pub const VALUE_SZ: usize = 5;
@@ -787,12 +788,12 @@ impl<const L: usize> BuildNode<L> {
    ///
    /// Serialize the node to on-disk format.
    ///
-    fn pack(&self) -> Bytes {
+    fn pack(&self) -> IoBuffer {
        assert!(self.keys.len() == self.num_children as usize * self.suffix_len);
        assert!(self.values.len() == self.num_children as usize * VALUE_SZ);
        assert!(self.num_children > 0);

-        let mut buf = BytesMut::new();
+        let mut buf = IoBufferMut::with_capacity(PAGE_SZ);

        buf.put_u16(self.num_children);
        buf.put_u8(self.level);
@@ -805,7 +806,7 @@ impl<const L: usize> BuildNode<L> {
        assert!(buf.len() == self.size);

        assert!(buf.len() <= PAGE_SZ);
-        buf.resize(PAGE_SZ, 0);
+        buf.extend_with(0, PAGE_SZ - buf.len());
        buf.freeze()
    }

@@ -839,7 +840,7 @@ pub(crate) mod tests {

    #[derive(Clone, Default)]
    pub(crate) struct TestDisk {
-        blocks: Vec<Bytes>,
+        blocks: Vec<IoBuffer>,
    }
    impl TestDisk {
        fn new() -> Self {
@@ -857,7 +858,7 @@ pub(crate) mod tests {
        }
    }
    impl BlockWriter for &mut TestDisk {
-        fn write_blk(&mut self, buf: Bytes) -> io::Result<u32> {
+        fn write_blk(&mut self, buf: IoBuffer) -> io::Result<u32> {
            let blknum = self.blocks.len();
            self.blocks.push(buf);
            Ok(blknum as u32)
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -12,6 +12,7 @@ use tokio_epoll_uring::{BoundedBuf, Slice};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info_span};
 use utils::id::TimelineId;
+use utils::sync::gate::GateGuard;

 use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
@@ -21,16 +22,33 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
 use crate::virtual_file::owned_buffers_io::write::{Buffer, FlushTaskError};
-use crate::virtual_file::{self, IoBufferMut, VirtualFile, owned_buffers_io};
+use crate::virtual_file::{self, IoBufferMut, TempVirtualFile, VirtualFile, owned_buffers_io};
+
+use self::owned_buffers_io::write::OwnedAsyncWriter;

 pub struct EphemeralFile {
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
    page_cache_file_id: page_cache::FileId,
    bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
-    _gate_guard: utils::sync::gate::GateGuard,
+    file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
+    buffered_writer: BufferedWriter,
+}
+
+type BufferedWriter = owned_buffers_io::write::BufferedWriter<
+    IoBufferMut,
+    TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
+>;
+
+/// A TempVirtualFile that is co-owned by the [`EphemeralFile`]` and [`BufferedWriter`].
+///
+/// (Actually [`BufferedWriter`] internally is just a client to a background flush task.
+/// The co-ownership is between [`EphemeralFile`] and that flush task.)
+///
+/// Co-ownership allows us to serve reads for data that has already been flushed by the [`BufferedWriter`].
+#[derive(Debug, Clone)]
+struct TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    inner: Arc<TempVirtualFile>,
 }

 const TAIL_SZ: usize = 64 * 1024;
@@ -44,9 +62,12 @@ impl EphemeralFile {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<EphemeralFile> {
-        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
+        // TempVirtualFile requires us to never reuse a filename while an old
+        // instance of TempVirtualFile created with that filename is not done dropping yet.
+        // So, we use a monotonic counter to disambiguate the filenames.
+        static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1);
        let filename_disambiguator =
-            NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+            NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        let filename = conf
            .timeline_path(&tenant_shard_id, &timeline_id)
@@ -54,16 +75,17 @@ impl EphemeralFile {
                "ephemeral-{filename_disambiguator}"
            )));

-        let file = Arc::new(
+        let file = TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter::new(
            VirtualFile::open_with_options_v2(
                &filename,
                virtual_file::OpenOptions::new()
+                    .create_new(true)
                    .read(true)
-                    .write(true)
-                    .create(true),
+                    .write(true),
                ctx,
            )
            .await?,
+            gate.enter()?,
        );

        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
@@ -73,37 +95,60 @@ impl EphemeralFile {
            _timeline_id: timeline_id,
            page_cache_file_id,
            bytes_written: 0,
-            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
+            file: file.clone(),
+            buffered_writer: BufferedWriter::new(
                file,
+                0,
                || IoBufferMut::with_capacity(TAIL_SZ),
                gate.enter()?,
                cancel.child_token(),
                ctx,
                info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename),
            ),
-            _gate_guard: gate.enter()?,
        })
    }
 }

-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // unlink the file
-        // we are clear to do this, because we have entered a gate
-        let path = self.buffered_writer.as_inner().path();
-        let res = std::fs::remove_file(path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
-            }
+impl TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    fn new(file: VirtualFile, gate_guard: GateGuard) -> Self {
+        Self {
+            inner: Arc::new(TempVirtualFile::new(file, gate_guard)),
        }
    }
 }

+impl OwnedAsyncWriter for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    fn write_all_at<Buf: owned_buffers_io::io_buf_aligned::IoBufAligned + Send>(
+        &self,
+        buf: owned_buffers_io::io_buf_ext::FullSlice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> impl std::future::Future<
+        Output = (
+            owned_buffers_io::io_buf_ext::FullSlice<Buf>,
+            std::io::Result<()>,
+        ),
+    > + Send {
+        self.inner.write_all_at(buf, offset, ctx)
+    }
+
+    fn set_len(
+        &self,
+        len: u64,
+        ctx: &RequestContext,
+    ) -> impl Future<Output = std::io::Result<()>> + Send {
+        self.inner.set_len(len, ctx)
+    }
+}
+
+impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    type Target = VirtualFile;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum EphemeralFileWriteError {
    #[error("{0}")]
@@ -262,9 +307,9 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);

        let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = self.buffered_writer.as_inner();
            let bounds = dst.bounds();
-            let slice = file
+            let slice = self
+                .file
                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
                .await?;
            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
@@ -456,7 +501,7 @@ mod tests {
            assert_eq!(&buf, &content[range]);
        }

-        let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
+        let file_contents = std::fs::read(file.file.path()).unwrap();
        assert!(file_contents == content[0..cap * 2]);

        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
@@ -489,7 +534,7 @@ mod tests {
        // assert the state is as this test expects it to be
        let load_io_buf_res = file.load_to_io_buf(&ctx).await.unwrap();
        assert_eq!(&load_io_buf_res[..], &content[0..cap * 2 + cap / 2]);
-        let md = file.buffered_writer.as_inner().path().metadata().unwrap();
+        let md = file.file.path().metadata().unwrap();
        assert_eq!(
            md.len(),
            2 * cap.into_u64(),
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -564,8 +564,9 @@ mod tests {
            Lsn(0),
            Lsn(0),
            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
+            // Updating this version to 17 will cause the test to fail at the
+            // next assert_eq!().
+            16,
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,7 +44,7 @@ use crate::controller_upcall_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
-use crate::metrics::TENANT_MANAGER as METRICS;
+use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
@@ -52,7 +52,9 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState,
+};
 use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

@@ -67,7 +69,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 /// having a properly acquired generation (Secondary doesn't need a generation)
 #[derive(Clone)]
 pub(crate) enum TenantSlot {
-    Attached(Arc<Tenant>),
+    Attached(Arc<TenantShard>),
    Secondary(Arc<SecondaryTenant>),
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
@@ -86,7 +88,7 @@ impl std::fmt::Debug for TenantSlot {

 impl TenantSlot {
    /// Return the `Tenant` in this slot if attached, else None
-    fn get_attached(&self) -> Option<&Arc<Tenant>> {
+    fn get_attached(&self) -> Option<&Arc<TenantShard>> {
        match self {
            Self::Attached(t) => Some(t),
            Self::Secondary(_) => None,
@@ -164,7 +166,7 @@ impl TenantStartupMode {
 /// Result type for looking up a TenantId to a specific shard
 pub(crate) enum ShardResolveResult {
    NotFound,
-    Found(Arc<Tenant>),
+    Found(Arc<TenantShard>),
    // Wait for this barrrier, then query again
    InProgress(utils::completion::Barrier),
 }
@@ -173,7 +175,7 @@ impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
    /// None is returned.
-    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<TenantShard>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
@@ -410,7 +412,7 @@ fn load_tenant_config(
        return None;
    }

-    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
+    Some(TenantShard::load_tenant_config(conf, &tenant_shard_id))
 }

 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -519,7 +521,7 @@ pub async fn init_tenant_mgr(
        tenant_configs.len(),
        conf.concurrent_tenant_warmup.initial_permits()
    );
-
+    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

    // Accumulate futures for writing tenant configs, so that we can execute in parallel
    let mut config_write_futs = Vec::new();
@@ -606,7 +608,8 @@ pub async fn init_tenant_mgr(
        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            let r =
+                TenantShard::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
            (tenant_shard_id, location_conf, r)
        });
    }
@@ -694,7 +697,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Result<Arc<TenantShard>, GlobalShutDown> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -706,7 +709,7 @@ fn tenant_spawn(
            .unwrap()
    );

-    Tenant::spawn(
+    TenantShard::spawn(
        conf,
        tenant_shard_id,
        resources,
@@ -883,12 +886,12 @@ impl TenantManager {
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
    /// undergoing a state change (i.e. slot is InProgress).
    ///
-    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
-    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
+    /// The return TenantShard is not guaranteed to be active: check its status after obtaing it, or
+    /// use [`TenantShard::wait_to_become_active`] before using it if you will do I/O on it.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
-    ) -> Result<Arc<Tenant>, GetTenantError> {
+    ) -> Result<Arc<TenantShard>, GetTenantError> {
        let locked = self.tenants.read().unwrap();

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
@@ -937,12 +940,12 @@ impl TenantManager {
        flush: Option<Duration>,
        mut spawn_mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
+    ) -> Result<Option<Arc<TenantShard>>, UpsertLocationError> {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

        enum FastPathModified {
-            Attached(Arc<Tenant>),
+            Attached(Arc<TenantShard>),
            Secondary(Arc<SecondaryTenant>),
        }

@@ -999,9 +1002,13 @@ impl TenantManager {
        // phase of writing config and/or waiting for flush, before returning.
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                // Transition to AttachedStale means we may well hold a valid generation
                // still, and have been requested to go stale as part of a migration.  If
@@ -1030,9 +1037,13 @@ impl TenantManager {
                return Ok(Some(tenant));
            }
            Some(FastPathModified::Secondary(_secondary_tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                return Ok(None);
            }
@@ -1122,7 +1133,7 @@ impl TenantManager {
        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+        TenantShard::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
            .await
            .fatal_err("write tenant shard config");

@@ -1262,7 +1273,7 @@ impl TenantManager {

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+        let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)?;

        if drop_cache {
            tracing::info!("Dropping local file cache");
@@ -1297,7 +1308,7 @@ impl TenantManager {
        Ok(())
    }

-    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
+    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<TenantShard>> {
        let locked = self.tenants.read().unwrap();
        match &*locked {
            TenantsMap::Initializing => Vec::new(),
@@ -1446,7 +1457,7 @@ impl TenantManager {
    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1476,7 +1487,7 @@ impl TenantManager {

    pub(crate) async fn do_shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1703,7 +1714,7 @@ impl TenantManager {
    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
    async fn shard_split_hardlink(
        &self,
-        parent_shard: &Tenant,
+        parent_shard: &TenantShard,
        child_shards: Vec<TenantShardId>,
    ) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_id();
@@ -1988,7 +1999,7 @@ impl TenantManager {
            }

            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
+            let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)
                .map_err(|e| Error::DetachReparent(e.into()))?;

            let shard_identity = config.shard;
@@ -2177,7 +2188,9 @@ impl TenantManager {
                        // we would use if not doing any eviction.
                        progress.bytes_total
                    } else {
-                        42
+                        // In the absence of heatmap info, assume that the secondary location simply
+                        // needs as much space as it is currently using.
+                        secondary.resident_size_metric.get()
                    }
                }
            }
@@ -2528,7 +2541,7 @@ impl SlotGuard {
                Ok(())
            }
            None => {
-              
+                METRICS.unexpected_errors.inc();
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during tenant upsert, this is a bug."
@@ -2538,7 +2551,7 @@ impl SlotGuard {
                ))
            }
            Some(slot) => {
-               
+                METRICS.unexpected_errors.inc();
                error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during upsert, this is a bug.  Contents: {:?}", slot);
                Err(TenantSlotUpsertError::InternalError(
                    "Unexpected contents of TenantSlot".into(),
@@ -2619,7 +2632,7 @@ impl Drop for SlotGuard {
        match m.entry(self.tenant_shard_id) {
            Entry::Occupied(mut entry) => {
                if !matches!(entry.get(), TenantSlot::InProgress(_)) {
-                    
+                    METRICS.unexpected_errors.inc();
                    error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during drop, this is a bug.  Contents: {:?}", entry.get());
                }

@@ -2634,7 +2647,7 @@ impl Drop for SlotGuard {
                }
            }
            Entry::Vacant(_) => {
-                
+                METRICS.unexpected_errors.inc();
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during SlotGuard drop, this is a bug."
@@ -2694,7 +2707,7 @@ fn tenant_map_acquire_slot_impl(
    mode: TenantSlotAcquireMode,
 ) -> Result<SlotGuard, TenantSlotError> {
    use TenantSlotAcquireMode::*;
-  
+    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -133,7 +133,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`].
+//!   [`TenantShard::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -171,7 +171,7 @@
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
+//! [`TenantShard::timeline_init_and_sync`]: super::TenantShard::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

 pub(crate) mod download;
@@ -223,8 +223,9 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
-    MeasureRemoteOp, 
-    RemoteOpFileKind, RemoteOpKind, 
+    MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+    RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
+    RemoteTimelineClientMetricsCallTrackSize,
 };
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token};
 use crate::tenant::metadata::TimelineMetadata;
@@ -356,6 +357,8 @@ pub(crate) struct RemoteTimelineClient {

    upload_queue: Mutex<UploadQueue>,

+    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
+
    storage_impl: GenericRemoteStorage,

    deletion_queue_client: DeletionQueueClient,
@@ -402,6 +405,10 @@ impl RemoteTimelineClient {
            storage_impl: remote_storage,
            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+            )),
            config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
            cancel: CancellationToken::new(),
        }
@@ -590,13 +597,21 @@ impl RemoteTimelineClient {
            .map_err(|_| UploadQueueNotReadyError)
    }

-    fn update_remote_physical_size_gauge(&self, _current_remote_index_part: Option<&IndexPart>) {
-
-        
+    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
+        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
+            current_remote_index_part
+                .layer_metadata
+                .values()
+                .map(|ilmd| ilmd.file_size)
+                .sum()
+        } else {
+            0
+        };
+        self.metrics.remote_physical_size_gauge.set(size);
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-  0
+        self.metrics.remote_physical_size_gauge.get()
    }

    //
@@ -611,6 +626,13 @@ impl RemoteTimelineClient {
        &self,
        cancel: &CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self.metrics.call_begin(
+            &RemoteOpFileKind::Index,
+            &RemoteOpKind::Download,
+            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                reason: "no need for a downloads gauge",
+            },
+        );

        let (index_part, index_generation, index_last_modified) = download::download_index_part(
            &self.storage_impl,
@@ -623,7 +645,7 @@ impl RemoteTimelineClient {
            Option::<TaskKind>::None,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
-        
+            Arc::clone(&self.metrics),
        )
        .await?;

@@ -698,7 +720,13 @@ impl RemoteTimelineClient {
        ctx: &RequestContext,
    ) -> Result<u64, DownloadError> {
        let downloaded_size = {
-        
+            let _unfinished_gauge_guard = self.metrics.call_begin(
+                &RemoteOpFileKind::Layer,
+                &RemoteOpKind::Download,
+                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                    reason: "no need for a downloads gauge",
+                },
+            );
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
@@ -715,11 +743,13 @@ impl RemoteTimelineClient {
                Some(ctx.task_kind()),
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
-          
+                Arc::clone(&self.metrics),
            )
            .await?
        };

+        REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
+        REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);

        Ok(downloaded_size)
    }
@@ -997,6 +1027,7 @@ impl RemoteTimelineClient {
        let op = UploadOp::UploadMetadata {
            uploaded: Box::new(index_part.clone()),
        };
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

@@ -1234,6 +1265,7 @@ impl RemoteTimelineClient {
        );

        let op = UploadOp::UploadLayer(layer, metadata, None);
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -1410,6 +1442,7 @@ impl RemoteTimelineClient {
        let op = UploadOp::Delete(Delete {
            layers: with_metadata,
        });
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -2147,7 +2180,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
-                      
+                        Arc::clone(&self.metrics),
                    )
                    .await
                }
@@ -2164,7 +2197,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
-                    
+                        Arc::clone(&self.metrics),
                    )
                    .await;
                    if res.is_ok() {
@@ -2310,7 +2343,10 @@ impl RemoteTimelineClient {
                    upload_queue.clean.1 = Some(task.task_id);

                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
-                   
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);
+
                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -2351,6 +2387,64 @@ impl RemoteTimelineClient {
                .await;
        }

+        self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
+    }
+
+    fn metric_impl(
+        &self,
+        op: &UploadOp,
+    ) -> Option<(
+        RemoteOpFileKind,
+        RemoteOpKind,
+        RemoteTimelineClientMetricsCallTrackSize,
+    )> {
+        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
+        let res = match op {
+            UploadOp::UploadLayer(_, m, _) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Upload,
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
+            ),
+            UploadOp::UploadMetadata { .. } => (
+                RemoteOpFileKind::Index,
+                RemoteOpKind::Upload,
+                DontTrackSize {
+                    reason: "metadata uploads are tiny",
+                },
+            ),
+            UploadOp::Delete(_delete) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
+            UploadOp::Barrier(..) | UploadOp::Shutdown => {
+                // we do not account these
+                return None;
+            }
+        };
+        Some(res)
+    }
+
+    fn metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
+        guard.will_decrement_manually(); // in metric_end(), see right below
+    }
+
+    fn metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
    }

    /// Close the upload queue for new operations and cancel queued operations.
@@ -2430,6 +2524,7 @@ impl RemoteTimelineClient {

                // Tear down queued ops
                for op in qi.queued_operations.into_iter() {
+                    self.metric_end(&op);
                    // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                    // which is exactly what we want to happen.
                    drop(op);
@@ -2648,7 +2743,7 @@ mod tests {
    use crate::tenant::config::AttachmentMode;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::layer::local_layer_path;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
        format!("contents for {name}").into()
@@ -2701,7 +2796,7 @@ mod tests {

    struct TestSetup {
        harness: TenantHarness,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
    }
@@ -2739,6 +2834,10 @@ mod tests {
                storage_impl: self.harness.remote_storage.clone(),
                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &self.harness.tenant_shard_id,
+                    &TIMELINE_ID,
+                )),
                config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
                cancel: CancellationToken::new(),
            })
@@ -2965,7 +3064,99 @@ mod tests {
        );
    }

-    
+    #[tokio::test]
+    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
+        // Setup
+
+        let TestSetup {
+            harness,
+            tenant: _tenant,
+            timeline,
+            ..
+        } = TestSetup::new("metrics").await.unwrap();
+        let client = &timeline.remote_client;
+
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let local_path = local_layer_path(
+            harness.conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &layer_file_name_1,
+            &harness.generation,
+        );
+        let content_1 = dummy_contents("foo");
+        std::fs::write(&local_path, &content_1).unwrap();
+
+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            local_path,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
+        );
+
+        #[derive(Debug, PartialEq, Clone, Copy)]
+        struct BytesStartedFinished {
+            started: Option<usize>,
+            finished: Option<usize>,
+        }
+        impl std::ops::Add for BytesStartedFinished {
+            type Output = Self;
+            fn add(self, rhs: Self) -> Self::Output {
+                Self {
+                    started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
+                    finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
+                }
+            }
+        }
+        let get_bytes_started_stopped = || {
+            let started = client
+                .metrics
+                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            let stopped = client
+                .metrics
+                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            BytesStartedFinished {
+                started,
+                finished: stopped,
+            }
+        };
+
+        // Test
+        tracing::info!("now doing actual test");
+
+        let actual_a = get_bytes_started_stopped();
+
+        client
+            .schedule_layer_file_upload(layer_file_1.clone())
+            .unwrap();
+
+        let actual_b = get_bytes_started_stopped();
+
+        client.wait_completion().await.unwrap();
+
+        let actual_c = get_bytes_started_stopped();
+
+        // Validate
+
+        let expected_b = actual_a
+            + BytesStartedFinished {
+                started: Some(content_1.len()),
+                // assert that the _finished metric is created eagerly so that subtractions work on first sample
+                finished: Some(0),
+            };
+        assert_eq!(actual_b, expected_b);
+
+        let expected_c = actual_a
+            + BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len()),
+            };
+        assert_eq!(actual_c, expected_c);
+    }
+
    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
        // An empty IndexPart, just sufficient to ensure deserialization will succeed
        let example_index_part = IndexPart::example();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::sync::atomic::AtomicU64;
 use std::time::SystemTime;

 use anyhow::{Context, anyhow};
@@ -15,7 +16,7 @@ use remote_storage::{
    DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
 };
 use tokio::fs::{self, File, OpenOptions};
-use tokio::io::{AsyncSeekExt, AsyncWriteExt};
+use tokio::io::AsyncSeekExt;
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -40,7 +41,10 @@ use crate::span::{
 use crate::tenant::Generation;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
-use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error};
+use crate::virtual_file;
+use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
+use crate::virtual_file::{IoBufferMut, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{TempVirtualFile, owned_buffers_io};

 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
@@ -72,21 +76,34 @@ pub async fn download_layer_file<'a>(
        layer_metadata.generation,
    );

-    // Perform a rename inspired by durable_rename from file_utils.c.
-    // The sequence:
-    //     write(tmp)
-    //     fsync(tmp)
-    //     rename(tmp, new)
-    //     fsync(new)
-    //     fsync(parent)
-    // For more context about durable_rename check this email from postgres mailing list:
-    // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
-    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
-
-    let bytes_amount = download_retry(
+    let (bytes_amount, temp_file) = download_retry(
        || async {
-            download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
+            // TempVirtualFile requires us to never reuse a filename while an old
+            // instance of TempVirtualFile created with that filename is not done dropping yet.
+            // So, we use a monotonic counter to disambiguate the filenames.
+            static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1);
+            let filename_disambiguator =
+                NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+
+            let temp_file_path = path_with_suffix_extension(
+                local_path,
+                &format!("{filename_disambiguator:x}.{TEMP_DOWNLOAD_EXTENSION}"),
+            );
+
+            let temp_file = TempVirtualFile::new(
+                VirtualFile::open_with_options_v2(
+                    &temp_file_path,
+                    virtual_file::OpenOptions::new()
+                        .create_new(true)
+                        .write(true),
+                    ctx,
+                )
+                .await
+                .with_context(|| format!("create a temp file for layer download: {temp_file_path}"))
+                .map_err(DownloadError::Other)?,
+                gate.enter().map_err(|_| DownloadError::Cancelled)?,
+            );
+            download_object(storage, &remote_path, temp_file, gate, cancel, ctx).await
        },
        &format!("download {remote_path:?}"),
        cancel,
@@ -96,7 +113,8 @@ pub async fn download_layer_file<'a>(
    let expected = layer_metadata.file_size;
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
-            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
+            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {:?}",
+            temp_file.path()
        )));
    }

@@ -106,11 +124,28 @@ pub async fn download_layer_file<'a>(
        )))
    });

-    fs::rename(&temp_file_path, &local_path)
+    // Try rename before disarming the temp file.
+    // That way, if rename fails for whatever reason, we clean up the temp file on the return path.
+
+    fs::rename(temp_file.path(), &local_path)
        .await
        .with_context(|| format!("rename download layer file to {local_path}"))
        .map_err(DownloadError::Other)?;

+    // The temp file's VirtualFile points to the temp_file_path which we moved above.
+    // Drop it immediately, it's invalid.
+    // This will get better in https://github.com/neondatabase/neon/issues/11692
+    let _: VirtualFile = temp_file.disarm_into_inner();
+    // NB: The gate guard that was stored in `temp_file` is dropped but we continue
+    // to operate on it and on the parent timeline directory.
+    // Those operations are safe to do because higher-level code is holding another gate guard:
+    // - attached mode: the download task spawned by struct Layer is holding the gate guard
+    // - secondary mode: The TenantDownloader::download holds the gate open
+
+    // The rename above is not durable yet.
+    // It doesn't matter for crash consistency because pageserver startup deletes temp
+    // files and we'll re-download on demand if necessary.
+
    // We use fatal_err() below because the after the rename above,
    // the in-memory state of the filesystem already has the layer file in its final place,
    // and subsequent pageserver code could think it's durable while it really isn't.
@@ -146,147 +181,64 @@ pub async fn download_layer_file<'a>(
 async fn download_object(
    storage: &GenericRemoteStorage,
    src_path: &RemotePath,
-    dst_path: &Utf8PathBuf,
-    #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
+    destination_file: TempVirtualFile,
+    gate: &utils::sync::gate::Gate,
    cancel: &CancellationToken,
-    #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
-) -> Result<u64, DownloadError> {
-    let res = match crate::virtual_file::io_engine::get() {
-        crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
-        crate::virtual_file::io_engine::IoEngine::StdFs => {
-            async {
-                let destination_file = tokio::fs::File::create(dst_path)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
+    ctx: &RequestContext,
+) -> Result<(u64, TempVirtualFile), DownloadError> {
+    let mut download = storage
+        .download(src_path, &DownloadOpts::default(), cancel)
+        .await?;

-                let download = storage
-                    .download(src_path, &DownloadOpts::default(), cancel)
-                    .await?;
+    pausable_failpoint!("before-downloading-layer-stream-pausable");

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
+    let dst_path = destination_file.path().to_owned();
+    let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
+        destination_file,
+        0,
+        || IoBufferMut::with_capacity(super::BUFFER_SIZE),
+        gate.enter().map_err(|_| DownloadError::Cancelled)?,
+        cancel.child_token(),
+        ctx,
+        tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path),
+    );

-                let mut buf_writer =
-                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
-
-                let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
-
-                let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
-                buf_writer.flush().await?;
-
-                let mut destination_file = buf_writer.into_inner();
-
-                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-                // A file will not be closed immediately when it goes out of scope if there are any IO operations
-                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-                // you should call flush before dropping it.
-                //
-                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
-                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
-                // But for additional safety lets check/wait for any pending operations.
-                destination_file
-                    .flush()
-                    .await
-                    .maybe_fatal_err("download_object sync_all")
-                    .with_context(|| format!("flush source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                // not using sync_data because it can lose file size update
-                destination_file
-                    .sync_all()
-                    .await
-                    .maybe_fatal_err("download_object sync_all")
-                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                Ok(bytes_amount)
-            }
+    // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
+    // There's chunks_vectored() on the stream.
+    let (bytes_amount, destination_file) = async {
+        while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await {
+            let chunk = match res {
+                Ok(chunk) => chunk,
+                Err(e) => return Err(DownloadError::from(e)),
+            };
+            buffered
+                .write_buffered_borrowed(&chunk, ctx)
+                .await
+                .map_err(|e| match e {
+                    FlushTaskError::Cancelled => DownloadError::Cancelled,
+                })?;
+        }
+        buffered
+            .shutdown(
+                owned_buffers_io::write::BufferedWriterShutdownMode::PadThenTruncate,
+                ctx,
+            )
            .await
-        }
-        #[cfg(target_os = "linux")]
-        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
-            use std::sync::Arc;
-
-            use crate::virtual_file::{IoBufferMut, owned_buffers_io};
-            async {
-                let destination_file = Arc::new(
-                    VirtualFile::create(dst_path, ctx)
-                        .await
-                        .with_context(|| {
-                            format!("create a destination file for layer '{dst_path}'")
-                        })
-                        .map_err(DownloadError::Other)?,
-                );
-
-                let mut download = storage
-                    .download(src_path, &DownloadOpts::default(), cancel)
-                    .await?;
-
-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
-                let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
-                    destination_file,
-                    || IoBufferMut::with_capacity(super::BUFFER_SIZE),
-                    gate.enter().map_err(|_| DownloadError::Cancelled)?,
-                    cancel.child_token(),
-                    ctx,
-                    tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path),
-                );
-
-                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
-                // There's chunks_vectored() on the stream.
-                let (bytes_amount, destination_file) = async {
-                    while let Some(res) =
-                        futures::StreamExt::next(&mut download.download_stream).await
-                    {
-                        let chunk = match res {
-                            Ok(chunk) => chunk,
-                            Err(e) => return Err(DownloadError::from(e)),
-                        };
-                        buffered
-                            .write_buffered_borrowed(&chunk, ctx)
-                            .await
-                            .map_err(|e| match e {
-                                FlushTaskError::Cancelled => DownloadError::Cancelled,
-                            })?;
-                    }
-                    let inner = buffered
-                        .flush_and_into_inner(ctx)
-                        .await
-                        .map_err(|e| match e {
-                            FlushTaskError::Cancelled => DownloadError::Cancelled,
-                        })?;
-                    Ok(inner)
-                }
-                .await?;
-
-                // not using sync_data because it can lose file size update
-                destination_file
-                    .sync_all()
-                    .await
-                    .maybe_fatal_err("download_object sync_all")
-                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                Ok(bytes_amount)
-            }
-            .await
-        }
-    };
-
-    // in case the download failed, clean up
-    match res {
-        Ok(bytes_amount) => Ok(bytes_amount),
-        Err(e) => {
-            if let Err(e) = tokio::fs::remove_file(dst_path).await {
-                if e.kind() != std::io::ErrorKind::NotFound {
-                    on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
-                }
-            }
-            Err(e)
-        }
+            .map_err(|e| match e {
+                FlushTaskError::Cancelled => DownloadError::Cancelled,
+            })
    }
+    .await?;
+
+    // not using sync_data because it can lose file size update
+    destination_file
+        .sync_all()
+        .await
+        .maybe_fatal_err("download_object sync_all")
+        .with_context(|| format!("failed to fsync source file at {dst_path}"))
+        .map_err(DownloadError::Other)?;
+
+    Ok((bytes_amount, destination_file))
 }

 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
@@ -452,7 +404,7 @@ async fn do_download_index_part(
 /// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
 /// to listing objects.
 ///
-/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `my_generation`: the value of `[crate::tenant::TenantShard::generation]`
 /// * `what`: for logging, what object are we downloading
 /// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
 /// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
--- a/Show More
+++ b/Show More