no merge: intentionally break benchmarks for verifying CI

CI(build-tools): bump packages in build-tools image (#11697 )
## Problem `cargo-deny` 0.16.2 spits a bunch of warnings like: ``` warning[index-failure]: unable to check for yanked crates ``` The issue is fixed for the latest version of `cargo-deny` (0.18.2). And while we're here, let's bump all the packages we have in `build-tools` image ## Summary of changes - bump cargo-hakari to 0.9.36 - bump cargo-deny to 0.18.2 - bump cargo-hack to 0.6.36 - bump cargo-nextest to 0.9.94 - bump diesel_cli to 2.2.9 - bump s5cmd to 2.3.0 - bump mold to 2.37.1 - bump python to 3.11.12
2026-05-12 18:50:37 +00:00 · 2025-04-25 11:29:33 +02:00 · 2025-04-24 14:13:04 +00:00 · 2025-04-24 13:51:09 +00:00 · 2025-04-24 13:07:57 +00:00 · 2025-04-24 13:02:31 +00:00
174 changed files with 4660 additions and 2367 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,7 +19,7 @@
 !pageserver/
 !pgxn/
 !proxy/
-!object_storage/
+!endpoint_storage/
 !storage_scrubber/
 !safekeeper/
 !storage_broker/
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -113,8 +113,6 @@ runs:
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
-        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
-        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
        RERUN_FAILED: ${{ inputs.rerun_failed }}
        PG_VERSION: ${{ inputs.pg_version }}
        SANITIZERS: ${{ inputs.sanitizers }}
@@ -135,6 +133,7 @@ runs:
        fi

        PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+        echo "PERF_REPORT_DIR=${PERF_REPORT_DIR}" >> ${GITHUB_ENV}
        rm -rf $PERF_REPORT_DIR

        TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
@@ -211,11 +210,12 @@ runs:
          --verbose \
          -rA $TEST_SELECTION $EXTRA_PARAMS

-        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
-          export REPORT_FROM="$PERF_REPORT_DIR"
-          export REPORT_TO="$PLATFORM"
-          scripts/generate_and_push_perf_report.sh
-        fi
+    - name: Upload performance report
+      if: ${{ !cancelled() && inputs.save_perf_report == 'true' }}
+      shell: bash -euxo pipefail {0}
+      run: |
+        export REPORT_FROM="${PERF_REPORT_DIR}"
+        scripts/generate_and_push_perf_report.sh

    - name: Upload compatibility snapshot
      # Note, that we use `github.base_ref` which is a target branch for a PR
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -272,10 +272,13 @@ jobs:
          # run pageserver tests with different settings
          for get_vectored_concurrent_io in sequential sidecar-task; do
            for io_engine in std-fs tokio-epoll-uring ; do
-              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                ${cov_prefix} \
-                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+                for io_mode in buffered direct direct-rw ; do
+                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \
+                  ${cov_prefix} \
+                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+              done
            done
          done

@@ -346,7 +349,7 @@ jobs:
      contents: read
      statuses: write
    needs: [ build-neon ]
-    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large-metal')) }}
    container:
      image: ${{ inputs.build-tools-image }}
      credentials:
@@ -392,6 +395,7 @@ jobs:
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -165,5 +165,5 @@ jobs:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          CURRENT_SHA: ${{ github.sha }}
        run: |
-          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
+          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release.*$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
          echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,6 +323,8 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
          SYNC_BETWEEN_TESTS: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
@@ -1236,7 +1238,7 @@ jobs:
        env:
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          TIMEOUT=1800 # 30 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
+          TIMEOUT=5400 # 90 minutes, usually it takes ~2-3 minutes, but if runners are busy, it might take longer
          INTERVAL=15 # try each N seconds

          last_status="" # a variable to carry the last status of the "build-and-upload-extensions" context
--- a/.github/workflows/fast-forward.yml
+++ b/.github/workflows/fast-forward.yml
@@ -27,15 +27,17 @@ jobs:
      - name: Fast forwarding
        uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
        # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
-        if: ${{ github.event.pull_request.mergeable_state  == 'clean' }}
+        if: ${{ contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
        with:
          merge: true
          comment: on-error
          github_token: ${{ secrets.CI_ACCESS_TOKEN }}

      - name: Comment if mergeable_state is not clean
-        if: ${{ github.event.pull_request.mergeable_state  != 'clean' }}
+        if: ${{ !contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          gh pr comment ${{ github.event.pull_request.number }} \
            --repo "${GITHUB_REPOSITORY}" \
-            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."
+            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\` or \`unstable\`."
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -30,7 +30,7 @@ permissions:
  statuses: write # require for posting a status update

 env:
-  DEFAULT_PG_VERSION: 16
+  DEFAULT_PG_VERSION: 17
  PLATFORM: neon-captest-new
  AWS_DEFAULT_REGION: eu-central-1

@@ -42,6 +42,8 @@ jobs:
      github-event-name: ${{ github.event_name }}

  build-build-tools-image:
+    permissions:
+      packages: write
    needs: [ check-permissions ]
    uses: ./.github/workflows/build-build-tools-image.yml
    secrets: inherit
--- a/.github/workflows/random-ops-test.yml
+++ b/.github/workflows/random-ops-test.yml
@@ -0,0 +1,93 @@
+name: Random Operations Test
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │  ┌───────────── hour (0 - 23)
+    #          │  │  ┌───────────── day of the month (1 - 31)
+    #          │  │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │  │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 */2 * * *' # runs every 2 hours
+  workflow_dispatch:
+    inputs:
+      random_seed:
+        type: number
+        description: 'The random seed'
+        required: false
+        default: 0
+      num_operations:
+        type: number
+        description: "The number of operations to test"
+        default: 250
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+permissions: {}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  run-random-rests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+    runs-on: small
+    permissions:
+      id-token: write
+      statuses: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+      - name: Run tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: remote
+          test_selection: random_ops
+          run_in_parallel: false
+          extra_params: -m remote_cluster
+          pg_version: ${{ matrix.pg-version }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+          RANDOM_SEED: ${{ inputs.random_seed }}
+          NUM_OPERATIONS: ${{ inputs.num_operations }}
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -40,7 +40,7 @@ dependencies = [
 "getrandom 0.2.11",
 "once_cell",
 "version_check",
- "zerocopy",
+ "zerocopy 0.7.31",
 ]

 [[package]]
@@ -2037,6 +2037,33 @@ dependencies = [
 "zeroize",
 ]

+[[package]]
+name = "endpoint_storage"
+version = "0.0.1"
+dependencies = [
+ "anyhow",
+ "axum",
+ "axum-extra",
+ "camino",
+ "camino-tempfile",
+ "futures",
+ "http-body-util",
+ "itertools 0.10.5",
+ "jsonwebtoken",
+ "prometheus",
+ "rand 0.8.5",
+ "remote_storage",
+ "serde",
+ "serde_json",
+ "test-log",
+ "tokio",
+ "tokio-util",
+ "tower 0.5.2",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "enum-map"
 version = "2.5.0"
@@ -3998,33 +4025,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "object_storage"
-version = "0.0.1"
-dependencies = [
- "anyhow",
- "axum",
- "axum-extra",
- "camino",
- "camino-tempfile",
- "futures",
- "http-body-util",
- "itertools 0.10.5",
- "jsonwebtoken",
- "prometheus",
- "rand 0.8.5",
- "remote_storage",
- "serde",
- "serde_json",
- "test-log",
- "tokio",
- "tokio-util",
- "tower 0.5.2",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "once_cell"
 version = "1.20.2"
@@ -4285,6 +4285,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "pageserver_compaction",
+ "pem",
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
@@ -4352,6 +4353,7 @@ dependencies = [
 "humantime-serde",
 "itertools 0.10.5",
 "nix 0.27.1",
+ "once_cell",
 "postgres_backend",
 "postgres_ffi",
 "rand 0.8.5",
@@ -4413,9 +4415,9 @@ dependencies = [

 [[package]]
 name = "papaya"
-version = "0.2.0"
+version = "0.2.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "aab21828b6b5952fdadd6c377728ffae53ec3a21b2febc47319ab65741f7e2fd"
+checksum = "6827e3fc394523c21d4464d02c0bb1c19966ea4a58a9844ad6d746214179d2bc"
 dependencies = [
 "equivalent",
 "seize",
@@ -5202,7 +5204,7 @@ dependencies = [
 "walkdir",
 "workspace_hack",
 "x509-cert",
- "zerocopy",
+ "zerocopy 0.8.24",
 ]

 [[package]]
@@ -5592,7 +5594,7 @@ dependencies = [
 "wasm-bindgen-futures",
 "wasm-streams",
 "web-sys",
- "webpki-roots 0.26.1",
+ "webpki-roots",
 "winreg",
 ]

@@ -6000,6 +6002,7 @@ dependencies = [
 "once_cell",
 "pageserver_api",
 "parking_lot 0.12.1",
+ "pem",
 "postgres-protocol",
 "postgres_backend",
 "postgres_ffi",
@@ -6192,13 +6195,13 @@ checksum = "224e328af6e080cddbab3c770b1cf50f0351ba0577091ef2410c3951d835ff87"

 [[package]]
 name = "sentry"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
+checksum = "255914a8e53822abd946e2ce8baa41d4cded6b8e938913b7f7b9da5b7ab44335"
 dependencies = [
 "httpdate",
 "reqwest",
- "rustls 0.21.12",
+ "rustls 0.23.18",
 "sentry-backtrace",
 "sentry-contexts",
 "sentry-core",
@@ -6206,14 +6209,14 @@ dependencies = [
 "sentry-tracing",
 "tokio",
 "ureq",
- "webpki-roots 0.25.2",
+ "webpki-roots",
 ]

 [[package]]
 name = "sentry-backtrace"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a79194074f34b0cbe5dd33896e5928bbc6ab63a889bd9df2264af5acb186921e"
+checksum = "00293cd332a859961f24fd69258f7e92af736feaeb91020cff84dac4188a4302"
 dependencies = [
 "backtrace",
 "once_cell",
@@ -6223,9 +6226,9 @@ dependencies = [

 [[package]]
 name = "sentry-contexts"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "eba8870c5dba2bfd9db25c75574a11429f6b95957b0a78ac02e2970dd7a5249a"
+checksum = "961990f9caa76476c481de130ada05614cd7f5aa70fb57c2142f0e09ad3fb2aa"
 dependencies = [
 "hostname",
 "libc",
@@ -6237,9 +6240,9 @@ dependencies = [

 [[package]]
 name = "sentry-core"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "46a75011ea1c0d5c46e9e57df03ce81f5c7f0a9e199086334a1f9c0a541e0826"
+checksum = "1a6409d845707d82415c800290a5d63be5e3df3c2e417b0997c60531dfbd35ef"
 dependencies = [
 "once_cell",
 "rand 0.8.5",
@@ -6250,9 +6253,9 @@ dependencies = [

 [[package]]
 name = "sentry-panic"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2eaa3ecfa3c8750c78dcfd4637cfa2598b95b52897ed184b4dc77fcf7d95060d"
+checksum = "609b1a12340495ce17baeec9e08ff8ed423c337c1a84dffae36a178c783623f3"
 dependencies = [
 "sentry-backtrace",
 "sentry-core",
@@ -6260,9 +6263,9 @@ dependencies = [

 [[package]]
 name = "sentry-tracing"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f715932bf369a61b7256687c6f0554141b7ce097287e30e3f7ed6e9de82498fe"
+checksum = "49f4e86402d5c50239dc7d8fd3f6d5e048221d5fcb4e026d8d50ab57fe4644cb"
 dependencies = [
 "sentry-backtrace",
 "sentry-core",
@@ -6272,9 +6275,9 @@ dependencies = [

 [[package]]
 name = "sentry-types"
-version = "0.32.3"
+version = "0.37.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4519c900ce734f7a0eb7aba0869dfb225a7af8820634a7dd51449e3b093cfb7c"
+checksum = "3d3f117b8755dbede8260952de2aeb029e20f432e72634e8969af34324591631"
 dependencies = [
 "debugid",
 "hex",
@@ -6708,8 +6711,6 @@ version = "0.1.0"
 dependencies = [
 "anyhow",
 "async-stream",
- "aws-config",
- "aws-sdk-s3",
 "camino",
 "chrono",
 "clap",
@@ -7798,7 +7799,7 @@ dependencies = [
 "rustls 0.23.18",
 "rustls-pki-types",
 "url",
- "webpki-roots 0.26.1",
+ "webpki-roots",
 ]

 [[package]]
@@ -8166,12 +8167,6 @@ dependencies = [
 "wasm-bindgen",
 ]

-[[package]]
-name = "webpki-roots"
-version = "0.25.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "14247bb57be4f377dfb94c72830b8ce8fc6beac03cf4bf7b9732eadd414123fc"
-
 [[package]]
 name = "webpki-roots"
 version = "0.26.1"
@@ -8479,6 +8474,8 @@ dependencies = [
 "regex-syntax 0.8.2",
 "reqwest",
 "rustls 0.23.18",
+ "rustls-pki-types",
+ "rustls-webpki 0.102.8",
 "scopeguard",
 "sec1 0.7.3",
 "serde",
@@ -8507,7 +8504,6 @@ dependencies = [
 "tracing-log",
 "url",
 "uuid",
- "zerocopy",
 "zeroize",
 "zstd",
 "zstd-safe",
@@ -8611,8 +8607,16 @@ version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
 dependencies = [
- "byteorder",
- "zerocopy-derive",
+ "zerocopy-derive 0.7.31",
+]
+
+[[package]]
+name = "zerocopy"
+version = "0.8.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2586fea28e186957ef732a5f8b3be2da217d65c5969d4b1e17f973ebbe876879"
+dependencies = [
+ "zerocopy-derive 0.8.24",
 ]

 [[package]]
@@ -8626,6 +8630,17 @@ dependencies = [
 "syn 2.0.100",
 ]

+[[package]]
+name = "zerocopy-derive"
+version = "0.8.24"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a996a8f63c5c4448cd959ac1bab0aaa3306ccfd060472f85943ee0750f0169be"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn 2.0.100",
+]
+
 [[package]]
 name = "zerofrom"
 version = "0.1.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -40,7 +40,7 @@ members = [
    "libs/proxy/postgres-protocol2",
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
-    "object_storage",
+    "endpoint_storage",
 ]

 [workspace.package]
@@ -164,7 +164,7 @@ scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
 send-future = "0.1.0"
-sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.37", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
@@ -220,7 +220,7 @@ uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 rustls-native-certs = "0.8"
 whoami = "1.5.1"
-zerocopy = { version = "0.7", features = ["derive"] }
+zerocopy = { version = "0.8", features = ["derive", "simd"] }
 json-structural-diff = { version = "0.2.0" }
 x509-cert = { version = "0.2.5" }

--- a/4
+++ b/4
@@ -89,7 +89,7 @@ RUN set -e \
      --bin storage_broker  \
      --bin storage_controller  \
      --bin proxy  \
-      --bin object_storage \
+      --bin endpoint_storage \
      --bin neon_local \
      --bin storage_scrubber \
      --locked --release
@@ -122,7 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage    /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin

--- a/README.md
+++ b/README.md
@@ -270,7 +270,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -173,7 +173,7 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && rm -rf protoc.zip protoc

 # s5cmd
-ENV S5CMD_VERSION=2.2.2
+ENV S5CMD_VERSION=2.3.0
 RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
    && chmod +x s5cmd \
    && mv s5cmd /usr/local/bin/s5cmd
@@ -206,7 +206,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
    && rm awscliv2.zip

 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.34.1
+ENV MOLD_VERSION=v2.37.1
 RUN set -e \
    && git clone https://github.com/rui314/mold.git \
    && mkdir mold/build \
@@ -268,7 +268,7 @@ WORKDIR /home/nonroot
 RUN echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /home/nonroot/.curlrc

 # Python
-ENV PYTHON_VERSION=3.11.10 \
+ENV PYTHON_VERSION=3.11.12 \
    PYENV_ROOT=/home/nonroot/.pyenv \
    PATH=/home/nonroot/.pyenv/shims:/home/nonroot/.pyenv/bin:/home/nonroot/.poetry/bin:$PATH
 RUN set -e \
@@ -296,12 +296,12 @@ ENV RUSTC_VERSION=1.86.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
-ARG CARGO_HAKARI_VERSION=0.9.33
-ARG CARGO_DENY_VERSION=0.16.2
-ARG CARGO_HACK_VERSION=0.6.33
-ARG CARGO_NEXTEST_VERSION=0.9.85
+ARG CARGO_HAKARI_VERSION=0.9.36
+ARG CARGO_DENY_VERSION=0.18.2
+ARG CARGO_HACK_VERSION=0.6.36
+ARG CARGO_NEXTEST_VERSION=0.9.94
 ARG CARGO_CHEF_VERSION=0.1.71
-ARG CARGO_DIESEL_CLI_VERSION=2.2.6
+ARG CARGO_DIESEL_CLI_VERSION=2.2.9
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
 	chmod +x rustup-init && \
 	./rustup-init -y --default-toolchain ${RUSTC_VERSION} && \
--- a/clippy.toml
+++ b/clippy.toml
@@ -12,3 +12,5 @@ disallowed-macros = [
    # cannot disallow this, because clippy finds used from tokio macros
    #"tokio::pin",
 ]
+
+allow-unwrap-in-tests = true
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1677,7 +1677,7 @@ RUN set -e \
    && apt clean && rm -rf /var/lib/apt/lists/*

 # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+ENV PGBOUNCER_TAG=pgbouncer_1_24_1
 RUN set -e \
    && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
    && cd pgbouncer \
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,265 +0,0 @@
-commit 00aa659afc9c7336ab81036edec3017168aabf40
-Author: Heikki Linnakangas <heikki@neon.tech>
-Date:   Tue Nov 12 16:59:19 2024 +0200
-
-    Temporarily disable test that depends on timezone
-
-diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
-index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
-+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
-@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
-  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
- (1 row)
- 
-SELECT anon.generalize_tstzrange('19041107','millennium');
-                      generalize_tstzrange                       
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-   generalize_daterange   
-diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
-index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
-+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
-@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
- SELECT anon.generalize_tstzrange('19041107','year');
- SELECT anon.generalize_tstzrange('19041107','decade');
- SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- 
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-
-commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Fri May 31 06:34:26 2024 +0000
-
-    These alternative expected files were added to consider the neon features
-
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-new file mode 100644
-index 0000000..2539cfd
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-@@ -0,0 +1,101 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE mallory_the_masked_user;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.anonymize_table('t1');
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ERROR:  Only supersusers can start the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT * FROM mask.t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  SELECT * FROM public.t1;
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  Only supersusers can stop the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-new file mode 100644
-index 0000000..8b090fe
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-@@ -0,0 +1,104 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE oscar_the_owner;
-+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE oscar_the_owner;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+SELECT anon.anonymize_table('t1');
-+ anonymize_table 
-+-----------------
-+ t
-+(1 row)
-+
-+SELECT * FROM t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+UPDATE t1 SET t='test' WHERE i=1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+SELECT * FROM t1;
-+ i |  t   
-+---+------
-+ 1 | test
-+(1 row)
-+
-+--SELECT * FROM mask.t1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  permission denied for schema mask
-+CONTEXT:  SQL statement "DROP VIEW mask.t1;"
-+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
-+SQL statement "SELECT anon.mask_drop_view(oid)
-+  FROM pg_catalog.pg_class
-+  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
-+  AND relkind IN ('r','p','f')"
-+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
--- a/compute/patches/pg_repack.patch
+++ b/compute/patches/pg_repack.patch
@@ -11,6 +11,14 @@ index bf6edcb..89b4c7f 100644
 
 USE_PGXS = 1	# use pgxs if not in contrib directory
 PGXS := $(shell $(PG_CONFIG) --pgxs)
+diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
+index 9f2e171..f6e4f8d 100644
+--- a/regress/expected/init-extension.out
+++ b/regress/expected/init-extension.out
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
 index 8d0a94e..63b68bf 100644
 --- a/regress/expected/nosuper.out
@@ -42,6 +50,14 @@ index 8d0a94e..63b68bf 100644
 INFO: repacking table "public.tbl_cluster"
 ERROR: query failed: ERROR:  current transaction is aborted, commands ignored until end of transaction block
 DETAIL: query was: RESET lock_timeout
+diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
+index 9f2e171..f6e4f8d 100644
+--- a/regress/sql/init-extension.sql
+++ b/regress/sql/init-extension.sql
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
 index 072f0fa..dbe60f8 100644
 --- a/regress/sql/nosuper.sql
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -57,24 +57,13 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

-// Compatibility hack: if the control plane specified any remote-ext-config
-// use the default value for extension storage proxy gateway.
-// Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_config(arg: &str) -> Result<String> {
-    if arg.starts_with("http") {
-        Ok(arg.trim_end_matches('/').to_string())
-    } else {
-        Ok("http://pg-ext-s3-gateway".to_string())
-    }
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
    pub pgbin: String,

-    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    #[arg(short = 'r', long)]
    pub remote_ext_config: Option<String>,

    /// The port to bind the external listening HTTP server to. Clients running
@@ -116,9 +105,7 @@ struct Cli {
    #[arg(long)]
    pub set_disk_quota_for_fs: Option<String>,

-    // TODO(tristan957): remove alias after compatibility tests are no longer
-    // an issue
-    #[arg(short = 'c', long, alias = "spec-path")]
+    #[arg(short = 'c', long)]
    pub config: Option<OsString>,

    #[arg(short = 'i', long, group = "compute-id")]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,9 +11,7 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{
-    ActivityKind, ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus,
-};
+use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
 };
@@ -134,10 +132,6 @@ pub struct ComputeState {
    /// Timestamp of the last Postgres activity. It could be `None` if
    /// compute wasn't used since start.
    pub last_active: Option<DateTime<Utc>>,
-    /// Timestamp of the last client's activity. Unlike `last_active` it doesn't take into account
-    /// baclkground activity: autovacuum, LR,...
-    pub last_active_query: Option<DateTime<Utc>>,
-    pub last_activity_kind: Option<ActivityKind>,
    pub error: Option<String>,

    /// Compute spec. This can be received from the CLI or - more likely -
@@ -165,8 +159,6 @@ impl ComputeState {
            start_time: Utc::now(),
            status: ComputeStatus::Empty,
            last_active: None,
-            last_active_query: None,
-            last_activity_kind: None,
            error: None,
            pspec: None,
            startup_span: None,
@@ -649,7 +641,26 @@ impl ComputeNode {

                let log_directory_path = Path::new(&self.params.pgdata).join("log");
                let log_directory_path = log_directory_path.to_string_lossy().to_string();
-                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Add project_id,endpoint_id tag to identify the logs.
+                //
+                // These ids are passed from cplane,
+                // for backwards compatibility (old computes that don't have them),
+                // we set them to None.
+                // TODO: Clean up this code when all computes have them.
+                let tag: Option<String> = match (
+                    pspec.spec.project_id.as_deref(),
+                    pspec.spec.endpoint_id.as_deref(),
+                ) {
+                    (Some(project_id), Some(endpoint_id)) => {
+                        Some(format!("{project_id}/{endpoint_id}"))
+                    }
+                    (Some(project_id), None) => Some(format!("{project_id}/None")),
+                    (None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
+                    (None, None) => None,
+                };
+
+                configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;

                // Launch a background task to clean up the audit logs
                launch_pgaudit_gc(log_directory_path);
@@ -1696,22 +1707,13 @@ impl ComputeNode {
    }

    /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
-    pub fn update_last_active(
-        &self,
-        last_active: Option<DateTime<Utc>>,
-        activity_kind: ActivityKind,
-    ) {
+    pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
        let mut state = self.state.lock().unwrap();
        // NB: `Some(<DateTime>)` is always greater than `None`.
        if last_active > state.last_active {
            state.last_active = last_active;
            debug!("set the last compute activity time to: {:?}", last_active);
        }
-        if activity_kind == ActivityKind::Query && last_active > state.last_active_query {
-            state.last_active_query = last_active;
-            debug!("set the last user's activity time to: {:?}", last_active);
-        }
-        state.last_activity_kind = Some(activity_kind);
    }

    // Look for core dumps and collect backtraces.
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -30,8 +30,6 @@ impl From<&ComputeState> for ComputeStatusResponse {
                .map(|pspec| pspec.timeline_id.to_string()),
            status: state.status,
            last_active: state.last_active,
-            last_active_query: state.last_active_query,
-            last_activity_kind: state.last_activity_kind,
            error: state.error.clone(),
        }
    }
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -1,8 +1,8 @@
-use metrics::core::{AtomicF64, Collector, GenericGauge};
+use metrics::core::{AtomicF64, AtomicU64, Collector, GenericCounter, GenericGauge};
 use metrics::proto::MetricFamily;
 use metrics::{
-    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter_vec,
-    register_int_gauge_vec, register_uint_gauge_vec,
+    IntCounterVec, IntGaugeVec, UIntGaugeVec, register_gauge, register_int_counter,
+    register_int_counter_vec, register_int_gauge_vec, register_uint_gauge_vec,
 };
 use once_cell::sync::Lazy;

@@ -81,6 +81,22 @@ pub(crate) static COMPUTE_CTL_UP: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

+pub(crate) static PG_CURR_DOWNTIME_MS: Lazy<GenericGauge<AtomicF64>> = Lazy::new(|| {
+    register_gauge!(
+        "compute_pg_current_downtime_ms",
+        "Non-cumulative duration of Postgres downtime in ms; resets after successful check",
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PG_TOTAL_DOWNTIME_MS: Lazy<GenericCounter<AtomicU64>> = Lazy::new(|| {
+    register_int_counter!(
+        "compute_pg_downtime_ms_total",
+        "Cumulative duration of Postgres downtime in ms",
+    )
+    .expect("failed to define a metric")
+});
+
 pub fn collect() -> Vec<MetricFamily> {
    let mut metrics = COMPUTE_CTL_UP.collect();
    metrics.extend(INSTALLED_EXTENSIONS.collect());
@@ -88,5 +104,7 @@ pub fn collect() -> Vec<MetricFamily> {
    metrics.extend(REMOTE_EXT_REQUESTS_TOTAL.collect());
    metrics.extend(DB_MIGRATION_FAILED.collect());
    metrics.extend(AUDIT_LOG_DIR_SIZE.collect());
+    metrics.extend(PG_CURR_DOWNTIME_MS.collect());
+    metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
    metrics
 }
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,207 +3,297 @@ use std::thread;
 use std::time::Duration;

 use chrono::{DateTime, Utc};
-use compute_api::responses::{ActivityKind, ComputeStatus};
+use compute_api::responses::ComputeStatus;
 use compute_api::spec::ComputeFeature;
 use postgres::{Client, NoTls};
-use tracing::{debug, error, info, warn};
+use tracing::{Level, error, info, instrument, span};

 use crate::compute::ComputeNode;
+use crate::metrics::{PG_CURR_DOWNTIME_MS, PG_TOTAL_DOWNTIME_MS};

 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

-// Spin in a loop and figure out the last activity time in the Postgres.
-// Then update it in the shared state. This function never errors out.
-// NB: the only expected panic is at `Mutex` unwrap(), all other errors
-// should be handled gracefully.
-fn watch_compute_activity(compute: &ComputeNode) {
-    // Suppose that `connstr` doesn't change
-    let connstr = compute.params.connstr.clone();
-    let conf = compute.get_conn_conf(Some("compute_ctl:activity_monitor"));
+struct ComputeMonitor {
+    compute: Arc<ComputeNode>,

-    // During startup and configuration we connect to every Postgres database,
-    // but we don't want to count this as some user activity. So wait until
-    // the compute fully started before monitoring activity.
-    wait_for_postgres_start(compute);
+    /// The moment when Postgres had some activity,
+    /// that should prevent compute from being suspended.
+    last_active: Option<DateTime<Utc>>,

-    // Define `client` outside of the loop to reuse existing connection if it's active.
-    let mut client = conf.connect(NoTls);
+    /// The moment when we last tried to check Postgres.
+    last_checked: DateTime<Utc>,
+    /// The last moment we did a successful Postgres check.
+    last_up: DateTime<Utc>,

-    let mut sleep = false;
-    let mut prev_active_time: Option<f64> = None;
-    let mut prev_sessions: Option<i64> = None;
+    /// Only used for internal statistics change tracking
+    /// between monitor runs and can be outdated.
+    active_time: Option<f64>,
+    /// Only used for internal statistics change tracking
+    /// between monitor runs and can be outdated.
+    sessions: Option<i64>,

-    if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
-        info!("starting experimental activity monitor for {}", connstr);
-    } else {
-        info!("starting activity monitor for {}", connstr);
+    /// Use experimental statistics-based activity monitor. It's no longer
+    /// 'experimental' per se, as it's enabled for everyone, but we still
+    /// keep the flag as an option to turn it off in some cases if it will
+    /// misbehave.
+    experimental: bool,
+}
+
+impl ComputeMonitor {
+    fn report_down(&self) {
+        let now = Utc::now();
+
+        // Calculate and report current downtime
+        // (since the last time Postgres was up)
+        let downtime = now.signed_duration_since(self.last_up);
+        PG_CURR_DOWNTIME_MS.set(downtime.num_milliseconds() as f64);
+
+        // Calculate and update total downtime
+        // (cumulative duration of Postgres downtime in ms)
+        let inc = now
+            .signed_duration_since(self.last_checked)
+            .num_milliseconds();
+        PG_TOTAL_DOWNTIME_MS.inc_by(inc as u64);
    }

-    loop {
-        // We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
-        // But skip the first sleep, so we can connect to Postgres immediately.
-        if sleep {
-            // Should be outside of the mutex lock to allow others to read while we sleep.
-            thread::sleep(MONITOR_CHECK_INTERVAL);
-        } else {
-            sleep = true;
-        }
+    fn report_up(&mut self) {
+        self.last_up = Utc::now();
+        PG_CURR_DOWNTIME_MS.set(0.0);
+    }

-        match &mut client {
-            Ok(cli) => {
-                if cli.is_closed() {
-                    info!("connection to Postgres is closed, trying to reconnect");
+    fn downtime_info(&self) -> String {
+        format!(
+            "total_ms: {}, current_ms: {}, last_up: {}",
+            PG_TOTAL_DOWNTIME_MS.get(),
+            PG_CURR_DOWNTIME_MS.get(),
+            self.last_up
+        )
+    }

-                    // Connection is closed, reconnect and try again.
-                    client = conf.connect(NoTls);
-                    continue;
-                }
+    /// Spin in a loop and figure out the last activity time in the Postgres.
+    /// Then update it in the shared state. This function never errors out.
+    /// NB: the only expected panic is at `Mutex` unwrap(), all other errors
+    /// should be handled gracefully.
+    #[instrument(skip_all)]
+    pub fn run(&mut self) {
+        // Suppose that `connstr` doesn't change
+        let connstr = self.compute.params.connstr.clone();
+        let conf = self
+            .compute
+            .get_conn_conf(Some("compute_ctl:compute_monitor"));

-                // This is a new logic, only enable if the feature flag is set.
-                // TODO: remove this once we are sure that it works OR drop it altogether.
-                if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
-                    // First, check if the total active time or sessions across all databases has changed.
-                    // If it did, it means that user executed some queries. In theory, it can even go down if
-                    // some databases were dropped, but it's still a user activity.
-                    match get_database_stats(cli) {
-                        Ok((active_time, sessions)) => {
-                            let mut detected_activity = false;
+        // During startup and configuration we connect to every Postgres database,
+        // but we don't want to count this as some user activity. So wait until
+        // the compute fully started before monitoring activity.
+        wait_for_postgres_start(&self.compute);

-                            prev_active_time = match prev_active_time {
-                                Some(prev_active_time) => {
-                                    if active_time != prev_active_time {
-                                        detected_activity = true;
-                                    }
-                                    Some(active_time)
-                                }
-                                None => Some(active_time),
-                            };
-                            prev_sessions = match prev_sessions {
-                                Some(prev_sessions) => {
-                                    if sessions != prev_sessions {
-                                        detected_activity = true;
-                                    }
-                                    Some(sessions)
-                                }
-                                None => Some(sessions),
-                            };
+        // Define `client` outside of the loop to reuse existing connection if it's active.
+        let mut client = conf.connect(NoTls);

-                            if detected_activity {
-                                // Update the last active time and continue, we don't need to
-                                // check backends state change.
-                                compute.update_last_active(Some(Utc::now()), ActivityKind::Query);
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            error!("could not get database statistics: {}", e);
-                            continue;
-                        }
-                    }
-                }
+        info!("starting compute monitor for {}", connstr);

-                // Second, if database statistics is the same, check all backends state change,
-                // maybe there is some with more recent activity. `get_backends_state_change()`
-                // can return None or stale timestamp, so it's `compute.update_last_active()`
-                // responsibility to check if the new timestamp is more recent than the current one.
-                // This helps us to discover new sessions, that did nothing yet.
-                match get_backends_state_change(cli) {
-                    Ok(last_active) => {
-                        compute.update_last_active(last_active, ActivityKind::Query);
-                    }
-                    Err(e) => {
-                        error!("could not get backends state change: {}", e);
-                    }
-                }
-
-                // Finally, if there are existing (logical) walsenders, do not suspend.
-                //
-                // walproposer doesn't currently show up in pg_stat_replication,
-                // but protect if it will be
-                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
-                match cli.query_one(ws_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_ws) => {
-                            if num_ws > 0 {
-                                compute.update_last_active(
-                                    Some(Utc::now()),
-                                    ActivityKind::LogicalReplication,
-                                );
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse walsenders count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of walsenders: {:?}", e);
-                        continue;
-                    }
-                }
-                //
-                // Don't suspend compute if there is an active logical replication subscription
-                //
-                // `where pid is not null` – to filter out read only computes and subscription on branches
-                //
-                let logical_subscriptions_query =
-                    "select count(*) from pg_stat_subscription where pid is not null;";
-                match cli.query_one(logical_subscriptions_query, &[]) {
-                    Ok(row) => match row.try_get::<&str, i64>("count") {
-                        Ok(num_subscribers) => {
-                            if num_subscribers > 0 {
-                                compute.update_last_active(
-                                    Some(Utc::now()),
-                                    ActivityKind::LogicalReplication,
-                                );
-                                continue;
-                            }
-                        }
-                        Err(e) => {
-                            warn!("failed to parse `pg_stat_subscription` count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!(
-                            "failed to get list of active logical replication subscriptions: {:?}",
-                            e
+        loop {
+            match &mut client {
+                Ok(cli) => {
+                    if cli.is_closed() {
+                        info!(
+                            downtime_info = self.downtime_info(),
+                            "connection to Postgres is closed, trying to reconnect"
                        );
-                        continue;
-                    }
-                }
-                //
-                // Do not suspend compute if autovacuum is running
-                //
-                let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
-                match cli.query_one(autovacuum_count_query, &[]) {
-                    Ok(r) => match r.try_get::<&str, i64>("count") {
-                        Ok(num_workers) => {
-                            if num_workers > 0 {
-                                compute
-                                    .update_last_active(Some(Utc::now()), ActivityKind::Autovacuum);
-                                continue;
+                        self.report_down();
+
+                        // Connection is closed, reconnect and try again.
+                        client = conf.connect(NoTls);
+                    } else {
+                        match self.check(cli) {
+                            Ok(_) => {
+                                self.report_up();
+                                self.compute.update_last_active(self.last_active);
+                            }
+                            Err(e) => {
+                                // Although we have many places where we can return errors in `check()`,
+                                // normally it shouldn't happen. I.e., we will likely return error if
+                                // connection got broken, query timed out, Postgres returned invalid data, etc.
+                                // In all such cases it's suspicious, so let's report this as downtime.
+                                self.report_down();
+                                error!(
+                                    downtime_info = self.downtime_info(),
+                                    "could not check Postgres: {}", e
+                                );
+
+                                // Reconnect to Postgres just in case. During tests, I noticed
+                                // that queries in `check()` can fail with `connection closed`,
+                                // but `cli.is_closed()` above doesn't detect it. Even if old
+                                // connection is still alive, it will be dropped when we reassign
+                                // `client` to a new connection.
+                                client = conf.connect(NoTls);
                            }
                        }
-                        Err(e) => {
-                            warn!("failed to parse autovacuum workers count: {:?}", e);
-                            continue;
-                        }
-                    },
-                    Err(e) => {
-                        warn!("failed to get list of autovacuum workers: {:?}", e);
-                        continue;
                    }
                }
-            }
-            Err(e) => {
-                debug!("could not connect to Postgres: {}, retrying", e);
+                Err(e) => {
+                    info!(
+                        downtime_info = self.downtime_info(),
+                        "could not connect to Postgres: {}, retrying", e
+                    );
+                    self.report_down();

-                // Establish a new connection and try again.
-                client = conf.connect(NoTls);
+                    // Establish a new connection and try again.
+                    client = conf.connect(NoTls);
+                }
+            }
+
+            // Reset the `last_checked` timestamp and sleep before the next iteration.
+            self.last_checked = Utc::now();
+            thread::sleep(MONITOR_CHECK_INTERVAL);
+        }
+    }
+
+    #[instrument(skip_all)]
+    fn check(&mut self, cli: &mut Client) -> anyhow::Result<()> {
+        // This is new logic, only enable if the feature flag is set.
+        // TODO: remove this once we are sure that it works OR drop it altogether.
+        if self.experimental {
+            // Check if the total active time or sessions across all databases has changed.
+            // If it did, it means that user executed some queries. In theory, it can even go down if
+            // some databases were dropped, but it's still user activity.
+            match get_database_stats(cli) {
+                Ok((active_time, sessions)) => {
+                    let mut detected_activity = false;
+
+                    if let Some(prev_active_time) = self.active_time {
+                        if active_time != prev_active_time {
+                            detected_activity = true;
+                        }
+                    }
+                    self.active_time = Some(active_time);
+
+                    if let Some(prev_sessions) = self.sessions {
+                        if sessions != prev_sessions {
+                            detected_activity = true;
+                        }
+                    }
+                    self.sessions = Some(sessions);
+
+                    if detected_activity {
+                        // Update the last active time and continue, we don't need to
+                        // check backends state change.
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    return Err(anyhow::anyhow!("could not get database statistics: {}", e));
+                }
            }
        }
+
+        // If database statistics are the same, check all backends for state changes.
+        // Maybe there are some with more recent activity. `get_backends_state_change()`
+        // can return None or stale timestamp, so it's `compute.update_last_active()`
+        // responsibility to check if the new timestamp is more recent than the current one.
+        // This helps us to discover new sessions that have not done anything yet.
+        match get_backends_state_change(cli) {
+            Ok(last_active) => match (last_active, self.last_active) {
+                (Some(last_active), Some(prev_last_active)) => {
+                    if last_active > prev_last_active {
+                        self.last_active = Some(last_active);
+                        return Ok(());
+                    }
+                }
+                (Some(last_active), None) => {
+                    self.last_active = Some(last_active);
+                    return Ok(());
+                }
+                _ => {}
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "could not get backends state change: {}",
+                    e
+                ));
+            }
+        }
+
+        // If there are existing (logical) walsenders, do not suspend.
+        //
+        // N.B. walproposer doesn't currently show up in pg_stat_replication,
+        // but protect if it will.
+        const WS_COUNT_QUERY: &str =
+            "select count(*) from pg_stat_replication where application_name != 'walproposer';";
+        match cli.query_one(WS_COUNT_QUERY, &[]) {
+            Ok(r) => match r.try_get::<&str, i64>("count") {
+                Ok(num_ws) => {
+                    if num_ws > 0 {
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    let err: anyhow::Error = e.into();
+                    return Err(err.context("failed to parse walsenders count"));
+                }
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!("failed to get list of walsenders: {}", e));
+            }
+        }
+
+        // Don't suspend compute if there is an active logical replication subscription
+        //
+        // `where pid is not null` – to filter out read only computes and subscription on branches
+        const LOGICAL_SUBSCRIPTIONS_QUERY: &str =
+            "select count(*) from pg_stat_subscription where pid is not null;";
+        match cli.query_one(LOGICAL_SUBSCRIPTIONS_QUERY, &[]) {
+            Ok(row) => match row.try_get::<&str, i64>("count") {
+                Ok(num_subscribers) => {
+                    if num_subscribers > 0 {
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    }
+                }
+                Err(e) => {
+                    return Err(anyhow::anyhow!(
+                        "failed to parse 'pg_stat_subscription' count: {}",
+                        e
+                    ));
+                }
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "failed to get list of active logical replication subscriptions: {}",
+                    e
+                ));
+            }
+        }
+
+        // Do not suspend compute if autovacuum is running
+        const AUTOVACUUM_COUNT_QUERY: &str =
+            "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
+        match cli.query_one(AUTOVACUUM_COUNT_QUERY, &[]) {
+            Ok(r) => match r.try_get::<&str, i64>("count") {
+                Ok(num_workers) => {
+                    if num_workers > 0 {
+                        self.last_active = Some(Utc::now());
+                        return Ok(());
+                    };
+                }
+                Err(e) => {
+                    return Err(anyhow::anyhow!(
+                        "failed to parse autovacuum workers count: {}",
+                        e
+                    ));
+                }
+            },
+            Err(e) => {
+                return Err(anyhow::anyhow!(
+                    "failed to get list of autovacuum workers: {}",
+                    e
+                ));
+            }
+        }
+
+        Ok(())
    }
 }

@@ -322,9 +412,24 @@ fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime
 /// Launch a separate compute monitor thread and return its `JoinHandle`.
 pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
    let compute = Arc::clone(compute);
+    let experimental = compute.has_feature(ComputeFeature::ActivityMonitorExperimental);
+    let now = Utc::now();
+    let mut monitor = ComputeMonitor {
+        compute,
+        last_active: None,
+        last_checked: now,
+        last_up: now,
+        active_time: None,
+        sessions: None,
+        experimental,
+    };

+    let span = span!(Level::INFO, "compute_monitor");
    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&compute))
+        .spawn(move || {
+            let _enter = span.enter();
+            monitor.run();
+        })
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -50,13 +50,13 @@ fn restart_rsyslog() -> Result<()> {

 pub fn configure_audit_rsyslog(
    log_directory: String,
-    tag: &str,
+    tag: Option<String>,
    remote_endpoint: &str,
 ) -> Result<()> {
    let config_content: String = format!(
        include_str!("config_template/compute_audit_rsyslog_template.conf"),
        log_directory = log_directory,
-        tag = tag,
+        tag = tag.unwrap_or("".to_string()),
        remote_endpoint = remote_endpoint
    );

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -18,12 +18,11 @@ use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage};
 use control_plane::local_env::{
-    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
-    ObjectStorageConf, SafekeeperConf,
+    EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf,
+    NeonLocalInitPageserverConf, SafekeeperConf,
 };
-use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
-use control_plane::object_storage::ObjectStorage;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
@@ -63,7 +62,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: u32 = 16;
+const DEFAULT_PG_VERSION: u32 = 17;

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -93,7 +92,7 @@ enum NeonLocalCmd {
    #[command(subcommand)]
    Safekeeper(SafekeeperCmd),
    #[command(subcommand)]
-    ObjectStorage(ObjectStorageCmd),
+    EndpointStorage(EndpointStorageCmd),
    #[command(subcommand)]
    Endpoint(EndpointCmd),
    #[command(subcommand)]
@@ -460,14 +459,14 @@ enum SafekeeperCmd {

 #[derive(clap::Subcommand)]
 #[clap(about = "Manage object storage")]
-enum ObjectStorageCmd {
-    Start(ObjectStorageStartCmd),
-    Stop(ObjectStorageStopCmd),
+enum EndpointStorageCmd {
+    Start(EndpointStorageStartCmd),
+    Stop(EndpointStorageStopCmd),
 }

 #[derive(clap::Args)]
 #[clap(about = "Start object storage")]
-struct ObjectStorageStartCmd {
+struct EndpointStorageStartCmd {
    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
@@ -475,7 +474,7 @@ struct ObjectStorageStartCmd {

 #[derive(clap::Args)]
 #[clap(about = "Stop object storage")]
-struct ObjectStorageStopCmd {
+struct EndpointStorageStopCmd {
    #[arg(value_enum, default_value = "fast")]
    #[clap(
        short = 'm',
@@ -797,7 +796,9 @@ fn main() -> Result<()> {
            }
            NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
            NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
-            NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
+            NeonLocalCmd::EndpointStorage(subcmd) => {
+                rt.block_on(handle_endpoint_storage(&subcmd, env))
+            }
            NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
            NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
        };
@@ -1014,8 +1015,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                    }
                })
                .collect(),
-            object_storage: ObjectStorageConf {
-                port: OBJECT_STORAGE_DEFAULT_PORT,
+            endpoint_storage: EndpointStorageConf {
+                port: ENDPOINT_STORAGE_DEFAULT_PORT,
            },
            pg_distrib_dir: None,
            neon_distrib_dir: None,
@@ -1735,12 +1736,15 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
-    use ObjectStorageCmd::*;
-    let storage = ObjectStorage::from_env(env);
+async fn handle_endpoint_storage(
+    subcmd: &EndpointStorageCmd,
+    env: &local_env::LocalEnv,
+) -> Result<()> {
+    use EndpointStorageCmd::*;
+    let storage = EndpointStorage::from_env(env);

    // In tests like test_forward_compatibility or test_graceful_cluster_restart
-    // old neon binaries (without object_storage) are present
+    // old neon binaries (without endpoint_storage) are present
    if !storage.bin.exists() {
        eprintln!(
            "{} binary not found. Ignore if this is a compatibility test",
@@ -1750,13 +1754,13 @@ async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::Local
    }

    match subcmd {
-        Start(ObjectStorageStartCmd { start_timeout }) => {
+        Start(EndpointStorageStartCmd { start_timeout }) => {
            if let Err(e) = storage.start(start_timeout).await {
-                eprintln!("object_storage start failed: {e}");
+                eprintln!("endpoint_storage start failed: {e}");
                exit(1);
            }
        }
-        Stop(ObjectStorageStopCmd { stop_mode }) => {
+        Stop(EndpointStorageStopCmd { stop_mode }) => {
            let immediate = match stop_mode {
                StopMode::Fast => false,
                StopMode::Immediate => true,
@@ -1866,10 +1870,10 @@ async fn handle_start_all_impl(
        }

        js.spawn(async move {
-            ObjectStorage::from_env(env)
+            EndpointStorage::from_env(env)
                .start(&retry_timeout)
                .await
-                .map_err(|e| e.context("start object_storage"))
+                .map_err(|e| e.context("start endpoint_storage"))
        });
    })();

@@ -1968,9 +1972,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    let storage = ObjectStorage::from_env(env);
+    let storage = EndpointStorage::from_env(env);
    if let Err(e) = storage.stop(immediate) {
-        eprintln!("object_storage stop failed: {:#}", e);
+        eprintln!("endpoint_storage stop failed: {:#}", e);
    }

    for ps_conf in &env.pageservers {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -766,10 +766,6 @@ impl Endpoint {
            }
        };

-        // TODO(tristan957): Remove the write to spec.json after compatibility
-        // tests work themselves out
-        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
        let config_path = self.endpoint_path().join("config.json");
        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;

@@ -779,16 +775,6 @@ impl Endpoint {
            .append(true)
            .open(self.endpoint_path().join("compute.log"))?;

-        // TODO(tristan957): Remove when compatibility tests are no longer an
-        // issue
-        let old_compute_ctl = {
-            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-            let help_output = cmd.arg("--help").output()?;
-            let help_output = String::from_utf8_lossy(&help_output.stdout);
-
-            !help_output.contains("--config")
-        };
-
        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
@@ -807,19 +793,8 @@ impl Endpoint {
        ])
        .args(["--pgdata", self.pgdata().to_str().unwrap()])
        .args(["--connstr", &conn_str])
-        // TODO(tristan957): Change this to --config when compatibility tests
-        // are no longer an issue
-        .args([
-            "--spec-path",
-            self.endpoint_path()
-                .join(if old_compute_ctl {
-                    "spec.json"
-                } else {
-                    "config.json"
-                })
-                .to_str()
-                .unwrap(),
-        ])
+        .arg("--config")
+        .arg(self.endpoint_path().join("config.json").as_os_str())
        .args([
            "--pgbin",
            self.env
--- a/control_plane/src/endpoint_storage.rs
+++ b/control_plane/src/endpoint_storage.rs
@@ -1,34 +1,33 @@
 use crate::background_process::{self, start_process, stop_process};
 use crate::local_env::LocalEnv;
-use anyhow::anyhow;
 use anyhow::{Context, Result};
 use camino::Utf8PathBuf;
 use std::io::Write;
 use std::time::Duration;

 /// Directory within .neon which will be used by default for LocalFs remote storage.
-pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
-pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
+pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage";
+pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993;

-pub struct ObjectStorage {
+pub struct EndpointStorage {
    pub bin: Utf8PathBuf,
    pub data_dir: Utf8PathBuf,
    pub pemfile: Utf8PathBuf,
    pub port: u16,
 }

-impl ObjectStorage {
-    pub fn from_env(env: &LocalEnv) -> ObjectStorage {
-        ObjectStorage {
-            bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
-            data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
+impl EndpointStorage {
+    pub fn from_env(env: &LocalEnv) -> EndpointStorage {
+        EndpointStorage {
+            bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(),
+            data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(),
            pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
-            port: env.object_storage.port,
+            port: env.endpoint_storage.port,
        }
    }

    fn config_path(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.json")
+        self.data_dir.join("endpoint_storage.json")
    }

    fn listen_addr(&self) -> Utf8PathBuf {
@@ -49,7 +48,7 @@ impl ObjectStorage {
        let cfg = Cfg {
            listen: self.listen_addr(),
            pemfile: parent.join(self.pemfile.clone()),
-            local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
+            local_path: parent.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR),
            r#type: "LocalFs".to_string(),
        };
        std::fs::create_dir_all(self.config_path().parent().unwrap())?;
@@ -59,24 +58,19 @@ impl ObjectStorage {
    }

    pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
-        println!("Starting s3 proxy at {}", self.listen_addr());
+        println!("Starting endpoint_storage at {}", self.listen_addr());
        std::io::stdout().flush().context("flush stdout")?;

        let process_status_check = || async {
-            tokio::time::sleep(Duration::from_millis(500)).await;
-            let res = reqwest::Client::new()
-                .get(format!("http://{}/metrics", self.listen_addr()))
-                .send()
-                .await;
-            match res {
-                Ok(response) if response.status().is_success() => Ok(true),
-                Ok(_) => Err(anyhow!("Failed to query /metrics")),
-                Err(e) => Err(anyhow!("Failed to check node status: {e}")),
+            let res = reqwest::Client::new().get(format!("http://{}/metrics", self.listen_addr()));
+            match res.send().await {
+                Ok(res) => Ok(res.status().is_success()),
+                Err(_) => Ok(false),
            }
        };

        let res = start_process(
-            "object_storage",
+            "endpoint_storage",
            &self.data_dir.clone().into_std_path_buf(),
            &self.bin.clone().into_std_path_buf(),
            vec![self.config_path().to_string()],
@@ -94,14 +88,14 @@ impl ObjectStorage {
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        stop_process(immediate, "object_storage", &self.pid_file())
+        stop_process(immediate, "endpoint_storage", &self.pid_file())
    }

    fn log_file(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.log")
+        self.data_dir.join("endpoint_storage.log")
    }

    fn pid_file(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.pid")
+        self.data_dir.join("endpoint_storage.pid")
    }
 }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -9,8 +9,8 @@
 mod background_process;
 pub mod broker;
 pub mod endpoint;
+pub mod endpoint_storage;
 pub mod local_env;
-pub mod object_storage;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -19,11 +19,11 @@ use serde::{Deserialize, Serialize};
 use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};

-use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
+use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 //
 // This data structures represents neon_local CLI config
@@ -72,7 +72,7 @@ pub struct LocalEnv {

    pub safekeepers: Vec<SafekeeperConf>,

-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,

    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
@@ -110,7 +110,7 @@ pub struct OnDiskConfig {
    )]
    pub pageservers: Vec<PageServerConf>,
    pub safekeepers: Vec<SafekeeperConf>,
-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub control_plane_compute_hook_api: Option<Url>,
@@ -144,7 +144,7 @@ pub struct NeonLocalInitConf {
    pub storage_controller: Option<NeonStorageControllerConf>,
    pub pageservers: Vec<NeonLocalInitPageserverConf>,
    pub safekeepers: Vec<SafekeeperConf>,
-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub generate_local_ssl_certs: bool,
@@ -152,7 +152,7 @@ pub struct NeonLocalInitConf {

 #[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
-pub struct ObjectStorageConf {
+pub struct EndpointStorageConf {
    pub port: u16,
 }

@@ -413,8 +413,8 @@ impl LocalEnv {
        self.pg_dir(pg_version, "lib")
    }

-    pub fn object_storage_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("object_storage")
+    pub fn endpoint_storage_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("endpoint_storage")
    }

    pub fn pageserver_bin(&self) -> PathBuf {
@@ -450,8 +450,8 @@ impl LocalEnv {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

-    pub fn object_storage_data_dir(&self) -> PathBuf {
-        self.base_data_dir.join("object_storage")
+    pub fn endpoint_storage_data_dir(&self) -> PathBuf {
+        self.base_data_dir.join("endpoint_storage")
    }

    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
@@ -615,7 +615,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: _,
                branch_name_mappings,
                generate_local_ssl_certs,
-                object_storage,
+                endpoint_storage,
            } = on_disk_config;
            LocalEnv {
                base_data_dir: repopath.to_owned(),
@@ -632,7 +632,7 @@ impl LocalEnv {
                control_plane_hooks_api,
                branch_name_mappings,
                generate_local_ssl_certs,
-                object_storage,
+                endpoint_storage,
            }
        };

@@ -742,7 +742,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: None,
                branch_name_mappings: self.branch_name_mappings.clone(),
                generate_local_ssl_certs: self.generate_local_ssl_certs,
-                object_storage: self.object_storage.clone(),
+                endpoint_storage: self.endpoint_storage.clone(),
            },
        )
    }
@@ -849,7 +849,7 @@ impl LocalEnv {
            control_plane_api,
            generate_local_ssl_certs,
            control_plane_hooks_api,
-            object_storage,
+            endpoint_storage,
        } = conf;

        // Find postgres binaries.
@@ -901,7 +901,7 @@ impl LocalEnv {
            control_plane_hooks_api,
            branch_name_mappings: Default::default(),
            generate_local_ssl_certs,
-            object_storage,
+            endpoint_storage,
        };

        if generate_local_ssl_certs {
@@ -929,13 +929,13 @@ impl LocalEnv {
                .context("pageserver init failed")?;
        }

-        ObjectStorage::from_env(&env)
+        EndpointStorage::from_env(&env)
            .init()
            .context("object storage init failed")?;

        // setup remote remote location for default LocalFs remote storage
        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
-        std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
+        std::fs::create_dir_all(env.base_data_dir.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR))?;

        env.persist_config()
    }
--- a/deny.toml
+++ b/deny.toml
@@ -45,9 +45,7 @@ allow = [
    "ISC",
    "MIT",
    "MPL-2.0",
-    "OpenSSL",
    "Unicode-3.0",
-    "Zlib",
 ]
 confidence-threshold = 0.8
 exceptions = [
@@ -56,14 +54,6 @@ exceptions = [
    { allow = ["Zlib"], name = "const_format", version = "*" },
 ]

-[[licenses.clarify]]
-name = "ring"
-version = "*"
-expression = "MIT AND ISC AND OpenSSL"
-license-files = [
-    { path = "LICENSE", hash = 0xbd0eed23 }
-]
-
 [licenses.private]
 ignore = true
 registries = []
@@ -116,7 +106,11 @@ name = "openssl"
 unknown-registry = "warn"
 unknown-git = "warn"
 allow-registry = ["https://github.com/rust-lang/crates.io-index"]
-allow-git = []
+allow-git = [
+    # Crate pinned to commit in origin repo due to opentelemetry version.
+    # TODO: Remove this once crate is fetched from crates.io again.
+    "https://github.com/mattiapenati/tower-otel",
+]

 [sources.allow-org]
 github = [
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -81,19 +81,9 @@ sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}

 cat ${CONFIG_FILE}

-# TODO(tristan957): Remove these workarounds for backwards compatibility after
-# the next compute release. That includes these next few lines and the
-# --spec-path in the compute_ctl invocation.
-if compute_ctl --help | grep --quiet -- '--config'; then
-  SPEC_PATH="$CONFIG_FILE"
-else
-  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
-  SPEC_PATH=/tmp/spec.json
-fi
-
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-$RANDOM"                          \
-     --spec-path "$SPEC_PATH"
+     --config "$CONFIG_FILE"
--- a/endpoint_storage/Cargo.toml
+++ b/endpoint_storage/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "object_storage"
+name = "endpoint_storage"
 version = "0.0.1"
 edition.workspace = true
 license.workspace = true
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -2,7 +2,7 @@ use anyhow::anyhow;
 use axum::body::{Body, Bytes};
 use axum::response::{IntoResponse, Response};
 use axum::{Router, http::StatusCode};
-use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
+use endpoint_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
 use remote_storage::TimeoutOrCancel;
 use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
 use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
@@ -46,12 +46,12 @@ async fn metrics() -> Result {

 async fn get(S3Path { path }: S3Path, state: State) -> Result {
    info!(%path, "downloading");
-    let download_err = |e| {
-        if let DownloadError::NotFound = e {
-            info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
+    let download_err = |err| {
+        if let DownloadError::NotFound = err {
+            info!(%path, %err, "downloading"); // 404 is not an issue of _this_ service
            return not_found(&path);
        }
-        internal_error(e, &path, "downloading")
+        internal_error(err, &path, "downloading")
    };
    let cancel = state.cancel.clone();
    let opts = &DownloadOpts::default();
@@ -249,7 +249,7 @@ mod tests {
        };

        let proxy = Storage {
-            auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
+            auth: endpoint_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
            storage,
            cancel: cancel.clone(),
            max_upload_file_limit: usize::MAX,
@@ -343,14 +343,14 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
    fn token() -> String {
-        let claims = object_storage::Claims {
+        let claims = endpoint_storage::Claims {
            tenant_id: TENANT_ID,
            timeline_id: TIMELINE_ID,
            endpoint_id: ENDPOINT_ID.into(),
            exp: u64::MAX,
        };
        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
-        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
        jsonwebtoken::encode(&header, &claims, &key).unwrap()
    }

@@ -364,7 +364,10 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
            vec![ENDPOINT_ID, "ep-ololo"]
        )
-        .skip(1);
+        // first one is fully valid path, second path is valid for GET as
+        // read paths may have different endpoint if tenant and timeline matches
+        // (needed for prewarming RO->RW replica)
+        .skip(2);

        for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
            info!(%uri, %method, %tenant, %timeline, %endpoint);
@@ -475,6 +478,16 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        requests_chain(chain.into_iter(), |_| token()).await;
    }

+    #[testlog(tokio::test)]
+    async fn read_other_endpoint_data() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/other_endpoint/key");
+        let chain = vec![
+            (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
+            (uri.clone(), "PUT", "", StatusCode::UNAUTHORIZED, false),
+        ];
+        requests_chain(chain.into_iter(), |_| token()).await;
+    }
+
    fn delete_prefix_token(uri: &str) -> String {
        use serde::Serialize;
        let parts = uri.split("/").collect::<Vec<&str>>();
@@ -482,7 +495,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        struct PrefixClaims {
            tenant_id: TenantId,
            timeline_id: Option<TimelineId>,
-            endpoint_id: Option<object_storage::EndpointId>,
+            endpoint_id: Option<endpoint_storage::EndpointId>,
            exp: u64,
        }
        let claims = PrefixClaims {
@@ -492,7 +505,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            exp: u64::MAX,
        };
        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
-        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
        jsonwebtoken::encode(&header, &claims, &key).unwrap()
    }

--- a/endpoint_storage/src/lib.rs
+++ b/endpoint_storage/src/lib.rs
@@ -169,10 +169,19 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
            .auth
            .decode(bearer.token())
            .map_err(|e| bad_request(e, "decoding token"))?;
+
+        // Read paths may have different endpoint ids. For readonly -> readwrite replica
+        // prewarming, endpoint must read other endpoint's data.
+        let endpoint_id = if parts.method == axum::http::Method::GET {
+            claims.endpoint_id.clone()
+        } else {
+            path.endpoint_id.clone()
+        };
+
        let route = Claims {
            tenant_id: path.tenant_id,
            timeline_id: path.timeline_id,
-            endpoint_id: path.endpoint_id.clone(),
+            endpoint_id,
            exp: claims.exp,
        };
        if route != claims {
--- a/endpoint_storage/src/main.rs
+++ b/endpoint_storage/src/main.rs
@@ -1,4 +1,4 @@
-//! `object_storage` is a service which provides API for uploading and downloading
+//! `endpoint_storage` is a service which provides API for uploading and downloading
 //! files. It is used by compute and control plane for accessing LFC prewarm data.
 //! This service is deployed either as a separate component or as part of compute image
 //! for large computes.
@@ -33,7 +33,7 @@ async fn main() -> anyhow::Result<()> {

    let config: String = std::env::args().skip(1).take(1).collect();
    if config.is_empty() {
-        anyhow::bail!("Usage: object_storage config.json")
+        anyhow::bail!("Usage: endpoint_storage config.json")
    }
    info!("Reading config from {config}");
    let config = std::fs::read_to_string(config.clone())?;
@@ -41,7 +41,7 @@ async fn main() -> anyhow::Result<()> {
    info!("Reading pemfile from {}", config.pemfile.clone());
    let pemfile = std::fs::read(config.pemfile.clone())?;
    info!("Loading public key from {}", config.pemfile.clone());
-    let auth = object_storage::JwtAuth::new(&pemfile)?;
+    let auth = endpoint_storage::JwtAuth::new(&pemfile)?;

    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
    info!("listening on {}", listener.local_addr().unwrap());
@@ -50,7 +50,7 @@ async fn main() -> anyhow::Result<()> {
    let cancel = tokio_util::sync::CancellationToken::new();
    app::check_storage_permissions(&storage, cancel.clone()).await?;

-    let proxy = std::sync::Arc::new(object_storage::Storage {
+    let proxy = std::sync::Arc::new(endpoint_storage::Storage {
        auth,
        storage,
        cancel: cancel.clone(),
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -56,22 +56,9 @@ pub struct ComputeStatusResponse {
    pub status: ComputeStatus,
    #[serde(serialize_with = "rfc3339_serialize")]
    pub last_active: Option<DateTime<Utc>>,
-    pub last_active_query: Option<DateTime<Utc>>,
-    pub last_activity_kind: Option<ActivityKind>,
    pub error: Option<String>,
 }

-#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ActivityKind {
-    // Client's query is executed
-    Query,
-    // Logical replication  is active (subscription or publication)
-    LogicalReplication,
-    // Autovacuum is active
-    Autovacuum,
-}
-
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -242,13 +242,22 @@ impl RemoteExtSpec {

        match self.extension_data.get(real_ext_name) {
            Some(_ext_data) => {
+                // We have decided to use the Go naming convention due to Kubernetes.
+
+                let arch = match std::env::consts::ARCH {
+                    "x86_64" => "amd64",
+                    "aarch64" => "arm64",
+                    arch => arch,
+                };
+
                // Construct the path to the extension archive
                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
                //
                // Keep it in sync with path generation in
                // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str =
-                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                let archive_path_str = format!(
+                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
+                );
                Ok((
                    real_ext_name.to_string(),
                    RemotePath::from_string(&archive_path_str)?,
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -35,6 +35,7 @@ nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
 tracing-utils.workspace = true
+once_cell.workspace = true

 [dev-dependencies]
 bincode.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -181,6 +181,7 @@ pub struct ConfigToml {
    pub generate_unarchival_heatmap: Option<bool>,
    pub tracing: Option<Tracing>,
    pub enable_tls_page_service_api: bool,
+    pub dev_mode: bool,
 }

 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -657,6 +658,7 @@ impl Default for ConfigToml {
            generate_unarchival_heatmap: None,
            tracing: None,
            enable_tls_page_service_api: false,
+            dev_mode: false,
        }
    }
 }
@@ -682,10 +684,10 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;

    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
-    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
-    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
-    // with this config, we can get a maximum peak compaction usage of 9 GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
+    // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
+    // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
+    // compaction usage of 15360MB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
    // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
    // read amp.
    pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
@@ -702,8 +704,11 @@ pub mod tenant_conf_defaults {
    // Relevant: https://github.com/neondatabase/neon/issues/3394
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
-    // layer creation will end immediately. Set to 0 to disable.
+    // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
+    // without looking at the exact number of L0 layers.
+    // It was expected to have the following behavior:
+    // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // > layer creation will end immediately. Set to 0 to disable.
    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -320,6 +320,35 @@ pub struct TimelineCreateRequest {
    pub mode: TimelineCreateRequestMode,
 }

+impl TimelineCreateRequest {
+    pub fn mode_tag(&self) -> &'static str {
+        match &self.mode {
+            TimelineCreateRequestMode::Branch { .. } => "branch",
+            TimelineCreateRequestMode::ImportPgdata { .. } => "import",
+            TimelineCreateRequestMode::Bootstrap { .. } => "bootstrap",
+        }
+    }
+
+    pub fn is_import(&self) -> bool {
+        matches!(self.mode, TimelineCreateRequestMode::ImportPgdata { .. })
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+pub enum ShardImportStatus {
+    InProgress,
+    Done,
+    Error(String),
+}
+impl ShardImportStatus {
+    pub fn is_terminal(&self) -> bool {
+        match self {
+            ShardImportStatus::InProgress => false,
+            ShardImportStatus::Done | ShardImportStatus::Error(_) => true,
+        }
+    }
+}
+
 /// Storage controller specific extensions to [`TimelineInfo`].
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateResponseStorcon {
@@ -1817,8 +1846,34 @@ pub mod virtual_file {
    }

    impl IoMode {
-        pub const fn preferred() -> Self {
-            Self::Buffered
+        pub fn preferred() -> Self {
+            // The default behavior when running Rust unit tests without any further
+            // flags is to use the newest behavior if available on the platform (Direct).
+            // The CI uses the following environment variable to unit tests for all
+            // different modes.
+            // NB: the Python regression & perf tests have their own defaults management
+            // that writes pageserver.toml; they do not use this variable.
+            if cfg!(test) {
+                use once_cell::sync::Lazy;
+                static CACHED: Lazy<IoMode> = Lazy::new(|| {
+                    utils::env::var_serde_json_string(
+                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
+                    )
+                    .unwrap_or({
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoMode::Direct
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoMode::Buffered
+                        }
+                    })
+                });
+                *CACHED
+            } else {
+                IoMode::Buffered
+            }
        }
    }

--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -4,10 +4,10 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
-use utils::id::NodeId;
+use utils::id::{NodeId, TimelineId};

 use crate::controller_api::NodeRegisterRequest;
-use crate::models::LocationConfigMode;
+use crate::models::{LocationConfigMode, ShardImportStatus};
 use crate::shard::TenantShardId;

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
@@ -62,3 +62,10 @@ pub struct ValidateResponseTenant {
    pub id: TenantShardId,
    pub valid: bool,
 }
+
+#[derive(Serialize, Deserialize)]
+pub struct PutTimelineImportStatusRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub status: ShardImportStatus,
+}
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -14,8 +14,9 @@ use anyhow::{Context, Result};
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
-use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::blob::operations::GetBlobBuilder;
+use azure_storage_blobs::blob::{Blob, CopyStatus};
+use azure_storage_blobs::container::operations::ListBlobsBuilder;
 use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
 use bytes::Bytes;
 use futures::FutureExt;
@@ -253,53 +254,15 @@ impl AzureBlobStorage {
        download
    }

-    async fn permit(
-        &self,
-        kind: RequestKind,
-        cancel: &CancellationToken,
-    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
-        let acquire = self.concurrency_limiter.acquire(kind);
-
-        tokio::select! {
-            permit = acquire => Ok(permit.expect("never closed")),
-            _ = cancel.cancelled() => Err(Cancelled),
-        }
-    }
-
-    pub fn container_name(&self) -> &str {
-        &self.container_name
-    }
-}
-
-fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
-    let mut res = Metadata::new();
-    for (k, v) in metadata.0.into_iter() {
-        res.insert(k, v);
-    }
-    res
-}
-
-fn to_download_error(error: azure_core::Error) -> DownloadError {
-    if let Some(http_err) = error.as_http_error() {
-        match http_err.status() {
-            StatusCode::NotFound => DownloadError::NotFound,
-            StatusCode::NotModified => DownloadError::Unmodified,
-            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
-            _ => DownloadError::Other(anyhow::Error::new(error)),
-        }
-    } else {
-        DownloadError::Other(error.into())
-    }
-}
-
-impl RemoteStorage for AzureBlobStorage {
-    fn list_streaming(
+    fn list_streaming_for_fn<T: Default + ListingCollector>(
        &self,
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
-    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        request_kind: RequestKind,
+        customize_builder: impl Fn(ListBlobsBuilder) -> ListBlobsBuilder,
+    ) -> impl Stream<Item = Result<T, DownloadError>> {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix.map(|p| self.relative_path_to_name(p)).or_else(|| {
            self.prefix_in_container.clone().map(|mut s| {
@@ -311,7 +274,7 @@ impl RemoteStorage for AzureBlobStorage {
        });

        async_stream::stream! {
-            let _permit = self.permit(RequestKind::List, cancel).await?;
+            let _permit = self.permit(request_kind, cancel).await?;

            let mut builder = self.client.list_blobs();

@@ -327,6 +290,8 @@ impl RemoteStorage for AzureBlobStorage {
                builder = builder.max_results(MaxResults::new(limit));
            }

+            builder = customize_builder(builder);
+
            let mut next_marker = None;

            let mut timeout_try_cnt = 1;
@@ -382,26 +347,20 @@ impl RemoteStorage for AzureBlobStorage {
                    break;
                };

-                let mut res = Listing::default();
+                let mut res = T::default();
                next_marker = entry.continuation();
                let prefix_iter = entry
                    .blobs
                    .prefixes()
                    .map(|prefix| self.name_to_relative_path(&prefix.name));
-                res.prefixes.extend(prefix_iter);
+                res.add_prefixes(self, prefix_iter);

                let blob_iter = entry
                    .blobs
-                    .blobs()
-                    .map(|k| ListingObject{
-                        key: self.name_to_relative_path(&k.name),
-                        last_modified: k.properties.last_modified.into(),
-                        size: k.properties.content_length,
-                    }
-                );
+                    .blobs();

                for key in blob_iter {
-                    res.keys.push(key);
+                    res.add_blob(self, key);

                    if let Some(mut mk) = max_keys {
                        assert!(mk > 0);
@@ -423,6 +382,128 @@ impl RemoteStorage for AzureBlobStorage {
        }
    }

+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        tokio::select! {
+            permit = acquire => Ok(permit.expect("never closed")),
+            _ = cancel.cancelled() => Err(Cancelled),
+        }
+    }
+
+    pub fn container_name(&self) -> &str {
+        &self.container_name
+    }
+}
+
+trait ListingCollector {
+    fn add_prefixes(&mut self, abs: &AzureBlobStorage, prefix_it: impl Iterator<Item = RemotePath>);
+    fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob);
+}
+
+impl ListingCollector for Listing {
+    fn add_prefixes(
+        &mut self,
+        _abs: &AzureBlobStorage,
+        prefix_it: impl Iterator<Item = RemotePath>,
+    ) {
+        self.prefixes.extend(prefix_it);
+    }
+    fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) {
+        self.keys.push(ListingObject {
+            key: abs.name_to_relative_path(&blob.name),
+            last_modified: blob.properties.last_modified.into(),
+            size: blob.properties.content_length,
+        });
+    }
+}
+
+impl ListingCollector for crate::VersionListing {
+    fn add_prefixes(
+        &mut self,
+        _abs: &AzureBlobStorage,
+        _prefix_it: impl Iterator<Item = RemotePath>,
+    ) {
+        // nothing
+    }
+    fn add_blob(&mut self, abs: &AzureBlobStorage, blob: &Blob) {
+        let id = crate::VersionId(blob.version_id.clone().expect("didn't find version ID"));
+        self.versions.push(crate::Version {
+            key: abs.name_to_relative_path(&blob.name),
+            last_modified: blob.properties.last_modified.into(),
+            kind: crate::VersionKind::Version(id),
+        });
+    }
+}
+
+fn to_azure_metadata(metadata: StorageMetadata) -> Metadata {
+    let mut res = Metadata::new();
+    for (k, v) in metadata.0.into_iter() {
+        res.insert(k, v);
+    }
+    res
+}
+
+fn to_download_error(error: azure_core::Error) -> DownloadError {
+    if let Some(http_err) = error.as_http_error() {
+        match http_err.status() {
+            StatusCode::NotFound => DownloadError::NotFound,
+            StatusCode::NotModified => DownloadError::Unmodified,
+            StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)),
+            _ => DownloadError::Other(anyhow::Error::new(error)),
+        }
+    } else {
+        DownloadError::Other(error.into())
+    }
+}
+
+impl RemoteStorage for AzureBlobStorage {
+    fn list_streaming(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> impl Stream<Item = Result<Listing, DownloadError>> {
+        let customize_builder = |builder| builder;
+        let kind = RequestKind::ListVersions;
+        self.list_streaming_for_fn(prefix, mode, max_keys, cancel, kind, customize_builder)
+    }
+
+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> std::result::Result<crate::VersionListing, DownloadError> {
+        let customize_builder = |mut builder: ListBlobsBuilder| {
+            builder = builder.include_versions(true);
+            builder
+        };
+        let kind = RequestKind::ListVersions;
+
+        let mut stream = std::pin::pin!(self.list_streaming_for_fn(
+            prefix,
+            mode,
+            max_keys,
+            cancel,
+            kind,
+            customize_builder
+        ));
+        let mut combined: crate::VersionListing =
+            stream.next().await.expect("At least one item required")?;
+        while let Some(list) = stream.next().await {
+            let list = list?;
+            combined.versions.extend(list.versions.into_iter());
+        }
+        Ok(combined)
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
@@ -532,7 +613,12 @@ impl RemoteStorage for AzureBlobStorage {
        let mut builder = blob_client.get();

        if let Some(ref etag) = opts.etag {
-            builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()))
+            builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()));
+        }
+
+        if let Some(ref version_id) = opts.version_id {
+            let version_id = azure_storage_blobs::prelude::VersionId::new(version_id.0.clone());
+            builder = builder.blob_versioning(version_id);
        }

        if let Some((start, end)) = opts.byte_range() {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -176,6 +176,32 @@ pub struct Listing {
    pub keys: Vec<ListingObject>,
 }

+#[derive(Default)]
+pub struct VersionListing {
+    pub versions: Vec<Version>,
+}
+
+pub struct Version {
+    pub key: RemotePath,
+    pub last_modified: SystemTime,
+    pub kind: VersionKind,
+}
+
+impl Version {
+    pub fn version_id(&self) -> Option<&VersionId> {
+        match &self.kind {
+            VersionKind::Version(id) => Some(id),
+            VersionKind::DeletionMarker => None,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum VersionKind {
+    DeletionMarker,
+    Version(VersionId),
+}
+
 /// Options for downloads. The default value is a plain GET.
 pub struct DownloadOpts {
    /// If given, returns [`DownloadError::Unmodified`] if the object still has
@@ -186,6 +212,8 @@ pub struct DownloadOpts {
    /// The end of the byte range to download, or unbounded. Must be after the
    /// start bound.
    pub byte_end: Bound<u64>,
+    /// Optionally request a specific version of a key
+    pub version_id: Option<VersionId>,
    /// Indicate whether we're downloading something small or large: this indirectly controls
    /// timeouts: for something like an index/manifest/heatmap, we should time out faster than
    /// for layer files
@@ -197,12 +225,16 @@ pub enum DownloadKind {
    Small,
 }

+#[derive(Debug, Clone)]
+pub struct VersionId(pub String);
+
 impl Default for DownloadOpts {
    fn default() -> Self {
        Self {
            etag: Default::default(),
            byte_start: Bound::Unbounded,
            byte_end: Bound::Unbounded,
+            version_id: None,
            kind: DownloadKind::Large,
        }
    }
@@ -295,6 +327,14 @@ pub trait RemoteStorage: Send + Sync + 'static {
        Ok(combined)
    }

+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<VersionListing, DownloadError>;
+
    /// Obtain metadata information about an object.
    async fn head_object(
        &self,
@@ -475,6 +515,22 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // See [`RemoteStorage::list_versions`].
+    pub async fn list_versions<'a>(
+        &'a self,
+        prefix: Option<&'a RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &'a CancellationToken,
+    ) -> Result<VersionListing, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_versions(prefix, mode, max_keys, cancel).await,
+        }
+    }
+
    // See [`RemoteStorage::head_object`].
    pub async fn head_object(
        &self,
@@ -727,6 +783,7 @@ impl ConcurrencyLimiter {
            RequestKind::Copy => &self.write,
            RequestKind::TimeTravel => &self.write,
            RequestKind::Head => &self.read,
+            RequestKind::ListVersions => &self.read,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -445,6 +445,16 @@ impl RemoteStorage for LocalFs {
        }
    }

+    async fn list_versions(
+        &self,
+        _prefix: Option<&RemotePath>,
+        _mode: ListingMode,
+        _max_keys: Option<NonZeroU32>,
+        _cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        unimplemented!()
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -14,6 +14,7 @@ pub(crate) enum RequestKind {
    Copy = 4,
    TimeTravel = 5,
    Head = 6,
+    ListVersions = 7,
 }

 use RequestKind::*;
@@ -29,6 +30,7 @@ impl RequestKind {
            Copy => "copy_object",
            TimeTravel => "time_travel_recover",
            Head => "head_object",
+            ListVersions => "list_versions",
        }
    }
    const fn as_index(&self) -> usize {
@@ -36,7 +38,10 @@ impl RequestKind {
    }
 }

-const REQUEST_KIND_COUNT: usize = 7;
+const REQUEST_KIND_LIST: &[RequestKind] =
+    &[Get, Put, Delete, List, Copy, TimeTravel, Head, ListVersions];
+
+const REQUEST_KIND_COUNT: usize = REQUEST_KIND_LIST.len();
 pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);

 impl<C> RequestTyped<C> {
@@ -45,12 +50,11 @@ impl<C> RequestTyped<C> {
    }

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
-        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let mut it = REQUEST_KIND_LIST.iter();
        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
-            f(next)
+            f(*next)
        });

        if let Some(next) = it.next() {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -21,9 +21,8 @@ use aws_sdk_s3::config::{AsyncSleep, IdentityCache, Region, SharedAsyncSleep};
 use aws_sdk_s3::error::SdkError;
 use aws_sdk_s3::operation::get_object::GetObjectError;
 use aws_sdk_s3::operation::head_object::HeadObjectError;
-use aws_sdk_s3::types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass};
+use aws_sdk_s3::types::{Delete, ObjectIdentifier, StorageClass};
 use aws_smithy_async::rt::sleep::TokioSleep;
-use aws_smithy_types::DateTime;
 use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::date_time::ConversionError;
@@ -46,7 +45,7 @@ use crate::support::PermitCarrying;
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
    MAX_KEYS_PER_DELETE_S3, REMOTE_STORAGE_PREFIX_SEPARATOR, RemotePath, RemoteStorage,
-    TimeTravelError, TimeoutOrCancel,
+    TimeTravelError, TimeoutOrCancel, Version, VersionId, VersionKind, VersionListing,
 };

 /// AWS S3 storage.
@@ -66,6 +65,7 @@ struct GetObjectRequest {
    key: String,
    etag: Option<String>,
    range: Option<String>,
+    version_id: Option<String>,
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
@@ -251,6 +251,7 @@ impl S3Bucket {
            .get_object()
            .bucket(request.bucket)
            .key(request.key)
+            .set_version_id(request.version_id)
            .set_range(request.range);

        if let Some(etag) = request.etag {
@@ -405,6 +406,124 @@ impl S3Bucket {
        Ok(())
    }

+    async fn list_versions_with_permit(
+        &self,
+        _permit: &tokio::sync::SemaphorePermit<'_>,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        // get the passed prefix or if it is not set use prefix_in_bucket value
+        let prefix = prefix
+            .map(|p| self.relative_path_to_s3_object(p))
+            .or_else(|| self.prefix_in_bucket.clone());
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |e: &_| matches!(e, DownloadError::Cancelled);
+
+        let mut key_marker = None;
+        let mut version_id_marker = None;
+        let mut versions_and_deletes = Vec::new();
+
+        loop {
+            let response = backoff::retry(
+                || async {
+                    let mut request = self
+                        .client
+                        .list_object_versions()
+                        .bucket(self.bucket_name.clone())
+                        .set_prefix(prefix.clone())
+                        .set_key_marker(key_marker.clone())
+                        .set_version_id_marker(version_id_marker.clone());
+
+                    if let ListingMode::WithDelimiter = mode {
+                        request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+                    }
+
+                    let op = request.send();
+
+                    tokio::select! {
+                        res = op => res.map_err(|e| DownloadError::Other(e.into())),
+                        _ = cancel.cancelled() => Err(DownloadError::Cancelled),
+                    }
+                },
+                is_permanent,
+                warn_threshold,
+                max_retries,
+                "listing object versions",
+                cancel,
+            )
+            .await
+            .ok_or_else(|| DownloadError::Cancelled)
+            .and_then(|x| x)?;
+
+            tracing::trace!(
+                "  Got List response version_id_marker={:?}, key_marker={:?}",
+                response.version_id_marker,
+                response.key_marker
+            );
+            let versions = response
+                .versions
+                .unwrap_or_default()
+                .into_iter()
+                .map(|version| {
+                    let key = version.key.expect("response does not contain a key");
+                    let key = self.s3_object_to_relative_path(&key);
+                    let version_id = VersionId(version.version_id.expect("needing version id"));
+                    let last_modified =
+                        SystemTime::try_from(version.last_modified.expect("no last_modified"))?;
+                    Ok(Version {
+                        key,
+                        last_modified,
+                        kind: crate::VersionKind::Version(version_id),
+                    })
+                });
+            let deletes = response
+                .delete_markers
+                .unwrap_or_default()
+                .into_iter()
+                .map(|version| {
+                    let key = version.key.expect("response does not contain a key");
+                    let key = self.s3_object_to_relative_path(&key);
+                    let last_modified =
+                        SystemTime::try_from(version.last_modified.expect("no last_modified"))?;
+                    Ok(Version {
+                        key,
+                        last_modified,
+                        kind: crate::VersionKind::DeletionMarker,
+                    })
+                });
+            itertools::process_results(versions.chain(deletes), |n_vds| {
+                versions_and_deletes.extend(n_vds)
+            })
+            .map_err(DownloadError::Other)?;
+            fn none_if_empty(v: Option<String>) -> Option<String> {
+                v.filter(|v| !v.is_empty())
+            }
+            version_id_marker = none_if_empty(response.next_version_id_marker);
+            key_marker = none_if_empty(response.next_key_marker);
+            if version_id_marker.is_none() {
+                // The final response is not supposed to be truncated
+                if response.is_truncated.unwrap_or_default() {
+                    return Err(DownloadError::Other(anyhow::anyhow!(
+                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
+                    )));
+                }
+                break;
+            }
+            if let Some(max_keys) = max_keys {
+                if versions_and_deletes.len() >= max_keys.get().try_into().unwrap() {
+                    return Err(DownloadError::Other(anyhow::anyhow!("too many versions")));
+                }
+            }
+        }
+        Ok(VersionListing {
+            versions: versions_and_deletes,
+        })
+    }
+
    pub fn bucket_name(&self) -> &str {
        &self.bucket_name
    }
@@ -621,6 +740,19 @@ impl RemoteStorage for S3Bucket {
        }
    }

+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        let kind = RequestKind::ListVersions;
+        let permit = self.permit(kind, cancel).await?;
+        self.list_versions_with_permit(&permit, prefix, mode, max_keys, cancel)
+            .await
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
@@ -801,6 +933,7 @@ impl RemoteStorage for S3Bucket {
                key: self.relative_path_to_s3_object(from),
                etag: opts.etag.as_ref().map(|e| e.to_string()),
                range: opts.byte_range_header(),
+                version_id: opts.version_id.as_ref().map(|v| v.0.to_owned()),
            },
            cancel,
        )
@@ -845,94 +978,25 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::TimeTravel;
        let permit = self.permit(kind, cancel).await?;

-        let timestamp = DateTime::from(timestamp);
-        let done_if_after = DateTime::from(done_if_after);
-
        tracing::trace!("Target time: {timestamp:?}, done_if_after {done_if_after:?}");

-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let prefix = prefix
-            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone());
+        // Limit the number of versions deletions, mostly so that we don't
+        // keep requesting forever if the list is too long, as we'd put the
+        // list in RAM.
+        // Building a list of 100k entries that reaches the limit roughly takes
+        // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
+        const COMPLEXITY_LIMIT: Option<NonZeroU32> = NonZeroU32::new(100_000);

-        let warn_threshold = 3;
-        let max_retries = 10;
-        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
-
-        let mut key_marker = None;
-        let mut version_id_marker = None;
-        let mut versions_and_deletes = Vec::new();
-
-        loop {
-            let response = backoff::retry(
-                || async {
-                    let op = self
-                        .client
-                        .list_object_versions()
-                        .bucket(self.bucket_name.clone())
-                        .set_prefix(prefix.clone())
-                        .set_key_marker(key_marker.clone())
-                        .set_version_id_marker(version_id_marker.clone())
-                        .send();
-
-                    tokio::select! {
-                        res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
-                        _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
-                    }
-                },
-                is_permanent,
-                warn_threshold,
-                max_retries,
-                "listing object versions for time_travel_recover",
-                cancel,
-            )
+        let mode = ListingMode::NoDelimiter;
+        let version_listing = self
+            .list_versions_with_permit(&permit, prefix, mode, COMPLEXITY_LIMIT, cancel)
            .await
-            .ok_or_else(|| TimeTravelError::Cancelled)
-            .and_then(|x| x)?;
-
-            tracing::trace!(
-                "  Got List response version_id_marker={:?}, key_marker={:?}",
-                response.version_id_marker,
-                response.key_marker
-            );
-            let versions = response
-                .versions
-                .unwrap_or_default()
-                .into_iter()
-                .map(VerOrDelete::from_version);
-            let deletes = response
-                .delete_markers
-                .unwrap_or_default()
-                .into_iter()
-                .map(VerOrDelete::from_delete_marker);
-            itertools::process_results(versions.chain(deletes), |n_vds| {
-                versions_and_deletes.extend(n_vds)
-            })
-            .map_err(TimeTravelError::Other)?;
-            fn none_if_empty(v: Option<String>) -> Option<String> {
-                v.filter(|v| !v.is_empty())
-            }
-            version_id_marker = none_if_empty(response.next_version_id_marker);
-            key_marker = none_if_empty(response.next_key_marker);
-            if version_id_marker.is_none() {
-                // The final response is not supposed to be truncated
-                if response.is_truncated.unwrap_or_default() {
-                    return Err(TimeTravelError::Other(anyhow::anyhow!(
-                        "Received truncated ListObjectVersions response for prefix={prefix:?}"
-                    )));
-                }
-                break;
-            }
-            // Limit the number of versions deletions, mostly so that we don't
-            // keep requesting forever if the list is too long, as we'd put the
-            // list in RAM.
-            // Building a list of 100k entries that reaches the limit roughly takes
-            // 40 seconds, and roughly corresponds to tenants of 2 TiB physical size.
-            const COMPLEXITY_LIMIT: usize = 100_000;
-            if versions_and_deletes.len() >= COMPLEXITY_LIMIT {
-                return Err(TimeTravelError::TooManyVersions);
-            }
-        }
+            .map_err(|err| match err {
+                DownloadError::Other(e) => TimeTravelError::Other(e),
+                DownloadError::Cancelled => TimeTravelError::Cancelled,
+                other => TimeTravelError::Other(other.into()),
+            })?;
+        let versions_and_deletes = version_listing.versions;

        tracing::info!(
            "Built list for time travel with {} versions and deletions",
@@ -948,24 +1012,26 @@ impl RemoteStorage for S3Bucket {
        let mut vds_for_key = HashMap::<_, Vec<_>>::new();

        for vd in &versions_and_deletes {
-            let VerOrDelete {
-                version_id, key, ..
-            } = &vd;
-            if version_id == "null" {
+            let Version { key, .. } = &vd;
+            let version_id = vd.version_id().map(|v| v.0.as_str());
+            if version_id == Some("null") {
                return Err(TimeTravelError::Other(anyhow!(
                    "Received ListVersions response for key={key} with version_id='null', \
                    indicating either disabled versioning, or legacy objects with null version id values"
                )));
            }
-            tracing::trace!(
-                "Parsing version key={key} version_id={version_id} kind={:?}",
-                vd.kind
-            );
+            tracing::trace!("Parsing version key={key} kind={:?}", vd.kind);

            vds_for_key.entry(key).or_default().push(vd);
        }
+
+        let warn_threshold = 3;
+        let max_retries = 10;
+        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);
+
        for (key, versions) in vds_for_key {
            let last_vd = versions.last().unwrap();
+            let key = self.relative_path_to_s3_object(key);
            if last_vd.last_modified > done_if_after {
                tracing::trace!("Key {key} has version later than done_if_after, skipping");
                continue;
@@ -990,11 +1056,11 @@ impl RemoteStorage for S3Bucket {
                do_delete = true;
            } else {
                match &versions[version_to_restore_to - 1] {
-                    VerOrDelete {
-                        kind: VerOrDeleteKind::Version,
-                        version_id,
+                    Version {
+                        kind: VersionKind::Version(version_id),
                        ..
                    } => {
+                        let version_id = &version_id.0;
                        tracing::trace!("Copying old version {version_id} for {key}...");
                        // Restore the state to the last version by copying
                        let source_id =
@@ -1006,7 +1072,7 @@ impl RemoteStorage for S3Bucket {
                                    .client
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
-                                    .key(key)
+                                    .key(&key)
                                    .set_storage_class(self.upload_storage_class.clone())
                                    .copy_source(&source_id)
                                    .send();
@@ -1027,8 +1093,8 @@ impl RemoteStorage for S3Bucket {
                        .and_then(|x| x)?;
                        tracing::info!(%version_id, %key, "Copied old version in S3");
                    }
-                    VerOrDelete {
-                        kind: VerOrDeleteKind::DeleteMarker,
+                    Version {
+                        kind: VersionKind::DeletionMarker,
                        ..
                    } => {
                        do_delete = true;
@@ -1036,7 +1102,7 @@ impl RemoteStorage for S3Bucket {
                }
            };
            if do_delete {
-                if matches!(last_vd.kind, VerOrDeleteKind::DeleteMarker) {
+                if matches!(last_vd.kind, VersionKind::DeletionMarker) {
                    // Key has since been deleted (but there was some history), no need to do anything
                    tracing::trace!("Key {key} already deleted, skipping.");
                } else {
@@ -1064,62 +1130,6 @@ impl RemoteStorage for S3Bucket {
    }
 }

-// Save RAM and only store the needed data instead of the entire ObjectVersion/DeleteMarkerEntry
-struct VerOrDelete {
-    kind: VerOrDeleteKind,
-    last_modified: DateTime,
-    version_id: String,
-    key: String,
-}
-
-#[derive(Debug)]
-enum VerOrDeleteKind {
-    Version,
-    DeleteMarker,
-}
-
-impl VerOrDelete {
-    fn with_kind(
-        kind: VerOrDeleteKind,
-        last_modified: Option<DateTime>,
-        version_id: Option<String>,
-        key: Option<String>,
-    ) -> anyhow::Result<Self> {
-        let lvk = (last_modified, version_id, key);
-        let (Some(last_modified), Some(version_id), Some(key)) = lvk else {
-            anyhow::bail!(
-                "One (or more) of last_modified, key, and id is None. \
-            Is versioning enabled in the bucket? last_modified={:?}, version_id={:?}, key={:?}",
-                lvk.0,
-                lvk.1,
-                lvk.2,
-            );
-        };
-        Ok(Self {
-            kind,
-            last_modified,
-            version_id,
-            key,
-        })
-    }
-    fn from_version(v: ObjectVersion) -> anyhow::Result<Self> {
-        Self::with_kind(
-            VerOrDeleteKind::Version,
-            v.last_modified,
-            v.version_id,
-            v.key,
-        )
-    }
-    fn from_delete_marker(v: DeleteMarkerEntry) -> anyhow::Result<Self> {
-        Self::with_kind(
-            VerOrDeleteKind::DeleteMarker,
-            v.last_modified,
-            v.version_id,
-            v.key,
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use std::num::NonZeroUsize;
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -139,6 +139,20 @@ impl RemoteStorage for UnreliableWrapper {
        self.inner.list(prefix, mode, max_keys, cancel).await
    }

+    async fn list_versions(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<crate::VersionListing, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner
+            .list_versions(prefix, mode, max_keys, cancel)
+            .await
+    }
+
    async fn head_object(
        &self,
        key: &RemotePath,
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -78,6 +78,7 @@ metrics.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
+pem.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -11,6 +11,7 @@ use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::storage_layer::InMemoryLayer;
 use pageserver::{page_cache, virtual_file};
 use pageserver_api::key::Key;
+use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::value::Value;
 use tokio_util::sync::CancellationToken;
@@ -21,13 +22,14 @@ use wal_decoder::serialized_batch::SerializedValueBatch;
 // A very cheap hash for generating non-sequential keys.
 fn murmurhash32(mut h: u32) -> u32 {
    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
+    h  h.wrapping_mul(0x85ebca6b);
    h ^= h >> 13;
    h = h.wrapping_mul(0xc2b2ae35);
    h ^= h >> 16;
    h
 }

+#[derive(serde::Serialize, Clone, Copy, Debug)]
 enum KeyLayout {
    /// Sequential unique keys
    Sequential,
@@ -37,6 +39,7 @@ enum KeyLayout {
    RandomReuse(u32),
 }

+#[derive(serde::Serialize, Clone, Copy, Debug)]
 enum WriteDelta {
    Yes,
    No,
@@ -138,12 +141,15 @@ async fn ingest(
 /// Wrapper to instantiate a tokio runtime
 fn ingest_main(
    conf: &'static PageServerConf,
+    io_mode: IoMode,
    put_size: usize,
    put_count: usize,
    key_layout: KeyLayout,
    write_delta: WriteDelta,
 ) {
-    let runtime = tokio::runtime::Builder::new_current_thread()
+    pageserver::virtual_file::set_io_mode(io_mode);
+
+    let runtime = tokio::runtime::Builder::new_multi_thread()
        .enable_all()
        .build()
        .unwrap();
@@ -174,93 +180,207 @@ fn criterion_benchmark(c: &mut Criterion) {
    virtual_file::init(
        16384,
        virtual_file::io_engine_for_bench(),
+        // immaterial, each `ingest_main` invocation below overrides this
        conf.virtual_file_io_mode,
+        // without actually doing syncs, buffered writes have an unfair advantage over direct IO writes
        virtual_file::SyncMode::Sync,
    );
    page_cache::init(conf.page_cache_size);

-    {
-        let mut group = c.benchmark_group("ingest-small-values");
-        let put_size = 100usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
-        group.sample_size(10);
-        group.bench_function("ingest 128MB/100b seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Random,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b rand-1024keys", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::RandomReuse(0x3ff),
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/100b seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
-        });
+    #[derive(serde::Serialize)]
+    struct ExplodedParameters {
+        io_mode: IoMode,
+        volume_mib: usize,
+        key_size: usize,
+        key_layout: KeyLayout,
+        write_delta: WriteDelta,
    }
-
-    {
-        let mut group = c.benchmark_group("ingest-big-values");
-        let put_size = 8192usize;
-        let put_count = 128 * 1024 * 1024 / put_size;
-        group.throughput(criterion::Throughput::Bytes((put_size * put_count) as u64));
+    #[derive(Clone)]
+    struct HandPickedParameters {
+        volume_mib: usize,
+        key_size: usize,
+        key_layout: KeyLayout,
+        write_delta: WriteDelta,
+    }
+    let expect = vec![
+        // Small values (100b) tests
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::Random,
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::RandomReuse(0x3ff),
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 100,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::No,
+        },
+        // Large values (8k) tests
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 8192,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::Yes,
+        },
+        HandPickedParameters {
+            volume_mib: 128,
+            key_size: 8192,
+            key_layout: KeyLayout::Sequential,
+            write_delta: WriteDelta::No,
+        },
+    ];
+    let exploded_parameters = {
+        let mut out = Vec::new();
+        for io_mode in [
+            IoMode::Buffered,
+            #[cfg(target_os = "linux")]
+            IoMode::Direct,
+        ] {
+            for param in expect.clone() {
+                let HandPickedParameters {
+                    volume_mib,
+                    key_size,
+                    key_layout,
+                    write_delta,
+                } = param;
+                out.push(ExplodedParameters {
+                    io_mode,
+                    volume_mib,
+                    key_size,
+                    key_layout,
+                    write_delta,
+                });
+            }
+        }
+        out
+    };
+    impl ExplodedParameters {
+        fn benchmark_id(&self) -> String {
+            let ExplodedParameters {
+                io_mode,
+                volume_mib,
+                key_size,
+                key_layout,
+                write_delta,
+            } = self;
+            format!(
+                "io_mode={io_mode:?} volume_mib={volume_mib:?} key_size_bytes={key_size:?} key_layout={key_layout:?} write_delta={write_delta:?}"
+            )
+        }
+    }
+    let mut group = c.benchmark_group("ingest");
+    for params in exploded_parameters {
+        let id = params.benchmark_id();
+        let ExplodedParameters {
+            io_mode,
+            volume_mib,
+            key_size,
+            key_layout,
+            write_delta,
+        } = params;
+        let put_count = volume_mib * 1024 * 1024 / key_size;
+        group.throughput(criterion::Throughput::Bytes((key_size * put_count) as u64));
        group.sample_size(10);
-        group.bench_function("ingest 128MB/8k seq", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::Yes,
-                )
-            })
-        });
-        group.bench_function("ingest 128MB/8k seq, no delta", |b| {
-            b.iter(|| {
-                ingest_main(
-                    conf,
-                    put_size,
-                    put_count,
-                    KeyLayout::Sequential,
-                    WriteDelta::No,
-                )
-            })
+        group.bench_function(id, |b| {
+            b.iter(|| ingest_main(conf, io_mode, key_size, put_count, key_layout, write_delta))
        });
    }
 }

 criterion_group!(benches, criterion_benchmark);
 criterion_main!(benches);
+
+/*
+cargo bench --bench bench_ingest
+
+im4gn.2xlarge:
+
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [1.8491 s 1.8540 s 1.8592 s]
+                        thrpt:  [68.847 MiB/s 69.039 MiB/s 69.222 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [2.6976 s 2.7123 s 2.7286 s]
+                        thrpt:  [46.911 MiB/s 47.193 MiB/s 47.450 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
+                        time:   [1.7433 s 1.7510 s 1.7600 s]
+                        thrpt:  [72.729 MiB/s 73.099 MiB/s 73.423 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [499.63 ms 500.07 ms 500.46 ms]
+                        thrpt:  [255.77 MiB/s 255.96 MiB/s 256.19 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [456.97 ms 459.61 ms 461.92 ms]
+                        thrpt:  [277.11 MiB/s 278.50 MiB/s 280.11 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [158.82 ms 159.16 ms 159.56 ms]
+                        thrpt:  [802.22 MiB/s 804.24 MiB/s 805.93 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [1.8856 s 1.8997 s 1.9179 s]
+                        thrpt:  [66.740 MiB/s 67.380 MiB/s 67.882 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [2.7468 s 2.7625 s 2.7785 s]
+                        thrpt:  [46.068 MiB/s 46.335 MiB/s 46.600 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes
+                        time:   [1.7689 s 1.7726 s 1.7767 s]
+                        thrpt:  [72.045 MiB/s 72.208 MiB/s 72.363 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [497.64 ms 498.60 ms 499.67 ms]
+                        thrpt:  [256.17 MiB/s 256.72 MiB/s 257.21 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [493.72 ms 505.07 ms 518.03 ms]
+                        thrpt:  [247.09 MiB/s 253.43 MiB/s 259.26 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [267.76 ms 267.85 ms 267.96 ms]
+                        thrpt:  [477.69 MiB/s 477.88 MiB/s 478.03 MiB/s]
+
+Hetzner AX102:
+
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [1.0683 s 1.1006 s 1.1386 s]
+                        thrpt:  [112.42 MiB/s 116.30 MiB/s 119.82 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [1.5719 s 1.6012 s 1.6228 s]
+                        thrpt:  [78.877 MiB/s 79.938 MiB/s 81.430 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Y...
+                        time:   [1.1095 s 1.1331 s 1.1580 s]
+                        thrpt:  [110.53 MiB/s 112.97 MiB/s 115.37 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [303.20 ms 307.83 ms 311.90 ms]
+                        thrpt:  [410.39 MiB/s 415.81 MiB/s 422.16 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [406.34 ms 429.37 ms 451.63 ms]
+                        thrpt:  [283.42 MiB/s 298.11 MiB/s 315.00 MiB/s]
+ingest/io_mode=Buffered volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [134.01 ms 135.78 ms 137.48 ms]
+                        thrpt:  [931.03 MiB/s 942.68 MiB/s 955.12 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=Yes
+                        time:   [1.0406 s 1.0580 s 1.0772 s]
+                        thrpt:  [118.83 MiB/s 120.98 MiB/s 123.00 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Random write_delta=Yes
+                        time:   [1.5059 s 1.5339 s 1.5625 s]
+                        thrpt:  [81.920 MiB/s 83.448 MiB/s 84.999 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=RandomReuse(1023) write_delta=Yes
+                        time:   [1.0714 s 1.0934 s 1.1161 s]
+                        thrpt:  [114.69 MiB/s 117.06 MiB/s 119.47 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=100 key_layout=Sequential write_delta=No
+                        time:   [262.68 ms 265.14 ms 267.71 ms]
+                        thrpt:  [478.13 MiB/s 482.76 MiB/s 487.29 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=Yes
+                        time:   [375.19 ms 393.80 ms 411.40 ms]
+                        thrpt:  [311.14 MiB/s 325.04 MiB/s 341.16 MiB/s]
+ingest/io_mode=Direct volume_mib=128 key_size_bytes=8192 key_layout=Sequential write_delta=No
+                        time:   [123.02 ms 123.85 ms 124.66 ms]
+                        thrpt:  [1.0027 GiB/s 1.0093 GiB/s 1.0161 GiB/s]
+*/
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -419,6 +419,23 @@ impl Client {
        }
    }

+    pub async fn timeline_detail(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+    ) -> Result<TimelineInfo> {
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        );
+
+        self.request(Method::GET, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn timeline_archival_config(
        &self,
        tenant_shard_id: TenantShardId,
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -68,6 +68,13 @@ pub(crate) struct Args {
    targets: Option<Vec<TenantTimelineId>>,
 }

+/// State shared by all clients
+#[derive(Debug)]
+struct SharedState {
+    start_work_barrier: tokio::sync::Barrier,
+    live_stats: LiveStats,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
    completed_requests: AtomicU64,
@@ -240,24 +247,26 @@ async fn main_impl(
        all_ranges
    };

-    let live_stats = Arc::new(LiveStats::default());
-
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = args.num_clients.get() * timelines.len();
    let num_main_impl = 1;

-    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_live_stats_dump + num_work_sender_tasks + num_main_impl,
-    ));
+    let shared_state = Arc::new(SharedState {
+        start_work_barrier: tokio::sync::Barrier::new(
+            num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        ),
+        live_stats: LiveStats::default(),
+    });
+    let cancel = CancellationToken::new();

+    let ss = shared_state.clone();
    tokio::spawn({
-        let stats = Arc::clone(&live_stats);
-        let start_work_barrier = Arc::clone(&start_work_barrier);
        async move {
-            start_work_barrier.wait().await;
+            ss.start_work_barrier.wait().await;
            loop {
                let start = std::time::Instant::now();
                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let stats = &ss.live_stats;
                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
                let missed = stats.missed.swap(0, Ordering::Relaxed);
                let elapsed = start.elapsed();
@@ -270,14 +279,12 @@ async fn main_impl(
        }
    });

-    let cancel = CancellationToken::new();
-
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
-        let live_stats = live_stats.clone();
-        let start_work_barrier = start_work_barrier.clone();
+        let ss = shared_state.clone();
+        let cancel = cancel.clone();
        let ranges: Vec<KeyRange> = all_ranges
            .iter()
            .filter(|r| r.timeline == worker_id.timeline)
@@ -287,85 +294,8 @@ async fn main_impl(
            rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
                .unwrap();

-        let cancel = cancel.clone();
        Box::pin(async move {
-            let client =
-                pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-                    .await
-                    .unwrap();
-            let mut client = client
-                .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
-                .await
-                .unwrap();
-
-            start_work_barrier.wait().await;
-            let client_start = Instant::now();
-            let mut ticks_processed = 0;
-            let mut inflight = VecDeque::new();
-            while !cancel.is_cancelled() {
-                // Detect if a request took longer than the RPS rate
-                if let Some(period) = &rps_period {
-                    let periods_passed_until_now =
-                        usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
-                            .unwrap();
-
-                    if periods_passed_until_now > ticks_processed {
-                        live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
-                    }
-                    ticks_processed = periods_passed_until_now;
-                }
-
-                while inflight.len() < args.queue_depth.get() {
-                    let start = Instant::now();
-                    let req = {
-                        let mut rng = rand::thread_rng();
-                        let r = &ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        assert!(key.is_rel_block_key());
-                        let (rel_tag, block_no) = key
-                            .to_rel_block()
-                            .expect("we filter non-rel-block keys out above");
-                        PagestreamGetPageRequest {
-                            hdr: PagestreamRequest {
-                                reqid: 0,
-                                request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                                    Lsn::MAX
-                                } else {
-                                    r.timeline_lsn
-                                },
-                                not_modified_since: r.timeline_lsn,
-                            },
-                            rel: rel_tag,
-                            blkno: block_no,
-                        }
-                    };
-                    client.getpage_send(req).await.unwrap();
-                    inflight.push_back(start);
-                }
-
-                let start = inflight.pop_front().unwrap();
-                client.getpage_recv().await.unwrap();
-                let end = Instant::now();
-                live_stats.request_done();
-                ticks_processed += 1;
-                STATS.with(|stats| {
-                    stats
-                        .borrow()
-                        .lock()
-                        .unwrap()
-                        .observe(end.duration_since(start))
-                        .unwrap();
-                });
-
-                if let Some(period) = &rps_period {
-                    let next_at = client_start
-                        + Duration::from_micros(
-                            (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
-                        );
-                    tokio::time::sleep_until(next_at.into()).await;
-                }
-            }
+            client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
        })
    };

@@ -387,7 +317,7 @@ async fn main_impl(
    };

    info!("waiting for everything to become ready");
-    start_work_barrier.wait().await;
+    shared_state.start_work_barrier.wait().await;
    info!("work started");
    if let Some(runtime) = args.runtime {
        tokio::time::sleep(runtime.into()).await;
@@ -416,3 +346,91 @@ async fn main_impl(

    anyhow::Ok(())
 }
+
+async fn client_libpq(
+    args: &Args,
+    worker_id: WorkerId,
+    shared_state: Arc<SharedState>,
+    cancel: CancellationToken,
+    rps_period: Option<Duration>,
+    ranges: Vec<KeyRange>,
+    weights: rand::distributions::weighted::WeightedIndex<i128>,
+) {
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+        .await
+        .unwrap();
+
+    shared_state.start_work_barrier.wait().await;
+    let client_start = Instant::now();
+    let mut ticks_processed = 0;
+    let mut inflight = VecDeque::new();
+    while !cancel.is_cancelled() {
+        // Detect if a request took longer than the RPS rate
+        if let Some(period) = &rps_period {
+            let periods_passed_until_now =
+                usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
+
+            if periods_passed_until_now > ticks_processed {
+                shared_state
+                    .live_stats
+                    .missed((periods_passed_until_now - ticks_processed) as u64);
+            }
+            ticks_processed = periods_passed_until_now;
+        }
+
+        while inflight.len() < args.queue_depth.get() {
+            let start = Instant::now();
+            let req = {
+                let mut rng = rand::thread_rng();
+                let r = &ranges[weights.sample(&mut rng)];
+                let key: i128 = rng.gen_range(r.start..r.end);
+                let key = Key::from_i128(key);
+                assert!(key.is_rel_block_key());
+                let (rel_tag, block_no) = key
+                    .to_rel_block()
+                    .expect("we filter non-rel-block keys out above");
+                PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid: 0,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since: r.timeline_lsn,
+                    },
+                    rel: rel_tag,
+                    blkno: block_no,
+                }
+            };
+            client.getpage_send(req).await.unwrap();
+            inflight.push_back(start);
+        }
+
+        let start = inflight.pop_front().unwrap();
+        client.getpage_recv().await.unwrap();
+        let end = Instant::now();
+        shared_state.live_stats.request_done();
+        ticks_processed += 1;
+        STATS.with(|stats| {
+            stats
+                .borrow()
+                .lock()
+                .unwrap()
+                .observe(end.duration_since(start))
+                .unwrap();
+        });
+
+        if let Some(period) = &rps_period {
+            let next_at = client_start
+                + Duration::from_micros(
+                    (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                );
+            tokio::time::sleep_until(next_at.into()).await;
+        }
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -416,8 +416,18 @@ fn start_pageserver(
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME
        .block_on(async {
+            let tls_config = storage_broker::ClientTlsConfig::new().ca_certificates(
+                conf.ssl_ca_certs
+                    .iter()
+                    .map(pem::encode)
+                    .map(storage_broker::Certificate::from_pem),
+            );
            // Note: we do not attempt connecting here (but validate endpoints sanity).
-            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+            storage_broker::connect(
+                conf.broker_endpoint.clone(),
+                conf.broker_keepalive_interval,
+                tls_config,
+            )
        })
        .with_context(|| {
            format!(
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -17,9 +17,10 @@ use once_cell::sync::OnceCell;
 use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
+use pem::Pem;
 use postgres_backend::AuthType;
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use reqwest::{Certificate, Url};
+use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
@@ -67,8 +68,8 @@ pub struct PageServerConf {
    /// Period to reload certificate and private key from files.
    /// Default: 60s.
    pub ssl_cert_reload_period: Duration,
-    /// Trusted root CA certificates to use in https APIs.
-    pub ssl_ca_certs: Vec<Certificate>,
+    /// Trusted root CA certificates to use in https APIs in PEM format.
+    pub ssl_ca_certs: Vec<Pem>,

    /// Current availability zone. Used for traffic metrics.
    pub availability_zone: Option<String>,
@@ -118,13 +119,13 @@ pub struct PageServerConf {
    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
    pub concurrent_tenant_warmup: ConfigurableSemaphore,

-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
-    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
+    /// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -224,6 +225,11 @@ pub struct PageServerConf {
    /// Does not force TLS: the client negotiates TLS usage during the handshake.
    /// Uses key and certificate from ssl_key_file/ssl_cert_file.
    pub enable_tls_page_service_api: bool,
+
+    /// Run in development mode, which disables certain safety checks
+    /// such as authentication requirements for HTTP and PostgreSQL APIs.
+    /// This is insecure and should only be used in development environments.
+    pub dev_mode: bool,
 }

 /// Token for authentication to safekeepers
@@ -397,6 +403,7 @@ impl PageServerConf {
            generate_unarchival_heatmap,
            tracing,
            enable_tls_page_service_api,
+            dev_mode,
        } = config_toml;

        let mut conf = PageServerConf {
@@ -448,6 +455,7 @@ impl PageServerConf {
            get_vectored_concurrent_io,
            tracing,
            enable_tls_page_service_api,
+            dev_mode,

            // ------------------------------------------------------------
            // fields that require additional validation or custom handling
@@ -497,7 +505,10 @@ impl PageServerConf {
            ssl_ca_certs: match ssl_ca_file {
                Some(ssl_ca_file) => {
                    let buf = std::fs::read(ssl_ca_file)?;
-                    Certificate::from_pem_bundle(&buf)?
+                    pem::parse_many(&buf)?
+                        .into_iter()
+                        .filter(|pem| pem.tag() == "CERTIFICATE")
+                        .collect()
                }
                None => Vec::new(),
            },
@@ -588,10 +599,10 @@ impl ConfigurableSemaphore {
    /// Initializse using a non-zero amount of permits.
    ///
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
-    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
+    /// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 mod disk_cache;
 mod metrics;
@@ -428,7 +428,7 @@ async fn calculate_synthetic_size_worker(
    }
 }

-async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) {
    const CAUSE: LogicalSizeCalculationCause =
        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -175,9 +175,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::remote_size`]
+    /// [`TenantShard::remote_size`]
    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    /// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size
    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
            tenant_id,
@@ -199,9 +199,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
@@ -254,7 +254,7 @@ pub(super) async fn collect_all_metrics(

 async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
 where
-    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::TenantShard>)>,
 {
    let mut current_metrics: Vec<NewRawMetric> = Vec::new();

@@ -263,7 +263,9 @@ where
    while let Some((tenant_id, tenant)) = tenants.next().await {
        let mut tenant_resident_size = 0;

-        for timeline in tenant.list_timelines() {
+        let timelines = tenant.list_timelines();
+        let timelines_len = timelines.len();
+        for timeline in timelines {
            let timeline_id = timeline.timeline_id;

            match TimelineSnapshot::collect(&timeline, ctx) {
@@ -289,6 +291,11 @@ where
            tenant_resident_size += timeline.resident_physical_size();
        }

+        if timelines_len == 0 {
+            // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
+            tenant_resident_size = 1;
+        }
+
        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
        snap.to_metrics(tenant_id, Utc::now(), cache, &mut current_metrics);
    }
@@ -308,7 +315,7 @@ impl TenantSnapshot {
    ///
    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
    /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
        TenantSnapshot {
            resident_size,
            remote_size: t.remote_size(),
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -3,17 +3,19 @@ use std::collections::HashMap;
 use futures::Future;
 use pageserver_api::config::NodeMetadata;
 use pageserver_api::controller_api::{AvailabilityZone, NodeRegisterRequest};
+use pageserver_api::models::ShardImportStatus;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::upcall_api::{
-    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
-    ValidateRequestTenant, ValidateResponse,
+    PutTimelineImportStatusRequest, ReAttachRequest, ReAttachResponse, ReAttachResponseTenant,
+    ValidateRequest, ValidateRequestTenant, ValidateResponse,
 };
+use reqwest::Certificate;
 use serde::Serialize;
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
 use url::Url;
 use utils::generation::Generation;
-use utils::id::NodeId;
+use utils::id::{NodeId, TimelineId};
 use utils::{backoff, failpoint_support};

 use crate::config::PageServerConf;
@@ -45,6 +47,12 @@ pub trait StorageControllerUpcallApi {
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
    ) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
+    fn put_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        status: ShardImportStatus,
+    ) -> impl Future<Output = Result<(), RetryForeverError>> + Send;
 }

 impl StorageControllerUpcallClient {
@@ -76,8 +84,8 @@ impl StorageControllerUpcallClient {
            client = client.default_headers(headers);
        }

-        for ssl_ca_cert in &conf.ssl_ca_certs {
-            client = client.add_root_certificate(ssl_ca_cert.clone());
+        for cert in &conf.ssl_ca_certs {
+            client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
        }

        Ok(Some(Self {
@@ -272,4 +280,30 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {

        Ok(result.into_iter().collect())
    }
+
+    /// Send a shard import status to the storage controller
+    ///
+    /// The implementation must have at-least-once delivery semantics.
+    /// To this end, we retry the request until it succeeds. If the pageserver
+    /// restarts or crashes, the shard import will start again from the beggining.
+    #[tracing::instrument(skip_all)] // so that warning logs from retry_http_forever have context
+    async fn put_timeline_import_status(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        status: ShardImportStatus,
+    ) -> Result<(), RetryForeverError> {
+        let url = self
+            .base_url
+            .join("timeline_import_status")
+            .expect("Failed to build path");
+
+        let request = PutTimelineImportStatusRequest {
+            tenant_shard_id,
+            timeline_id,
+            status,
+        };
+
+        self.retry_http_forever(&url, request).await
+    }
 }
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -787,6 +787,15 @@ mod test {

            Ok(result)
        }
+
+        async fn put_timeline_import_status(
+            &self,
+            _tenant_shard_id: TenantShardId,
+            _timeline_id: TimelineId,
+            _status: pageserver_api::models::ShardImportStatus,
+        ) -> Result<(), RetryForeverError> {
+            unimplemented!()
+        }
    }

    async fn setup(test_name: &str) -> anyhow::Result<TestSetup> {
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1873,7 +1873,7 @@ async fn update_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

@@ -1917,7 +1917,7 @@ async fn patch_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1086,7 +1086,7 @@ pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register metric")
 });

-/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
+/// Metrics related to the lifecycle of a [`crate::tenant::TenantShard`] object: things
 /// like how long it took to load.
 ///
 /// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -76,7 +76,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
 use crate::{basebackup, timed_after_cancellation};

-/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
+/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which
 /// is not yet in state [`TenantState::Active`].
 ///
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -158,7 +158,7 @@ pub struct TenantSharedResources {
    pub l0_flush_global_state: L0FlushGlobalState,
 }

-/// A [`Tenant`] is really an _attached_ tenant.  The configuration
+/// A [`TenantShard`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
 #[derive(Clone)]
@@ -245,7 +245,7 @@ pub(crate) enum SpawnMode {
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
-pub struct Tenant {
+pub struct TenantShard {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

@@ -267,7 +267,7 @@ pub struct Tenant {
    shard_identity: ShardIdentity,

    /// The remote storage generation, used to protect S3 objects from split-brain.
-    /// Does not change over the lifetime of the [`Tenant`] object.
+    /// Does not change over the lifetime of the [`TenantShard`] object.
    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
@@ -309,7 +309,7 @@ pub struct Tenant {
    // Access to global deletion queue for when this tenant wants to schedule a deletion
    deletion_queue_client: DeletionQueueClient,

-    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
+    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,

@@ -337,12 +337,12 @@ pub struct Tenant {
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,

-    // Users of the Tenant such as the page service must take this Gate to avoid
-    // trying to use a Tenant which is shutting down.
+    // Users of the TenantShard such as the page service must take this Gate to avoid
+    // trying to use a TenantShard which is shutting down.
    pub(crate) gate: Gate,

    /// Throttle applied at the top of [`Timeline::get`].
-    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
+    /// All [`TenantShard::timelines`] of a given [`TenantShard`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,

    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
@@ -362,7 +362,7 @@ pub struct Tenant {

    l0_flush_global_state: L0FlushGlobalState,
 }
-impl std::fmt::Debug for Tenant {
+impl std::fmt::Debug for TenantShard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
    }
@@ -841,7 +841,7 @@ impl Debug for SetStoppingError {
    }
 }

-/// Arguments to [`Tenant::create_timeline`].
+/// Arguments to [`TenantShard::create_timeline`].
 ///
 /// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
 /// is `None`, the result of the timeline create call is not deterministic.
@@ -876,7 +876,7 @@ pub(crate) struct CreateTimelineParamsImportPgdata {
    pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`] in  [`Tenant::start_creating_timeline`].
+/// What is used to determine idempotency of a [`TenantShard::create_timeline`] call in  [`TenantShard::start_creating_timeline`] in  [`TenantShard::start_creating_timeline`].
 ///
 /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
@@ -914,7 +914,7 @@ pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
    idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is returned by [`Tenant::start_creating_timeline`].
+/// What is returned by [`TenantShard::start_creating_timeline`].
 #[must_use]
 enum StartCreatingTimelineResult {
    CreateGuard(TimelineCreateGuard),
@@ -943,13 +943,13 @@ struct TimelineInitAndSyncNeedsSpawnImportPgdata {
    guard: TimelineCreateGuard,
 }

-/// What is returned by [`Tenant::create_timeline`].
+/// What is returned by [`TenantShard::create_timeline`].
 enum CreateTimelineResult {
    Created(Arc<Timeline>),
    Idempotent(Arc<Timeline>),
-    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
+    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`TenantShard::timelines`] when
    /// we return this result, nor will this concrete object ever be added there.
-    /// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
+    /// Cf method comment on [`TenantShard::create_timeline_import_pgdata`].
    ImportSpawned(Arc<Timeline>),
 }

@@ -1082,7 +1082,7 @@ pub(crate) enum LoadConfigError {
    NotFound(Utf8PathBuf),
 }

-impl Tenant {
+impl TenantShard {
    /// Yet another helper for timeline initialization.
    ///
    /// - Initializes the Timeline struct and inserts it into the tenant's hash map
@@ -1303,7 +1303,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
+    ) -> Result<Arc<TenantShard>, GlobalShutDown> {
        let wal_redo_manager =
            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;

@@ -1317,7 +1317,7 @@ impl Tenant {
        let attach_mode = attached_conf.location.attach_mode;
        let generation = attached_conf.location.generation;

-        let tenant = Arc::new(Tenant::new(
+        let tenant = Arc::new(TenantShard::new(
            TenantState::Attaching,
            conf,
            attached_conf,
@@ -1334,7 +1334,7 @@ impl Tenant {
        let attach_gate_guard = tenant
            .gate
            .enter()
-            .expect("We just created the Tenant: nothing else can have shut it down yet");
+            .expect("We just created the TenantShard: nothing else can have shut it down yet");

        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
@@ -1362,7 +1362,7 @@ impl Tenant {
                    }
                }

-                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
+                fn make_broken_or_stopping(t: &TenantShard, err: anyhow::Error) {
                    t.state.send_modify(|state| match state {
                        // TODO: the old code alluded to DeleteTenantFlow sometimes setting
                        // TenantState::Stopping before we get here, but this may be outdated.
@@ -1627,7 +1627,7 @@ impl Tenant {
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -1957,7 +1957,7 @@ impl Tenant {
    }

    async fn load_timelines_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
        heatmap: Option<(HeatMapTenant, std::time::Instant)>,
@@ -2028,7 +2028,7 @@ impl Tenant {
    }

    fn load_timeline_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_id: TimelineId,
        remote_storage: GenericRemoteStorage,
        previous_heatmap: Option<PreviousHeatmap>,
@@ -2429,14 +2429,14 @@ impl Tenant {
    /// This is used by tests & import-from-basebackup.
    ///
    /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
-    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
+    /// a state that will fail [`TenantShard::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
    ///
    /// The caller is responsible for getting the timeline into a state that will be accepted
-    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
+    /// by [`TenantShard::load_remote_timeline`] / [`TenantShard::attach`].
    /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
-    /// to the [`Tenant::timelines`].
+    /// to the [`TenantShard::timelines`].
    ///
-    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
+    /// Tests should use `TenantShard::create_test_timeline` to set up the minimum required metadata keys.
    pub(crate) async fn create_empty_timeline(
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
@@ -2584,7 +2584,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        params: CreateTimelineParams,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
@@ -2751,13 +2751,13 @@ impl Tenant {
        Ok(activated_timeline)
    }

-    /// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
+    /// The returned [`Arc<Timeline>`] is NOT in the [`TenantShard::timelines`] map until the import
    /// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
-    /// [`Tenant::timelines`] map when the import completes.
+    /// [`TenantShard::timelines`] map when the import completes.
    /// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
    /// for the response.
    async fn create_timeline_import_pgdata(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
        params: CreateTimelineParamsImportPgdata,
        activate: ActivateTimelineArgs,
        ctx: &RequestContext,
@@ -2854,7 +2854,7 @@ impl Tenant {

    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
    async fn create_timeline_import_pgdata_task(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2882,7 +2882,7 @@ impl Tenant {
    }

    async fn create_timeline_import_pgdata_task_impl(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2899,10 +2899,10 @@ impl Tenant {
        // Reload timeline from remote.
        // This proves that the remote state is attachable, and it reuses the code.
        //
-        // TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
+        // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown.
        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
-        // But our activate() call might launch new background tasks after Tenant::shutdown
-        // already went past shutting down the Tenant::timelines, which this timeline here is no part of.
+        // But our activate() call might launch new background tasks after TenantShard::shutdown
+        // already went past shutting down the TenantShard::timelines, which this timeline here is no part of.
        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
        // down while bootstrapping/branching + activating), but, the race condition is much more likely
        // to manifest because of the long runtime of this import task.
@@ -2917,7 +2917,7 @@ impl Tenant {
        // };
        let timeline_id = timeline.timeline_id;

-        // load from object storage like Tenant::attach does
+        // load from object storage like TenantShard::attach does
        let resources = self.build_timeline_resources(timeline_id);
        let index_part = resources
            .remote_client
@@ -3938,7 +3938,7 @@ enum ActivateTimelineArgs {
    No,
 }

-impl Tenant {
+impl TenantShard {
    pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig {
        self.tenant_conf.load().tenant_conf.clone()
    }
@@ -4096,7 +4096,7 @@ impl Tenant {
        update: F,
    ) -> anyhow::Result<pageserver_api::models::TenantConfig> {
        // Use read-copy-update in order to avoid overwriting the location config
-        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // state if this races with [`TenantShard::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
@@ -4219,7 +4219,7 @@ impl Tenant {
        Ok((timeline, timeline_ctx))
    }

-    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// [`TenantShard::shutdown`] must be called before dropping the returned [`TenantShard`] object
    /// to ensure proper cleanup of background tasks and metrics.
    //
    // Allow too_many_arguments because a constructor's argument list naturally grows with the
@@ -4235,7 +4235,7 @@ impl Tenant {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
-    ) -> Tenant {
+    ) -> TenantShard {
        debug_assert!(
            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
        );
@@ -4295,7 +4295,7 @@ impl Tenant {
            }
        });

-        Tenant {
+        TenantShard {
            tenant_shard_id,
            shard_identity,
            generation: attached_conf.location.generation,
@@ -4330,7 +4330,7 @@ impl Tenant {
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            pagestream_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                TenantShard::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
            )),
            pagestream_throttle_metrics: Arc::new(
                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
@@ -4466,11 +4466,11 @@ impl Tenant {

        // Perform GC for each timeline.
        //
-        // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the
+        // Note that we don't hold the `TenantShard::gc_cs` lock here because we don't want to delay the
        // branch creation task, which requires the GC lock. A GC iteration can run concurrently
        // with branch creation.
        //
-        // See comments in [`Tenant::branch_timeline`] for more information about why branch
+        // See comments in [`TenantShard::branch_timeline`] for more information about why branch
        // creation task can run concurrently with timeline's GC iteration.
        for timeline in gc_timelines {
            if cancel.is_cancelled() {
@@ -4500,7 +4500,7 @@ impl Tenant {

    /// Refreshes the Timeline::gc_info for all timelines, returning the
    /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
-    /// [`Tenant::get_gc_horizon`].
+    /// [`TenantShard::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
    pub(crate) async fn refresh_gc_info(
@@ -5499,7 +5499,7 @@ impl Tenant {
            }
        }

-        // The flushes we did above were just writes, but the Tenant might have had
+        // The flushes we did above were just writes, but the TenantShard might have had
        // pending deletions as well from recent compaction/gc: we want to flush those
        // as well.  This requires flushing the global delete queue.  This is cheap
        // because it's typically a no-op.
@@ -5517,7 +5517,7 @@ impl Tenant {

    /// How much local storage would this tenant like to have?  It can cope with
    /// less than this (via eviction and on-demand downloads), but this function enables
-    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
+    /// the TenantShard to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
    ///
    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
@@ -5540,11 +5540,11 @@ impl Tenant {
    /// manifest in `Self::remote_tenant_manifest`.
    ///
    /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
-    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
+    /// changing any `TenantShard` state that's included in the manifest, consider making the manifest
    /// the authoritative source of data with an API that automatically uploads on changes. Revisit
    /// this when the manifest is more widely used and we have a better idea of the data model.
    pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
+        // Multiple tasks may call this function concurrently after mutating the TenantShard runtime
        // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
        // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
        // simple coalescing mechanism.
@@ -5812,7 +5812,7 @@ pub(crate) mod harness {
            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
        }

-        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<TenantShard>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
                .with_scope_unit_test();
            (
@@ -5827,10 +5827,10 @@ pub(crate) mod harness {
        pub(crate) async fn do_try_load(
            &self,
            ctx: &RequestContext,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

-            let tenant = Arc::new(Tenant::new(
+            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
                self.conf,
                AttachedTenantConf::try_from(LocationConf::attached_single(
@@ -6046,7 +6046,7 @@ mod tests {
    #[cfg(feature = "testing")]
    #[allow(clippy::too_many_arguments)]
    async fn randomize_timeline(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
        new_timeline_id: TimelineId,
        pg_version: u32,
        spec: TestTimelineSpecification,
@@ -6936,7 +6936,7 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        lsn: Lsn,
@@ -6948,7 +6948,7 @@ mod tests {
    }

    async fn bulk_insert_maybe_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
@@ -7858,7 +7858,7 @@ mod tests {
            let (tline, _ctx) = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
+            // Leave the timeline ID in [`TenantShard::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown(super::timeline::ShutdownMode::Hard)
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -28,7 +28,7 @@ use tracing::warn;
 use crate::context::RequestContext;
 use crate::page_cache::PAGE_SZ;
 use crate::tenant::block_io::BlockCursor;
-use crate::virtual_file::VirtualFile;
+use crate::virtual_file::TempVirtualFile;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};

 #[derive(Copy, Clone, Debug)]
@@ -218,7 +218,7 @@ pub(super) const BYTE_ZSTD: u8 = BYTE_UNCOMPRESSED | 0x10;
 /// discarded. You need to call [`flush_buffer`](Self::flush_buffer)
 /// manually before dropping.
 pub struct BlobWriter<const BUFFERED: bool> {
-    inner: VirtualFile,
+    inner: TempVirtualFile,
    offset: u64,
    /// A buffer to save on write calls, only used if BUFFERED=true
    buf: Vec<u8>,
@@ -228,7 +228,7 @@ pub struct BlobWriter<const BUFFERED: bool> {

 impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    pub fn new(
-        inner: VirtualFile,
+        inner: TempVirtualFile,
        start_offset: u64,
        _gate: &utils::sync::gate::Gate,
        _cancel: CancellationToken,
@@ -476,30 +476,17 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
    }
 }

-impl BlobWriter<true> {
-    /// Access the underlying `VirtualFile`.
+impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
+    /// Finish this blob writer and return the underlying [`TempVirtualFile`].
    ///
-    /// This function flushes the internal buffer before giving access
-    /// to the underlying `VirtualFile`.
-    pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<VirtualFile, Error> {
-        self.flush_buffer(ctx).await?;
+    /// If there is an internal buffer (depends on `BUFFERED`), it will
+    /// be flushed before this method returns.
+    pub async fn into_inner(mut self, ctx: &RequestContext) -> Result<TempVirtualFile, Error> {
+        if BUFFERED {
+            self.flush_buffer(ctx).await?;
+        }
        Ok(self.inner)
    }
-
-    /// Access the underlying `VirtualFile`.
-    ///
-    /// Unlike [`into_inner`](Self::into_inner), this doesn't flush
-    /// the internal buffer before giving access.
-    pub fn into_inner_no_flush(self) -> VirtualFile {
-        self.inner
-    }
-}
-
-impl BlobWriter<false> {
-    /// Access the underlying `VirtualFile`.
-    pub fn into_inner(self) -> VirtualFile {
-        self.inner
-    }
 }

 #[cfg(test)]
@@ -512,6 +499,7 @@ pub(crate) mod tests {
    use crate::context::DownloadBehavior;
    use crate::task_mgr::TaskKind;
    use crate::tenant::block_io::BlockReaderRef;
+    use crate::virtual_file::VirtualFile;

    async fn round_trip_test<const BUFFERED: bool>(blobs: &[Vec<u8>]) -> Result<(), Error> {
        round_trip_test_compressed::<BUFFERED>(blobs, false).await
@@ -530,7 +518,10 @@ pub(crate) mod tests {
        // Write part (in block to drop the file)
        let mut offsets = Vec::new();
        {
-            let file = VirtualFile::create(pathbuf.as_path(), ctx).await?;
+            let file = TempVirtualFile::new(
+                VirtualFile::create(pathbuf.as_path(), ctx).await?,
+                gate.enter().unwrap(),
+            );
            let mut wtr = BlobWriter::<BUFFERED>::new(file, 0, &gate, cancel.clone(), ctx);
            for blob in blobs.iter() {
                let (_, res) = if compression {
@@ -553,7 +544,9 @@ pub(crate) mod tests {
            let (_, res) = wtr.write_blob(vec![0; PAGE_SZ].slice_len(), ctx).await;
            let offs = res?;
            println!("Writing final blob at offs={offs}");
-            wtr.flush_buffer(ctx).await?;
+
+            let file = wtr.into_inner(ctx).await?;
+            file.disarm_into_inner();
        }
        Ok((temp_dir, pathbuf, offsets))
    }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -12,6 +12,7 @@ use tokio_epoll_uring::{BoundedBuf, Slice};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info_span};
 use utils::id::TimelineId;
+use utils::sync::gate::GateGuard;

 use crate::assert_u64_eq_usize::{U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
@@ -21,16 +22,33 @@ use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
 use crate::virtual_file::owned_buffers_io::write::{Buffer, FlushTaskError};
-use crate::virtual_file::{self, IoBufferMut, VirtualFile, owned_buffers_io};
+use crate::virtual_file::{self, IoBufferMut, TempVirtualFile, VirtualFile, owned_buffers_io};
+
+use self::owned_buffers_io::write::OwnedAsyncWriter;

 pub struct EphemeralFile {
    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
    page_cache_file_id: page_cache::FileId,
    bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
-    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
-    _gate_guard: utils::sync::gate::GateGuard,
+    file: TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
+    buffered_writer: BufferedWriter,
+}
+
+type BufferedWriter = owned_buffers_io::write::BufferedWriter<
+    IoBufferMut,
+    TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter,
+>;
+
+/// A TempVirtualFile that is co-owned by the [`EphemeralFile`]` and [`BufferedWriter`].
+///
+/// (Actually [`BufferedWriter`] internally is just a client to a background flush task.
+/// The co-ownership is between [`EphemeralFile`] and that flush task.)
+///
+/// Co-ownership allows us to serve reads for data that has already been flushed by the [`BufferedWriter`].
+#[derive(Debug, Clone)]
+struct TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    inner: Arc<TempVirtualFile>,
 }

 const TAIL_SZ: usize = 64 * 1024;
@@ -44,9 +62,12 @@ impl EphemeralFile {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> anyhow::Result<EphemeralFile> {
-        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
+        // TempVirtualFile requires us to never reuse a filename while an old
+        // instance of TempVirtualFile created with that filename is not done dropping yet.
+        // So, we use a monotonic counter to disambiguate the filenames.
+        static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1);
        let filename_disambiguator =
-            NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+            NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        let filename = conf
            .timeline_path(&tenant_shard_id, &timeline_id)
@@ -54,7 +75,7 @@ impl EphemeralFile {
                "ephemeral-{filename_disambiguator}"
            )));

-        let file = Arc::new(
+        let file = TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter::new(
            VirtualFile::open_with_options_v2(
                &filename,
                virtual_file::OpenOptions::new()
@@ -64,6 +85,7 @@ impl EphemeralFile {
                ctx,
            )
            .await?,
+            gate.enter()?,
        );

        let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
@@ -73,7 +95,8 @@ impl EphemeralFile {
            _timeline_id: timeline_id,
            page_cache_file_id,
            bytes_written: 0,
-            buffered_writer: owned_buffers_io::write::BufferedWriter::new(
+            file: file.clone(),
+            buffered_writer: BufferedWriter::new(
                file,
                || IoBufferMut::with_capacity(TAIL_SZ),
                gate.enter()?,
@@ -81,29 +104,42 @@ impl EphemeralFile {
                ctx,
                info_span!(parent: None, "ephemeral_file_buffered_writer", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), timeline_id=%timeline_id, path = %filename),
            ),
-            _gate_guard: gate.enter()?,
        })
    }
 }

-impl Drop for EphemeralFile {
-    fn drop(&mut self) {
-        // unlink the file
-        // we are clear to do this, because we have entered a gate
-        let path = self.buffered_writer.as_inner().path();
-        let res = std::fs::remove_file(path);
-        if let Err(e) = res {
-            if e.kind() != std::io::ErrorKind::NotFound {
-                // just never log the not found errors, we cannot do anything for them; on detach
-                // the tenant directory is already gone.
-                //
-                // not found files might also be related to https://github.com/neondatabase/neon/issues/2442
-                error!("could not remove ephemeral file '{path}': {e}");
-            }
+impl TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    fn new(file: VirtualFile, gate_guard: GateGuard) -> Self {
+        Self {
+            inner: Arc::new(TempVirtualFile::new(file, gate_guard)),
        }
    }
 }

+impl OwnedAsyncWriter for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    fn write_all_at<Buf: owned_buffers_io::io_buf_aligned::IoBufAligned + Send>(
+        &self,
+        buf: owned_buffers_io::io_buf_ext::FullSlice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> impl std::future::Future<
+        Output = (
+            owned_buffers_io::io_buf_ext::FullSlice<Buf>,
+            std::io::Result<()>,
+        ),
+    > + Send {
+        self.inner.write_all_at(buf, offset, ctx)
+    }
+}
+
+impl std::ops::Deref for TempVirtualFileCoOwnedByEphemeralFileAndBufferedWriter {
+    type Target = VirtualFile;
+
+    fn deref(&self) -> &Self::Target {
+        &self.inner
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum EphemeralFileWriteError {
    #[error("{0}")]
@@ -262,9 +298,9 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);

        let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = self.buffered_writer.as_inner();
            let bounds = dst.bounds();
-            let slice = file
+            let slice = self
+                .file
                .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
                .await?;
            Slice::from_buf_bounds(Slice::into_inner(slice), bounds)
@@ -456,7 +492,7 @@ mod tests {
            assert_eq!(&buf, &content[range]);
        }

-        let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
+        let file_contents = std::fs::read(file.file.path()).unwrap();
        assert!(file_contents == content[0..cap * 2]);

        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
@@ -489,7 +525,7 @@ mod tests {
        // assert the state is as this test expects it to be
        let load_io_buf_res = file.load_to_io_buf(&ctx).await.unwrap();
        assert_eq!(&load_io_buf_res[..], &content[0..cap * 2 + cap / 2]);
-        let md = file.buffered_writer.as_inner().path().metadata().unwrap();
+        let md = file.file.path().metadata().unwrap();
        assert_eq!(
            md.len(),
            2 * cap.into_u64(),
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -564,8 +564,9 @@ mod tests {
            Lsn(0),
            Lsn(0),
            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
+            // Updating this version to 17 will cause the test to fail at the
+            // next assert_eq!().
+            16,
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -52,7 +52,9 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState,
+};
 use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

@@ -67,7 +69,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 /// having a properly acquired generation (Secondary doesn't need a generation)
 #[derive(Clone)]
 pub(crate) enum TenantSlot {
-    Attached(Arc<Tenant>),
+    Attached(Arc<TenantShard>),
    Secondary(Arc<SecondaryTenant>),
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
@@ -86,7 +88,7 @@ impl std::fmt::Debug for TenantSlot {

 impl TenantSlot {
    /// Return the `Tenant` in this slot if attached, else None
-    fn get_attached(&self) -> Option<&Arc<Tenant>> {
+    fn get_attached(&self) -> Option<&Arc<TenantShard>> {
        match self {
            Self::Attached(t) => Some(t),
            Self::Secondary(_) => None,
@@ -164,7 +166,7 @@ impl TenantStartupMode {
 /// Result type for looking up a TenantId to a specific shard
 pub(crate) enum ShardResolveResult {
    NotFound,
-    Found(Arc<Tenant>),
+    Found(Arc<TenantShard>),
    // Wait for this barrrier, then query again
    InProgress(utils::completion::Barrier),
 }
@@ -173,7 +175,7 @@ impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
    /// None is returned.
-    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<TenantShard>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
@@ -410,7 +412,7 @@ fn load_tenant_config(
        return None;
    }

-    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
+    Some(TenantShard::load_tenant_config(conf, &tenant_shard_id))
 }

 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -606,7 +608,8 @@ pub async fn init_tenant_mgr(
        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            let r =
+                TenantShard::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
            (tenant_shard_id, location_conf, r)
        });
    }
@@ -694,7 +697,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Result<Arc<TenantShard>, GlobalShutDown> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -706,7 +709,7 @@ fn tenant_spawn(
            .unwrap()
    );

-    Tenant::spawn(
+    TenantShard::spawn(
        conf,
        tenant_shard_id,
        resources,
@@ -883,12 +886,12 @@ impl TenantManager {
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
    /// undergoing a state change (i.e. slot is InProgress).
    ///
-    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
-    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
+    /// The return TenantShard is not guaranteed to be active: check its status after obtaing it, or
+    /// use [`TenantShard::wait_to_become_active`] before using it if you will do I/O on it.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
-    ) -> Result<Arc<Tenant>, GetTenantError> {
+    ) -> Result<Arc<TenantShard>, GetTenantError> {
        let locked = self.tenants.read().unwrap();

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
@@ -937,12 +940,12 @@ impl TenantManager {
        flush: Option<Duration>,
        mut spawn_mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
+    ) -> Result<Option<Arc<TenantShard>>, UpsertLocationError> {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

        enum FastPathModified {
-            Attached(Arc<Tenant>),
+            Attached(Arc<TenantShard>),
            Secondary(Arc<SecondaryTenant>),
        }

@@ -999,9 +1002,13 @@ impl TenantManager {
        // phase of writing config and/or waiting for flush, before returning.
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                // Transition to AttachedStale means we may well hold a valid generation
                // still, and have been requested to go stale as part of a migration.  If
@@ -1030,9 +1037,13 @@ impl TenantManager {
                return Ok(Some(tenant));
            }
            Some(FastPathModified::Secondary(_secondary_tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                return Ok(None);
            }
@@ -1122,7 +1133,7 @@ impl TenantManager {
        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+        TenantShard::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
            .await
            .fatal_err("write tenant shard config");

@@ -1262,7 +1273,7 @@ impl TenantManager {

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+        let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)?;

        if drop_cache {
            tracing::info!("Dropping local file cache");
@@ -1297,7 +1308,7 @@ impl TenantManager {
        Ok(())
    }

-    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
+    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<TenantShard>> {
        let locked = self.tenants.read().unwrap();
        match &*locked {
            TenantsMap::Initializing => Vec::new(),
@@ -1446,7 +1457,7 @@ impl TenantManager {
    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1476,7 +1487,7 @@ impl TenantManager {

    pub(crate) async fn do_shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1703,7 +1714,7 @@ impl TenantManager {
    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
    async fn shard_split_hardlink(
        &self,
-        parent_shard: &Tenant,
+        parent_shard: &TenantShard,
        child_shards: Vec<TenantShardId>,
    ) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_id();
@@ -1988,7 +1999,7 @@ impl TenantManager {
            }

            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
+            let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)
                .map_err(|e| Error::DetachReparent(e.into()))?;

            let shard_identity = config.shard;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -133,7 +133,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`].
+//!   [`TenantShard::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -171,7 +171,7 @@
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
+//! [`TenantShard::timeline_init_and_sync`]: super::TenantShard::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

 pub(crate) mod download;
@@ -2743,7 +2743,7 @@ mod tests {
    use crate::tenant::config::AttachmentMode;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::layer::local_layer_path;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
        format!("contents for {name}").into()
@@ -2796,7 +2796,7 @@ mod tests {

    struct TestSetup {
        harness: TenantHarness,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
    }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::sync::atomic::AtomicU64;
 use std::time::SystemTime;

 use anyhow::{Context, anyhow};
@@ -15,7 +16,7 @@ use remote_storage::{
    DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
 };
 use tokio::fs::{self, File, OpenOptions};
-use tokio::io::{AsyncSeekExt, AsyncWriteExt};
+use tokio::io::AsyncSeekExt;
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
@@ -40,7 +41,10 @@ use crate::span::{
 use crate::tenant::Generation;
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
-use crate::virtual_file::{MaybeFatalIo, VirtualFile, on_fatal_io_error};
+use crate::virtual_file;
+use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
+use crate::virtual_file::{IoBufferMut, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{TempVirtualFile, owned_buffers_io};

 ///
 /// If 'metadata' is given, we will validate that the downloaded file's size matches that
@@ -72,21 +76,36 @@ pub async fn download_layer_file<'a>(
        layer_metadata.generation,
    );

-    // Perform a rename inspired by durable_rename from file_utils.c.
-    // The sequence:
-    //     write(tmp)
-    //     fsync(tmp)
-    //     rename(tmp, new)
-    //     fsync(new)
-    //     fsync(parent)
-    // For more context about durable_rename check this email from postgres mailing list:
-    // https://www.postgresql.org/message-id/56583BDD.9060302@2ndquadrant.com
-    // If pageserver crashes the temp file will be deleted on startup and re-downloaded.
-    let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
-
-    let bytes_amount = download_retry(
+    let (bytes_amount, temp_file) = download_retry(
        || async {
-            download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
+            // TempVirtualFile requires us to never reuse a filename while an old
+            // instance of TempVirtualFile created with that filename is not done dropping yet.
+            // So, we use a monotonic counter to disambiguate the filenames.
+            static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1);
+            let filename_disambiguator =
+                NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
+
+            let temp_file_path = path_with_suffix_extension(
+                local_path,
+                &format!("{filename_disambiguator:x}.{TEMP_DOWNLOAD_EXTENSION}"),
+            );
+
+            let temp_file = TempVirtualFile::new(
+                // Not _v2 yet which is sensitive to virtual_file_io_mode.
+                // That'll happen in PR https://github.com/neondatabase/neon/pull/11558
+                VirtualFile::open_with_options(
+                    &temp_file_path,
+                    virtual_file::OpenOptions::new()
+                        .create_new(true)
+                        .write(true),
+                    ctx,
+                )
+                .await
+                .with_context(|| format!("create a temp file for layer download: {temp_file_path}"))
+                .map_err(DownloadError::Other)?,
+                gate.enter().map_err(|_| DownloadError::Cancelled)?,
+            );
+            download_object(storage, &remote_path, temp_file, gate, cancel, ctx).await
        },
        &format!("download {remote_path:?}"),
        cancel,
@@ -96,7 +115,8 @@ pub async fn download_layer_file<'a>(
    let expected = layer_metadata.file_size;
    if expected != bytes_amount {
        return Err(DownloadError::Other(anyhow!(
-            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {temp_file_path:?}",
+            "According to layer file metadata should have downloaded {expected} bytes but downloaded {bytes_amount} bytes into file {:?}",
+            temp_file.path()
        )));
    }

@@ -106,11 +126,28 @@ pub async fn download_layer_file<'a>(
        )))
    });

-    fs::rename(&temp_file_path, &local_path)
+    // Try rename before disarming the temp file.
+    // That way, if rename fails for whatever reason, we clean up the temp file on the return path.
+
+    fs::rename(temp_file.path(), &local_path)
        .await
        .with_context(|| format!("rename download layer file to {local_path}"))
        .map_err(DownloadError::Other)?;

+    // The temp file's VirtualFile points to the temp_file_path which we moved above.
+    // Drop it immediately, it's invalid.
+    // This will get better in https://github.com/neondatabase/neon/issues/11692
+    let _: VirtualFile = temp_file.disarm_into_inner();
+    // NB: The gate guard that was stored in `temp_file` is dropped but we continue
+    // to operate on it and on the parent timeline directory.
+    // Those operations are safe to do because higher-level code is holding another gate guard:
+    // - attached mode: the download task spawned by struct Layer is holding the gate guard
+    // - secondary mode: The TenantDownloader::download holds the gate open
+
+    // The rename above is not durable yet.
+    // It doesn't matter for crash consistency because pageserver startup deletes temp
+    // files and we'll re-download on demand if necessary.
+
    // We use fatal_err() below because the after the rename above,
    // the in-memory state of the filesystem already has the layer file in its final place,
    // and subsequent pageserver code could think it's durable while it really isn't.
@@ -146,147 +183,58 @@ pub async fn download_layer_file<'a>(
 async fn download_object(
    storage: &GenericRemoteStorage,
    src_path: &RemotePath,
-    dst_path: &Utf8PathBuf,
-    #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
+    destination_file: TempVirtualFile,
+    gate: &utils::sync::gate::Gate,
    cancel: &CancellationToken,
-    #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
-) -> Result<u64, DownloadError> {
-    let res = match crate::virtual_file::io_engine::get() {
-        crate::virtual_file::io_engine::IoEngine::NotSet => panic!("unset"),
-        crate::virtual_file::io_engine::IoEngine::StdFs => {
-            async {
-                let destination_file = tokio::fs::File::create(dst_path)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
+    ctx: &RequestContext,
+) -> Result<(u64, TempVirtualFile), DownloadError> {
+    let mut download = storage
+        .download(src_path, &DownloadOpts::default(), cancel)
+        .await?;

-                let download = storage
-                    .download(src_path, &DownloadOpts::default(), cancel)
-                    .await?;
+    pausable_failpoint!("before-downloading-layer-stream-pausable");

-                pausable_failpoint!("before-downloading-layer-stream-pausable");
+    let dst_path = destination_file.path().to_owned();
+    let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
+        destination_file,
+        || IoBufferMut::with_capacity(super::BUFFER_SIZE),
+        gate.enter().map_err(|_| DownloadError::Cancelled)?,
+        cancel.child_token(),
+        ctx,
+        tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path),
+    );

-                let mut buf_writer =
-                    tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
-
-                let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
-
-                let bytes_amount = tokio::io::copy_buf(&mut reader, &mut buf_writer).await?;
-                buf_writer.flush().await?;
-
-                let mut destination_file = buf_writer.into_inner();
-
-                // Tokio doc here: https://docs.rs/tokio/1.17.0/tokio/fs/struct.File.html states that:
-                // A file will not be closed immediately when it goes out of scope if there are any IO operations
-                // that have not yet completed. To ensure that a file is closed immediately when it is dropped,
-                // you should call flush before dropping it.
-                //
-                // From the tokio code I see that it waits for pending operations to complete. There shouldt be any because
-                // we assume that `destination_file` file is fully written. I e there is no pending .write(...).await operations.
-                // But for additional safety lets check/wait for any pending operations.
-                destination_file
-                    .flush()
-                    .await
-                    .maybe_fatal_err("download_object sync_all")
-                    .with_context(|| format!("flush source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                // not using sync_data because it can lose file size update
-                destination_file
-                    .sync_all()
-                    .await
-                    .maybe_fatal_err("download_object sync_all")
-                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                Ok(bytes_amount)
-            }
-            .await
-        }
-        #[cfg(target_os = "linux")]
-        crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
-            use std::sync::Arc;
-
-            use crate::virtual_file::{IoBufferMut, owned_buffers_io};
-            async {
-                let destination_file = Arc::new(
-                    VirtualFile::create(dst_path, ctx)
-                        .await
-                        .with_context(|| {
-                            format!("create a destination file for layer '{dst_path}'")
-                        })
-                        .map_err(DownloadError::Other)?,
-                );
-
-                let mut download = storage
-                    .download(src_path, &DownloadOpts::default(), cancel)
-                    .await?;
-
-                pausable_failpoint!("before-downloading-layer-stream-pausable");
-
-                let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
-                    destination_file,
-                    || IoBufferMut::with_capacity(super::BUFFER_SIZE),
-                    gate.enter().map_err(|_| DownloadError::Cancelled)?,
-                    cancel.child_token(),
-                    ctx,
-                    tracing::info_span!(parent: None, "download_object_buffered_writer", %dst_path),
-                );
-
-                // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
-                // There's chunks_vectored() on the stream.
-                let (bytes_amount, destination_file) = async {
-                    while let Some(res) =
-                        futures::StreamExt::next(&mut download.download_stream).await
-                    {
-                        let chunk = match res {
-                            Ok(chunk) => chunk,
-                            Err(e) => return Err(DownloadError::from(e)),
-                        };
-                        buffered
-                            .write_buffered_borrowed(&chunk, ctx)
-                            .await
-                            .map_err(|e| match e {
-                                FlushTaskError::Cancelled => DownloadError::Cancelled,
-                            })?;
-                    }
-                    let inner = buffered
-                        .flush_and_into_inner(ctx)
-                        .await
-                        .map_err(|e| match e {
-                            FlushTaskError::Cancelled => DownloadError::Cancelled,
-                        })?;
-                    Ok(inner)
-                }
-                .await?;
-
-                // not using sync_data because it can lose file size update
-                destination_file
-                    .sync_all()
-                    .await
-                    .maybe_fatal_err("download_object sync_all")
-                    .with_context(|| format!("failed to fsync source file at {dst_path}"))
-                    .map_err(DownloadError::Other)?;
-
-                Ok(bytes_amount)
-            }
-            .await
-        }
-    };
-
-    // in case the download failed, clean up
-    match res {
-        Ok(bytes_amount) => Ok(bytes_amount),
-        Err(e) => {
-            if let Err(e) = tokio::fs::remove_file(dst_path).await {
-                if e.kind() != std::io::ErrorKind::NotFound {
-                    on_fatal_io_error(&e, &format!("Removing temporary file {dst_path}"));
-                }
-            }
-            Err(e)
+    // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
+    // There's chunks_vectored() on the stream.
+    let (bytes_amount, destination_file) = async {
+        while let Some(res) = futures::StreamExt::next(&mut download.download_stream).await {
+            let chunk = match res {
+                Ok(chunk) => chunk,
+                Err(e) => return Err(DownloadError::from(e)),
+            };
+            buffered
+                .write_buffered_borrowed(&chunk, ctx)
+                .await
+                .map_err(|e| match e {
+                    FlushTaskError::Cancelled => DownloadError::Cancelled,
+                })?;
        }
+        let inner = buffered.shutdown(ctx).await.map_err(|e| match e {
+            FlushTaskError::Cancelled => DownloadError::Cancelled,
+        })?;
+        Ok(inner)
    }
+    .await?;
+
+    // not using sync_data because it can lose file size update
+    destination_file
+        .sync_all()
+        .await
+        .maybe_fatal_err("download_object sync_all")
+        .with_context(|| format!("failed to fsync source file at {dst_path}"))
+        .map_err(DownloadError::Other)?;
+
+    Ok((bytes_amount, destination_file))
 }

 const TEMP_DOWNLOAD_EXTENSION: &str = "temp_download";
@@ -452,7 +400,7 @@ async fn do_download_index_part(
 /// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
 /// to listing objects.
 ///
-/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `my_generation`: the value of `[crate::tenant::TenantShard::generation]`
 /// * `what`: for logging, what object are we downloading
 /// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
 /// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -646,7 +646,7 @@ enum UpdateError {
    NoData,
    #[error("Insufficient local storage space")]
    NoSpace,
-    #[error("Failed to download")]
+    #[error("Failed to download: {0}")]
    DownloadError(DownloadError),
    #[error(transparent)]
    Deserialize(#[from] serde_json::Error),
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -21,7 +21,7 @@ use super::scheduler::{
 use super::{CommandRequest, SecondaryTenantError, UploadCommand};
 use crate::TEMP_FILE_SUFFIX;
 use crate::metrics::SECONDARY_MODE;
-use crate::tenant::Tenant;
+use crate::tenant::TenantShard;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::mgr::{GetTenantError, TenantManager};
 use crate::tenant::remote_timeline_client::remote_heatmap_path;
@@ -74,7 +74,7 @@ impl RunningJob for WriteInProgress {
 }

 struct UploadPending {
-    tenant: Arc<Tenant>,
+    tenant: Arc<TenantShard>,
    last_upload: Option<LastUploadState>,
    target_time: Option<Instant>,
    period: Option<Duration>,
@@ -106,7 +106,7 @@ impl scheduler::Completion for WriteComplete {
 struct UploaderTenantState {
    // This Weak only exists to enable culling idle instances of this type
    // when the Tenant has been deallocated.
-    tenant: Weak<Tenant>,
+    tenant: Weak<TenantShard>,

    /// Digest of the serialized heatmap that we last successfully uploaded
    last_upload_state: Option<LastUploadState>,
@@ -357,7 +357,7 @@ struct LastUploadState {
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
    last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
    debug_assert_current_span_has_tenant_id();
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -360,7 +360,7 @@ where

    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
    ///
-    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::TenantShard`] or [`crate::tenant::secondary::SecondaryTenant`]
    ///
    /// This function resets the pending list: it is assumed that the caller may change their mind about
    /// which tenants need work between calls to schedule_iteration.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -12,7 +12,7 @@ use tracing::*;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

-use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use super::{GcError, LogicalSizeCalculationCause, TenantShard};
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::{MaybeOffloaded, Timeline};
@@ -156,7 +156,7 @@ pub struct TimelineInputs {
 ///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
 pub(super) async fn gather_inputs(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    limit: &Arc<Semaphore>,
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -34,6 +34,7 @@ use std::ops::Range;
 use std::os::unix::fs::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
+use std::sync::atomic::AtomicU64;

 use anyhow::{Context, Result, bail, ensure};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -45,8 +46,6 @@ use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
 use pageserver_api::value::Value;
-use rand::Rng;
-use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_epoll_uring::IoBuf;
@@ -74,6 +73,7 @@ use crate::tenant::vectored_blob_io::{
    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
+use crate::virtual_file::TempVirtualFile;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -288,19 +288,20 @@ impl DeltaLayer {
        key_start: Key,
        lsn_range: &Range<Lsn>,
    ) -> Utf8PathBuf {
-        let rand_string: String = rand::thread_rng()
-            .sample_iter(&Alphanumeric)
-            .take(8)
-            .map(char::from)
-            .collect();
+        // TempVirtualFile requires us to never reuse a filename while an old
+        // instance of TempVirtualFile created with that filename is not done dropping yet.
+        // So, we use a monotonic counter to disambiguate the filenames.
+        static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1);
+        let filename_disambiguator =
+            NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        conf.timeline_path(tenant_shard_id, timeline_id)
            .join(format!(
-                "{}-XXX__{:016X}-{:016X}.{}.{}",
+                "{}-XXX__{:016X}-{:016X}.{:x}.{}",
                key_start,
                u64::from(lsn_range.start),
                u64::from(lsn_range.end),
-                rand_string,
+                filename_disambiguator,
                TEMP_FILE_SUFFIX,
            ))
    }
@@ -421,7 +422,7 @@ impl DeltaLayerWriterInner {
        let path =
            DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);

-        let mut file = VirtualFile::create(&path, ctx).await?;
+        let mut file = TempVirtualFile::new(VirtualFile::create(&path, ctx).await?, gate.enter()?);
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
@@ -515,22 +516,6 @@ impl DeltaLayerWriterInner {
        self,
        key_end: Key,
        ctx: &RequestContext,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let temp_path = self.path.clone();
-        let result = self.finish0(key_end, ctx).await;
-        if let Err(ref e) = result {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
-            }
-        }
-        result
-    }
-
-    async fn finish0(
-        self,
-        key_end: Key,
-        ctx: &RequestContext,
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;

@@ -598,6 +583,10 @@ impl DeltaLayerWriterInner {

        trace!("created delta layer {}", self.path);

+        // The gate guard stored in `destination_file` is dropped. Callers (e.g.. flush loop or compaction)
+        // keep the gate open also, so that it's safe for them to rename the file to its final destination.
+        file.disarm_into_inner();
+
        Ok((desc, self.path))
    }
 }
@@ -726,17 +715,6 @@ impl DeltaLayerWriter {
    }
 }

-impl Drop for DeltaLayerWriter {
-    fn drop(&mut self) {
-        if let Some(inner) = self.inner.take() {
-            // We want to remove the virtual file here, so it's fine to not
-            // having completely flushed unwritten data.
-            let vfile = inner.blob_writer.into_inner_no_flush();
-            vfile.remove();
-        }
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub enum RewriteSummaryError {
    #[error("magic mismatch")]
@@ -1609,8 +1587,8 @@ pub(crate) mod test {
    use bytes::Bytes;
    use itertools::MinMaxResult;
    use pageserver_api::value::Value;
-    use rand::RngCore;
    use rand::prelude::{SeedableRng, SliceRandom, StdRng};
+    use rand::{Rng, RngCore};

    use super::*;
    use crate::DEFAULT_PG_VERSION;
@@ -1620,7 +1598,7 @@ pub(crate) mod test {
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -2209,7 +2187,7 @@ pub(crate) mod test {
    }

    pub(crate) async fn produce_delta_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        tline: &Arc<Timeline>,
        mut deltas: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -32,6 +32,7 @@ use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::str::FromStr;
 use std::sync::Arc;
+use std::sync::atomic::AtomicU64;

 use anyhow::{Context, Result, bail, ensure};
 use bytes::Bytes;
@@ -43,8 +44,6 @@ use pageserver_api::key::{DBDIR_KEY, KEY_SIZE, Key};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_api::value::Value;
-use rand::Rng;
-use rand::distributions::Alphanumeric;
 use serde::{Deserialize, Serialize};
 use tokio::sync::OnceCell;
 use tokio_stream::StreamExt;
@@ -72,6 +71,7 @@ use crate::tenant::vectored_blob_io::{
    BlobFlag, BufView, StreamingVectoredReadPlanner, VectoredBlobReader, VectoredRead,
    VectoredReadPlanner,
 };
+use crate::virtual_file::TempVirtualFile;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::virtual_file::{self, IoBufferMut, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
@@ -252,14 +252,18 @@ impl ImageLayer {
        tenant_shard_id: TenantShardId,
        fname: &ImageLayerName,
    ) -> Utf8PathBuf {
-        let rand_string: String = rand::thread_rng()
-            .sample_iter(&Alphanumeric)
-            .take(8)
-            .map(char::from)
-            .collect();
+        // TempVirtualFile requires us to never reuse a filename while an old
+        // instance of TempVirtualFile created with that filename is not done dropping yet.
+        // So, we use a monotonic counter to disambiguate the filenames.
+        static NEXT_TEMP_DISAMBIGUATOR: AtomicU64 = AtomicU64::new(1);
+        let filename_disambiguator =
+            NEXT_TEMP_DISAMBIGUATOR.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        conf.timeline_path(&tenant_shard_id, &timeline_id)
-            .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
+            .join(format!(
+                "{fname}.{:x}.{TEMP_FILE_SUFFIX}",
+                filename_disambiguator
+            ))
    }

    ///
@@ -773,7 +777,7 @@ impl ImageLayerWriterInner {
            },
        );
        trace!("creating image layer {}", path);
-        let mut file = {
+        let mut file = TempVirtualFile::new(
            VirtualFile::open_with_options(
                &path,
                virtual_file::OpenOptions::new()
@@ -781,8 +785,9 @@ impl ImageLayerWriterInner {
                    .create_new(true),
                ctx,
            )
-            .await?
-        };
+            .await?,
+            gate.enter()?,
+        );
        // make room for the header block
        file.seek(SeekFrom::Start(PAGE_SZ as u64)).await?;
        let blob_writer = BlobWriter::new(file, PAGE_SZ as u64, gate, cancel, ctx);
@@ -896,25 +901,6 @@ impl ImageLayerWriterInner {
        self,
        ctx: &RequestContext,
        end_key: Option<Key>,
-    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let temp_path = self.path.clone();
-        let result = self.finish0(ctx, end_key).await;
-        if let Err(ref e) = result {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
-            }
-        }
-        result
-    }
-
-    ///
-    /// Finish writing the image layer.
-    ///
-    async fn finish0(
-        self,
-        ctx: &RequestContext,
-        end_key: Option<Key>,
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;

@@ -932,7 +918,7 @@ impl ImageLayerWriterInner {
            crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
        };

-        let mut file = self.blob_writer.into_inner();
+        let mut file = self.blob_writer.into_inner(ctx).await?;

        // Write out the index
        file.seek(SeekFrom::Start(index_start_blk as u64 * PAGE_SZ as u64))
@@ -1000,6 +986,10 @@ impl ImageLayerWriterInner {

        trace!("created image layer {}", self.path);

+        // The gate guard stored in `destination_file` is dropped. Callers (e.g.. flush loop or compaction)
+        // keep the gate open also, so that it's safe for them to rename the file to its final destination.
+        file.disarm_into_inner();
+
        Ok((desc, self.path))
    }
 }
@@ -1125,14 +1115,6 @@ impl ImageLayerWriter {
    }
 }

-impl Drop for ImageLayerWriter {
-    fn drop(&mut self) {
-        if let Some(inner) = self.inner.take() {
-            inner.blob_writer.into_inner().remove();
-        }
-    }
-}
-
 pub struct ImageLayerIterator<'a> {
    image_layer: &'a ImageLayerInner,
    ctx: &'a RequestContext,
@@ -1228,7 +1210,7 @@ mod test {
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    #[tokio::test]
    async fn image_layer_rewrite() {
@@ -1410,7 +1392,7 @@ mod test {
    }

    async fn produce_image_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        tline: &Arc<Timeline>,
        mut images: Vec<(Key, Bytes)>,
        lsn: Lsn,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
-use crate::tenant::{Tenant, TenantState};
+use crate::tenant::{TenantShard, TenantState};

 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -117,7 +117,7 @@ pub(crate) async fn acquire_concurrency_permit(
 }

 /// Start per tenant background loops: compaction, GC, and ingest housekeeping.
-pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
+pub fn start_background_loops(tenant: &Arc<TenantShard>, can_start: Option<&Barrier>) {
    let tenant_shard_id = tenant.tenant_shard_id;

    task_mgr::spawn(
@@ -198,7 +198,7 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
 }

 /// Compaction task's main loop.
-async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn compaction_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    const BASE_BACKOFF_SECS: f64 = 1.0;
    const MAX_BACKOFF_SECS: f64 = 300.0;
    const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10);
@@ -348,7 +348,7 @@ pub(crate) fn log_compaction_error(
 }

 /// GC task's main loop.
-async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn gc_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    const MAX_BACKOFF_SECS: f64 = 300.0;
    let mut error_run = 0; // consecutive errors

@@ -432,7 +432,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 }

 /// Tenant housekeeping's main loop.
-async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn tenant_housekeeping_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    let mut last_throttle_flag_reset_at = Instant::now();
    loop {
        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
@@ -483,7 +483,7 @@ async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken

 /// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
 async fn wait_for_active_tenant(
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
    cancel: &CancellationToken,
 ) -> ControlFlow<()> {
    if tenant.current_state() == TenantState::Active {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -412,7 +412,7 @@ pub struct Timeline {
    /// Timeline deletion will acquire both compaction and gc locks in whatever order.
    gc_lock: tokio::sync::Mutex<()>,

-    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
+    /// Cloned from [`super::TenantShard::pagestream_throttle`] on construction.
    pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,

    /// Size estimator for aux file v2
@@ -1285,6 +1285,10 @@ impl Timeline {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
            Some(ReadPath::new(
                query.total_keyspace(),
@@ -2065,7 +2069,7 @@ impl Timeline {

    pub(crate) fn activate(
        self: &Arc<Self>,
-        parent: Arc<crate::tenant::Tenant>,
+        parent: Arc<crate::tenant::TenantShard>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
@@ -3325,7 +3329,7 @@ impl Timeline {
        //     (1) and (4)
        // TODO: this is basically a no-op now, should we remove it?
        self.remote_client.schedule_barrier()?;
-        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // TenantShard::create_timeline will wait for these uploads to happen before returning, or
        // on retry.

        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
@@ -5754,7 +5758,7 @@ impl Timeline {
    /// from our ancestor to be branches of this timeline.
    pub(crate) async fn prepare_to_detach_from_ancestor(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        options: detach_ancestor::Options,
        behavior: DetachBehavior,
        ctx: &RequestContext,
@@ -5773,7 +5777,7 @@ impl Timeline {
    /// resetting the tenant.
    pub(crate) async fn detach_from_ancestor_and_reparent(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        prepared: detach_ancestor::PreparedTimelineDetach,
        ancestor_timeline_id: TimelineId,
        ancestor_lsn: Lsn,
@@ -5797,7 +5801,7 @@ impl Timeline {
    /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
    pub(crate) async fn complete_detaching_timeline_ancestor(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        attempt: detach_ancestor::Attempt,
        ctx: &RequestContext,
    ) -> Result<(), detach_ancestor::Error> {
@@ -6859,14 +6863,14 @@ impl Timeline {
    /// Persistently blocks gc for `Manual` reason.
    ///
    /// Returns true if no such block existed before, false otherwise.
-    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+    pub(crate) async fn block_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<bool> {
        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
    }

    /// Persistently unblocks gc for `Manual` reason.
-    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+    pub(crate) async fn unblock_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<()> {
        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
@@ -6884,8 +6888,8 @@ impl Timeline {

    /// Force create an image layer and place it into the layer map.
    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_image_layer(
@@ -6941,8 +6945,8 @@ impl Timeline {

    /// Force create a delta layer and place it into the layer map.
    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_delta_layer(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -77,7 +77,7 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5;
 /// shard split, which gets expensive for large tenants.
 const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3;

-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+#[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize)]
 pub struct GcCompactionJobId(pub usize);

 impl std::fmt::Display for GcCompactionJobId {
@@ -105,6 +105,50 @@ pub enum GcCompactionQueueItem {
    Notify(GcCompactionJobId, Option<Lsn>),
 }

+/// Statistics for gc-compaction meta jobs, which contains several sub compaction jobs.
+#[derive(Debug, Clone, Serialize, Default)]
+pub struct GcCompactionMetaStatistics {
+    /// The total number of sub compaction jobs.
+    pub total_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that failed.
+    pub failed_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that succeeded.
+    pub succeeded_sub_compaction_jobs: usize,
+    /// The layer size before compaction.
+    pub before_compaction_layer_size: u64,
+    /// The layer size after compaction.
+    pub after_compaction_layer_size: u64,
+    /// The start time of the meta job.
+    pub start_time: Option<chrono::DateTime<chrono::Utc>>,
+    /// The end time of the meta job.
+    pub end_time: Option<chrono::DateTime<chrono::Utc>>,
+    /// The duration of the meta job.
+    pub duration_secs: f64,
+    /// The id of the meta job.
+    pub meta_job_id: GcCompactionJobId,
+    /// The LSN below which the layers are compacted, used to compute the statistics.
+    pub below_lsn: Lsn,
+    /// The retention ratio of the meta job (after_compaction_layer_size / before_compaction_layer_size)
+    pub retention_ratio: f64,
+}
+
+impl GcCompactionMetaStatistics {
+    fn finalize(&mut self) {
+        let end_time = chrono::Utc::now();
+        if let Some(start_time) = self.start_time {
+            if end_time > start_time {
+                let delta = end_time - start_time;
+                if let Ok(std_dur) = delta.to_std() {
+                    self.duration_secs = std_dur.as_secs_f64();
+                }
+            }
+        }
+        self.retention_ratio = self.after_compaction_layer_size as f64
+            / (self.before_compaction_layer_size as f64 + 1.0);
+        self.end_time = Some(end_time);
+    }
+}
+
 impl GcCompactionQueueItem {
    pub fn into_compact_info_resp(
        self,
@@ -142,6 +186,7 @@ struct GcCompactionQueueInner {
    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
    guards: HashMap<GcCompactionJobId, GcCompactionGuardItems>,
    last_id: GcCompactionJobId,
+    meta_statistics: Option<GcCompactionMetaStatistics>,
 }

 impl GcCompactionQueueInner {
@@ -173,6 +218,7 @@ impl GcCompactionQueue {
                queued: VecDeque::new(),
                guards: HashMap::new(),
                last_id: GcCompactionJobId(0),
+                meta_statistics: None,
            }),
            consumer_lock: tokio::sync::Mutex::new(()),
        }
@@ -357,6 +403,23 @@ impl GcCompactionQueue {
        Ok(())
    }

+    async fn collect_layer_below_lsn(
+        &self,
+        timeline: &Arc<Timeline>,
+        lsn: Lsn,
+    ) -> Result<u64, CompactionError> {
+        let guard = timeline.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let layers = layer_map.iter_historic_layers().collect_vec();
+        let mut size = 0;
+        for layer in layers {
+            if layer.lsn_range.start <= lsn {
+                size += layer.file_size();
+            }
+        }
+        Ok(size)
+    }
+
    /// Notify the caller the job has finished and unblock GC.
    fn notify_and_unblock(&self, id: GcCompactionJobId) {
        info!("compaction job id={} finished", id);
@@ -366,6 +429,16 @@ impl GcCompactionQueue {
                let _ = tx.send(());
            }
        }
+        if let Some(ref meta_statistics) = guard.meta_statistics {
+            if meta_statistics.meta_job_id == id {
+                if let Ok(stats) = serde_json::to_string(&meta_statistics) {
+                    info!(
+                        "gc-compaction meta statistics for job id = {}: {}",
+                        id, stats
+                    );
+                }
+            }
+        }
    }

    fn clear_running_job(&self) {
@@ -405,7 +478,11 @@ impl GcCompactionQueue {
            let mut pending_tasks = Vec::new();
            // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate.
            // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN.
-            let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max();
+            let expected_l2_lsn = jobs
+                .iter()
+                .map(|job| job.compact_lsn_range.end)
+                .max()
+                .unwrap();
            for job in jobs {
                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
                // until we do further refactors to allow directly call `compact_with_gc`.
@@ -430,9 +507,13 @@ impl GcCompactionQueue {
            if !auto {
                pending_tasks.push(GcCompactionQueueItem::Notify(id, None));
            } else {
-                pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn));
+                pending_tasks.push(GcCompactionQueueItem::Notify(id, Some(expected_l2_lsn)));
            }

+            let layer_size = self
+                .collect_layer_below_lsn(timeline, expected_l2_lsn)
+                .await?;
+
            {
                let mut guard = self.inner.lock().unwrap();
                let mut tasks = Vec::new();
@@ -444,7 +525,16 @@ impl GcCompactionQueue {
                for item in tasks {
                    guard.queued.push_front(item);
                }
+                guard.meta_statistics = Some(GcCompactionMetaStatistics {
+                    meta_job_id: id,
+                    start_time: Some(chrono::Utc::now()),
+                    before_compaction_layer_size: layer_size,
+                    below_lsn: expected_l2_lsn,
+                    total_sub_compaction_jobs: jobs_len,
+                    ..Default::default()
+                });
            }
+
            info!(
                "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs",
                jobs_len
@@ -573,6 +663,10 @@ impl GcCompactionQueue {
                    Err(err) => {
                        warn!(%err, "failed to run gc-compaction subcompaction job");
                        self.clear_running_job();
+                        let mut guard = self.inner.lock().unwrap();
+                        if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                            meta_statistics.failed_sub_compaction_jobs += 1;
+                        }
                        return Err(err);
                    }
                };
@@ -582,8 +676,34 @@ impl GcCompactionQueue {
                    // we need to clean things up before returning from the function.
                    yield_for_l0 = true;
                }
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.succeeded_sub_compaction_jobs += 1;
+                    }
+                }
            }
            GcCompactionQueueItem::Notify(id, l2_lsn) => {
+                let below_lsn = {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.below_lsn
+                    } else {
+                        Lsn::INVALID
+                    }
+                };
+                let layer_size = if below_lsn != Lsn::INVALID {
+                    self.collect_layer_below_lsn(timeline, below_lsn).await?
+                } else {
+                    0
+                };
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.after_compaction_layer_size = layer_size;
+                        meta_statistics.finalize();
+                    }
+                }
                self.notify_and_unblock(id);
                if let Some(l2_lsn) = l2_lsn {
                    let current_l2_lsn = timeline
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -18,8 +18,8 @@ use crate::tenant::remote_timeline_client::{
    PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
 };
 use crate::tenant::{
-    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError,
-    Timeline, TimelineOrOffloaded,
+    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, TenantManifestError,
+    TenantShard, Timeline, TimelineOrOffloaded,
 };
 use crate::virtual_file::MaybeFatalIo;

@@ -113,7 +113,7 @@ pub(super) async fn delete_local_timeline_directory(
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`make_timeline_delete_guard`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &TimelineOrOffloaded,
    _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
@@ -192,7 +192,7 @@ impl DeleteTimelineFlow {
    // error out if some of the shutdown tasks have already been completed!
    #[instrument(skip_all)]
    pub async fn run(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -288,7 +288,7 @@ impl DeleteTimelineFlow {
    /// Shortcut to create Timeline in stopping state and spawn deletion task.
    #[instrument(skip_all, fields(%timeline_id))]
    pub(crate) async fn resume_deletion(
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
@@ -338,7 +338,7 @@ impl DeleteTimelineFlow {
    fn schedule_background(
        guard: DeletionGuard,
        conf: &'static PageServerConf,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline: TimelineOrOffloaded,
        remote_client: Arc<RemoteTimelineClient>,
    ) {
@@ -381,7 +381,7 @@ impl DeleteTimelineFlow {
    async fn background(
        mut guard: DeletionGuard,
        conf: &PageServerConf,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &TimelineOrOffloaded,
        remote_client: Arc<RemoteTimelineClient>,
    ) -> Result<(), DeleteTimelineError> {
@@ -435,7 +435,7 @@ pub(super) enum TimelineDeleteGuardKind {
 }

 pub(super) fn make_timeline_delete_guard(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline_id: TimelineId,
    guard_kind: TimelineDeleteGuardKind,
 ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -23,7 +23,7 @@ use super::layer_manager::LayerManager;
 use super::{FlushLayerError, Timeline};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::TaskKind;
-use crate::tenant::Tenant;
+use crate::tenant::TenantShard;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::{
@@ -265,7 +265,7 @@ async fn generate_tombstone_image_layer(
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    behavior: DetachBehavior,
    options: Options,
    ctx: &RequestContext,
@@ -590,7 +590,7 @@ pub(super) async fn prepare(

 async fn start_new_attempt(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -611,7 +611,7 @@ async fn start_new_attempt(

 async fn continue_with_blocked_gc(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -622,7 +622,7 @@ async fn continue_with_blocked_gc(

 fn obtain_exclusive_attempt(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -655,7 +655,7 @@ fn obtain_exclusive_attempt(

 fn reparented_direct_children(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
 ) -> Result<HashSet<TimelineId>, Error> {
    let mut all_direct_children = tenant
        .timelines
@@ -950,7 +950,7 @@ impl DetachingAndReparenting {
 /// See [`Timeline::detach_from_ancestor_and_reparent`].
 pub(super) async fn detach_and_reparent(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    prepared: PreparedTimelineDetach,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
@@ -1184,7 +1184,7 @@ pub(super) async fn detach_and_reparent(

 pub(super) async fn complete(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    mut attempt: Attempt,
    _ctx: &RequestContext,
 ) -> Result<(), Error> {
@@ -1258,7 +1258,7 @@ where
 }

 fn check_no_archived_children_of_ancestor(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    detached: &Arc<Timeline>,
    ancestor: &Arc<Timeline>,
    ancestor_lsn: Lsn,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -33,7 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::storage_layer::LayerVisibilityHint;
 use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
 use crate::tenant::timeline::EvictionError;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 #[derive(Default)]
 pub struct EvictionTaskTimelineState {
@@ -48,7 +48,7 @@ pub struct EvictionTaskTenantState {
 impl Timeline {
    pub(super) fn launch_eviction_task(
        self: &Arc<Self>,
-        parent: Arc<Tenant>,
+        parent: Arc<TenantShard>,
        background_tasks_can_start: Option<&completion::Barrier>,
    ) {
        let self_clone = Arc::clone(self);
@@ -75,7 +75,7 @@ impl Timeline {
    }

    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<TenantShard>) {
        // acquire the gate guard only once within a useful span
        let Ok(guard) = self.gate.enter() else {
            return;
@@ -118,7 +118,7 @@ impl Timeline {
    #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
    async fn eviction_iteration(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        policy: &EvictionPolicy,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -175,7 +175,7 @@ impl Timeline {

    async fn eviction_iteration_threshold(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -309,7 +309,7 @@ impl Timeline {
    /// disk usage based eviction task.
    async fn imitiate_only(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -363,7 +363,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_layer_accesses(
        &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -499,7 +499,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_synthetic_size_calculation_worker(
        &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) {
--- a/pageserver/src/tenant/timeline/import_pgdata.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata.rs
@@ -1,20 +1,21 @@
 use std::sync::Arc;

 use anyhow::{Context, bail};
+use pageserver_api::models::ShardImportStatus;
 use remote_storage::RemotePath;
 use tokio_util::sync::CancellationToken;
-use tracing::{Instrument, info, info_span};
+use tracing::info;
 use utils::lsn::Lsn;

 use super::Timeline;
 use crate::context::RequestContext;
+use crate::controller_upcall_client::{StorageControllerUpcallApi, StorageControllerUpcallClient};
 use crate::tenant::metadata::TimelineMetadata;

 mod flow;
 mod importbucket_client;
 mod importbucket_format;
 pub(crate) mod index_part_format;
-pub(crate) mod upcall_api;

 pub async fn doit(
    timeline: &Arc<Timeline>,
@@ -34,23 +35,6 @@ pub async fn doit(

    let storage = importbucket_client::new(timeline.conf, &location, cancel.clone()).await?;

-    info!("get spec early so we know we'll be able to upcall when done");
-    let Some(spec) = storage.get_spec().await? else {
-        bail!("spec not found")
-    };
-
-    let upcall_client =
-        upcall_api::Client::new(timeline.conf, cancel.clone()).context("create upcall client")?;
-
-    //
-    // send an early progress update to clean up k8s job early and generate potentially useful logs
-    //
-    info!("send early progress update");
-    upcall_client
-        .send_progress_until_success(&spec)
-        .instrument(info_span!("early_progress_update"))
-        .await?;
-
    let status_prefix = RemotePath::from_string("status").unwrap();

    //
@@ -176,7 +160,21 @@ pub async fn doit(

        //
        // Communicate that shard is done.
+        // Ensure at-least-once delivery of the upcall to storage controller
+        // before we mark the task as done and never come here again.
        //
+        let storcon_client = StorageControllerUpcallClient::new(timeline.conf, &cancel)?
+            .expect("storcon configured");
+        storcon_client
+            .put_timeline_import_status(
+                timeline.tenant_shard_id,
+                timeline.timeline_id,
+                // TODO(vlad): What about import errors?
+                ShardImportStatus::Done,
+            )
+            .await
+            .map_err(|_err| anyhow::anyhow!("Shut down while putting timeline import status"))?;
+
        storage
            .put_json(
                &shard_status_key,
@@ -186,16 +184,6 @@ pub async fn doit(
            .context("put shard status")?;
    }

-    //
-    // Ensure at-least-once deliver of the upcall to cplane
-    // before we mark the task as done and never come here again.
-    //
-    info!("send final progress update");
-    upcall_client
-        .send_progress_until_success(&spec)
-        .instrument(info_span!("final_progress_update"))
-        .await?;
-
    //
    // Mark as done in index_part.
    // This makes subsequent timeline loads enter the normal load code path
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_client.rs
@@ -13,7 +13,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, instrument};
 use utils::lsn::Lsn;

-use super::{importbucket_format, index_part_format};
+use super::index_part_format;
 use crate::assert_u64_eq_usize::U64IsUsize;
 use crate::config::PageServerConf;

@@ -173,12 +173,6 @@ impl RemoteStorageWrapper {
        res
    }

-    pub async fn get_spec(&self) -> Result<Option<importbucket_format::Spec>, anyhow::Error> {
-        self.get_json(&RemotePath::from_string("spec.json").unwrap())
-            .await
-            .context("get spec")
-    }
-
    #[instrument(level = tracing::Level::DEBUG, skip_all, fields(%path))]
    pub async fn get_json<T: DeserializeOwned>(
        &self,
@@ -244,7 +238,8 @@ impl RemoteStorageWrapper {
                            kind: DownloadKind::Large,
                            etag: None,
                            byte_start: Bound::Included(start_inclusive),
-                            byte_end: Bound::Excluded(end_exclusive)
+                            byte_end: Bound::Excluded(end_exclusive),
+                            version_id: None,
                        },
                        &self.cancel)
                    .await?;
--- a/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/importbucket_format.rs
@@ -11,10 +11,3 @@ pub struct ShardStatus {
    pub done: bool,
    // TODO: remaining fields
 }
-
-// TODO: dedupe with fast_import code
-#[derive(Deserialize, Serialize, Debug, Clone, PartialEq, Eq)]
-pub struct Spec {
-    pub project_id: String,
-    pub branch_id: String,
-}
--- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -1,124 +0,0 @@
-//! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
-use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
-use reqwest::Method;
-use serde::{Deserialize, Serialize};
-use tokio_util::sync::CancellationToken;
-use tracing::error;
-
-use super::importbucket_format::Spec;
-use crate::config::PageServerConf;
-
-pub struct Client {
-    base_url: String,
-    authorization_header: Option<String>,
-    client: reqwest::Client,
-    cancel: CancellationToken,
-}
-
-pub type Result<T> = std::result::Result<T, Error>;
-
-#[derive(Serialize, Deserialize, Debug)]
-struct ImportProgressRequest {
-    // no fields yet, not sure if there every will be any
-}
-
-#[derive(Serialize, Deserialize, Debug)]
-struct ImportProgressResponse {
-    // we don't care
-}
-
-impl Client {
-    pub fn new(conf: &PageServerConf, cancel: CancellationToken) -> anyhow::Result<Self> {
-        let Some(ref base_url) = conf.import_pgdata_upcall_api else {
-            anyhow::bail!("import_pgdata_upcall_api is not configured")
-        };
-        let mut http_client = reqwest::Client::builder();
-        for cert in &conf.ssl_ca_certs {
-            http_client = http_client.add_root_certificate(cert.clone());
-        }
-        let http_client = http_client.build()?;
-
-        Ok(Self {
-            base_url: base_url.to_string(),
-            client: http_client,
-            cancel,
-            authorization_header: conf
-                .import_pgdata_upcall_api_token
-                .as_ref()
-                .map(|secret_string| secret_string.get_contents())
-                .map(|jwt| format!("Bearer {jwt}")),
-        })
-    }
-
-    fn start_request<U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-    ) -> reqwest::RequestBuilder {
-        let req = self.client.request(method, uri);
-        if let Some(value) = &self.authorization_header {
-            req.header(reqwest::header::AUTHORIZATION, value)
-        } else {
-            req
-        }
-    }
-
-    async fn request_noerror<B: serde::Serialize, U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-        body: B,
-    ) -> Result<reqwest::Response> {
-        self.start_request(method, uri)
-            .json(&body)
-            .send()
-            .await
-            .map_err(Error::ReceiveBody)
-    }
-
-    async fn request<B: serde::Serialize, U: reqwest::IntoUrl>(
-        &self,
-        method: Method,
-        uri: U,
-        body: B,
-    ) -> Result<reqwest::Response> {
-        let res = self.request_noerror(method, uri, body).await?;
-        let response = res.error_from_body().await?;
-        Ok(response)
-    }
-
-    pub async fn send_progress_once(&self, spec: &Spec) -> Result<()> {
-        let url = format!(
-            "{}/projects/{}/branches/{}/import_progress",
-            self.base_url, spec.project_id, spec.branch_id
-        );
-        let ImportProgressResponse {} = self
-            .request(Method::POST, url, &ImportProgressRequest {})
-            .await?
-            .json()
-            .await
-            .map_err(Error::ReceiveBody)?;
-        Ok(())
-    }
-
-    pub async fn send_progress_until_success(&self, spec: &Spec) -> anyhow::Result<()> {
-        loop {
-            match self.send_progress_once(spec).await {
-                Ok(()) => return Ok(()),
-                Err(Error::Cancelled) => return Err(anyhow::anyhow!("cancelled")),
-                Err(err) => {
-                    error!(?err, "error sending progress, retrying");
-                    if tokio::time::timeout(
-                        std::time::Duration::from_secs(10),
-                        self.cancel.cancelled(),
-                    )
-                    .await
-                    .is_ok()
-                    {
-                        anyhow::bail!("cancelled while sending early progress update");
-                    }
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -8,7 +8,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard};
 use crate::tenant::{
-    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+    DeleteTimelineError, OffloadedTimeline, TenantManifestError, TenantShard, TimelineOrOffloaded,
 };

 #[derive(thiserror::Error, Debug)]
@@ -33,7 +33,7 @@ impl From<TenantManifestError> for OffloadError {
 }

 pub(crate) async fn offload_timeline(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &Arc<Timeline>,
 ) -> Result<(), OffloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
@@ -123,7 +123,7 @@ pub(crate) async fn offload_timeline(
 ///
 /// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &Timeline,
    _: &DeletionGuard, // using it as a witness
 ) -> usize {
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -15,17 +15,19 @@ use super::Timeline;
 use crate::context::RequestContext;
 use crate::import_datadir;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded};
+use crate::tenant::{
+    CreateTimelineError, CreateTimelineIdempotency, TenantShard, TimelineOrOffloaded,
+};

 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
 /// its local files are removed.  If we crash while this class exists, then the timeline's local
-/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
+/// state is cleaned up during [`TenantShard::clean_up_timelines`], because the timeline's content isn't in remote storage.
 ///
 /// The caller is responsible for proper timeline data filling before the final init.
 #[must_use]
 pub struct UninitializedTimeline<'t> {
-    pub(crate) owning_tenant: &'t Tenant,
+    pub(crate) owning_tenant: &'t TenantShard,
    timeline_id: TimelineId,
    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
@@ -35,7 +37,7 @@ pub struct UninitializedTimeline<'t> {

 impl<'t> UninitializedTimeline<'t> {
    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
+        owning_tenant: &'t TenantShard,
        timeline_id: TimelineId,
        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
    ) -> Self {
@@ -156,7 +158,7 @@ impl<'t> UninitializedTimeline<'t> {
    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
        mut self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
@@ -227,17 +229,17 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
        }
    }
-    // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
+    // Having cleaned up, we can release this TimelineId in `[TenantShard::timelines_creating]` to allow other
    // timeline creation attempts under this TimelineId to proceed
    drop(create_guard);
 }

 /// A guard for timeline creations in process: as long as this object exists, the timeline ID
-/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
+/// is kept in `[TenantShard::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
 pub(crate) struct TimelineCreateGuard {
    pub(crate) _tenant_gate_guard: GateGuard,
-    pub(crate) owning_tenant: Arc<Tenant>,
+    pub(crate) owning_tenant: Arc<TenantShard>,
    pub(crate) timeline_id: TimelineId,
    pub(crate) timeline_path: Utf8PathBuf,
    pub(crate) idempotency: CreateTimelineIdempotency,
@@ -263,7 +265,7 @@ pub(crate) enum TimelineExclusionError {

 impl TimelineCreateGuard {
    pub(crate) fn new(
-        owning_tenant: &Arc<Tenant>,
+        owning_tenant: &Arc<TenantShard>,
        timeline_id: TimelineId,
        timeline_path: Utf8PathBuf,
        idempotency: CreateTimelineIdempotency,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -25,29 +25,31 @@ use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlig
 use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
-pub use pageserver_api::models::virtual_file as api;
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};

+use self::owned_buffers_io::write::OwnedAsyncWriter;
 use crate::assert_u64_eq_usize::UsizeIsU64;
 use crate::context::RequestContext;
 use crate::metrics::{STORAGE_IO_TIME_METRIC, StorageIoOperation};
 use crate::page_cache::{PAGE_SZ, PageWriteGuard};
-pub(crate) mod io_engine;
+
+pub(crate) use api::IoMode;
+pub(crate) use io_engine::IoEngineKind;
 pub use io_engine::{
    FeatureTestResult as IoEngineFeatureTestResult, feature_test as io_engine_feature_test,
    io_engine_for_bench,
 };
-mod metadata;
-mod open_options;
-pub(crate) use api::IoMode;
-pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
+pub use pageserver_api::models::virtual_file as api;
+pub use temporary::TempVirtualFile;

-use self::owned_buffers_io::write::OwnedAsyncWriter;
-
+pub(crate) mod io_engine;
+mod metadata;
+mod open_options;
+mod temporary;
 pub(crate) mod owned_buffers_io {
    //! Abstractions for IO with owned buffers.
    //!
@@ -1366,9 +1368,10 @@ pub(crate) type IoBuffer = AlignedBuffer<ConstAlign<{ get_io_buffer_alignment()
 pub(crate) type IoPageSlice<'a> =
    AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>;

-static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+static IO_MODE: once_cell::sync::Lazy<AtomicU8> =
+    once_cell::sync::Lazy::new(|| AtomicU8::new(IoMode::preferred() as u8));

-pub(crate) fn set_io_mode(mode: IoMode) {
+pub fn set_io_mode(mode: IoMode) {
    IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
 }

--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,5 +1,4 @@
 mod flush;
-use std::sync::Arc;

 pub(crate) use flush::FlushControl;
 use flush::FlushHandle;
@@ -41,7 +40,6 @@ pub trait OwnedAsyncWriter {
 // TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput,
 // since we would avoid copying majority of the data into the internal buffer.
 pub struct BufferedWriter<B: Buffer, W> {
-    writer: Arc<W>,
    /// Clone of the buffer that was last submitted to the flush loop.
    /// `None` if no flush request has been submitted, Some forever after.
    pub(super) maybe_flushed: Option<FullSlice<B::IoBuf>>,
@@ -72,7 +70,7 @@ where
    ///
    /// The `buf_new` function provides a way to initialize the owned buffers used by this writer.
    pub fn new(
-        writer: Arc<W>,
+        writer: W,
        buf_new: impl Fn() -> B,
        gate_guard: utils::sync::gate::GateGuard,
        cancel: CancellationToken,
@@ -80,7 +78,6 @@ where
        flush_task_span: tracing::Span,
    ) -> Self {
        Self {
-            writer: writer.clone(),
            mutable: Some(buf_new()),
            maybe_flushed: None,
            flush_handle: FlushHandle::spawn_new(
@@ -95,10 +92,6 @@ where
        }
    }

-    pub fn as_inner(&self) -> &W {
-        &self.writer
-    }
-
    /// Returns the number of bytes submitted to the background flush task.
    pub fn bytes_submitted(&self) -> u64 {
        self.bytes_submitted
@@ -116,20 +109,16 @@ where
    }

    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn flush_and_into_inner(
-        mut self,
-        ctx: &RequestContext,
-    ) -> Result<(u64, Arc<W>), FlushTaskError> {
+    pub async fn shutdown(mut self, ctx: &RequestContext) -> Result<(u64, W), FlushTaskError> {
        self.flush(ctx).await?;

        let Self {
            mutable: buf,
            maybe_flushed: _,
-            writer,
            mut flush_handle,
            bytes_submitted: bytes_amount,
        } = self;
-        flush_handle.shutdown().await?;
+        let writer = flush_handle.shutdown().await?;
        assert!(buf.is_some());
        Ok((bytes_amount, writer))
    }
@@ -329,7 +318,7 @@ mod tests {
    async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> {
        let ctx = test_ctx();
        let ctx = &ctx;
-        let recorder = Arc::new(RecorderWriter::default());
+        let recorder = RecorderWriter::default();
        let gate = utils::sync::gate::Gate::default();
        let cancel = CancellationToken::new();
        let mut writer = BufferedWriter::<_, RecorderWriter>::new(
@@ -350,7 +339,7 @@ mod tests {
        writer.write_buffered_borrowed(b"j", ctx).await?;
        writer.write_buffered_borrowed(b"klmno", ctx).await?;

-        let (_, recorder) = writer.flush_and_into_inner(ctx).await?;
+        let (_, recorder) = writer.shutdown(ctx).await?;
        assert_eq!(
            recorder.get_writes(),
            {
--- a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -1,5 +1,4 @@
 use std::ops::ControlFlow;
-use std::sync::Arc;

 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, info, info_span, warn};
@@ -21,7 +20,7 @@ pub struct FlushHandleInner<Buf, W> {
    /// and receives recyled buffer.
    channel: duplex::mpsc::Duplex<FlushRequest<Buf>, FullSlice<Buf>>,
    /// Join handle for the background flush task.
-    join_handle: tokio::task::JoinHandle<Result<Arc<W>, FlushTaskError>>,
+    join_handle: tokio::task::JoinHandle<Result<W, FlushTaskError>>,
 }

 struct FlushRequest<Buf> {
@@ -120,7 +119,7 @@ where
    /// The queue depth is 1, and the passed-in `buf` seeds the queue depth.
    /// I.e., the passed-in buf is immediately available to the handle as a recycled buffer.
    pub fn spawn_new<B>(
-        file: Arc<W>,
+        file: W,
        buf: B,
        gate_guard: utils::sync::gate::GateGuard,
        cancel: CancellationToken,
@@ -183,7 +182,7 @@ where
    }

    /// Cleans up the channel, join the flush task.
-    pub async fn shutdown(&mut self) -> Result<Arc<W>, FlushTaskError> {
+    pub async fn shutdown(&mut self) -> Result<W, FlushTaskError> {
        let handle = self
            .inner
            .take()
@@ -207,7 +206,7 @@ pub struct FlushBackgroundTask<Buf, W> {
    /// and send back recycled buffer.
    channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
    /// A writter for persisting data to disk.
-    writer: Arc<W>,
+    writer: W,
    ctx: RequestContext,
    cancel: CancellationToken,
    /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk.
@@ -228,7 +227,7 @@ where
    /// Creates a new background flush task.
    fn new(
        channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
-        file: Arc<W>,
+        file: W,
        gate_guard: utils::sync::gate::GateGuard,
        cancel: CancellationToken,
        ctx: RequestContext,
@@ -243,7 +242,7 @@ where
    }

    /// Runs the background flush task.
-    async fn run(mut self) -> Result<Arc<W>, FlushTaskError> {
+    async fn run(mut self) -> Result<W, FlushTaskError> {
        //  Exit condition: channel is closed and there is no remaining buffer to be flushed
        while let Some(request) = self.channel.recv().await {
            #[cfg(test)]
--- a/pageserver/src/virtual_file/temporary.rs
+++ b/pageserver/src/virtual_file/temporary.rs
@@ -0,0 +1,106 @@
+use tracing::error;
+use utils::sync::gate::GateGuard;
+
+use crate::context::RequestContext;
+
+use super::{
+    MaybeFatalIo, VirtualFile,
+    owned_buffers_io::{
+        io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice, write::OwnedAsyncWriter,
+    },
+};
+
+/// A wrapper around [`super::VirtualFile`] that deletes the file on drop.
+/// For use as a [`OwnedAsyncWriter`] in [`super::owned_buffers_io::write::BufferedWriter`].
+#[derive(Debug)]
+pub struct TempVirtualFile {
+    inner: Option<Inner>,
+}
+
+#[derive(Debug)]
+struct Inner {
+    file: VirtualFile,
+    /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
+    _gate_guard: GateGuard,
+}
+
+impl OwnedAsyncWriter for TempVirtualFile {
+    fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
+        buf: FullSlice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> impl std::future::Future<Output = (FullSlice<Buf>, std::io::Result<()>)> + Send {
+        VirtualFile::write_all_at(self, buf, offset, ctx)
+    }
+}
+
+impl Drop for TempVirtualFile {
+    fn drop(&mut self) {
+        let Some(Inner { file, _gate_guard }) = self.inner.take() else {
+            return;
+        };
+        let path = file.path();
+        if let Err(e) =
+            std::fs::remove_file(path).maybe_fatal_err("failed to remove the virtual file")
+        {
+            error!(err=%e, path=%path, "failed to remove");
+        }
+        drop(_gate_guard);
+    }
+}
+
+impl std::ops::Deref for TempVirtualFile {
+    type Target = VirtualFile;
+
+    fn deref(&self) -> &Self::Target {
+        &self
+            .inner
+            .as_ref()
+            .expect("only None after into_inner or drop")
+            .file
+    }
+}
+
+impl std::ops::DerefMut for TempVirtualFile {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self
+            .inner
+            .as_mut()
+            .expect("only None after into_inner or drop")
+            .file
+    }
+}
+
+impl TempVirtualFile {
+    /// The caller is responsible for ensuring that the path of `virtual_file` is not reused
+    /// until after this TempVirtualFile's `Drop` impl has completed.
+    /// Failure to do so will result in unlinking of the reused path by the original instance's Drop impl.
+    /// The best way to do so is by using a monotonic counter as a disambiguator.
+    /// TODO: centralize this disambiguator pattern inside this struct.
+    ///   => <https://github.com/neondatabase/neon/pull/11549#issuecomment-2824592831>
+    pub fn new(virtual_file: VirtualFile, gate_guard: GateGuard) -> Self {
+        Self {
+            inner: Some(Inner {
+                file: virtual_file,
+                _gate_guard: gate_guard,
+            }),
+        }
+    }
+
+    /// Dismantle this wrapper and return the underlying [`VirtualFile`].
+    /// This disables auto-unlinking functionality that is the essence of this wrapper.
+    ///
+    /// The gate guard is dropped as well; it is the callers responsibility to ensure filesystem
+    /// operations after calls to this functions are still gated by some other gate guard.
+    ///
+    /// TODO:
+    /// - centralize the common usage pattern of callers (sync_all(self), rename(self, dst), sync_all(dst.parent))
+    ///   => <https://github.com/neondatabase/neon/pull/11549#issuecomment-2824592831>
+    pub fn disarm_into_inner(mut self) -> VirtualFile {
+        self.inner
+            .take()
+            .expect("only None after into_inner or drop, and we are into_inner, and we consume")
+            .file
+    }
+}
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -803,7 +803,13 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)

 		case RELPERSISTENCE_TEMP:
 		case RELPERSISTENCE_UNLOGGED:
+#ifdef DEBUG_COMPARE_LOCAL
+			mdcreate(reln, forkNum, forkNum == INIT_FORKNUM || isRedo);
+			if (forkNum == MAIN_FORKNUM)
+				mdcreate(reln, INIT_FORKNUM, true);
+#else
 			mdcreate(reln, forkNum, isRedo);
+#endif
 			return;

 		default:
@@ -1973,6 +1979,10 @@ neon_start_unlogged_build(SMgrRelation reln)
 		case RELPERSISTENCE_UNLOGGED:
 			unlogged_build_rel = reln;
 			unlogged_build_phase = UNLOGGED_BUILD_NOT_PERMANENT;
+#ifdef DEBUG_COMPARE_LOCAL
+			if (!IsParallelWorker())
+				mdcreate(reln, INIT_FORKNUM, true);
+#endif
 			return;

 		default:
@@ -1995,12 +2005,14 @@ neon_start_unlogged_build(SMgrRelation reln)
 	 * FIXME: should we pass isRedo true to create the tablespace dir if it
 	 * doesn't exist? Is it needed?
 	 */
-#ifndef DEBUG_COMPARE_LOCAL
 	if (!IsParallelWorker())
+	{
+#ifndef DEBUG_COMPARE_LOCAL
 		mdcreate(reln, MAIN_FORKNUM, false);
 #else
-	mdcreate(reln, INIT_FORKNUM, false);
+		mdcreate(reln, INIT_FORKNUM, true);
 #endif
+	}
 }

 /*
@@ -2099,12 +2111,12 @@ neon_end_unlogged_build(SMgrRelation reln)
 #ifndef DEBUG_COMPARE_LOCAL
 			/* use isRedo == true, so that we drop it immediately */
 			mdunlink(rinfob, forknum, true);
-#else
-			mdunlink(rinfob, INIT_FORKNUM, true);
 #endif
 		}
+#ifdef DEBUG_COMPARE_LOCAL
+		mdunlink(rinfob, INIT_FORKNUM, true);
+#endif
 	}
-
 	unlogged_build_rel = NULL;
 	unlogged_build_phase = UNLOGGED_BUILD_NOT_IN_PROGRESS;
 }
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -776,7 +776,6 @@ impl From<&jose_jwk::Key> for KeyType {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::future::IntoFuture;
    use std::net::SocketAddr;
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -253,7 +253,6 @@ fn project_name_valid(name: &str) -> bool {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use ComputeUserInfoParseError::*;
    use serde_json::json;
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -258,7 +258,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
                "unexpected startup packet, rejecting connection"
            );
            stream
-                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User, None)
                .await?
        }
    }
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -259,7 +259,6 @@ impl EndpointsCache {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;

--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -585,7 +585,6 @@ impl Cache for ProjectInfoCacheImpl {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;
    use crate::scram::ServerSecret;
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -222,7 +222,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    {
        Ok(auth_result) => auth_result,
        Err(e) => {
-            return stream.throw_error(e).await?;
+            return stream.throw_error(e, Some(ctx)).await?;
        }
    };

@@ -238,7 +238,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        config.wake_compute_retry_config,
        &config.connect_to_compute,
    )
-    .or_else(|e| stream.throw_error(e))
+    .or_else(|e| stream.throw_error(e, Some(ctx)))
    .await?;

    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -63,7 +63,7 @@ struct RequestContextInner {
    success: bool,
    pub(crate) cold_start_info: ColdStartInfo,
    pg_options: Option<StartupMessageParams>,
-    testodrome_query_id: Option<String>,
+    testodrome_query_id: Option<SmolStr>,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -219,7 +219,7 @@ impl RequestContext {
            for option in options_str.split_whitespace() {
                if option.starts_with("neon_query_id:") {
                    if let Some(value) = option.strip_prefix("neon_query_id:") {
-                        this.set_testodrome_id(value.to_string());
+                        this.set_testodrome_id(value.into());
                        break;
                    }
                }
@@ -272,7 +272,7 @@ impl RequestContext {
            .set_user_agent(user_agent);
    }

-    pub(crate) fn set_testodrome_id(&self, query_id: String) {
+    pub(crate) fn set_testodrome_id(&self, query_id: SmolStr) {
        self.0
            .try_lock()
            .expect("should not deadlock")
@@ -378,7 +378,7 @@ impl RequestContext {
            .accumulated()
    }

-    pub(crate) fn get_testodrome_id(&self) -> Option<String> {
+    pub(crate) fn get_testodrome_id(&self) -> Option<SmolStr> {
        self.0
            .try_lock()
            .expect("should not deadlock")
@@ -447,7 +447,7 @@ impl RequestContextInner {
        self.user = Some(user);
    }

-    fn set_testodrome_id(&mut self, query_id: String) {
+    fn set_testodrome_id(&mut self, query_id: SmolStr) {
        self.testodrome_query_id = Some(query_id);
    }

--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -416,7 +416,6 @@ async fn upload_parquet(
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::net::Ipv4Addr;
    use std::num::NonZeroUsize;
--- a/Show More
+++ b/Show More