drain instead of mark offline

Signed-off-by: Alex Chi Z <chi@neon.tech>
test(storcon): ensure essential scheduling mode attaches tenant to secondary
2026-05-18 21:50:37 +00:00 · 2025-04-23 16:03:55 -04:00 · 2025-04-23 15:57:37 -04:00 · 2025-04-23 14:51:08 +00:00 · 2025-04-23 14:03:19 +00:00 · 2025-04-23 12:18:30 +00:00
179 changed files with 8588 additions and 1738 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -19,7 +19,7 @@
 !pageserver/
 !pgxn/
 !proxy/
-!object_storage/
+!endpoint_storage/
 !storage_scrubber/
 !safekeeper/
 !storage_broker/
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -113,8 +113,6 @@ runs:
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
-        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
-        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
        RERUN_FAILED: ${{ inputs.rerun_failed }}
        PG_VERSION: ${{ inputs.pg_version }}
        SANITIZERS: ${{ inputs.sanitizers }}
@@ -135,6 +133,7 @@ runs:
        fi

        PERF_REPORT_DIR="$(realpath test_runner/perf-report-local)"
+        echo "PERF_REPORT_DIR=${PERF_REPORT_DIR}" >> ${GITHUB_ENV}
        rm -rf $PERF_REPORT_DIR

        TEST_SELECTION="test_runner/${{ inputs.test_selection }}"
@@ -211,11 +210,12 @@ runs:
          --verbose \
          -rA $TEST_SELECTION $EXTRA_PARAMS

-        if [[ "${{ inputs.save_perf_report }}" == "true" ]]; then
-          export REPORT_FROM="$PERF_REPORT_DIR"
-          export REPORT_TO="$PLATFORM"
-          scripts/generate_and_push_perf_report.sh
-        fi
+    - name: Upload performance report
+      if: ${{ !cancelled() && inputs.save_perf_report == 'true' }}
+      shell: bash -euxo pipefail {0}
+      run: |
+        export REPORT_FROM="${PERF_REPORT_DIR}"
+        scripts/generate_and_push_perf_report.sh

    - name: Upload compatibility snapshot
      # Note, that we use `github.base_ref` which is a target branch for a PR
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -272,10 +272,13 @@ jobs:
          # run pageserver tests with different settings
          for get_vectored_concurrent_io in sequential sidecar-task; do
            for io_engine in std-fs tokio-epoll-uring ; do
-              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                ${cov_prefix} \
-                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+                for io_mode in buffered direct direct-rw ; do
+                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \
+                  ${cov_prefix} \
+                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+              done
            done
          done

@@ -346,7 +349,7 @@ jobs:
      contents: read
      statuses: write
    needs: [ build-neon ]
-    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large-metal')) }}
    container:
      image: ${{ inputs.build-tools-image }}
      credentials:
@@ -392,6 +395,7 @@ jobs:
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -165,5 +165,5 @@ jobs:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          CURRENT_SHA: ${{ github.sha }}
        run: |
-          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
+          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release.*$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
          echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,6 +323,8 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
          SYNC_BETWEEN_TESTS: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
--- a/.github/workflows/fast-forward.yml
+++ b/.github/workflows/fast-forward.yml
@@ -27,15 +27,17 @@ jobs:
      - name: Fast forwarding
        uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
        # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
-        if: ${{ github.event.pull_request.mergeable_state  == 'clean' }}
+        if: ${{ contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
        with:
          merge: true
          comment: on-error
          github_token: ${{ secrets.CI_ACCESS_TOKEN }}

      - name: Comment if mergeable_state is not clean
-        if: ${{ github.event.pull_request.mergeable_state  != 'clean' }}
+        if: ${{ !contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          gh pr comment ${{ github.event.pull_request.number }} \
            --repo "${GITHUB_REPOSITORY}" \
-            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."
+            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\` or \`unstable\`."
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -30,7 +30,7 @@ permissions:
  statuses: write # require for posting a status update

 env:
-  DEFAULT_PG_VERSION: 16
+  DEFAULT_PG_VERSION: 17
  PLATFORM: neon-captest-new
  AWS_DEFAULT_REGION: eu-central-1

@@ -42,6 +42,8 @@ jobs:
      github-event-name: ${{ github.event_name }}

  build-build-tools-image:
+    permissions:
+      packages: write
    needs: [ check-permissions ]
    uses: ./.github/workflows/build-build-tools-image.yml
    secrets: inherit
--- a/.github/workflows/random-ops-test.yml
+++ b/.github/workflows/random-ops-test.yml
@@ -0,0 +1,93 @@
+name: Random Operations Test
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │  ┌───────────── hour (0 - 23)
+    #          │  │  ┌───────────── day of the month (1 - 31)
+    #          │  │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │  │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 */2 * * *' # runs every 2 hours
+  workflow_dispatch:
+    inputs:
+      random_seed:
+        type: number
+        description: 'The random seed'
+        required: false
+        default: 0
+      num_operations:
+        type: number
+        description: "The number of operations to test"
+        default: 250
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+permissions: {}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  run-random-rests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+    runs-on: small
+    permissions:
+      id-token: write
+      statuses: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+      - name: Run tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: remote
+          test_selection: random_ops
+          run_in_parallel: false
+          extra_params: -m remote_cluster
+          pg_version: ${{ matrix.pg-version }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+          RANDOM_SEED: ${{ inputs.random_seed }}
+          NUM_OPERATIONS: ${{ inputs.num_operations }}
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2037,6 +2037,33 @@ dependencies = [
 "zeroize",
 ]

+[[package]]
+name = "endpoint_storage"
+version = "0.0.1"
+dependencies = [
+ "anyhow",
+ "axum",
+ "axum-extra",
+ "camino",
+ "camino-tempfile",
+ "futures",
+ "http-body-util",
+ "itertools 0.10.5",
+ "jsonwebtoken",
+ "prometheus",
+ "rand 0.8.5",
+ "remote_storage",
+ "serde",
+ "serde_json",
+ "test-log",
+ "tokio",
+ "tokio-util",
+ "tower 0.5.2",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "enum-map"
 version = "2.5.0"
@@ -3998,33 +4025,6 @@ dependencies = [
 "memchr",
 ]

-[[package]]
-name = "object_storage"
-version = "0.0.1"
-dependencies = [
- "anyhow",
- "axum",
- "axum-extra",
- "camino",
- "camino-tempfile",
- "futures",
- "http-body-util",
- "itertools 0.10.5",
- "jsonwebtoken",
- "prometheus",
- "rand 0.8.5",
- "remote_storage",
- "serde",
- "serde_json",
- "test-log",
- "tokio",
- "tokio-util",
- "tower 0.5.2",
- "tracing",
- "utils",
- "workspace_hack",
-]
-
 [[package]]
 name = "once_cell"
 version = "1.20.2"
@@ -4285,6 +4285,7 @@ dependencies = [
 "pageserver_api",
 "pageserver_client",
 "pageserver_compaction",
+ "pem",
 "pin-project-lite",
 "postgres-protocol",
 "postgres-types",
@@ -4352,6 +4353,7 @@ dependencies = [
 "humantime-serde",
 "itertools 0.10.5",
 "nix 0.27.1",
+ "once_cell",
 "postgres_backend",
 "postgres_ffi",
 "rand 0.8.5",
@@ -6000,6 +6002,7 @@ dependencies = [
 "once_cell",
 "pageserver_api",
 "parking_lot 0.12.1",
+ "pem",
 "postgres-protocol",
 "postgres_backend",
 "postgres_ffi",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -40,7 +40,7 @@ members = [
    "libs/proxy/postgres-protocol2",
    "libs/proxy/postgres-types2",
    "libs/proxy/tokio-postgres2",
-    "object_storage",
+    "endpoint_storage",
 ]

 [workspace.package]
--- a/4
+++ b/4
@@ -89,7 +89,7 @@ RUN set -e \
      --bin storage_broker  \
      --bin storage_controller  \
      --bin proxy  \
-      --bin object_storage \
+      --bin endpoint_storage \
      --bin neon_local \
      --bin storage_scrubber \
      --locked --release
@@ -122,7 +122,7 @@ COPY --from=build --chown=neon:neon /home/nonroot/target/release/safekeeper
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_broker      /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_controller  /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/proxy               /usr/local/bin
-COPY --from=build --chown=neon:neon /home/nonroot/target/release/object_storage      /usr/local/bin
+COPY --from=build --chown=neon:neon /home/nonroot/target/release/endpoint_storage    /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/neon_local          /usr/local/bin
 COPY --from=build --chown=neon:neon /home/nonroot/target/release/storage_scrubber    /usr/local/bin

--- a/README.md
+++ b/README.md
@@ -270,7 +270,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/clippy.toml
+++ b/clippy.toml
@@ -12,3 +12,5 @@ disallowed-macros = [
    # cannot disallow this, because clippy finds used from tokio macros
    #"tokio::pin",
 ]
+
+allow-unwrap-in-tests = true
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1677,7 +1677,7 @@ RUN set -e \
    && apt clean && rm -rf /var/lib/apt/lists/*

 # Use `dist_man_MANS=` to skip manpage generation (which requires python3/pandoc)
-ENV PGBOUNCER_TAG=pgbouncer_1_22_1
+ENV PGBOUNCER_TAG=pgbouncer_1_24_1
 RUN set -e \
    && git clone --recurse-submodules --depth 1 --branch ${PGBOUNCER_TAG} https://github.com/pgbouncer/pgbouncer.git pgbouncer \
    && cd pgbouncer \
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,265 +0,0 @@
-commit 00aa659afc9c7336ab81036edec3017168aabf40
-Author: Heikki Linnakangas <heikki@neon.tech>
-Date:   Tue Nov 12 16:59:19 2024 +0200
-
-    Temporarily disable test that depends on timezone
-
-diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
-index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
-+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
-@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
-  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
- (1 row)
- 
-SELECT anon.generalize_tstzrange('19041107','millennium');
-                      generalize_tstzrange                       
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-   generalize_daterange   
-diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
-index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
-+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
-@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
- SELECT anon.generalize_tstzrange('19041107','year');
- SELECT anon.generalize_tstzrange('19041107','decade');
- SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- 
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-
-commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Fri May 31 06:34:26 2024 +0000
-
-    These alternative expected files were added to consider the neon features
-
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-new file mode 100644
-index 0000000..2539cfd
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-@@ -0,0 +1,101 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE mallory_the_masked_user;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.anonymize_table('t1');
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ERROR:  Only supersusers can start the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT * FROM mask.t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  SELECT * FROM public.t1;
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  Only supersusers can stop the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-new file mode 100644
-index 0000000..8b090fe
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-@@ -0,0 +1,104 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE oscar_the_owner;
-+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE oscar_the_owner;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+SELECT anon.anonymize_table('t1');
-+ anonymize_table 
-+-----------------
-+ t
-+(1 row)
-+
-+SELECT * FROM t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+UPDATE t1 SET t='test' WHERE i=1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+SELECT * FROM t1;
-+ i |  t   
-+---+------
-+ 1 | test
-+(1 row)
-+
-+--SELECT * FROM mask.t1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  permission denied for schema mask
-+CONTEXT:  SQL statement "DROP VIEW mask.t1;"
-+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
-+SQL statement "SELECT anon.mask_drop_view(oid)
-+  FROM pg_catalog.pg_class
-+  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
-+  AND relkind IN ('r','p','f')"
-+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
--- a/compute/patches/pg_repack.patch
+++ b/compute/patches/pg_repack.patch
@@ -11,6 +11,14 @@ index bf6edcb..89b4c7f 100644
 
 USE_PGXS = 1	# use pgxs if not in contrib directory
 PGXS := $(shell $(PG_CONFIG) --pgxs)
+diff --git a/regress/expected/init-extension.out b/regress/expected/init-extension.out
+index 9f2e171..f6e4f8d 100644
+--- a/regress/expected/init-extension.out
+++ b/regress/expected/init-extension.out
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/expected/nosuper.out b/regress/expected/nosuper.out
 index 8d0a94e..63b68bf 100644
 --- a/regress/expected/nosuper.out
@@ -42,6 +50,14 @@ index 8d0a94e..63b68bf 100644
 INFO: repacking table "public.tbl_cluster"
 ERROR: query failed: ERROR:  current transaction is aborted, commands ignored until end of transaction block
 DETAIL: query was: RESET lock_timeout
+diff --git a/regress/sql/init-extension.sql b/regress/sql/init-extension.sql
+index 9f2e171..f6e4f8d 100644
+--- a/regress/sql/init-extension.sql
+++ b/regress/sql/init-extension.sql
+@@ -1,3 +1,2 @@
+ SET client_min_messages = warning;
+ CREATE EXTENSION pg_repack;
+-RESET client_min_messages;
 diff --git a/regress/sql/nosuper.sql b/regress/sql/nosuper.sql
 index 072f0fa..dbe60f8 100644
 --- a/regress/sql/nosuper.sql
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -57,24 +57,13 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

-// Compatibility hack: if the control plane specified any remote-ext-config
-// use the default value for extension storage proxy gateway.
-// Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_config(arg: &str) -> Result<String> {
-    if arg.starts_with("http") {
-        Ok(arg.trim_end_matches('/').to_string())
-    } else {
-        Ok("http://pg-ext-s3-gateway".to_string())
-    }
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
    pub pgbin: String,

-    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    #[arg(short = 'r', long)]
    pub remote_ext_config: Option<String>,

    /// The port to bind the external listening HTTP server to. Clients running
@@ -116,9 +105,7 @@ struct Cli {
    #[arg(long)]
    pub set_disk_quota_for_fs: Option<String>,

-    // TODO(tristan957): remove alias after compatibility tests are no longer
-    // an issue
-    #[arg(short = 'c', long, alias = "spec-path")]
+    #[arg(short = 'c', long)]
    pub config: Option<OsString>,

    #[arg(short = 'i', long, group = "compute-id")]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -641,7 +641,26 @@ impl ComputeNode {

                let log_directory_path = Path::new(&self.params.pgdata).join("log");
                let log_directory_path = log_directory_path.to_string_lossy().to_string();
-                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Add project_id,endpoint_id tag to identify the logs.
+                //
+                // These ids are passed from cplane,
+                // for backwards compatibility (old computes that don't have them),
+                // we set them to None.
+                // TODO: Clean up this code when all computes have them.
+                let tag: Option<String> = match (
+                    pspec.spec.project_id.as_deref(),
+                    pspec.spec.endpoint_id.as_deref(),
+                ) {
+                    (Some(project_id), Some(endpoint_id)) => {
+                        Some(format!("{project_id}/{endpoint_id}"))
+                    }
+                    (Some(project_id), None) => Some(format!("{project_id}/None")),
+                    (None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
+                    (None, None) => None,
+                };
+
+                configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;

                // Launch a background task to clean up the audit logs
                launch_pgaudit_gc(log_directory_path);
--- a/compute_tools/src/http/extract/mod.rs
+++ b/compute_tools/src/http/extract/mod.rs
@@ -6,4 +6,5 @@ pub(crate) mod request_id;
 pub(crate) use json::Json;
 pub(crate) use path::Path;
 pub(crate) use query::Query;
+#[allow(unused)]
 pub(crate) use request_id::RequestId;
--- a/compute_tools/src/http/middleware/authorize.rs
+++ b/compute_tools/src/http/middleware/authorize.rs
@@ -13,7 +13,7 @@ use jsonwebtoken::{Algorithm, DecodingKey, TokenData, Validation, jwk::JwkSet};
 use tower_http::auth::AsyncAuthorizeRequest;
 use tracing::{debug, warn};

-use crate::http::{JsonResponse, extract::RequestId};
+use crate::http::JsonResponse;

 #[derive(Clone, Debug)]
 pub(in crate::http) struct Authorize {
@@ -52,18 +52,6 @@ impl AsyncAuthorizeRequest<Body> for Authorize {
        let validation = self.validation.clone();

        Box::pin(async move {
-            let request_id = request.extract_parts::<RequestId>().await.unwrap();
-
-            // TODO(tristan957): Remove this stanza after teaching neon_local
-            // and the regression tests to use a JWT + JWKS.
-            //
-            // https://github.com/neondatabase/neon/issues/11316
-            if cfg!(feature = "testing") {
-                warn!(%request_id, "Skipping compute_ctl authorization check");
-
-                return Ok(request);
-            }
-
            let TypedHeader(Authorization(bearer)) = request
                .extract_parts::<TypedHeader<Authorization<Bearer>>>()
                .await
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -50,13 +50,13 @@ fn restart_rsyslog() -> Result<()> {

 pub fn configure_audit_rsyslog(
    log_directory: String,
-    tag: &str,
+    tag: Option<String>,
    remote_endpoint: &str,
 ) -> Result<()> {
    let config_content: String = format!(
        include_str!("config_template/compute_audit_rsyslog_template.conf"),
        log_directory = log_directory,
-        tag = tag,
+        tag = tag.unwrap_or("".to_string()),
        remote_endpoint = remote_endpoint
    );

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -18,12 +18,11 @@ use anyhow::{Context, Result, anyhow, bail};
 use clap::Parser;
 use compute_api::spec::ComputeMode;
 use control_plane::endpoint::ComputeControlPlane;
+use control_plane::endpoint_storage::{ENDPOINT_STORAGE_DEFAULT_PORT, EndpointStorage};
 use control_plane::local_env::{
-    InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf, NeonLocalInitPageserverConf,
-    ObjectStorageConf, SafekeeperConf,
+    EndpointStorageConf, InitForceMode, LocalEnv, NeonBroker, NeonLocalInitConf,
+    NeonLocalInitPageserverConf, SafekeeperConf,
 };
-use control_plane::object_storage::OBJECT_STORAGE_DEFAULT_PORT;
-use control_plane::object_storage::ObjectStorage;
 use control_plane::pageserver::PageServerNode;
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::{
@@ -63,7 +62,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: u32 = 16;
+const DEFAULT_PG_VERSION: u32 = 17;

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

@@ -93,7 +92,7 @@ enum NeonLocalCmd {
    #[command(subcommand)]
    Safekeeper(SafekeeperCmd),
    #[command(subcommand)]
-    ObjectStorage(ObjectStorageCmd),
+    EndpointStorage(EndpointStorageCmd),
    #[command(subcommand)]
    Endpoint(EndpointCmd),
    #[command(subcommand)]
@@ -460,14 +459,14 @@ enum SafekeeperCmd {

 #[derive(clap::Subcommand)]
 #[clap(about = "Manage object storage")]
-enum ObjectStorageCmd {
-    Start(ObjectStorageStartCmd),
-    Stop(ObjectStorageStopCmd),
+enum EndpointStorageCmd {
+    Start(EndpointStorageStartCmd),
+    Stop(EndpointStorageStopCmd),
 }

 #[derive(clap::Args)]
 #[clap(about = "Start object storage")]
-struct ObjectStorageStartCmd {
+struct EndpointStorageStartCmd {
    #[clap(short = 't', long, help = "timeout until we fail the command")]
    #[arg(default_value = "10s")]
    start_timeout: humantime::Duration,
@@ -475,7 +474,7 @@ struct ObjectStorageStartCmd {

 #[derive(clap::Args)]
 #[clap(about = "Stop object storage")]
-struct ObjectStorageStopCmd {
+struct EndpointStorageStopCmd {
    #[arg(value_enum, default_value = "fast")]
    #[clap(
        short = 'm',
@@ -797,7 +796,9 @@ fn main() -> Result<()> {
            }
            NeonLocalCmd::StorageBroker(subcmd) => rt.block_on(handle_storage_broker(&subcmd, env)),
            NeonLocalCmd::Safekeeper(subcmd) => rt.block_on(handle_safekeeper(&subcmd, env)),
-            NeonLocalCmd::ObjectStorage(subcmd) => rt.block_on(handle_object_storage(&subcmd, env)),
+            NeonLocalCmd::EndpointStorage(subcmd) => {
+                rt.block_on(handle_endpoint_storage(&subcmd, env))
+            }
            NeonLocalCmd::Endpoint(subcmd) => rt.block_on(handle_endpoint(&subcmd, env)),
            NeonLocalCmd::Mappings(subcmd) => handle_mappings(&subcmd, env),
        };
@@ -1014,8 +1015,8 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                    }
                })
                .collect(),
-            object_storage: ObjectStorageConf {
-                port: OBJECT_STORAGE_DEFAULT_PORT,
+            endpoint_storage: EndpointStorageConf {
+                port: ENDPOINT_STORAGE_DEFAULT_PORT,
            },
            pg_distrib_dir: None,
            neon_distrib_dir: None,
@@ -1544,7 +1545,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
            let jwt = endpoint.generate_jwt()?;

-            println!("{jwt}");
+            print!("{jwt}");
        }
    }

@@ -1735,12 +1736,15 @@ async fn handle_safekeeper(subcmd: &SafekeeperCmd, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::LocalEnv) -> Result<()> {
-    use ObjectStorageCmd::*;
-    let storage = ObjectStorage::from_env(env);
+async fn handle_endpoint_storage(
+    subcmd: &EndpointStorageCmd,
+    env: &local_env::LocalEnv,
+) -> Result<()> {
+    use EndpointStorageCmd::*;
+    let storage = EndpointStorage::from_env(env);

    // In tests like test_forward_compatibility or test_graceful_cluster_restart
-    // old neon binaries (without object_storage) are present
+    // old neon binaries (without endpoint_storage) are present
    if !storage.bin.exists() {
        eprintln!(
            "{} binary not found. Ignore if this is a compatibility test",
@@ -1750,13 +1754,13 @@ async fn handle_object_storage(subcmd: &ObjectStorageCmd, env: &local_env::Local
    }

    match subcmd {
-        Start(ObjectStorageStartCmd { start_timeout }) => {
+        Start(EndpointStorageStartCmd { start_timeout }) => {
            if let Err(e) = storage.start(start_timeout).await {
-                eprintln!("object_storage start failed: {e}");
+                eprintln!("endpoint_storage start failed: {e}");
                exit(1);
            }
        }
-        Stop(ObjectStorageStopCmd { stop_mode }) => {
+        Stop(EndpointStorageStopCmd { stop_mode }) => {
            let immediate = match stop_mode {
                StopMode::Fast => false,
                StopMode::Immediate => true,
@@ -1866,10 +1870,10 @@ async fn handle_start_all_impl(
        }

        js.spawn(async move {
-            ObjectStorage::from_env(env)
+            EndpointStorage::from_env(env)
                .start(&retry_timeout)
                .await
-                .map_err(|e| e.context("start object_storage"))
+                .map_err(|e| e.context("start endpoint_storage"))
        });
    })();

@@ -1968,9 +1972,9 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
        }
    }

-    let storage = ObjectStorage::from_env(env);
+    let storage = EndpointStorage::from_env(env);
    if let Err(e) = storage.stop(immediate) {
-        eprintln!("object_storage stop failed: {:#}", e);
+        eprintln!("endpoint_storage stop failed: {:#}", e);
    }

    for ps_conf in &env.pageservers {
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -766,10 +766,6 @@ impl Endpoint {
            }
        };

-        // TODO(tristan957): Remove the write to spec.json after compatibility
-        // tests work themselves out
-        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
        let config_path = self.endpoint_path().join("config.json");
        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;

@@ -779,16 +775,6 @@ impl Endpoint {
            .append(true)
            .open(self.endpoint_path().join("compute.log"))?;

-        // TODO(tristan957): Remove when compatibility tests are no longer an
-        // issue
-        let old_compute_ctl = {
-            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-            let help_output = cmd.arg("--help").output()?;
-            let help_output = String::from_utf8_lossy(&help_output.stdout);
-
-            !help_output.contains("--config")
-        };
-
        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
@@ -807,19 +793,8 @@ impl Endpoint {
        ])
        .args(["--pgdata", self.pgdata().to_str().unwrap()])
        .args(["--connstr", &conn_str])
-        // TODO(tristan957): Change this to --config when compatibility tests
-        // are no longer an issue
-        .args([
-            "--spec-path",
-            self.endpoint_path()
-                .join(if old_compute_ctl {
-                    "spec.json"
-                } else {
-                    "config.json"
-                })
-                .to_str()
-                .unwrap(),
-        ])
+        .arg("--config")
+        .arg(self.endpoint_path().join("config.json").as_os_str())
        .args([
            "--pgbin",
            self.env
--- a/control_plane/src/endpoint_storage.rs
+++ b/control_plane/src/endpoint_storage.rs
@@ -1,34 +1,33 @@
 use crate::background_process::{self, start_process, stop_process};
 use crate::local_env::LocalEnv;
-use anyhow::anyhow;
 use anyhow::{Context, Result};
 use camino::Utf8PathBuf;
 use std::io::Write;
 use std::time::Duration;

 /// Directory within .neon which will be used by default for LocalFs remote storage.
-pub const OBJECT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/object_storage";
-pub const OBJECT_STORAGE_DEFAULT_PORT: u16 = 9993;
+pub const ENDPOINT_STORAGE_REMOTE_STORAGE_DIR: &str = "local_fs_remote_storage/endpoint_storage";
+pub const ENDPOINT_STORAGE_DEFAULT_PORT: u16 = 9993;

-pub struct ObjectStorage {
+pub struct EndpointStorage {
    pub bin: Utf8PathBuf,
    pub data_dir: Utf8PathBuf,
    pub pemfile: Utf8PathBuf,
    pub port: u16,
 }

-impl ObjectStorage {
-    pub fn from_env(env: &LocalEnv) -> ObjectStorage {
-        ObjectStorage {
-            bin: Utf8PathBuf::from_path_buf(env.object_storage_bin()).unwrap(),
-            data_dir: Utf8PathBuf::from_path_buf(env.object_storage_data_dir()).unwrap(),
+impl EndpointStorage {
+    pub fn from_env(env: &LocalEnv) -> EndpointStorage {
+        EndpointStorage {
+            bin: Utf8PathBuf::from_path_buf(env.endpoint_storage_bin()).unwrap(),
+            data_dir: Utf8PathBuf::from_path_buf(env.endpoint_storage_data_dir()).unwrap(),
            pemfile: Utf8PathBuf::from_path_buf(env.public_key_path.clone()).unwrap(),
-            port: env.object_storage.port,
+            port: env.endpoint_storage.port,
        }
    }

    fn config_path(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.json")
+        self.data_dir.join("endpoint_storage.json")
    }

    fn listen_addr(&self) -> Utf8PathBuf {
@@ -49,7 +48,7 @@ impl ObjectStorage {
        let cfg = Cfg {
            listen: self.listen_addr(),
            pemfile: parent.join(self.pemfile.clone()),
-            local_path: parent.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR),
+            local_path: parent.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR),
            r#type: "LocalFs".to_string(),
        };
        std::fs::create_dir_all(self.config_path().parent().unwrap())?;
@@ -59,24 +58,19 @@ impl ObjectStorage {
    }

    pub async fn start(&self, retry_timeout: &Duration) -> Result<()> {
-        println!("Starting s3 proxy at {}", self.listen_addr());
+        println!("Starting endpoint_storage at {}", self.listen_addr());
        std::io::stdout().flush().context("flush stdout")?;

        let process_status_check = || async {
-            tokio::time::sleep(Duration::from_millis(500)).await;
-            let res = reqwest::Client::new()
-                .get(format!("http://{}/metrics", self.listen_addr()))
-                .send()
-                .await;
-            match res {
-                Ok(response) if response.status().is_success() => Ok(true),
-                Ok(_) => Err(anyhow!("Failed to query /metrics")),
-                Err(e) => Err(anyhow!("Failed to check node status: {e}")),
+            let res = reqwest::Client::new().get(format!("http://{}/metrics", self.listen_addr()));
+            match res.send().await {
+                Ok(res) => Ok(res.status().is_success()),
+                Err(_) => Ok(false),
            }
        };

        let res = start_process(
-            "object_storage",
+            "endpoint_storage",
            &self.data_dir.clone().into_std_path_buf(),
            &self.bin.clone().into_std_path_buf(),
            vec![self.config_path().to_string()],
@@ -94,14 +88,14 @@ impl ObjectStorage {
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
-        stop_process(immediate, "object_storage", &self.pid_file())
+        stop_process(immediate, "endpoint_storage", &self.pid_file())
    }

    fn log_file(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.log")
+        self.data_dir.join("endpoint_storage.log")
    }

    fn pid_file(&self) -> Utf8PathBuf {
-        self.data_dir.join("object_storage.pid")
+        self.data_dir.join("endpoint_storage.pid")
    }
 }
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -9,8 +9,8 @@
 mod background_process;
 pub mod broker;
 pub mod endpoint;
+pub mod endpoint_storage;
 pub mod local_env;
-pub mod object_storage;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -19,11 +19,11 @@ use serde::{Deserialize, Serialize};
 use utils::auth::encode_from_key_file;
 use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};

-use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
+use crate::endpoint_storage::{ENDPOINT_STORAGE_REMOTE_STORAGE_DIR, EndpointStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 //
 // This data structures represents neon_local CLI config
@@ -72,7 +72,7 @@ pub struct LocalEnv {

    pub safekeepers: Vec<SafekeeperConf>,

-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,

    // Control plane upcall API for pageserver: if None, we will not run storage_controller  If set, this will
    // be propagated into each pageserver's configuration.
@@ -110,7 +110,7 @@ pub struct OnDiskConfig {
    )]
    pub pageservers: Vec<PageServerConf>,
    pub safekeepers: Vec<SafekeeperConf>,
-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub control_plane_compute_hook_api: Option<Url>,
@@ -144,7 +144,7 @@ pub struct NeonLocalInitConf {
    pub storage_controller: Option<NeonStorageControllerConf>,
    pub pageservers: Vec<NeonLocalInitPageserverConf>,
    pub safekeepers: Vec<SafekeeperConf>,
-    pub object_storage: ObjectStorageConf,
+    pub endpoint_storage: EndpointStorageConf,
    pub control_plane_api: Option<Url>,
    pub control_plane_hooks_api: Option<Url>,
    pub generate_local_ssl_certs: bool,
@@ -152,7 +152,7 @@ pub struct NeonLocalInitConf {

 #[derive(Serialize, Default, Deserialize, PartialEq, Eq, Clone, Debug)]
 #[serde(default)]
-pub struct ObjectStorageConf {
+pub struct EndpointStorageConf {
    pub port: u16,
 }

@@ -413,8 +413,8 @@ impl LocalEnv {
        self.pg_dir(pg_version, "lib")
    }

-    pub fn object_storage_bin(&self) -> PathBuf {
-        self.neon_distrib_dir.join("object_storage")
+    pub fn endpoint_storage_bin(&self) -> PathBuf {
+        self.neon_distrib_dir.join("endpoint_storage")
    }

    pub fn pageserver_bin(&self) -> PathBuf {
@@ -450,8 +450,8 @@ impl LocalEnv {
        self.base_data_dir.join("safekeepers").join(data_dir_name)
    }

-    pub fn object_storage_data_dir(&self) -> PathBuf {
-        self.base_data_dir.join("object_storage")
+    pub fn endpoint_storage_data_dir(&self) -> PathBuf {
+        self.base_data_dir.join("endpoint_storage")
    }

    pub fn get_pageserver_conf(&self, id: NodeId) -> anyhow::Result<&PageServerConf> {
@@ -615,7 +615,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: _,
                branch_name_mappings,
                generate_local_ssl_certs,
-                object_storage,
+                endpoint_storage,
            } = on_disk_config;
            LocalEnv {
                base_data_dir: repopath.to_owned(),
@@ -632,7 +632,7 @@ impl LocalEnv {
                control_plane_hooks_api,
                branch_name_mappings,
                generate_local_ssl_certs,
-                object_storage,
+                endpoint_storage,
            }
        };

@@ -742,7 +742,7 @@ impl LocalEnv {
                control_plane_compute_hook_api: None,
                branch_name_mappings: self.branch_name_mappings.clone(),
                generate_local_ssl_certs: self.generate_local_ssl_certs,
-                object_storage: self.object_storage.clone(),
+                endpoint_storage: self.endpoint_storage.clone(),
            },
        )
    }
@@ -849,7 +849,7 @@ impl LocalEnv {
            control_plane_api,
            generate_local_ssl_certs,
            control_plane_hooks_api,
-            object_storage,
+            endpoint_storage,
        } = conf;

        // Find postgres binaries.
@@ -901,7 +901,7 @@ impl LocalEnv {
            control_plane_hooks_api,
            branch_name_mappings: Default::default(),
            generate_local_ssl_certs,
-            object_storage,
+            endpoint_storage,
        };

        if generate_local_ssl_certs {
@@ -929,13 +929,13 @@ impl LocalEnv {
                .context("pageserver init failed")?;
        }

-        ObjectStorage::from_env(&env)
+        EndpointStorage::from_env(&env)
            .init()
            .context("object storage init failed")?;

        // setup remote remote location for default LocalFs remote storage
        std::fs::create_dir_all(env.base_data_dir.join(PAGESERVER_REMOTE_STORAGE_DIR))?;
-        std::fs::create_dir_all(env.base_data_dir.join(OBJECT_STORAGE_REMOTE_STORAGE_DIR))?;
+        std::fs::create_dir_all(env.base_data_dir.join(ENDPOINT_STORAGE_REMOTE_STORAGE_DIR))?;

        env.persist_config()
    }
--- a/docker-compose/README.md
+++ b/docker-compose/README.md
@@ -1,4 +1,3 @@
-
 # Example docker compose configuration

 The configuration in this directory is used for testing Neon docker images: it is
@@ -8,3 +7,13 @@ you can experiment with a miniature Neon system, use `cargo neon` rather than co
 This configuration does not start the storage controller, because the controller
 needs a way to reconfigure running computes, and no such thing exists in this setup.

+## Generating the JWKS for a compute
+
+```shell
+openssl genpkey -algorithm Ed25519 -out private-key.pem
+openssl pkey -in private-key.pem -pubout -out public-key.pem
+openssl pkey -pubin -inform pem -in public-key.pem -pubout -outform der -out public-key.der
+key="$(xxd -plain -cols 32 -s -32 public-key.der)"
+key_id="$(printf '%s' "$key" | sha256sum | awk '{ print $1 }' | basenc --base64url --wrap=0)"
+x="$(printf '%s' "$key" | basenc --base64url --wrap=0)"
+```
--- a/docker-compose/compute_wrapper/private-key.pem
+++ b/docker-compose/compute_wrapper/private-key.pem
@@ -0,0 +1,3 @@
+-----BEGIN PRIVATE KEY-----
+MC4CAQAwBQYDK2VwBCIEIOmnRbzt2AJ0d+S3aU1hiYOl/tXpvz1FmWBfwHYBgOma
+-----END PRIVATE KEY-----
--- a/docker-compose/compute_wrapper/public-key.der
+++ b/docker-compose/compute_wrapper/public-key.der
--- a/docker-compose/compute_wrapper/public-key.pem
+++ b/docker-compose/compute_wrapper/public-key.pem
@@ -0,0 +1,3 @@
+-----BEGIN PUBLIC KEY-----
+MCowBQYDK2VwAyEADY0al/U0bgB3+9fUGk+3PKWnsck9OyxN5DjHIN6Xep0=
+-----END PUBLIC KEY-----
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -81,19 +81,9 @@ sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}

 cat ${CONFIG_FILE}

-# TODO(tristan957): Remove these workarounds for backwards compatibility after
-# the next compute release. That includes these next few lines and the
-# --spec-path in the compute_ctl invocation.
-if compute_ctl --help | grep --quiet -- '--config'; then
-  SPEC_PATH="$CONFIG_FILE"
-else
-  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
-  SPEC_PATH=/tmp/spec.json
-fi
-
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-$RANDOM"                          \
-     --spec-path "$SPEC_PATH"
+     --config "$CONFIG_FILE"
--- a/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
+++ b/docker-compose/compute_wrapper/var/db/postgres/configs/config.json
@@ -142,7 +142,19 @@
    },
    "compute_ctl_config": {
        "jwks": {
-            "keys": []
+            "keys": [
+                {
+                    "use": "sig",
+                    "key_ops": [
+                        "verify"
+                    ],
+                    "alg": "EdDSA",
+                    "kid": "ZGIxMzAzOGY0YWQwODk2ODU1MTk1NzMxMDFkYmUyOWU2NzZkOWNjNjMyMGRkZGJjOWY0MjdjYWVmNzE1MjUyOAo=",
+                    "kty": "OKP",
+                    "crv": "Ed25519",
+                    "x": "MGQ4ZDFhOTdmNTM0NmUwMDc3ZmJkN2Q0MWE0ZmI3M2NhNWE3YjFjOTNkM2IyYzRkZTQzOGM3MjBkZTk3N2E5ZAo="
+                }
+            ]
        }
    }
 }
--- a/endpoint_storage/Cargo.toml
+++ b/endpoint_storage/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "object_storage"
+name = "endpoint_storage"
 version = "0.0.1"
 edition.workspace = true
 license.workspace = true
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -2,7 +2,7 @@ use anyhow::anyhow;
 use axum::body::{Body, Bytes};
 use axum::response::{IntoResponse, Response};
 use axum::{Router, http::StatusCode};
-use object_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
+use endpoint_storage::{PrefixS3Path, S3Path, Storage, bad_request, internal_error, not_found, ok};
 use remote_storage::TimeoutOrCancel;
 use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, RemotePath};
 use std::{sync::Arc, time::SystemTime, time::UNIX_EPOCH};
@@ -46,12 +46,12 @@ async fn metrics() -> Result {

 async fn get(S3Path { path }: S3Path, state: State) -> Result {
    info!(%path, "downloading");
-    let download_err = |e| {
-        if let DownloadError::NotFound = e {
-            info!(%path, %e, "downloading"); // 404 is not an issue of _this_ service
+    let download_err = |err| {
+        if let DownloadError::NotFound = err {
+            info!(%path, %err, "downloading"); // 404 is not an issue of _this_ service
            return not_found(&path);
        }
-        internal_error(e, &path, "downloading")
+        internal_error(err, &path, "downloading")
    };
    let cancel = state.cancel.clone();
    let opts = &DownloadOpts::default();
@@ -249,7 +249,7 @@ mod tests {
        };

        let proxy = Storage {
-            auth: object_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
+            auth: endpoint_storage::JwtAuth::new(TEST_PUB_KEY_ED25519).unwrap(),
            storage,
            cancel: cancel.clone(),
            max_upload_file_limit: usize::MAX,
@@ -343,14 +343,14 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        TimelineId::from_array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 7]);
    const ENDPOINT_ID: &str = "ep-winter-frost-a662z3vg";
    fn token() -> String {
-        let claims = object_storage::Claims {
+        let claims = endpoint_storage::Claims {
            tenant_id: TENANT_ID,
            timeline_id: TIMELINE_ID,
            endpoint_id: ENDPOINT_ID.into(),
            exp: u64::MAX,
        };
        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
-        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
        jsonwebtoken::encode(&header, &claims, &key).unwrap()
    }

@@ -364,7 +364,10 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            vec![TIMELINE_ID.to_string(), TimelineId::generate().to_string()],
            vec![ENDPOINT_ID, "ep-ololo"]
        )
-        .skip(1);
+        // first one is fully valid path, second path is valid for GET as
+        // read paths may have different endpoint if tenant and timeline matches
+        // (needed for prewarming RO->RW replica)
+        .skip(2);

        for ((uri, method), (tenant, timeline, endpoint)) in iproduct!(routes(), args) {
            info!(%uri, %method, %tenant, %timeline, %endpoint);
@@ -475,6 +478,16 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        requests_chain(chain.into_iter(), |_| token()).await;
    }

+    #[testlog(tokio::test)]
+    async fn read_other_endpoint_data() {
+        let uri = format!("/{TENANT_ID}/{TIMELINE_ID}/other_endpoint/key");
+        let chain = vec![
+            (uri.clone(), "GET", "", StatusCode::NOT_FOUND, false),
+            (uri.clone(), "PUT", "", StatusCode::UNAUTHORIZED, false),
+        ];
+        requests_chain(chain.into_iter(), |_| token()).await;
+    }
+
    fn delete_prefix_token(uri: &str) -> String {
        use serde::Serialize;
        let parts = uri.split("/").collect::<Vec<&str>>();
@@ -482,7 +495,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
        struct PrefixClaims {
            tenant_id: TenantId,
            timeline_id: Option<TimelineId>,
-            endpoint_id: Option<object_storage::EndpointId>,
+            endpoint_id: Option<endpoint_storage::EndpointId>,
            exp: u64,
        }
        let claims = PrefixClaims {
@@ -492,7 +505,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
            exp: u64::MAX,
        };
        let key = jsonwebtoken::EncodingKey::from_ed_pem(TEST_PRIV_KEY_ED25519).unwrap();
-        let header = jsonwebtoken::Header::new(object_storage::VALIDATION_ALGO);
+        let header = jsonwebtoken::Header::new(endpoint_storage::VALIDATION_ALGO);
        jsonwebtoken::encode(&header, &claims, &key).unwrap()
    }

--- a/endpoint_storage/src/lib.rs
+++ b/endpoint_storage/src/lib.rs
@@ -169,10 +169,19 @@ impl FromRequestParts<Arc<Storage>> for S3Path {
            .auth
            .decode(bearer.token())
            .map_err(|e| bad_request(e, "decoding token"))?;
+
+        // Read paths may have different endpoint ids. For readonly -> readwrite replica
+        // prewarming, endpoint must read other endpoint's data.
+        let endpoint_id = if parts.method == axum::http::Method::GET {
+            claims.endpoint_id.clone()
+        } else {
+            path.endpoint_id.clone()
+        };
+
        let route = Claims {
            tenant_id: path.tenant_id,
            timeline_id: path.timeline_id,
-            endpoint_id: path.endpoint_id.clone(),
+            endpoint_id,
            exp: claims.exp,
        };
        if route != claims {
--- a/endpoint_storage/src/main.rs
+++ b/endpoint_storage/src/main.rs
@@ -1,4 +1,4 @@
-//! `object_storage` is a service which provides API for uploading and downloading
+//! `endpoint_storage` is a service which provides API for uploading and downloading
 //! files. It is used by compute and control plane for accessing LFC prewarm data.
 //! This service is deployed either as a separate component or as part of compute image
 //! for large computes.
@@ -33,7 +33,7 @@ async fn main() -> anyhow::Result<()> {

    let config: String = std::env::args().skip(1).take(1).collect();
    if config.is_empty() {
-        anyhow::bail!("Usage: object_storage config.json")
+        anyhow::bail!("Usage: endpoint_storage config.json")
    }
    info!("Reading config from {config}");
    let config = std::fs::read_to_string(config.clone())?;
@@ -41,7 +41,7 @@ async fn main() -> anyhow::Result<()> {
    info!("Reading pemfile from {}", config.pemfile.clone());
    let pemfile = std::fs::read(config.pemfile.clone())?;
    info!("Loading public key from {}", config.pemfile.clone());
-    let auth = object_storage::JwtAuth::new(&pemfile)?;
+    let auth = endpoint_storage::JwtAuth::new(&pemfile)?;

    let listener = tokio::net::TcpListener::bind(config.listen).await.unwrap();
    info!("listening on {}", listener.local_addr().unwrap());
@@ -50,7 +50,7 @@ async fn main() -> anyhow::Result<()> {
    let cancel = tokio_util::sync::CancellationToken::new();
    app::check_storage_permissions(&storage, cancel.clone()).await?;

-    let proxy = std::sync::Arc::new(object_storage::Storage {
+    let proxy = std::sync::Arc::new(endpoint_storage::Storage {
        auth,
        storage,
        cancel: cancel.clone(),
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -242,13 +242,22 @@ impl RemoteExtSpec {

        match self.extension_data.get(real_ext_name) {
            Some(_ext_data) => {
+                // We have decided to use the Go naming convention due to Kubernetes.
+
+                let arch = match std::env::consts::ARCH {
+                    "x86_64" => "amd64",
+                    "aarch64" => "arm64",
+                    arch => arch,
+                };
+
                // Construct the path to the extension archive
                // BUILD_TAG/PG_MAJOR_VERSION/extensions/EXTENSION_NAME.tar.zst
                //
                // Keep it in sync with path generation in
                // https://github.com/neondatabase/build-custom-extensions/tree/main
-                let archive_path_str =
-                    format!("{build_tag}/{pg_major_version}/extensions/{real_ext_name}.tar.zst");
+                let archive_path_str = format!(
+                    "{build_tag}/{arch}/{pg_major_version}/extensions/{real_ext_name}.tar.zst"
+                );
                Ok((
                    real_ext_name.to_string(),
                    RemotePath::from_string(&archive_path_str)?,
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -76,7 +76,14 @@ pub fn gather() -> Vec<prometheus::proto::MetricFamily> {
    mfs
 }

-
+static DISK_IO_BYTES: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "libmetrics_disk_io_bytes_total",
+        "Bytes written and read from disk, grouped by the operation (read|write)",
+        &["io_operation"]
+    )
+    .expect("Failed to register disk i/o bytes int gauge vec")
+});

 static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
    register_int_gauge!(
@@ -254,7 +261,12 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

-    
+    DISK_IO_BYTES
+        .with_label_values(&["read"])
+        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
+    DISK_IO_BYTES
+        .with_label_values(&["write"])
+        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);

    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
    #[cfg(target_os = "macos")]
@@ -345,7 +357,10 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }

-    
+    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
+        res[0] = self.inc.remove_label_values(vals);
+        res[1] = self.dec.remove_label_values(vals);
+    }
 }

 impl<P: Atomic> GenericCounterPair<P> {
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -35,6 +35,7 @@ nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
 tracing-utils.workspace = true
+once_cell.workspace = true

 [dev-dependencies]
 bincode.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -682,10 +682,10 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;

    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
-    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
-    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
-    // with this config, we can get a maximum peak compaction usage of 9 GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
+    // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
+    // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
+    // compaction usage of 15360MB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
    // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
    // read amp.
    pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
@@ -702,8 +702,11 @@ pub mod tenant_conf_defaults {
    // Relevant: https://github.com/neondatabase/neon/issues/3394
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
-    // layer creation will end immediately. Set to 0 to disable.
+    // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
+    // without looking at the exact number of L0 layers.
+    // It was expected to have the following behavior:
+    // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // > layer creation will end immediately. Set to 0 to disable.
    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1817,8 +1817,34 @@ pub mod virtual_file {
    }

    impl IoMode {
-        pub const fn preferred() -> Self {
-            Self::Buffered
+        pub fn preferred() -> Self {
+            // The default behavior when running Rust unit tests without any further
+            // flags is to use the newest behavior if available on the platform (Direct).
+            // The CI uses the following environment variable to unit tests for all
+            // different modes.
+            // NB: the Python regression & perf tests have their own defaults management
+            // that writes pageserver.toml; they do not use this variable.
+            if cfg!(test) {
+                use once_cell::sync::Lazy;
+                static CACHED: Lazy<IoMode> = Lazy::new(|| {
+                    utils::env::var_serde_json_string(
+                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
+                    )
+                    .unwrap_or({
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoMode::Direct
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoMode::Buffered
+                        }
+                    })
+                });
+                *CACHED
+            } else {
+                IoMode::Buffered
+            }
        }
    }

--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -23,6 +23,7 @@ use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::{StreamExt, TryStreamExt};
 use http_types::{StatusCode, Url};
+use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use utils::backoff;
@@ -31,7 +32,7 @@ use utils::backoff::exponential_backoff_duration_seconds;
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use crate::config::AzureConfig;
 use crate::error::Cancelled;
-use crate::metrics::RequestKind;
+use crate::metrics::{AttemptOutcome, RequestKind, start_measuring_requests};
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadKind, DownloadOpts, Listing, ListingMode,
    ListingObject, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel,
@@ -164,7 +165,7 @@ impl AzureBlobStorage {
        let mut last_modified = None;
        let mut metadata = HashMap::new();

-      
+        let started_at = start_measuring_requests(kind);

        let download = async {
            let response = builder
@@ -236,8 +237,19 @@ impl AzureBlobStorage {
                TimeoutOrCancel::Cancel => return Err(DownloadError::Cancelled),
            },
        };
-        
-      
+        let started_at = ScopeGuard::into_inner(started_at);
+        let outcome = match &download {
+            Ok(_) => AttemptOutcome::Ok,
+            // At this level in the stack 404 and 304 responses do not indicate an error.
+            // There's expected cases when a blob may not exist or hasn't been modified since
+            // the last get (e.g. probing for timeline indices and heatmap downloads).
+            // Callers should handle errors if they are unexpected.
+            Err(DownloadError::NotFound | DownloadError::Unmodified) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);
        download
    }

@@ -419,7 +431,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let blob_client = self.client.blob_client(self.relative_path_to_name(key));
        let properties_future = blob_client.get_properties().into_future();
@@ -431,9 +443,12 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(_inner) = &res {
-            
-         
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
        }

        let data = match res {
@@ -461,7 +476,7 @@ impl RemoteStorage for AzureBlobStorage {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let op = async {
            let blob_client = self.client.blob_client(self.relative_path_to_name(to));
@@ -494,7 +509,14 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-      
+        let outcome = match res {
+            Ok(_) => AttemptOutcome::Ok,
+            Err(_) => AttemptOutcome::Err,
+        };
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, outcome, started_at);

        res
    }
@@ -540,7 +562,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Delete;
        let _permit = self.permit(kind, cancel).await?;
-    
+        let started_at = start_measuring_requests(kind);

        let op = async {
            // TODO batch requests are not supported by the SDK
@@ -606,8 +628,10 @@ impl RemoteStorage for AzureBlobStorage {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
        res
    }

@@ -623,7 +647,7 @@ impl RemoteStorage for AzureBlobStorage {
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Copy;
        let _permit = self.permit(kind, cancel).await?;
-  
+        let started_at = start_measuring_requests(kind);

        let timeout = tokio::time::sleep(self.timeout);

@@ -677,8 +701,10 @@ impl RemoteStorage for AzureBlobStorage {
            },
        };

-        
-      
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
        res
    }

--- a/libs/remote_storage/src/metrics.rs
+++ b/libs/remote_storage/src/metrics.rs
@@ -1,7 +1,9 @@
+use metrics::{
+    Histogram, IntCounter, register_histogram_vec, register_int_counter, register_int_counter_vec,
+};
+use once_cell::sync::Lazy;

-
-
-
+pub(super) static BUCKET_METRICS: Lazy<BucketMetrics> = Lazy::new(Default::default);

 #[derive(Clone, Copy, Debug)]
 pub(crate) enum RequestKind {
@@ -14,9 +16,62 @@ pub(crate) enum RequestKind {
    Head = 6,
 }

-
+use RequestKind::*;
 use scopeguard::ScopeGuard;

+impl RequestKind {
+    const fn as_str(&self) -> &'static str {
+        match self {
+            Get => "get_object",
+            Put => "put_object",
+            Delete => "delete_object",
+            List => "list_objects",
+            Copy => "copy_object",
+            TimeTravel => "time_travel_recover",
+            Head => "head_object",
+        }
+    }
+    const fn as_index(&self) -> usize {
+        *self as usize
+    }
+}
+
+const REQUEST_KIND_COUNT: usize = 7;
+pub(crate) struct RequestTyped<C>([C; REQUEST_KIND_COUNT]);
+
+impl<C> RequestTyped<C> {
+    pub(crate) fn get(&self, kind: RequestKind) -> &C {
+        &self.0[kind.as_index()]
+    }
+
+    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
+        use RequestKind::*;
+        let mut it = [Get, Put, Delete, List, Copy, TimeTravel, Head].into_iter();
+        let arr = std::array::from_fn::<C, REQUEST_KIND_COUNT, _>(|index| {
+            let next = it.next().unwrap();
+            assert_eq!(index, next.as_index());
+            f(next)
+        });
+
+        if let Some(next) = it.next() {
+            panic!("unexpected {next:?}");
+        }
+
+        RequestTyped(arr)
+    }
+}
+
+impl RequestTyped<Histogram> {
+    pub(crate) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) {
+        self.get(kind).observe(started_at.elapsed().as_secs_f64())
+    }
+}
+
+pub(crate) struct PassFailCancelledRequestTyped<C> {
+    success: RequestTyped<C>,
+    fail: RequestTyped<C>,
+    cancelled: RequestTyped<C>,
+}

 #[derive(Debug, Clone, Copy)]
 pub(crate) enum AttemptOutcome {
@@ -34,22 +89,138 @@ impl<T, E> From<&Result<T, E>> for AttemptOutcome {
    }
 }

+impl AttemptOutcome {
+    pub(crate) fn as_str(&self) -> &'static str {
+        match self {
+            AttemptOutcome::Ok => "ok",
+            AttemptOutcome::Err => "err",
+            AttemptOutcome::Cancelled => "cancelled",
+        }
+    }
+}

+impl<C> PassFailCancelledRequestTyped<C> {
+    pub(crate) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C {
+        let target = match outcome {
+            AttemptOutcome::Ok => &self.success,
+            AttemptOutcome::Err => &self.fail,
+            AttemptOutcome::Cancelled => &self.cancelled,
+        };
+        target.get(kind)
+    }

+    fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self {
+        let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok));
+        let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err));
+        let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled));

+        PassFailCancelledRequestTyped {
+            success,
+            fail,
+            cancelled,
+        }
+    }
+}

+impl PassFailCancelledRequestTyped<Histogram> {
+    pub(crate) fn observe_elapsed(
+        &self,
+        kind: RequestKind,
+        outcome: impl Into<AttemptOutcome>,
+        started_at: std::time::Instant,
+    ) {
+        self.get(kind, outcome.into())
+            .observe(started_at.elapsed().as_secs_f64())
+    }
+}

-
-
-
-/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
-pub(crate) fn start_measuring_requests(
-    _kind: RequestKind,
+/// On drop (cancellation) count towards [`BucketMetrics::cancelled_waits`].
+pub(crate) fn start_counting_cancelled_wait(
+    kind: RequestKind,
 ) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
-    scopeguard::guard_on_success(std::time::Instant::now(), move |_started_at| {
-        
+    scopeguard::guard_on_success(std::time::Instant::now(), move |_| {
+        crate::metrics::BUCKET_METRICS
+            .cancelled_waits
+            .get(kind)
+            .inc()
    })
 }

+/// On drop (cancellation) add time to [`BucketMetrics::req_seconds`].
+pub(crate) fn start_measuring_requests(
+    kind: RequestKind,
+) -> ScopeGuard<std::time::Instant, impl FnOnce(std::time::Instant), scopeguard::OnSuccess> {
+    scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| {
+        crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+            kind,
+            AttemptOutcome::Cancelled,
+            started_at,
+        )
+    })
+}

+pub(crate) struct BucketMetrics {
+    /// Full request duration until successful completion, error or cancellation.
+    pub(crate) req_seconds: PassFailCancelledRequestTyped<Histogram>,
+    /// Total amount of seconds waited on queue.
+    pub(crate) wait_seconds: RequestTyped<Histogram>,

+    /// Track how many semaphore awaits were cancelled per request type.
+    ///
+    /// This is in case cancellations are happening more than expected.
+    pub(crate) cancelled_waits: RequestTyped<IntCounter>,
+
+    /// Total amount of deleted objects in batches or single requests.
+    pub(crate) deleted_objects_total: IntCounter,
+}
+
+impl Default for BucketMetrics {
+    fn default() -> Self {
+        // first bucket 100 microseconds to count requests that do not need to wait at all
+        // and get a permit immediately
+        let buckets = [0.0001, 0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0];
+
+        let req_seconds = register_histogram_vec!(
+            "remote_storage_s3_request_seconds",
+            "Seconds to complete a request",
+            &["request_type", "result"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| {
+            req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()])
+        });
+
+        let wait_seconds = register_histogram_vec!(
+            "remote_storage_s3_wait_seconds",
+            "Seconds rate limited",
+            &["request_type"],
+            buckets.to_vec(),
+        )
+        .unwrap();
+        let wait_seconds =
+            RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()]));
+
+        let cancelled_waits = register_int_counter_vec!(
+            "remote_storage_s3_cancelled_waits_total",
+            "Times a semaphore wait has been cancelled per request type",
+            &["request_type"],
+        )
+        .unwrap();
+        let cancelled_waits =
+            RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()]));
+
+        let deleted_objects_total = register_int_counter!(
+            "remote_storage_s3_deleted_objects_total",
+            "Amount of deleted objects in total",
+        )
+        .unwrap();
+
+        Self {
+            req_seconds,
+            wait_seconds,
+            cancelled_waits,
+            deleted_objects_total,
+        }
+    }
+}
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -41,7 +41,7 @@ use super::StorageMetadata;
 use crate::config::S3Config;
 use crate::error::Cancelled;
 pub(super) use crate::metrics::RequestKind;
-use crate::metrics::{AttemptOutcome, start_measuring_requests};
+use crate::metrics::{AttemptOutcome, start_counting_cancelled_wait, start_measuring_requests};
 use crate::support::PermitCarrying;
 use crate::{
    ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject,
@@ -199,7 +199,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
-       
+        let started_at = start_counting_cancelled_wait(kind);
        let acquire = self.concurrency_limiter.acquire(kind);

        let permit = tokio::select! {
@@ -207,8 +207,10 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);

        Ok(permit)
    }
@@ -218,7 +220,7 @@ impl S3Bucket {
        kind: RequestKind,
        cancel: &CancellationToken,
    ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
-       
+        let started_at = start_counting_cancelled_wait(kind);
        let acquire = self.concurrency_limiter.acquire_owned(kind);

        let permit = tokio::select! {
@@ -226,8 +228,10 @@ impl S3Bucket {
            _ = cancel.cancelled() => return Err(Cancelled),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .wait_seconds
+            .observe_elapsed(kind, started_at);
        Ok(permit)
    }

@@ -269,7 +273,11 @@ impl S3Bucket {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::NotFound);
            }
            Err(SdkError::ServiceError(e))
@@ -279,11 +287,19 @@ impl S3Bucket {
                if e.raw().status().as_u16() == StatusCode::NotModified =>
            {
                // Count an unmodified file as a success.
-               
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::Unmodified);
            }
            Err(e) => {
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("download s3 object"),
@@ -330,11 +346,11 @@ impl S3Bucket {
        delete_objects: &[ObjectIdentifier],
        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-   
+        let kind = RequestKind::Delete;
        let mut cancel = std::pin::pin!(cancel.cancelled());

        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE_S3) {
-           
+            let started_at = start_measuring_requests(kind);

            let req = self
                .client
@@ -354,10 +370,15 @@ impl S3Bucket {
                _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
            };

-          
-            
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, &resp, started_at);
+
            let resp = resp.context("request deletion")?;
-            
+            crate::metrics::BUCKET_METRICS
+                .deleted_objects_total
+                .inc_by(chunk.len() as u64);

            if let Some(errors) = resp.errors {
                // Log a bounded number of the errors within the response:
@@ -424,8 +445,8 @@ pin_project_lite::pin_project! {
    }

    impl<S> PinnedDrop for TimedDownload<S> {
-        fn drop(mut _this: Pin<&mut Self>) {
-           
+        fn drop(mut this: Pin<&mut Self>) {
+            crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at);
        }
    }
 }
@@ -490,7 +511,7 @@ impl RemoteStorage for S3Bucket {

            let mut continuation_token = None;
            'outer: loop {
-           
+                let started_at = start_measuring_requests(kind);

                // min of two Options, returning Some if one is value and another is
                // None (None is smaller than anything, so plain min doesn't work).
@@ -523,9 +544,11 @@ impl RemoteStorage for S3Bucket {
                    .context("Failed to list S3 prefixes")
                    .map_err(DownloadError::Other);

-               
+                let started_at = ScopeGuard::into_inner(started_at);

-                
+                crate::metrics::BUCKET_METRICS
+                    .req_seconds
+                    .observe_elapsed(kind, &response, started_at);

                let response = match response {
                    Ok(response) => response,
@@ -606,7 +629,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Head;
        let _permit = self.permit(kind, cancel).await?;

-     
+        let started_at = start_measuring_requests(kind);

        let head_future = self
            .client
@@ -625,18 +648,30 @@ impl RemoteStorage for S3Bucket {
        let res = res.map_err(|_e| DownloadError::Timeout)?;

        // do not incl. timeouts as errors in metrics but cancellations
-  
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
        let data = match res {
            Ok(object_output) => object_output,
            Err(SdkError::ServiceError(e)) if matches!(e.err(), HeadObjectError::NotFound(_)) => {
                // Count this in the AttemptOutcome::Ok bucket, because 404 is not
                // an error: we expect to sometimes fetch an object and find it missing,
                // e.g. when probing for timeline indices.
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Ok,
+                    started_at,
+                );
                return Err(DownloadError::NotFound);
            }
            Err(e) => {
-                
+                crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed(
+                    kind,
+                    AttemptOutcome::Err,
+                    started_at,
+                );

                return Err(DownloadError::Other(
                    anyhow::Error::new(e).context("s3 head object"),
@@ -669,7 +704,7 @@ impl RemoteStorage for S3Bucket {
        let kind = RequestKind::Put;
        let _permit = self.permit(kind, cancel).await?;

-      
+        let started_at = start_measuring_requests(kind);

        let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
        let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
@@ -692,10 +727,12 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-        if let Ok(_inner) = &res {
+        if let Ok(inner) = &res {
            // do not incl. timeouts as errors in metrics but cancellations
-       
-            
+            let started_at = ScopeGuard::into_inner(started_at);
+            crate::metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
        }

        match res {
@@ -716,7 +753,7 @@ impl RemoteStorage for S3Bucket {

        let timeout = tokio::time::sleep(self.timeout);

-       
+        let started_at = start_measuring_requests(kind);

        // we need to specify bucket_name as a prefix
        let copy_source = format!(
@@ -740,8 +777,10 @@ impl RemoteStorage for S3Bucket {
            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
        };

-       
-        
+        let started_at = ScopeGuard::into_inner(started_at);
+        crate::metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);

        res?;

--- a/libs/utils/src/circuit_breaker.rs
+++ b/libs/utils/src/circuit_breaker.rs
@@ -1,6 +1,7 @@
 use std::fmt::Display;
 use std::time::{Duration, Instant};

+use metrics::IntCounter;

 /// Circuit breakers are for operations that are expensive and fallible.
 ///
@@ -53,7 +54,7 @@ impl CircuitBreaker {
        }
    }

-    pub fn fail<E>(&mut self,  error: E)
+    pub fn fail<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
@@ -63,18 +64,18 @@ impl CircuitBreaker {

        self.fail_count += 1;
        if self.broken_at.is_none() && self.fail_count >= self.fail_threshold {
-            self.break_circuit( error);
+            self.break_circuit(metric, error);
        }
    }

    /// Call this after successfully executing an operation
-    pub fn success(&mut self) {
+    pub fn success(&mut self, metric: &IntCounter) {
        self.fail_count = 0;
        if let Some(broken_at) = &self.broken_at {
            tracing::info!(breaker=%self.name, "Circuit breaker failure ended (was broken for {})",
                humantime::format_duration(broken_at.elapsed()));
            self.broken_at = None;
-          
+            metric.inc();
        }
    }

@@ -97,13 +98,13 @@ impl CircuitBreaker {
        }
    }

-    fn break_circuit<E>(&mut self,  error: E)
+    fn break_circuit<E>(&mut self, metric: &IntCounter, error: E)
    where
        E: Display,
    {
        self.broken_at = Some(Instant::now());
        tracing::error!(breaker=%self.name, "Circuit breaker broken!  Last error: {error}");
-        
+        metric.inc();
    }

    fn reset_circuit(&mut self) {
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -78,6 +78,7 @@ metrics.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true # for ResponseErrorMessageExt TOOD refactor that
 pageserver_compaction.workspace = true
+pem.workspace = true
 postgres_connection.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -68,6 +68,13 @@ pub(crate) struct Args {
    targets: Option<Vec<TenantTimelineId>>,
 }

+/// State shared by all clients
+#[derive(Debug)]
+struct SharedState {
+    start_work_barrier: tokio::sync::Barrier,
+    live_stats: LiveStats,
+}
+
 #[derive(Debug, Default)]
 struct LiveStats {
    completed_requests: AtomicU64,
@@ -240,24 +247,26 @@ async fn main_impl(
        all_ranges
    };

-    let live_stats = Arc::new(LiveStats::default());
-
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = args.num_clients.get() * timelines.len();
    let num_main_impl = 1;

-    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_live_stats_dump + num_work_sender_tasks + num_main_impl,
-    ));
+    let shared_state = Arc::new(SharedState {
+        start_work_barrier: tokio::sync::Barrier::new(
+            num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        ),
+        live_stats: LiveStats::default(),
+    });
+    let cancel = CancellationToken::new();

+    let ss = shared_state.clone();
    tokio::spawn({
-        let stats = Arc::clone(&live_stats);
-        let start_work_barrier = Arc::clone(&start_work_barrier);
        async move {
-            start_work_barrier.wait().await;
+            ss.start_work_barrier.wait().await;
            loop {
                let start = std::time::Instant::now();
                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+                let stats = &ss.live_stats;
                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
                let missed = stats.missed.swap(0, Ordering::Relaxed);
                let elapsed = start.elapsed();
@@ -270,14 +279,12 @@ async fn main_impl(
        }
    });

-    let cancel = CancellationToken::new();
-
    let rps_period = args
        .per_client_rate
        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
-        let live_stats = live_stats.clone();
-        let start_work_barrier = start_work_barrier.clone();
+        let ss = shared_state.clone();
+        let cancel = cancel.clone();
        let ranges: Vec<KeyRange> = all_ranges
            .iter()
            .filter(|r| r.timeline == worker_id.timeline)
@@ -287,85 +294,8 @@ async fn main_impl(
            rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
                .unwrap();

-        let cancel = cancel.clone();
        Box::pin(async move {
-            let client =
-                pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-                    .await
-                    .unwrap();
-            let mut client = client
-                .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
-                .await
-                .unwrap();
-
-            start_work_barrier.wait().await;
-            let client_start = Instant::now();
-            let mut ticks_processed = 0;
-            let mut inflight = VecDeque::new();
-            while !cancel.is_cancelled() {
-                // Detect if a request took longer than the RPS rate
-                if let Some(period) = &rps_period {
-                    let periods_passed_until_now =
-                        usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
-                            .unwrap();
-
-                    if periods_passed_until_now > ticks_processed {
-                        live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
-                    }
-                    ticks_processed = periods_passed_until_now;
-                }
-
-                while inflight.len() < args.queue_depth.get() {
-                    let start = Instant::now();
-                    let req = {
-                        let mut rng = rand::thread_rng();
-                        let r = &ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        assert!(key.is_rel_block_key());
-                        let (rel_tag, block_no) = key
-                            .to_rel_block()
-                            .expect("we filter non-rel-block keys out above");
-                        PagestreamGetPageRequest {
-                            hdr: PagestreamRequest {
-                                reqid: 0,
-                                request_lsn: if rng.gen_bool(args.req_latest_probability) {
-                                    Lsn::MAX
-                                } else {
-                                    r.timeline_lsn
-                                },
-                                not_modified_since: r.timeline_lsn,
-                            },
-                            rel: rel_tag,
-                            blkno: block_no,
-                        }
-                    };
-                    client.getpage_send(req).await.unwrap();
-                    inflight.push_back(start);
-                }
-
-                let start = inflight.pop_front().unwrap();
-                client.getpage_recv().await.unwrap();
-                let end = Instant::now();
-                live_stats.request_done();
-                ticks_processed += 1;
-                STATS.with(|stats| {
-                    stats
-                        .borrow()
-                        .lock()
-                        .unwrap()
-                        .observe(end.duration_since(start))
-                        .unwrap();
-                });
-
-                if let Some(period) = &rps_period {
-                    let next_at = client_start
-                        + Duration::from_micros(
-                            (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
-                        );
-                    tokio::time::sleep_until(next_at.into()).await;
-                }
-            }
+            client_libpq(args, worker_id, ss, cancel, rps_period, ranges, weights).await
        })
    };

@@ -387,7 +317,7 @@ async fn main_impl(
    };

    info!("waiting for everything to become ready");
-    start_work_barrier.wait().await;
+    shared_state.start_work_barrier.wait().await;
    info!("work started");
    if let Some(runtime) = args.runtime {
        tokio::time::sleep(runtime.into()).await;
@@ -416,3 +346,91 @@ async fn main_impl(

    anyhow::Ok(())
 }
+
+async fn client_libpq(
+    args: &Args,
+    worker_id: WorkerId,
+    shared_state: Arc<SharedState>,
+    cancel: CancellationToken,
+    rps_period: Option<Duration>,
+    ranges: Vec<KeyRange>,
+    weights: rand::distributions::weighted::WeightedIndex<i128>,
+) {
+    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+        .await
+        .unwrap();
+    let mut client = client
+        .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+        .await
+        .unwrap();
+
+    shared_state.start_work_barrier.wait().await;
+    let client_start = Instant::now();
+    let mut ticks_processed = 0;
+    let mut inflight = VecDeque::new();
+    while !cancel.is_cancelled() {
+        // Detect if a request took longer than the RPS rate
+        if let Some(period) = &rps_period {
+            let periods_passed_until_now =
+                usize::try_from(client_start.elapsed().as_micros() / period.as_micros()).unwrap();
+
+            if periods_passed_until_now > ticks_processed {
+                shared_state
+                    .live_stats
+                    .missed((periods_passed_until_now - ticks_processed) as u64);
+            }
+            ticks_processed = periods_passed_until_now;
+        }
+
+        while inflight.len() < args.queue_depth.get() {
+            let start = Instant::now();
+            let req = {
+                let mut rng = rand::thread_rng();
+                let r = &ranges[weights.sample(&mut rng)];
+                let key: i128 = rng.gen_range(r.start..r.end);
+                let key = Key::from_i128(key);
+                assert!(key.is_rel_block_key());
+                let (rel_tag, block_no) = key
+                    .to_rel_block()
+                    .expect("we filter non-rel-block keys out above");
+                PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid: 0,
+                        request_lsn: if rng.gen_bool(args.req_latest_probability) {
+                            Lsn::MAX
+                        } else {
+                            r.timeline_lsn
+                        },
+                        not_modified_since: r.timeline_lsn,
+                    },
+                    rel: rel_tag,
+                    blkno: block_no,
+                }
+            };
+            client.getpage_send(req).await.unwrap();
+            inflight.push_back(start);
+        }
+
+        let start = inflight.pop_front().unwrap();
+        client.getpage_recv().await.unwrap();
+        let end = Instant::now();
+        shared_state.live_stats.request_done();
+        ticks_processed += 1;
+        STATS.with(|stats| {
+            stats
+                .borrow()
+                .lock()
+                .unwrap()
+                .observe(end.duration_since(start))
+                .unwrap();
+        });
+
+        if let Some(period) = &rps_period {
+            let next_at = client_start
+                + Duration::from_micros(
+                    (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                );
+            tokio::time::sleep_until(next_at.into()).await;
+        }
+    }
+}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -20,6 +20,7 @@ use pageserver::config::{PageServerConf, PageserverIdentity, ignored_fields};
 use pageserver::controller_upcall_client::StorageControllerUpcallClient;
 use pageserver::deletion_queue::DeletionQueue;
 use pageserver::disk_usage_eviction_task::{self, launch_disk_usage_global_eviction_task};
+use pageserver::metrics::{STARTUP_DURATION, STARTUP_IS_LOADING};
 use pageserver::task_mgr::{
    BACKGROUND_RUNTIME, COMPUTE_REQUEST_RUNTIME, MGMT_REQUEST_RUNTIME, WALRECEIVER_RUNTIME,
 };
@@ -320,9 +321,10 @@ where
    }
 }

-fn startup_checkpoint(started_at: Instant, _phase: &str, human_phase: &str) {
+fn startup_checkpoint(started_at: Instant, phase: &str, human_phase: &str) {
    let elapsed = started_at.elapsed();
    let secs = elapsed.as_secs_f64();
+    STARTUP_DURATION.with_label_values(&[phase]).set(secs);

    info!(
        elapsed_ms = elapsed.as_millis(),
@@ -353,7 +355,10 @@ fn start_pageserver(
    set_launch_timestamp_metric(launch_ts);
    #[cfg(target_os = "linux")]
    metrics::register_internal(Box::new(metrics::more_process_metrics::Collector::new())).unwrap();
-    
+    metrics::register_internal(Box::new(
+        pageserver::metrics::tokio_epoll_uring::Collector::new(),
+    ))
+    .unwrap();
    pageserver::preinitialize_metrics(conf, ignored);

    // If any failpoints were set from FAILPOINTS environment variable,
@@ -411,8 +416,18 @@ fn start_pageserver(
    // The storage_broker::connect call needs to happen inside a tokio runtime thread.
    let broker_client = WALRECEIVER_RUNTIME
        .block_on(async {
+            let tls_config = storage_broker::ClientTlsConfig::new().ca_certificates(
+                conf.ssl_ca_certs
+                    .iter()
+                    .map(pem::encode)
+                    .map(storage_broker::Certificate::from_pem),
+            );
            // Note: we do not attempt connecting here (but validate endpoints sanity).
-            storage_broker::connect(conf.broker_endpoint.clone(), conf.broker_keepalive_interval)
+            storage_broker::connect(
+                conf.broker_endpoint.clone(),
+                conf.broker_keepalive_interval,
+                tls_config,
+            )
        })
        .with_context(|| {
            format!(
@@ -497,6 +512,7 @@ fn start_pageserver(
    // Up to this point no significant I/O has been done: this should have been fast.  Record
    // duration prior to starting I/O intensive phase of startup.
    startup_checkpoint(started_startup_at, "initial", "Starting loading tenants");
+    STARTUP_IS_LOADING.set(1);

    // Startup staging or optimizing:
    //
@@ -572,6 +588,7 @@ fn start_pageserver(
                    "initial_tenant_load",
                    "Initial load completed",
                );
+                STARTUP_IS_LOADING.set(0);
            });

            let WaitForPhaseResult {
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -17,9 +17,10 @@ use once_cell::sync::OnceCell;
 use pageserver_api::config::{DiskUsageEvictionTaskConfig, MaxVectoredReadBytes};
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
+use pem::Pem;
 use postgres_backend::AuthType;
 use remote_storage::{RemotePath, RemoteStorageConfig};
-use reqwest::{Certificate, Url};
+use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
@@ -67,8 +68,8 @@ pub struct PageServerConf {
    /// Period to reload certificate and private key from files.
    /// Default: 60s.
    pub ssl_cert_reload_period: Duration,
-    /// Trusted root CA certificates to use in https APIs.
-    pub ssl_ca_certs: Vec<Certificate>,
+    /// Trusted root CA certificates to use in https APIs in PEM format.
+    pub ssl_ca_certs: Vec<Pem>,

    /// Current availability zone. Used for traffic metrics.
    pub availability_zone: Option<String>,
@@ -118,13 +119,13 @@ pub struct PageServerConf {
    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
    pub concurrent_tenant_warmup: ConfigurableSemaphore,

-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
-    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
+    /// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -497,7 +498,10 @@ impl PageServerConf {
            ssl_ca_certs: match ssl_ca_file {
                Some(ssl_ca_file) => {
                    let buf = std::fs::read(ssl_ca_file)?;
-                    Certificate::from_pem_bundle(&buf)?
+                    pem::parse_many(&buf)?
+                        .into_iter()
+                        .filter(|pem| pem.tag() == "CERTIFICATE")
+                        .collect()
                }
                None => Vec::new(),
            },
@@ -588,10 +592,10 @@ impl ConfigurableSemaphore {
    /// Initializse using a non-zero amount of permits.
    ///
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
-    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
+    /// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 mod disk_cache;
 mod metrics;
@@ -428,7 +428,7 @@ async fn calculate_synthetic_size_worker(
    }
 }

-async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) {
    const CAUSE: LogicalSizeCalculationCause =
        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -175,9 +175,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::remote_size`]
+    /// [`TenantShard::remote_size`]
    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    /// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size
    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
            tenant_id,
@@ -199,9 +199,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
@@ -254,16 +254,18 @@ pub(super) async fn collect_all_metrics(

 async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
 where
-    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::TenantShard>)>,
 {
    let mut current_metrics: Vec<NewRawMetric> = Vec::new();

    let mut tenants = std::pin::pin!(tenants);

    while let Some((tenant_id, tenant)) = tenants.next().await {
-        let tenant_resident_size = 0;
+        let mut tenant_resident_size = 0;

-        for timeline in tenant.list_timelines() {
+        let timelines = tenant.list_timelines();
+        let timelines_len = timelines.len();
+        for timeline in timelines {
            let timeline_id = timeline.timeline_id;

            match TimelineSnapshot::collect(&timeline, ctx) {
@@ -286,6 +288,12 @@ where
                }
            }

+            tenant_resident_size += timeline.resident_physical_size();
+        }
+
+        if timelines_len == 0 {
+            // Force set it to 1 byte to avoid not being reported -- all timelines are offloaded.
+            tenant_resident_size = 1;
        }

        let snap = TenantSnapshot::collect(&tenant, tenant_resident_size);
@@ -307,7 +315,7 @@ impl TenantSnapshot {
    ///
    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
    /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
        TenantSnapshot {
            resident_size,
            remote_size: t.remote_size(),
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -91,12 +91,12 @@

 use std::{sync::Arc, time::Duration};

-
+use once_cell::sync::Lazy;
 use tracing::warn;
 use utils::{id::TimelineId, shard::TenantShardId};

 use crate::{
-    metrics::TimelineMetrics,
+    metrics::{StorageIoSizeMetrics, TimelineMetrics},
    task_mgr::TaskKind,
    tenant::Timeline,
 };
@@ -122,35 +122,38 @@ pub struct RequestContext {
 #[derive(Clone)]
 pub(crate) enum Scope {
    Global {
-        
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    SecondaryTenant {
-       
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    SecondaryTimeline {
-       
+        io_size_metrics: crate::metrics::StorageIoSizeMetrics,
    },
    Timeline {
-       // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
+        // We wrap the `Arc<TimelineMetrics>`s inside another Arc to avoid child
        // context creation contending for the ref counters of the Arc<TimelineMetrics>,
        // which are shared among all tasks that operate on the timeline, especially
        // concurrent page_service connections.
        #[allow(clippy::redundant_allocation)]
-        #[allow(dead_code)]
-        arc_arc: Arc<Arc<TimelineMetrics>>,    },
+        arc_arc: Arc<Arc<TimelineMetrics>>,
+    },
    #[cfg(test)]
    UnitTest {
-       
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
    DebugTools {
-        
+        io_size_metrics: &'static crate::metrics::StorageIoSizeMetrics,
    },
 }

+static GLOBAL_IO_SIZE_METRICS: Lazy<crate::metrics::StorageIoSizeMetrics> =
+    Lazy::new(|| crate::metrics::StorageIoSizeMetrics::new("*", "*", "*"));

 impl Scope {
    pub(crate) fn new_global() -> Self {
        Scope::Global {
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
    /// NB: this allocates, so, use only at relatively long-lived roots, e.g., at start
@@ -170,13 +173,18 @@ impl Scope {
        }
    }
    pub(crate) fn new_secondary_timeline(
-        _tenant_shard_id: &TenantShardId,
-        _timeline_id: &TimelineId,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
    ) -> Self {
        // TODO(https://github.com/neondatabase/neon/issues/11156): secondary timelines have no infrastructure for metrics lifecycle.

+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = tenant_shard_id.shard_slug().to_string();
+        let timeline_id = timeline_id.to_string();

-        Scope::SecondaryTimeline {  }
+        let io_size_metrics =
+            crate::metrics::StorageIoSizeMetrics::new(&tenant_id, &shard_id, &timeline_id);
+        Scope::SecondaryTimeline { io_size_metrics }
    }
    pub(crate) fn new_secondary_tenant(_tenant_shard_id: &TenantShardId) -> Self {
        // Before propagating metrics via RequestContext, the labels were inferred from file path.
@@ -189,19 +197,19 @@ impl Scope {
        // like we do for attached timelines. (We don't have attached-tenant-scoped usage of VirtualFile
        // at this point, so, we were able to completely side-step tenant-scoped stuff there).
        Scope::SecondaryTenant {
-           
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
    #[cfg(test)]
    pub(crate) fn new_unit_test() -> Self {
        Scope::UnitTest {
-          
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }

    pub(crate) fn new_debug_tools() -> Self {
        Scope::DebugTools {
-          
+            io_size_metrics: &GLOBAL_IO_SIZE_METRICS,
        }
    }
 }
@@ -515,18 +523,58 @@ impl RequestContext {
        self.access_stats_behavior
    }

+    pub(crate) fn page_content_kind(&self) -> PageContentKind {
+        self.page_content_kind
+    }

    pub(crate) fn read_path_debug(&self) -> bool {
        self.read_path_debug
    }

+    pub(crate) fn io_size_metrics(&self) -> &StorageIoSizeMetrics {
+        match &self.scope {
+            Scope::Global { io_size_metrics } => {
+                let is_unit_test = cfg!(test);
+                let is_regress_test_build = cfg!(feature = "testing");
+                if is_unit_test || is_regress_test_build {
+                    panic!("all VirtualFile instances are timeline-scoped");
+                } else {
+                    use once_cell::sync::Lazy;
+                    use std::sync::Mutex;
+                    use std::time::Duration;
+                    use utils::rate_limit::RateLimit;
+                    static LIMIT: Lazy<Mutex<RateLimit>> =
+                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(1))));
+                    let mut guard = LIMIT.lock().unwrap();
+                    guard.call2(|rate_limit_stats| {
+                        warn!(
+                            %rate_limit_stats,
+                            backtrace=%std::backtrace::Backtrace::force_capture(),
+                            "all VirtualFile instances are timeline-scoped",
+                        );
+                    });
+
+                    io_size_metrics
+                }
+            }
+            Scope::Timeline { arc_arc } => &arc_arc.storage_io_size,
+            Scope::SecondaryTimeline { io_size_metrics } => io_size_metrics,
+            Scope::SecondaryTenant { io_size_metrics } => io_size_metrics,
+            #[cfg(test)]
+            Scope::UnitTest { io_size_metrics } => io_size_metrics,
+            Scope::DebugTools { io_size_metrics } => io_size_metrics,
+        }
+    }
+
    pub(crate) fn ondemand_download_wait_observe(&self, duration: Duration) {
        if duration == Duration::ZERO {
            return;
        }

        match &self.scope {
-            Scope::Timeline { arc_arc: _ } => {},
+            Scope::Timeline { arc_arc } => arc_arc
+                .wait_ondemand_download_time
+                .observe(self.task_kind, duration),
            _ => {
                use once_cell::sync::Lazy;
                use std::sync::Mutex;
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -8,6 +8,7 @@ use pageserver_api::upcall_api::{
    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest,
    ValidateRequestTenant, ValidateResponse,
 };
+use reqwest::Certificate;
 use serde::Serialize;
 use serde::de::DeserializeOwned;
 use tokio_util::sync::CancellationToken;
@@ -76,8 +77,8 @@ impl StorageControllerUpcallClient {
            client = client.default_headers(headers);
        }

-        for ssl_ca_cert in &conf.ssl_ca_certs {
-            client = client.add_root_certificate(ssl_ca_cert.clone());
+        for cert in &conf.ssl_ca_certs {
+            client = client.add_root_certificate(Certificate::from_der(cert.contents())?);
        }

        Ok(Some(Self {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -27,6 +27,7 @@ use self::list_writer::{DeletionOp, ListWriter, RecoverOp};
 use self::validator::Validator;
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::StorageControllerUpcallApi;
+use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_timeline_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
@@ -162,6 +163,11 @@ struct TenantDeletionList {
    generation: Generation,
 }

+impl TenantDeletionList {
+    pub(crate) fn len(&self) -> usize {
+        self.timelines.values().map(|v| v.len()).sum()
+    }
+}

 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
@@ -461,6 +467,9 @@ impl DeletionQueueClient {
        // they may be historical.
        assert!(!current_generation.is_none());

+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(layers.len() as u64);
        self.do_push(
            &self.tx,
            ListWriterQueueMessage::Delete(DeletionOp {
@@ -544,6 +553,9 @@ impl DeletionQueueClient {
        &self,
        objects: Vec<RemotePath>,
    ) -> Result<(), DeletionQueueError> {
+        metrics::DELETION_QUEUE
+            .keys_submitted
+            .inc_by(objects.len() as u64);
        self.executor_tx
            .send(DeleterMessage::Delete(objects))
            .await
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -14,6 +14,7 @@ use tracing::{info, warn};
 use utils::{backoff, pausable_failpoint};

 use super::{DeletionQueueError, FlushOp};
+use crate::metrics;

 const AUTOFLUSH_INTERVAL: Duration = Duration::from_secs(10);

@@ -59,6 +60,10 @@ impl Deleter {
                fail::fail_point!("deletion-queue-before-execute", |_| {
                    info!("Skipping execution, failpoint set");

+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["failpoint"])
+                        .inc();
                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                });

@@ -85,6 +90,9 @@ impl Deleter {
                Ok(()) => {
                    // Note: we assume that the remote storage layer returns Ok(()) if some
                    // or all of the deleted objects were already gone.
+                    metrics::DELETION_QUEUE
+                        .keys_executed
+                        .inc_by(self.accumulator.len() as u64);
                    info!(
                        "Executed deletion batch {}..{}",
                        self.accumulator
@@ -101,6 +109,10 @@ impl Deleter {
                        return Err(DeletionQueueError::ShuttingDown);
                    }
                    warn!("DeleteObjects request failed: {e:#}, will continue trying");
+                    metrics::DELETION_QUEUE
+                        .remote_errors
+                        .with_label_values(&["execute"])
+                        .inc();
                }
            };
        }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -25,6 +25,7 @@ use utils::id::TimelineId;
 use super::{DeletionHeader, DeletionList, FlushOp, ValidatorQueueMessage};
 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
+use crate::metrics;
 use crate::tenant::remote_timeline_client::{LayerFileMetadata, remote_layer_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::virtual_file::{MaybeFatalIo, on_fatal_io_error};
@@ -151,7 +152,7 @@ impl ListWriter {
                }
            }
            Err(e) => {
-                
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
                warn!(
                    sequence = self.pending.sequence,
                    "Failed to write deletion list, will retry later ({e:#})"
@@ -179,6 +180,7 @@ impl ListWriter {
                        // This should never happen unless we make a mistake with our serialization.
                        // Ignoring a deletion header is not consequential for correctnes because all deletions
                        // are ultimately allowed to fail: worst case we leak some objects for the scrubber to clean up.
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        Ok(None)
                    }
                }
@@ -247,6 +249,7 @@ impl ListWriter {
                    .as_str()
            } else {
                warn!("Unexpected key in deletion queue: {basename}");
+                metrics::DELETION_QUEUE.unexpected_errors.inc();
                continue;
            };

@@ -254,6 +257,7 @@ impl ListWriter {
                Ok(s) => s,
                Err(e) => {
                    warn!("Malformed key '{basename}': {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -282,6 +286,7 @@ impl ListWriter {
                    // Drop the list on the floor: any objects it referenced will be left behind
                    // for scrubbing to clean up.  This should never happen unless we have a serialization bug.
                    warn!(sequence = s, "Failed to deserialize deletion list: {e}");
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                    continue;
                }
            };
@@ -324,6 +329,9 @@ impl ListWriter {

            // We will drop out of recovery if this fails: it indicates that we are shutting down
            // or the backend has panicked
+            metrics::DELETION_QUEUE
+                .keys_submitted
+                .inc_by(deletion_list.len() as u64);
            self.tx
                .send(ValidatorQueueMessage::Delete(deletion_list))
                .await?;
@@ -345,6 +353,7 @@ impl ListWriter {
                "Failed to create deletion list directory {}, deletions will not be executed ({e})",
                self.conf.deletion_prefix(),
            );
+            metrics::DELETION_QUEUE.unexpected_errors.inc();
            return;
        }

@@ -413,6 +422,7 @@ impl ListWriter {
                            tracing::error!(
                                "Failed to enqueue deletions, leaking objects.  This is a bug."
                            );
+                            metrics::DELETION_QUEUE.unexpected_errors.inc();
                        }
                    }
                }
@@ -440,6 +450,7 @@ impl ListWriter {
                        tracing::error!(
                            "Deletion queue recovery called more than once.  This is a bug."
                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        // Non-fatal: although this is a bug, since we did recovery at least once we may proceed.
                        continue;
                    }
@@ -451,6 +462,7 @@ impl ListWriter {
                        info!(
                            "Deletion queue recover aborted, deletion queue will not proceed ({e})"
                        );
+                        metrics::DELETION_QUEUE.unexpected_errors.inc();
                        return;
                    } else {
                        self.recovered = true;
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -26,6 +26,7 @@ use super::deleter::DeleterMessage;
 use super::{DeletionHeader, DeletionList, DeletionQueueError, FlushOp, VisibleLsnUpdates};
 use crate::config::PageServerConf;
 use crate::controller_upcall_client::{RetryForeverError, StorageControllerUpcallApi};
+use crate::metrics;
 use crate::virtual_file::MaybeFatalIo;

 // After this length of time, do any validation work that is pending,
@@ -185,6 +186,7 @@ where
                    "Dropped remote consistent LSN updates for tenant {tenant_id} in stale generation {:?}",
                    tenant_lsn_state.generation
                );
+                metrics::DELETION_QUEUE.dropped_lsn_updates.inc();
            }
        }

@@ -219,8 +221,11 @@ where

                if !this_list_valid {
                    info!("Dropping stale deletions for tenant {tenant_id} in generation {:?}, objects may be leaked", tenant.generation);
+                    metrics::DELETION_QUEUE.keys_dropped.inc_by(tenant.len() as u64);
                    mutated = true;
-                } 
+                } else {
+                    metrics::DELETION_QUEUE.keys_validated.inc_by(tenant.len() as u64);
+                }
                this_list_valid
            });
            list.validated = true;
@@ -232,7 +237,7 @@ where
                    // Highly unexpected.  Could happen if e.g. disk full.
                    // If we didn't save the trimmed list, it is _not_ valid to execute.
                    warn!("Failed to save modified deletion list {list}: {e:#}");
-                    
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();

                    // Rather than have a complex retry process, just drop it and leak the objects,
                    // scrubber will clean up eventually.
@@ -271,7 +276,7 @@ where
                // The save() function logs a warning on error.
                if let Err(e) = header.save(self.conf).await {
                    warn!("Failed to write deletion queue header: {e:#}");
-                    
+                    metrics::DELETION_QUEUE.unexpected_errors.inc();
                }
            }
        }
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -56,6 +56,7 @@ use utils::completion;
 use utils::id::TimelineId;

 use crate::config::PageServerConf;
+use crate::metrics::disk_usage_based_eviction::METRICS;
 use crate::task_mgr::{self, BACKGROUND_RUNTIME};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
@@ -387,7 +388,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        }
    };

-
+    METRICS.layers_collected.inc_by(candidates.len() as u64);

    tracing::info!(
        elapsed_ms = collection_time.as_millis(),
@@ -427,7 +428,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let (evicted_amount, usage_planned) =
        select_victims(&candidates, usage_pre).into_amount_and_planned();

- 
+    METRICS.layers_selected.inc_by(evicted_amount as u64);

    // phase2: evict layers

@@ -456,6 +457,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            if let Some(next) = next {
                match next {
                    Ok(Ok(file_size)) => {
+                        METRICS.layers_evicted.inc();
                        usage_assumed.add_available_bytes(file_size);
                    }
                    Ok(Err((
@@ -786,6 +788,7 @@ async fn collect_eviction_candidates(
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
+    const LOG_DURATION_THRESHOLD: std::time::Duration = std::time::Duration::from_secs(10);

    // get a snapshot of the list of tenants
    let tenants = tenant_manager
@@ -819,7 +822,7 @@ async fn collect_eviction_candidates(
            continue;
        }

-        
+        let started_at = std::time::Instant::now();

        // collect layers from all timelines in this tenant
        //
@@ -914,11 +917,25 @@ async fn collect_eviction_candidates(
                    (partition, candidate)
                });

+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);

        candidates.extend(tenant_candidates);

-       
+        let elapsed = started_at.elapsed();
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());

+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    // Note: the same tenant ID might be hit twice, if it transitions from attached to
@@ -945,7 +962,7 @@ async fn collect_eviction_candidates(
            layer_info.resident_layers.len()
        );

-       
+        let started_at = std::time::Instant::now();

        layer_info
            .resident_layers
@@ -967,13 +984,28 @@ async fn collect_eviction_candidates(
                        candidate,
                    )
                });
+
+        METRICS
+            .tenant_layer_count
+            .observe(tenant_candidates.len() as f64);
        candidates.extend(tenant_candidates);

        tokio::task::yield_now().await;

-    
+        let elapsed = started_at.elapsed();

-        
+        METRICS
+            .tenant_collection_time
+            .observe(elapsed.as_secs_f64());
+
+        if elapsed > LOG_DURATION_THRESHOLD {
+            tracing::info!(
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
+                elapsed_ms = elapsed.as_millis(),
+                "collection took longer than threshold"
+            );
+        }
    }

    debug_assert!(
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1873,7 +1873,7 @@ async fn update_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

@@ -1917,7 +1917,7 @@ async fn patch_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -24,6 +24,7 @@ use wal_decoder::models::InterpretedWalRecord;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::{WalIngest, WalIngestErrorKind};
@@ -323,6 +324,7 @@ async fn import_wal(
                walingest
                    .ingest_record(interpreted, &mut modification, ctx)
                    .await?;
+                WAL_INGEST.records_committed.inc();

                modification.commit(ctx).await?;
                last_lsn = lsn;
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -77,6 +77,7 @@ use anyhow::Context;
 use once_cell::sync::OnceCell;

 use crate::context::RequestContext;
+use crate::metrics::{PageCacheSizeMetrics, page_cache_eviction_metrics};
 use crate::virtual_file::{IoBufferMut, IoPageSlice};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -194,7 +195,7 @@ impl SlotInner {
 }

 pub struct PageCache {
-    immutable_page_maps: [std::sync::RwLock<HashMap<(FileId, u32), usize>>; 16],
+    immutable_page_map: std::sync::RwLock<HashMap<(FileId, u32), usize>>,

    /// The actual buffers with their metadata.
    slots: Box<[Slot]>,
@@ -204,103 +205,8 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
-}

-impl PageCache {
-    /// Helper function to determine the shard index based on the low 4 bits of the u32 in the key tuple.
-    fn shard_index(_file_id: &FileId, blkno: u32) -> usize {
-        (blkno & 0xF) as usize
-    }
-
-    /// Search for a page in the cache using the given search key.
-    ///
-    /// Returns the slot index, if any.
-    ///
-    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
-    /// get recycled for an unrelated page immediately after this function
-    /// returns. The caller is responsible for re-checking that the slot still
-    /// contains the page with the same key before using it.
-    ///
-    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
-        match cache_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let map = self.immutable_page_maps[shard_idx].read().unwrap();
-                Some(*map.get(&(*file_id, *blkno))?)
-            }
-        }
-    }
-
-    ///
-    /// Remove mapping for given key.
-    ///
-    fn remove_mapping(&self, old_key: &CacheKey) {
-        match old_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
-                map.remove(&(*file_id, *blkno))
-                    .expect("could not find old key in mapping");
-            }
-        }
-    }
-
-    ///
-    /// Insert mapping for given key.
-    ///
-    /// If a mapping already existed for the given key, returns the slot index
-    /// of the existing mapping and leaves it untouched.
-    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
-        match new_key {
-            CacheKey::ImmutableFilePage { file_id, blkno } => {
-                let shard_idx = Self::shard_index(file_id, *blkno);
-                let mut map = self.immutable_page_maps[shard_idx].write().unwrap();
-                match map.entry((*file_id, *blkno)) {
-                    Entry::Occupied(entry) => Some(*entry.get()),
-                    Entry::Vacant(entry) => {
-                        entry.insert(slot_idx);
-                        None
-                    }
-                }
-            }
-        }
-    }
-
-    /// Initialize a new page cache
-    ///
-    /// This should be called only once at page server startup.
-    fn new(num_pages: usize) -> Self {
-        assert!(num_pages > 0, "page cache size must be > 0");
-
-        // We could use Vec::leak here, but that potentially also leaks
-        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
-        // this is avoided.
-        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
-
-        let slots = page_buffer
-            .chunks_exact_mut(PAGE_SZ)
-            .map(|chunk| {
-                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
-                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
-
-                Slot {
-                    inner: tokio::sync::RwLock::new(SlotInner {
-                        key: None,
-                        buf,
-                        permit: std::sync::Mutex::new(Weak::new()),
-                    }),
-                    usage_count: AtomicU8::new(0),
-                }
-            })
-            .collect();
-
-        Self {
-            immutable_page_maps: Default::default(),
-            slots,
-            next_evict_slot: AtomicUsize::new(0),
-            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-        }
-    }
+    size_metrics: &'static PageCacheSizeMetrics,
 }

 struct PinnedSlotsPermit {
@@ -508,17 +414,32 @@ impl PageCache {
    async fn lock_for_read(
        &self,
        cache_key: &CacheKey,
-        _ctx: &RequestContext,
+        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
        let mut permit = Some(self.try_get_pinned_slot_permit().await?);

+        let (read_access, hit) = match cache_key {
+            CacheKey::ImmutableFilePage { .. } => (
+                &crate::metrics::PAGE_CACHE
+                    .for_ctx(ctx)
+                    .read_accesses_immutable,
+                &crate::metrics::PAGE_CACHE.for_ctx(ctx).read_hits_immutable,
+            ),
+        };
+        read_access.inc();
+
+        let mut is_first_iteration = true;
        loop {
            // First check if the key already exists in the cache.
            if let Some(read_guard) = self.try_lock_for_read(cache_key, &mut permit).await {
                debug_assert!(permit.is_none());
+                if is_first_iteration {
+                    hit.inc();
+                }
                return Ok(ReadBufResult::Found(read_guard));
            }
            debug_assert!(permit.is_some());
+            is_first_iteration = false;

            // Not found. Find a victim buffer
            let (slot_idx, mut inner) = self
@@ -563,6 +484,63 @@ impl PageCache {
        }
    }

+    //
+    // Section 3: Mapping functions
+    //
+
+    /// Search for a page in the cache using the given search key.
+    ///
+    /// Returns the slot index, if any.
+    ///
+    /// NOTE: We don't hold any lock on the mapping on return, so the slot might
+    /// get recycled for an unrelated page immediately after this function
+    /// returns.  The caller is responsible for re-checking that the slot still
+    /// contains the page with the same key before using it.
+    ///
+    fn search_mapping(&self, cache_key: &CacheKey) -> Option<usize> {
+        match cache_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let map = self.immutable_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
+        }
+    }
+
+    ///
+    /// Remove mapping for given key.
+    ///
+    fn remove_mapping(&self, old_key: &CacheKey) {
+        match old_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
+            }
+        }
+    }
+
+    ///
+    /// Insert mapping for given key.
+    ///
+    /// If a mapping already existed for the given key, returns the slot index
+    /// of the existing mapping and leaves it untouched.
+    fn try_insert_mapping(&self, new_key: &CacheKey, slot_idx: usize) -> Option<usize> {
+        match new_key {
+            CacheKey::ImmutableFilePage { file_id, blkno } => {
+                let mut map = self.immutable_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
+                        None
+                    }
+                }
+            }
+        }
+    }
+
    //
    // Section 4: Misc internal helpers
    //
@@ -617,7 +595,11 @@ impl PageCache {
                            // Note that just yielding to tokio during iteration without such
                            // priority boosting is likely counter-productive. We'd just give more opportunities
                            // for B to bump usage count, further starving A.
-                            
+                            page_cache_eviction_metrics::observe(
+                                page_cache_eviction_metrics::Outcome::ItersExceeded {
+                                    iters: iters.try_into().unwrap(),
+                                },
+                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
@@ -627,12 +609,84 @@ impl PageCache {
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
                    inner.key = None;
-                    
-                } 
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                } else {
+                    page_cache_eviction_metrics::observe(
+                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
+                            iters: iters.try_into().unwrap(),
+                        },
+                    );
+                }
                return Ok((slot_idx, inner));
            }
        }
    }

+    /// Initialize a new page cache
+    ///
+    /// This should be called only once at page server startup.
+    fn new(num_pages: usize) -> Self {
+        assert!(num_pages > 0, "page cache size must be > 0");
+
+        // We could use Vec::leak here, but that potentially also leaks
+        // uninitialized reserved capacity. With into_boxed_slice and Box::leak
+        // this is avoided.
+        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
+
+        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
+        size_metrics.max_bytes.set_page_sz(num_pages);
+        size_metrics.current_bytes_immutable.set_page_sz(0);
+
+        let slots = page_buffer
+            .chunks_exact_mut(PAGE_SZ)
+            .map(|chunk| {
+                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
+                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
+
+                Slot {
+                    inner: tokio::sync::RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        permit: std::sync::Mutex::new(Weak::new()),
+                    }),
+                    usage_count: AtomicU8::new(0),
+                }
+            })
+            .collect();
+
+        Self {
+            immutable_page_map: Default::default(),
+            slots,
+            next_evict_slot: AtomicUsize::new(0),
+            size_metrics,
+            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
+        }
+    }
 }

+trait PageSzBytesMetric {
+    fn set_page_sz(&self, count: usize);
+    fn add_page_sz(&self, count: usize);
+    fn sub_page_sz(&self, count: usize);
+}
+
+#[inline(always)]
+fn count_times_page_sz(count: usize) -> u64 {
+    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
+}
+
+impl PageSzBytesMetric for metrics::UIntGauge {
+    fn set_page_sz(&self, count: usize) {
+        self.set(count_times_page_sz(count));
+    }
+    fn add_page_sz(&self, count: usize) {
+        self.add(count_times_page_sz(count));
+    }
+    fn sub_page_sz(&self, count: usize) {
+        self.sub(count_times_page_sz(count));
+    }
+}
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -59,7 +59,8 @@ use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
 };
 use crate::metrics::{
-    self, GetPageBatchBreakReason, SmgrOpTimer, TimelineMetrics,
+    self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
+    SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::Version;
 use crate::span::{
@@ -75,7 +76,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
 use crate::{basebackup, timed_after_cancellation};

-/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
+/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which
 /// is not yet in state [`TenantState::Active`].
 ///
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
@@ -274,6 +275,9 @@ async fn page_service_conn_main(
    cancel: CancellationToken,
    gate_guard: GateGuard,
 ) -> ConnectionHandlerResult {
+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["page_service"])
+        .guard();

    socket
        .set_nodelay(true)
@@ -637,6 +641,7 @@ impl std::fmt::Display for BatchedPageStreamError {

 struct BatchedGetPageRequest {
    req: PagestreamGetPageRequest,
+    timer: SmgrOpTimer,
    effective_request_lsn: Lsn,
    ctx: RequestContext,
 }
@@ -644,6 +649,7 @@ struct BatchedGetPageRequest {
 #[cfg(feature = "testing")]
 struct BatchedTestRequest {
    req: models::PagestreamTestRequest,
+    timer: SmgrOpTimer,
 }

 /// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
@@ -653,13 +659,13 @@ struct BatchedTestRequest {
 enum BatchedFeMessage {
    Exists {
        span: Span,
-       
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamExistsRequest,
    },
    Nblocks {
        span: Span,
-     
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamNblocksRequest,
    },
@@ -671,13 +677,13 @@ enum BatchedFeMessage {
    },
    DbSize {
        span: Span,
-  
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamDbSizeRequest,
    },
    GetSlruSegment {
        span: Span,
-   
+        timer: SmgrOpTimer,
        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
        req: models::PagestreamGetSlruSegmentRequest,
    },
@@ -698,7 +704,27 @@ impl BatchedFeMessage {
        self.into()
    }

-    fn observe_execution_start(&mut self, _at: Instant) {
+    fn observe_execution_start(&mut self, at: Instant) {
+        match self {
+            BatchedFeMessage::Exists { timer, .. }
+            | BatchedFeMessage::Nblocks { timer, .. }
+            | BatchedFeMessage::DbSize { timer, .. }
+            | BatchedFeMessage::GetSlruSegment { timer, .. } => {
+                timer.observe_execution_start(at);
+            }
+            BatchedFeMessage::GetPage { pages, .. } => {
+                for page in pages {
+                    page.timer.observe_execution_start(at);
+                }
+            }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test { requests, .. } => {
+                for req in requests {
+                    req.timer.observe_execution_start(at);
+                }
+            }
+            BatchedFeMessage::RespondError { .. } => {}
+        }
    }

    fn should_break_batch(
@@ -938,7 +964,7 @@ impl PageServerHandler {
                    .await?;
                debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
                let span = tracing::info_span!(parent: &parent_span, "handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelExists,
                    received_at,
@@ -946,7 +972,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Exists {
                    span,
-                
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -956,7 +982,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetRelSize,
                    received_at,
@@ -964,7 +990,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::Nblocks {
                    span,
-                   
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -974,7 +1000,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetDbSize,
                    received_at,
@@ -982,7 +1008,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::DbSize {
                    span,
-              
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -992,7 +1018,7 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.hdr.request_lsn, shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetSlruSegment,
                    received_at,
@@ -1000,7 +1026,7 @@ impl PageServerHandler {
                .await?;
                BatchedFeMessage::GetSlruSegment {
                    span,
-               
+                    timer,
                    shard: shard.downgrade(),
                    req,
                }
@@ -1099,7 +1125,7 @@ impl PageServerHandler {
                // request handler log messages contain the request-specific fields.
                let span = mkspan!(shard.tenant_shard_id.shard_slug());

-                record_op_start_and_throttle(
+                let timer = record_op_start_and_throttle(
                    &shard,
                    metrics::SmgrQueryType::GetPageAtLsn,
                    received_at,
@@ -1132,6 +1158,7 @@ impl PageServerHandler {
                    shard: shard.downgrade(),
                    pages: smallvec::smallvec![BatchedGetPageRequest {
                        req,
+                        timer,
                        effective_request_lsn,
                        ctx,
                    }],
@@ -1147,12 +1174,13 @@ impl PageServerHandler {
                    .get(tenant_id, timeline_id, ShardSelector::Zero)
                    .await?;
                let span = tracing::info_span!(parent: &parent_span, "handle_test_request", shard_id = %shard.tenant_shard_id.shard_slug());
-                record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
+                let timer =
+                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
                        .await?;
                BatchedFeMessage::Test {
                    span,
                    shard: shard.downgrade(),
-                    requests: vec![BatchedTestRequest { req,  }],
+                    requests: vec![BatchedTestRequest { req, timer }],
                }
            }
        };
@@ -1253,7 +1281,7 @@ impl PageServerHandler {

        // Dispatch the batch to the appropriate request handler.
        let log_slow_name = batch.as_static_str();
-        let (handler_results, span) = {
+        let (mut handler_results, span) = {
            // TODO: we unfortunately have to pin the future on the heap, since GetPage futures are huge and
            // won't fit on the stack.
            let mut boxpinned =
@@ -1283,31 +1311,31 @@ impl PageServerHandler {
        // call, which (all unmeasured) adds syscall overhead but reduces time to first byte
        // and avoids building up a "giant" contiguous userspace buffer to hold the entire response.
        // TODO: vectored socket IO would be great, but pgb_writer doesn't support that.
-        // let flush_timers = {
-        //     let flushing_start_time = Instant::now();
-        //     let mut flush_timers = Vec::with_capacity(handler_results.len());
-        //     for handler_result in &mut handler_results {
-        //         let flush_timer = match handler_result {
-        //             Ok((_, timer)) => Some(
-        //                 timer
-        //                     .observe_execution_end(flushing_start_time)
-        //                     .expect("we are the first caller"),
-        //             ),
-        //             Err(_) => {
-        //                 // TODO: measure errors
-        //                 None
-        //             }
-        //         };
-        //         flush_timers.push(flush_timer);
-        //     }
-        //     assert_eq!(flush_timers.len(), handler_results.len());
-        //     flush_timers
-        // };
+        let flush_timers = {
+            let flushing_start_time = Instant::now();
+            let mut flush_timers = Vec::with_capacity(handler_results.len());
+            for handler_result in &mut handler_results {
+                let flush_timer = match handler_result {
+                    Ok((_, timer)) => Some(
+                        timer
+                            .observe_execution_end(flushing_start_time)
+                            .expect("we are the first caller"),
+                    ),
+                    Err(_) => {
+                        // TODO: measure errors
+                        None
+                    }
+                };
+                flush_timers.push(flush_timer);
+            }
+            assert_eq!(flush_timers.len(), handler_results.len());
+            flush_timers
+        };

        // Map handler result to protocol behavior.
        // Some handler errors cause exit from pagestream protocol.
        // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        for handler_result in handler_results.into_iter() {
+        for (handler_result, flushing_timer) in handler_results.into_iter().zip(flush_timers) {
            let response_msg = match handler_result {
                Err(e) => match &e.err {
                    PageStreamError::Shutdown => {
@@ -1339,7 +1367,7 @@ impl PageServerHandler {
                        })
                    }
                },
-                Ok((response_msg, )) => response_msg,
+                Ok((response_msg, _op_timer_already_observed)) => response_msg,
            };

            //
@@ -1353,17 +1381,17 @@ impl PageServerHandler {
            failpoint_support::sleep_millis_async!("before-pagestream-msg-flush", cancel);

            // what we want to do
-           
+            let socket_fd = pgb_writer.socket_fd;
            let flush_fut = pgb_writer.flush();
            // metric for how long flushing takes
-            // let flush_fut = match flushing_timer {
-            //     Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
-            //         Instant::now(),
-            //         flush_fut,
-            //         socket_fd,
-            //     )),
-            //     None => futures::future::Either::Right(flush_fut),
-            // };
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => futures::future::Either::Left(flushing_timer.measure(
+                    Instant::now(),
+                    flush_fut,
+                    socket_fd,
+                )),
+                None => futures::future::Either::Right(flush_fut),
+            };
            // do it while respecting cancellation
            let _: () = async move {
                tokio::select! {
@@ -1393,7 +1421,7 @@ impl PageServerHandler {
        ctx: &RequestContext,
    ) -> Result<
        (
-            Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>>,
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>>,
            Span,
        ),
        QueryError,
@@ -1409,7 +1437,7 @@ impl PageServerHandler {
        Ok(match batch {
            BatchedFeMessage::Exists {
                span,
-               
+                timer,
                shard,
                req,
            } => {
@@ -1420,7 +1448,7 @@ impl PageServerHandler {
                        self.handle_get_rel_exists_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1428,7 +1456,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::Nblocks {
                span,
-           
+                timer,
                shard,
                req,
            } => {
@@ -1439,7 +1467,7 @@ impl PageServerHandler {
                        self.handle_get_nblocks_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1475,6 +1503,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::DbSize {
                span,
+                timer,
                shard,
                req,
            } => {
@@ -1485,7 +1514,7 @@ impl PageServerHandler {
                        self.handle_db_size_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -1493,6 +1522,7 @@ impl PageServerHandler {
            }
            BatchedFeMessage::GetSlruSegment {
                span,
+                timer,
                shard,
                req,
            } => {
@@ -1503,7 +1533,7 @@ impl PageServerHandler {
                        self.handle_get_slru_segment_request(&shard, &req, &ctx)
                            .instrument(span.clone())
                            .await
-                            .map(|msg| (msg, ))
+                            .map(|msg| (msg, timer))
                            .map_err(|err| BatchedPageStreamError { err, req: req.hdr }),
                    ],
                    span,
@@ -2149,11 +2179,15 @@ impl PageServerHandler {
        timeline: &Timeline,
        requests: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
        io_concurrency: IoConcurrency,
-        _batch_break_reason: GetPageBatchBreakReason,
+        batch_break_reason: GetPageBatchBreakReason,
        ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage, ), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
        debug_assert_current_span_has_tenant_and_timeline_id();

+        timeline
+            .query_metrics
+            .observe_getpage_batch_start(requests.len(), batch_break_reason);
+
        // If a page trace is running, submit an event for this request.
        if let Some(page_trace) = timeline.page_trace.load().as_ref() {
            let time = SystemTime::now();
@@ -2253,7 +2287,7 @@ impl PageServerHandler {
                                req: req.req,
                                page,
                            }),
-                            
+                            req.timer,
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2298,7 +2332,7 @@ impl PageServerHandler {
        timeline: &Timeline,
        requests: Vec<BatchedTestRequest>,
        _ctx: &RequestContext,
-    ) -> Vec<Result<(PagestreamBeMessage,), BatchedPageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
        // real requests would do something with the timeline
        let mut results = Vec::with_capacity(requests.len());
        for _req in requests.iter() {
@@ -2324,6 +2358,7 @@ impl PageServerHandler {
                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
                                req: req.req.clone(),
                            }),
+                            req.timer,
                        )
                    })
                    .map_err(|e| BatchedPageStreamError {
@@ -2878,7 +2913,12 @@ where
                    .record("timeline_id", field::display(timeline_id));

                self.check_permission(Some(tenant_id))?;
-                
+                let command_kind = match protocol_version {
+                    PagestreamProtocolVersion::V2 => ComputeCommandKind::PageStreamV2,
+                    PagestreamProtocolVersion::V3 => ComputeCommandKind::PageStreamV3,
+                };
+                COMPUTE_COMMANDS_COUNTERS.for_command(command_kind).inc();
+
                self.handle_pagerequests(pgb, tenant_id, timeline_id, protocol_version, ctx)
                    .await?;
            }
@@ -2895,7 +2935,10 @@ where

                self.check_permission(Some(tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Basebackup)
+                    .inc();
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
                let res = async {
                    self.handle_basebackup_request(
                        pgb,
@@ -2913,7 +2956,7 @@ where
                    Result::<(), QueryError>::Ok(())
                }
                .await;
-
+                metric_recording.observe(&res);
                res?;
            }
            // same as basebackup, but result includes relational data as well
@@ -2929,7 +2972,9 @@ where

                self.check_permission(Some(tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Fullbackup)
+                    .inc();

                // Check that the timeline exists
                self.handle_basebackup_request(
@@ -2963,7 +3008,9 @@ where

                self.check_permission(Some(tenant_shard_id.tenant_id))?;

-                
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::LeaseLsn)
+                    .inc();

                match self
                    .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -36,13 +36,15 @@ use tracing::{debug, info, info_span, trace, warn};
 use utils::bin_ser::{BeSer, DeserializeError};
 use utils::lsn::Lsn;
 use utils::pausable_failpoint;
-use wal_decoder::serialized_batch::SerializedValueBatch ;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

 use super::tenant::{PageReconstructError, Timeline};
 use crate::aux_file;
 use crate::context::{PerfInstrumentFutureExt, RequestContext};
 use crate::keyspace::{KeySpace, KeySpaceAccum};
-
+use crate::metrics::{
+    RELSIZE_CACHE_ENTRIES, RELSIZE_CACHE_HITS, RELSIZE_CACHE_MISSES, RELSIZE_CACHE_MISSES_OLD,
+};
 use crate::span::{
    debug_assert_current_span_has_tenant_and_timeline_id,
    debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id,
@@ -1030,16 +1032,19 @@ impl Timeline {
            )
            .await?;
        let mut result = HashMap::new();
-
+        let mut sz = 0;
        for (_, v) in kv {
            let v = v?;
            let v = aux_file::decode_file_value_bytes(&v)
                .context("value decode")
                .map_err(PageReconstructError::Other)?;
            for (fname, content) in v {
+                sz += fname.len();
+                sz += content.len();
                result.insert(fname, content);
            }
        }
+        self.aux_file_size_estimator.on_initial(sz);
        Ok(result)
    }

@@ -1310,12 +1315,12 @@ impl Timeline {
        let rel_size_cache = self.rel_size_cache.read().unwrap();
        if let Some((cached_lsn, nblocks)) = rel_size_cache.map.get(tag) {
            if lsn >= *cached_lsn {
-                
+                RELSIZE_CACHE_HITS.inc();
                return Some(*nblocks);
            }
-            
+            RELSIZE_CACHE_MISSES_OLD.inc();
        }
-       
+        RELSIZE_CACHE_MISSES.inc();
        None
    }

@@ -1340,21 +1345,25 @@ impl Timeline {
            }
            hash_map::Entry::Vacant(entry) => {
                entry.insert((lsn, nblocks));
-               
+                RELSIZE_CACHE_ENTRIES.inc();
            }
        }
    }

    /// Store cached relation size
-    pub fn set_cached_rel_size(&self, _tag: RelTag, _lsn: Lsn, _nblocks: BlockNumber) {
-       
-        
+    pub fn set_cached_rel_size(&self, tag: RelTag, lsn: Lsn, nblocks: BlockNumber) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.insert(tag, (lsn, nblocks)).is_none() {
+            RELSIZE_CACHE_ENTRIES.inc();
+        }
    }

    /// Remove cached relation size
-    pub fn remove_cached_rel_size(&self, _tag: &RelTag) {
-        
-        
+    pub fn remove_cached_rel_size(&self, tag: &RelTag) {
+        let mut rel_size_cache = self.rel_size_cache.write().unwrap();
+        if rel_size_cache.map.remove(tag).is_some() {
+            RELSIZE_CACHE_ENTRIES.dec();
+        }
    }
 }

@@ -1429,7 +1438,25 @@ impl DatadirModification<'_> {
            .is_some_and(|b| b.has_data())
    }

-    
+    /// Returns statistics about the currently pending modifications.
+    pub(crate) fn stats(&self) -> DatadirModificationStats {
+        let mut stats = DatadirModificationStats::default();
+        for (_, _, value) in self.pending_metadata_pages.values().flatten() {
+            match value {
+                Value::Image(_) => stats.metadata_images += 1,
+                Value::WalRecord(r) if r.will_init() => stats.metadata_images += 1,
+                Value::WalRecord(_) => stats.metadata_deltas += 1,
+            }
+        }
+        for valuemeta in self.pending_data_batch.iter().flat_map(|b| &b.metadata) {
+            match valuemeta {
+                ValueMeta::Serialized(s) if s.will_init => stats.data_images += 1,
+                ValueMeta::Serialized(_) => stats.data_deltas += 1,
+                ValueMeta::Observed(_) => {}
+            }
+        }
+        stats
+    }

    /// Set the current lsn
    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> Result<(), WalIngestError> {
@@ -2304,15 +2331,20 @@ impl DatadirModification<'_> {
        }
        let mut new_files = other_files;
        match (modifying_file, content.is_empty()) {
-            (Some(_old_content), false) => {
-                
+            (Some(old_content), false) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_update(old_content.len(), content.len());
                new_files.push((path, content));
            }
-            (Some(_old_content), true) => {
-               
+            (Some(old_content), true) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_remove(old_content.len());
                // not adding the file key to the final `new_files` vec.
            }
            (None, false) => {
+                self.tline.aux_file_size_estimator.on_add(content.len());
                new_files.push((path, content));
            }
            // Compute may request delete of old version of pgstat AUX file if new one exceeds size limit.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -83,6 +83,11 @@ use crate::context::RequestContextBuilder;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::l0_flush::L0FlushGlobalState;
+use crate::metrics::{
+    BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN, CONCURRENT_INITDBS,
+    INITDB_RUN_TIME, INITDB_SEMAPHORE_ACQUISITION_TIME, TENANT, TENANT_STATE_METRIC,
+    TENANT_SYNTHETIC_SIZE_METRIC, remove_tenant_metrics,
+};
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
 use crate::tenant::gc_result::GcResult;
@@ -153,7 +158,7 @@ pub struct TenantSharedResources {
    pub l0_flush_global_state: L0FlushGlobalState,
 }

-/// A [`Tenant`] is really an _attached_ tenant.  The configuration
+/// A [`TenantShard`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
 #[derive(Clone)]
@@ -240,7 +245,7 @@ pub(crate) enum SpawnMode {
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
-pub struct Tenant {
+pub struct TenantShard {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

@@ -262,7 +267,7 @@ pub struct Tenant {
    shard_identity: ShardIdentity,

    /// The remote storage generation, used to protect S3 objects from split-brain.
-    /// Does not change over the lifetime of the [`Tenant`] object.
+    /// Does not change over the lifetime of the [`TenantShard`] object.
    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
@@ -304,7 +309,7 @@ pub struct Tenant {
    // Access to global deletion queue for when this tenant wants to schedule a deletion
    deletion_queue_client: DeletionQueueClient,

-    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
+    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,

@@ -332,12 +337,12 @@ pub struct Tenant {
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,

-    // Users of the Tenant such as the page service must take this Gate to avoid
-    // trying to use a Tenant which is shutting down.
+    // Users of the TenantShard such as the page service must take this Gate to avoid
+    // trying to use a TenantShard which is shutting down.
    pub(crate) gate: Gate,

    /// Throttle applied at the top of [`Timeline::get`].
-    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
+    /// All [`TenantShard::timelines`] of a given [`TenantShard`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,

    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
@@ -357,7 +362,7 @@ pub struct Tenant {

    l0_flush_global_state: L0FlushGlobalState,
 }
-impl std::fmt::Debug for Tenant {
+impl std::fmt::Debug for TenantShard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
    }
@@ -836,7 +841,7 @@ impl Debug for SetStoppingError {
    }
 }

-/// Arguments to [`Tenant::create_timeline`].
+/// Arguments to [`TenantShard::create_timeline`].
 ///
 /// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
 /// is `None`, the result of the timeline create call is not deterministic.
@@ -871,7 +876,7 @@ pub(crate) struct CreateTimelineParamsImportPgdata {
    pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`] in  [`Tenant::start_creating_timeline`].
+/// What is used to determine idempotency of a [`TenantShard::create_timeline`] call in  [`TenantShard::start_creating_timeline`] in  [`TenantShard::start_creating_timeline`].
 ///
 /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
@@ -909,7 +914,7 @@ pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
    idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is returned by [`Tenant::start_creating_timeline`].
+/// What is returned by [`TenantShard::start_creating_timeline`].
 #[must_use]
 enum StartCreatingTimelineResult {
    CreateGuard(TimelineCreateGuard),
@@ -938,13 +943,13 @@ struct TimelineInitAndSyncNeedsSpawnImportPgdata {
    guard: TimelineCreateGuard,
 }

-/// What is returned by [`Tenant::create_timeline`].
+/// What is returned by [`TenantShard::create_timeline`].
 enum CreateTimelineResult {
    Created(Arc<Timeline>),
    Idempotent(Arc<Timeline>),
-    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
+    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`TenantShard::timelines`] when
    /// we return this result, nor will this concrete object ever be added there.
-    /// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
+    /// Cf method comment on [`TenantShard::create_timeline_import_pgdata`].
    ImportSpawned(Arc<Timeline>),
 }

@@ -1077,7 +1082,7 @@ pub(crate) enum LoadConfigError {
    NotFound(Utf8PathBuf),
 }

-impl Tenant {
+impl TenantShard {
    /// Yet another helper for timeline initialization.
    ///
    /// - Initializes the Timeline struct and inserts it into the tenant's hash map
@@ -1298,7 +1303,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
+    ) -> Result<Arc<TenantShard>, GlobalShutDown> {
        let wal_redo_manager =
            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;

@@ -1312,7 +1317,7 @@ impl Tenant {
        let attach_mode = attached_conf.location.attach_mode;
        let generation = attached_conf.location.generation;

-        let tenant = Arc::new(Tenant::new(
+        let tenant = Arc::new(TenantShard::new(
            TenantState::Attaching,
            conf,
            attached_conf,
@@ -1329,7 +1334,7 @@ impl Tenant {
        let attach_gate_guard = tenant
            .gate
            .enter()
-            .expect("We just created the Tenant: nothing else can have shut it down yet");
+            .expect("We just created the TenantShard: nothing else can have shut it down yet");

        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
@@ -1353,11 +1358,11 @@ impl Tenant {
                let starting_up = init_order.is_some();
                scopeguard::defer! {
                    if starting_up {
-                       
+                        TENANT.startup_complete.inc();
                    }
                }

-                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
+                fn make_broken_or_stopping(t: &TenantShard, err: anyhow::Error) {
                    t.state.send_modify(|state| match state {
                        // TODO: the old code alluded to DeleteTenantFlow sometimes setting
                        // TenantState::Stopping before we get here, but this may be outdated.
@@ -1456,7 +1461,7 @@ impl Tenant {

                let preload = match &mode {
                    SpawnMode::Eager | SpawnMode::Lazy => {
-                      
+                        let _preload_timer = TENANT.preload.start_timer();
                        let res = tenant_clone
                            .preload(&remote_storage, task_mgr::shutdown_token())
                            .await;
@@ -1478,7 +1483,7 @@ impl Tenant {
                // We will time the duration of the attach phase unless this is a creation (attach will do no work)
                let attach_start = std::time::Instant::now();
                let attached = {
-                
+                    let _attach_timer = Some(TENANT.attach.start_timer());
                    tenant_clone.attach(preload, &ctx).await
                };
                let attach_duration = attach_start.elapsed();
@@ -1622,7 +1627,7 @@ impl Tenant {
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -1952,7 +1957,7 @@ impl Tenant {
    }

    async fn load_timelines_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
        heatmap: Option<(HeatMapTenant, std::time::Instant)>,
@@ -2023,7 +2028,7 @@ impl Tenant {
    }

    fn load_timeline_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_id: TimelineId,
        remote_storage: GenericRemoteStorage,
        previous_heatmap: Option<PreviousHeatmap>,
@@ -2424,14 +2429,14 @@ impl Tenant {
    /// This is used by tests & import-from-basebackup.
    ///
    /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
-    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
+    /// a state that will fail [`TenantShard::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
    ///
    /// The caller is responsible for getting the timeline into a state that will be accepted
-    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
+    /// by [`TenantShard::load_remote_timeline`] / [`TenantShard::attach`].
    /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
-    /// to the [`Tenant::timelines`].
+    /// to the [`TenantShard::timelines`].
    ///
-    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
+    /// Tests should use `TenantShard::create_test_timeline` to set up the minimum required metadata keys.
    pub(crate) async fn create_empty_timeline(
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
@@ -2579,7 +2584,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        params: CreateTimelineParams,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
@@ -2746,13 +2751,13 @@ impl Tenant {
        Ok(activated_timeline)
    }

-    /// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
+    /// The returned [`Arc<Timeline>`] is NOT in the [`TenantShard::timelines`] map until the import
    /// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
-    /// [`Tenant::timelines`] map when the import completes.
+    /// [`TenantShard::timelines`] map when the import completes.
    /// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
    /// for the response.
    async fn create_timeline_import_pgdata(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
        params: CreateTimelineParamsImportPgdata,
        activate: ActivateTimelineArgs,
        ctx: &RequestContext,
@@ -2849,7 +2854,7 @@ impl Tenant {

    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
    async fn create_timeline_import_pgdata_task(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2877,7 +2882,7 @@ impl Tenant {
    }

    async fn create_timeline_import_pgdata_task_impl(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2894,10 +2899,10 @@ impl Tenant {
        // Reload timeline from remote.
        // This proves that the remote state is attachable, and it reuses the code.
        //
-        // TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
+        // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown.
        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
-        // But our activate() call might launch new background tasks after Tenant::shutdown
-        // already went past shutting down the Tenant::timelines, which this timeline here is no part of.
+        // But our activate() call might launch new background tasks after TenantShard::shutdown
+        // already went past shutting down the TenantShard::timelines, which this timeline here is no part of.
        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
        // down while bootstrapping/branching + activating), but, the race condition is much more likely
        // to manifest because of the long runtime of this import task.
@@ -2912,7 +2917,7 @@ impl Tenant {
        // };
        let timeline_id = timeline.timeline_id;

-        // load from object storage like Tenant::attach does
+        // load from object storage like TenantShard::attach does
        let resources = self.build_timeline_resources(timeline_id);
        let index_part = resources
            .remote_client
@@ -3180,7 +3185,7 @@ impl Tenant {
        self.compaction_circuit_breaker
            .lock()
            .unwrap()
-            .success();
+            .success(&CIRCUIT_BREAKERS_UNBROKEN);

        match has_pending {
            true => Ok(CompactionOutcome::Pending),
@@ -3201,13 +3206,13 @@ impl Tenant {
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
-                    .fail( err);
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
            }
            CompactionError::Other(err) => {
                self.compaction_circuit_breaker
                    .lock()
                    .unwrap()
-                    .fail( err);
+                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
            }
            CompactionError::AlreadyRunning(_) => {}
        }
@@ -3387,7 +3392,7 @@ impl Tenant {
                    "activation attempt finished"
                );

-              
+                TENANT.activation.observe(elapsed.as_secs_f64());
            });
        }
    }
@@ -3512,6 +3517,7 @@ impl Tenant {
        // Wait for any in-flight operations to complete
        self.gate.close().await;

+        remove_tenant_metrics(&self.tenant_shard_id);

        Ok(())
    }
@@ -3844,13 +3850,33 @@ impl Tenant {
    }

    pub(crate) fn get_sizes(&self) -> TopTenantShardItem {
-         TopTenantShardItem {
+        let mut result = TopTenantShardItem {
            id: self.tenant_shard_id,
            resident_size: 0,
            physical_size: 0,
            max_logical_size: 0,
            max_logical_size_per_shard: 0,
+        };
+
+        for timeline in self.timelines.lock().unwrap().values() {
+            result.resident_size += timeline.metrics.resident_physical_size_gauge.get();
+
+            result.physical_size += timeline
+                .remote_client
+                .metrics
+                .remote_physical_size_gauge
+                .get();
+            result.max_logical_size = std::cmp::max(
+                result.max_logical_size,
+                timeline.metrics.current_logical_size_gauge.get(),
+            );
        }
+
+        result.max_logical_size_per_shard = result
+            .max_logical_size
+            .div_ceil(self.tenant_shard_id.shard_count.count() as u64);
+
+        result
    }
 }

@@ -3912,7 +3938,7 @@ enum ActivateTimelineArgs {
    No,
 }

-impl Tenant {
+impl TenantShard {
    pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig {
        self.tenant_conf.load().tenant_conf.clone()
    }
@@ -4070,7 +4096,7 @@ impl Tenant {
        update: F,
    ) -> anyhow::Result<pageserver_api::models::TenantConfig> {
        // Use read-copy-update in order to avoid overwriting the location config
-        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // state if this races with [`TenantShard::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
@@ -4193,7 +4219,7 @@ impl Tenant {
        Ok((timeline, timeline_ctx))
    }

-    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// [`TenantShard::shutdown`] must be called before dropping the returned [`TenantShard`] object
    /// to ensure proper cleanup of background tasks and metrics.
    //
    // Allow too_many_arguments because a constructor's argument list naturally grows with the
@@ -4209,7 +4235,7 @@ impl Tenant {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
-    ) -> Tenant {
+    ) -> TenantShard {
        debug_assert!(
            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
        );
@@ -4217,19 +4243,59 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
-            
-            loop {
+            // reflect tenant state in metrics:
+            // - global per tenant state: TENANT_STATE_METRIC
+            // - "set" of broken tenants: BROKEN_TENANTS_SET
+            //
+            // set of broken tenants should not have zero counts so that it remains accessible for
+            // alerting.

+            let tid = tenant_shard_id.to_string();
+            let shard_id = tenant_shard_id.shard_slug().to_string();
+            let set_key = &[tid.as_str(), shard_id.as_str()][..];
+
+            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
+                ([state.into()], matches!(state, TenantState::Broken { .. }))
+            }
+
+            let mut tuple = inspect_state(&rx.borrow_and_update());
+
+            let is_broken = tuple.1;
+            let mut counted_broken = if is_broken {
+                // add the id to the set right away, there should not be any updates on the channel
+                // after before tenant is removed, if ever
+                BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                true
+            } else {
+                false
+            };
+
+            loop {
+                let labels = &tuple.0;
+                let current = TENANT_STATE_METRIC.with_label_values(labels);
+                current.inc();

                if rx.changed().await.is_err() {
-                   
+                    // tenant has been dropped
+                    current.dec();
+                    drop(BROKEN_TENANTS_SET.remove_label_values(set_key));
                    break;
                }

+                current.dec();
+                tuple = inspect_state(&rx.borrow_and_update());
+
+                let is_broken = tuple.1;
+                if is_broken && !counted_broken {
+                    counted_broken = true;
+                    // insert the tenant_id (back) into the set while avoiding needless counter
+                    // access
+                    BROKEN_TENANTS_SET.with_label_values(set_key).set(1);
+                }
            }
        });

-        Tenant {
+        TenantShard {
            tenant_shard_id,
            shard_identity,
            generation: attached_conf.location.generation,
@@ -4264,7 +4330,7 @@ impl Tenant {
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            pagestream_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                TenantShard::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
            )),
            pagestream_throttle_metrics: Arc::new(
                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
@@ -4400,11 +4466,11 @@ impl Tenant {

        // Perform GC for each timeline.
        //
-        // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the
+        // Note that we don't hold the `TenantShard::gc_cs` lock here because we don't want to delay the
        // branch creation task, which requires the GC lock. A GC iteration can run concurrently
        // with branch creation.
        //
-        // See comments in [`Tenant::branch_timeline`] for more information about why branch
+        // See comments in [`TenantShard::branch_timeline`] for more information about why branch
        // creation task can run concurrently with timeline's GC iteration.
        for timeline in gc_timelines {
            if cancel.is_cancelled() {
@@ -4434,7 +4500,7 @@ impl Tenant {

    /// Refreshes the Timeline::gc_info for all timelines, returning the
    /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
-    /// [`Tenant::get_gc_horizon`].
+    /// [`TenantShard::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
    pub(crate) async fn refresh_gc_info(
@@ -4600,6 +4666,10 @@ impl Tenant {
                let now = SystemTime::now();
                target.leases.retain(|_, lease| !lease.is_expired(&now));

+                timeline
+                    .metrics
+                    .valid_lsn_lease_count_gauge
+                    .set(target.leases.len() as u64);

                // Look up parent's PITR cutoff to update the child's knowledge of whether it is within parent's PITR
                if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
@@ -4609,6 +4679,22 @@ impl Tenant {
                    }
                }

+                // Update metrics that depend on GC state
+                timeline
+                    .metrics
+                    .archival_size
+                    .set(if target.within_ancestor_pitr {
+                        timeline.metrics.current_logical_size_gauge.get()
+                    } else {
+                        0
+                    });
+                timeline.metrics.pitr_history_size.set(
+                    timeline
+                        .get_last_record_lsn()
+                        .checked_sub(target.cutoffs.time)
+                        .unwrap_or(Lsn(0))
+                        .0,
+                );

                // Apply the cutoffs we found to the Timeline's GcInfo.  Why might we _not_ have cutoffs for a timeline?
                // - this timeline was created while we were finding cutoffs
@@ -5358,6 +5444,10 @@ impl Tenant {
        // Only shard zero should be calculating synthetic sizes
        debug_assert!(self.shard_identity.is_shard_zero());

+        TENANT_SYNTHETIC_SIZE_METRIC
+            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
+            .unwrap()
+            .set(size);
    }

    pub fn cached_synthetic_size(&self) -> u64 {
@@ -5409,7 +5499,7 @@ impl Tenant {
            }
        }

-        // The flushes we did above were just writes, but the Tenant might have had
+        // The flushes we did above were just writes, but the TenantShard might have had
        // pending deletions as well from recent compaction/gc: we want to flush those
        // as well.  This requires flushing the global delete queue.  This is cheap
        // because it's typically a no-op.
@@ -5427,25 +5517,34 @@ impl Tenant {

    /// How much local storage would this tenant like to have?  It can cope with
    /// less than this (via eviction and on-demand downloads), but this function enables
-    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
+    /// the TenantShard to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
    ///
    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
    /// than they report here, due to layer eviction.  Tenants with many active branches may
    /// actually use more than they report here.
    pub(crate) fn local_storage_wanted(&self) -> u64 {
-        1000
+        let timelines = self.timelines.lock().unwrap();
+
+        // Heuristic: we use the max() of the timelines' visible sizes, rather than the sum.  This
+        // reflects the observation that on tenants with multiple large branches, typically only one
+        // of them is used actively enough to occupy space on disk.
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .max()
+            .unwrap_or(0)
    }

    /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
    /// manifest in `Self::remote_tenant_manifest`.
    ///
    /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
-    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
+    /// changing any `TenantShard` state that's included in the manifest, consider making the manifest
    /// the authoritative source of data with an API that automatically uploads on changes. Revisit
    /// this when the manifest is more widely used and we have a better idea of the data model.
    pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
+        // Multiple tasks may call this function concurrently after mutating the TenantShard runtime
        // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
        // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
        // simple coalescing mechanism.
@@ -5518,11 +5617,16 @@ async fn run_initdb(
    );

    let _permit = {
-       
+        let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
        INIT_DB_SEMAPHORE.acquire().await
    };

+    CONCURRENT_INITDBS.inc();
+    scopeguard::defer! {
+        CONCURRENT_INITDBS.dec();
+    }

+    let _timer = INITDB_RUN_TIME.start_timer();
    let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
        superuser: &conf.superuser,
        locale: &conf.locale,
@@ -5708,7 +5812,7 @@ pub(crate) mod harness {
            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
        }

-        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<TenantShard>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
                .with_scope_unit_test();
            (
@@ -5723,10 +5827,10 @@ pub(crate) mod harness {
        pub(crate) async fn do_try_load(
            &self,
            ctx: &RequestContext,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

-            let tenant = Arc::new(Tenant::new(
+            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
                self.conf,
                AttachedTenantConf::try_from(LocationConf::attached_single(
@@ -5942,7 +6046,7 @@ mod tests {
    #[cfg(feature = "testing")]
    #[allow(clippy::too_many_arguments)]
    async fn randomize_timeline(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
        new_timeline_id: TimelineId,
        pg_version: u32,
        spec: TestTimelineSpecification,
@@ -6832,7 +6936,7 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        lsn: Lsn,
@@ -6844,7 +6948,7 @@ mod tests {
    }

    async fn bulk_insert_maybe_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
@@ -7754,7 +7858,7 @@ mod tests {
            let (tline, _ctx) = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
+            // Leave the timeline ID in [`TenantShard::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown(super::timeline::ShutdownMode::Hard)
--- a/pageserver/src/tenant/blob_io.rs
+++ b/pageserver/src/tenant/blob_io.rs
@@ -446,6 +446,34 @@ impl<const BUFFERED: bool> BlobWriter<BUFFERED> {
        };
        (srcbuf, res.map(|_| (offset, compression_info)))
    }
+
+    /// Writes a raw blob containing both header and data, returning its offset.
+    pub(crate) async fn write_blob_raw<Buf: IoBuf + Send>(
+        &mut self,
+        raw_with_header: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<u64, Error>) {
+        // Verify the header, to ensure we don't write invalid/corrupt data.
+        let header = match Header::decode(&raw_with_header) {
+            Ok(header) => header,
+            Err(err) => return (raw_with_header, Err(err)),
+        };
+        if raw_with_header.len() != header.total_len() {
+            let header_total_len = header.total_len();
+            let raw_len = raw_with_header.len();
+            return (
+                raw_with_header,
+                Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidData,
+                    format!("header length mismatch: {header_total_len} != {raw_len}"),
+                )),
+            );
+        }
+
+        let offset = self.offset;
+        let (raw_with_header, result) = self.write_all(raw_with_header, ctx).await;
+        (raw_with_header, result.map(|_| offset))
+    }
 }

 impl BlobWriter<true> {
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -564,8 +564,9 @@ mod tests {
            Lsn(0),
            Lsn(0),
            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
+            // Updating this version to 17 will cause the test to fail at the
+            // next assert_eq!().
+            16,
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -44,7 +44,7 @@ use crate::controller_upcall_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
-use crate::metrics::TENANT_MANAGER as METRICS;
+use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
@@ -52,7 +52,9 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState,
+};
 use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

@@ -67,7 +69,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 /// having a properly acquired generation (Secondary doesn't need a generation)
 #[derive(Clone)]
 pub(crate) enum TenantSlot {
-    Attached(Arc<Tenant>),
+    Attached(Arc<TenantShard>),
    Secondary(Arc<SecondaryTenant>),
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
@@ -86,7 +88,7 @@ impl std::fmt::Debug for TenantSlot {

 impl TenantSlot {
    /// Return the `Tenant` in this slot if attached, else None
-    fn get_attached(&self) -> Option<&Arc<Tenant>> {
+    fn get_attached(&self) -> Option<&Arc<TenantShard>> {
        match self {
            Self::Attached(t) => Some(t),
            Self::Secondary(_) => None,
@@ -164,7 +166,7 @@ impl TenantStartupMode {
 /// Result type for looking up a TenantId to a specific shard
 pub(crate) enum ShardResolveResult {
    NotFound,
-    Found(Arc<Tenant>),
+    Found(Arc<TenantShard>),
    // Wait for this barrrier, then query again
    InProgress(utils::completion::Barrier),
 }
@@ -173,7 +175,7 @@ impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
    /// None is returned.
-    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<TenantShard>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
@@ -410,7 +412,7 @@ fn load_tenant_config(
        return None;
    }

-    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
+    Some(TenantShard::load_tenant_config(conf, &tenant_shard_id))
 }

 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -519,7 +521,7 @@ pub async fn init_tenant_mgr(
        tenant_configs.len(),
        conf.concurrent_tenant_warmup.initial_permits()
    );
-
+    TENANT.startup_scheduled.inc_by(tenant_configs.len() as u64);

    // Accumulate futures for writing tenant configs, so that we can execute in parallel
    let mut config_write_futs = Vec::new();
@@ -606,7 +608,8 @@ pub async fn init_tenant_mgr(
        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            let r =
+                TenantShard::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
            (tenant_shard_id, location_conf, r)
        });
    }
@@ -694,7 +697,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Result<Arc<TenantShard>, GlobalShutDown> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -706,7 +709,7 @@ fn tenant_spawn(
            .unwrap()
    );

-    Tenant::spawn(
+    TenantShard::spawn(
        conf,
        tenant_shard_id,
        resources,
@@ -883,12 +886,12 @@ impl TenantManager {
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
    /// undergoing a state change (i.e. slot is InProgress).
    ///
-    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
-    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
+    /// The return TenantShard is not guaranteed to be active: check its status after obtaing it, or
+    /// use [`TenantShard::wait_to_become_active`] before using it if you will do I/O on it.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
-    ) -> Result<Arc<Tenant>, GetTenantError> {
+    ) -> Result<Arc<TenantShard>, GetTenantError> {
        let locked = self.tenants.read().unwrap();

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
@@ -937,12 +940,12 @@ impl TenantManager {
        flush: Option<Duration>,
        mut spawn_mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
+    ) -> Result<Option<Arc<TenantShard>>, UpsertLocationError> {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

        enum FastPathModified {
-            Attached(Arc<Tenant>),
+            Attached(Arc<TenantShard>),
            Secondary(Arc<SecondaryTenant>),
        }

@@ -999,9 +1002,13 @@ impl TenantManager {
        // phase of writing config and/or waiting for flush, before returning.
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                // Transition to AttachedStale means we may well hold a valid generation
                // still, and have been requested to go stale as part of a migration.  If
@@ -1030,9 +1037,13 @@ impl TenantManager {
                return Ok(Some(tenant));
            }
            Some(FastPathModified::Secondary(_secondary_tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                return Ok(None);
            }
@@ -1122,7 +1133,7 @@ impl TenantManager {
        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+        TenantShard::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
            .await
            .fatal_err("write tenant shard config");

@@ -1262,7 +1273,7 @@ impl TenantManager {

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+        let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)?;

        if drop_cache {
            tracing::info!("Dropping local file cache");
@@ -1297,7 +1308,7 @@ impl TenantManager {
        Ok(())
    }

-    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
+    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<TenantShard>> {
        let locked = self.tenants.read().unwrap();
        match &*locked {
            TenantsMap::Initializing => Vec::new(),
@@ -1446,7 +1457,7 @@ impl TenantManager {
    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1476,7 +1487,7 @@ impl TenantManager {

    pub(crate) async fn do_shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1703,7 +1714,7 @@ impl TenantManager {
    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
    async fn shard_split_hardlink(
        &self,
-        parent_shard: &Tenant,
+        parent_shard: &TenantShard,
        child_shards: Vec<TenantShardId>,
    ) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_id();
@@ -1988,7 +1999,7 @@ impl TenantManager {
            }

            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
+            let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)
                .map_err(|e| Error::DetachReparent(e.into()))?;

            let shard_identity = config.shard;
@@ -2177,7 +2188,9 @@ impl TenantManager {
                        // we would use if not doing any eviction.
                        progress.bytes_total
                    } else {
-                        42
+                        // In the absence of heatmap info, assume that the secondary location simply
+                        // needs as much space as it is currently using.
+                        secondary.resident_size_metric.get()
                    }
                }
            }
@@ -2528,7 +2541,7 @@ impl SlotGuard {
                Ok(())
            }
            None => {
-              
+                METRICS.unexpected_errors.inc();
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during tenant upsert, this is a bug."
@@ -2538,7 +2551,7 @@ impl SlotGuard {
                ))
            }
            Some(slot) => {
-               
+                METRICS.unexpected_errors.inc();
                error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during upsert, this is a bug.  Contents: {:?}", slot);
                Err(TenantSlotUpsertError::InternalError(
                    "Unexpected contents of TenantSlot".into(),
@@ -2619,7 +2632,7 @@ impl Drop for SlotGuard {
        match m.entry(self.tenant_shard_id) {
            Entry::Occupied(mut entry) => {
                if !matches!(entry.get(), TenantSlot::InProgress(_)) {
-                    
+                    METRICS.unexpected_errors.inc();
                    error!(tenant_shard_id=%self.tenant_shard_id, "Unexpected contents of TenantSlot during drop, this is a bug.  Contents: {:?}", entry.get());
                }

@@ -2634,7 +2647,7 @@ impl Drop for SlotGuard {
                }
            }
            Entry::Vacant(_) => {
-                
+                METRICS.unexpected_errors.inc();
                error!(
                    tenant_shard_id = %self.tenant_shard_id,
                    "Missing InProgress marker during SlotGuard drop, this is a bug."
@@ -2694,7 +2707,7 @@ fn tenant_map_acquire_slot_impl(
    mode: TenantSlotAcquireMode,
 ) -> Result<SlotGuard, TenantSlotError> {
    use TenantSlotAcquireMode::*;
-  
+    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug());
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -133,7 +133,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`].
+//!   [`TenantShard::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -171,7 +171,7 @@
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
+//! [`TenantShard::timeline_init_and_sync`]: super::TenantShard::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

 pub(crate) mod download;
@@ -223,8 +223,9 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::deletion_queue::{DeletionQueueClient, DeletionQueueError};
 use crate::metrics::{
-    MeasureRemoteOp, 
-    RemoteOpFileKind, RemoteOpKind, 
+    MeasureRemoteOp, REMOTE_ONDEMAND_DOWNLOADED_BYTES, REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
+    RemoteOpFileKind, RemoteOpKind, RemoteTimelineClientMetrics,
+    RemoteTimelineClientMetricsCallTrackSize,
 };
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind, shutdown_token};
 use crate::tenant::metadata::TimelineMetadata;
@@ -356,6 +357,8 @@ pub(crate) struct RemoteTimelineClient {

    upload_queue: Mutex<UploadQueue>,

+    pub(crate) metrics: Arc<RemoteTimelineClientMetrics>,
+
    storage_impl: GenericRemoteStorage,

    deletion_queue_client: DeletionQueueClient,
@@ -402,6 +405,10 @@ impl RemoteTimelineClient {
            storage_impl: remote_storage,
            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+            )),
            config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(location_conf)),
            cancel: CancellationToken::new(),
        }
@@ -590,13 +597,21 @@ impl RemoteTimelineClient {
            .map_err(|_| UploadQueueNotReadyError)
    }

-    fn update_remote_physical_size_gauge(&self, _current_remote_index_part: Option<&IndexPart>) {
-
-        
+    fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
+        let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
+            current_remote_index_part
+                .layer_metadata
+                .values()
+                .map(|ilmd| ilmd.file_size)
+                .sum()
+        } else {
+            0
+        };
+        self.metrics.remote_physical_size_gauge.set(size);
    }

    pub fn get_remote_physical_size(&self) -> u64 {
-  0
+        self.metrics.remote_physical_size_gauge.get()
    }

    //
@@ -611,6 +626,13 @@ impl RemoteTimelineClient {
        &self,
        cancel: &CancellationToken,
    ) -> Result<MaybeDeletedIndexPart, DownloadError> {
+        let _unfinished_gauge_guard = self.metrics.call_begin(
+            &RemoteOpFileKind::Index,
+            &RemoteOpKind::Download,
+            crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                reason: "no need for a downloads gauge",
+            },
+        );

        let (index_part, index_generation, index_last_modified) = download::download_index_part(
            &self.storage_impl,
@@ -623,7 +645,7 @@ impl RemoteTimelineClient {
            Option::<TaskKind>::None,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
-        
+            Arc::clone(&self.metrics),
        )
        .await?;

@@ -698,7 +720,13 @@ impl RemoteTimelineClient {
        ctx: &RequestContext,
    ) -> Result<u64, DownloadError> {
        let downloaded_size = {
-        
+            let _unfinished_gauge_guard = self.metrics.call_begin(
+                &RemoteOpFileKind::Layer,
+                &RemoteOpKind::Download,
+                crate::metrics::RemoteTimelineClientMetricsCallTrackSize::DontTrackSize {
+                    reason: "no need for a downloads gauge",
+                },
+            );
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
@@ -715,11 +743,13 @@ impl RemoteTimelineClient {
                Some(ctx.task_kind()),
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
-          
+                Arc::clone(&self.metrics),
            )
            .await?
        };

+        REMOTE_ONDEMAND_DOWNLOADED_LAYERS.inc();
+        REMOTE_ONDEMAND_DOWNLOADED_BYTES.inc_by(downloaded_size);

        Ok(downloaded_size)
    }
@@ -997,6 +1027,7 @@ impl RemoteTimelineClient {
        let op = UploadOp::UploadMetadata {
            uploaded: Box::new(index_part.clone()),
        };
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
        upload_queue.latest_files_changes_since_metadata_upload_scheduled = 0;

@@ -1234,6 +1265,7 @@ impl RemoteTimelineClient {
        );

        let op = UploadOp::UploadLayer(layer, metadata, None);
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -1410,6 +1442,7 @@ impl RemoteTimelineClient {
        let op = UploadOp::Delete(Delete {
            layers: with_metadata,
        });
+        self.metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
    }

@@ -2147,7 +2180,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
-                      
+                        Arc::clone(&self.metrics),
                    )
                    .await
                }
@@ -2164,7 +2197,7 @@ impl RemoteTimelineClient {
                        Some(TaskKind::RemoteUploadTask),
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
-                    
+                        Arc::clone(&self.metrics),
                    )
                    .await;
                    if res.is_ok() {
@@ -2310,7 +2343,10 @@ impl RemoteTimelineClient {
                    upload_queue.clean.1 = Some(task.task_id);

                    let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
-                   
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);
+
                    if self.generation.is_none() {
                        // Legacy mode: skip validating generation
                        upload_queue.visible_remote_consistent_lsn.store(lsn);
@@ -2351,6 +2387,64 @@ impl RemoteTimelineClient {
                .await;
        }

+        self.metric_end(&task.op);
+        for coalesced_op in &task.coalesced_ops {
+            self.metric_end(coalesced_op);
+        }
+    }
+
+    fn metric_impl(
+        &self,
+        op: &UploadOp,
+    ) -> Option<(
+        RemoteOpFileKind,
+        RemoteOpKind,
+        RemoteTimelineClientMetricsCallTrackSize,
+    )> {
+        use RemoteTimelineClientMetricsCallTrackSize::DontTrackSize;
+        let res = match op {
+            UploadOp::UploadLayer(_, m, _) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Upload,
+                RemoteTimelineClientMetricsCallTrackSize::Bytes(m.file_size),
+            ),
+            UploadOp::UploadMetadata { .. } => (
+                RemoteOpFileKind::Index,
+                RemoteOpKind::Upload,
+                DontTrackSize {
+                    reason: "metadata uploads are tiny",
+                },
+            ),
+            UploadOp::Delete(_delete) => (
+                RemoteOpFileKind::Layer,
+                RemoteOpKind::Delete,
+                DontTrackSize {
+                    reason: "should we track deletes? positive or negative sign?",
+                },
+            ),
+            UploadOp::Barrier(..) | UploadOp::Shutdown => {
+                // we do not account these
+                return None;
+            }
+        };
+        Some(res)
+    }
+
+    fn metric_begin(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        let guard = self.metrics.call_begin(&file_kind, &op_kind, track_bytes);
+        guard.will_decrement_manually(); // in metric_end(), see right below
+    }
+
+    fn metric_end(&self, op: &UploadOp) {
+        let (file_kind, op_kind, track_bytes) = match self.metric_impl(op) {
+            Some(x) => x,
+            None => return,
+        };
+        self.metrics.call_end(&file_kind, &op_kind, track_bytes);
    }

    /// Close the upload queue for new operations and cancel queued operations.
@@ -2430,6 +2524,7 @@ impl RemoteTimelineClient {

                // Tear down queued ops
                for op in qi.queued_operations.into_iter() {
+                    self.metric_end(&op);
                    // Dropping UploadOp::Barrier() here will make wait_completion() return with an Err()
                    // which is exactly what we want to happen.
                    drop(op);
@@ -2648,7 +2743,7 @@ mod tests {
    use crate::tenant::config::AttachmentMode;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::layer::local_layer_path;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
        format!("contents for {name}").into()
@@ -2701,7 +2796,7 @@ mod tests {

    struct TestSetup {
        harness: TenantHarness,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
    }
@@ -2739,6 +2834,10 @@ mod tests {
                storage_impl: self.harness.remote_storage.clone(),
                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
+                metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                    &self.harness.tenant_shard_id,
+                    &TIMELINE_ID,
+                )),
                config: std::sync::RwLock::new(RemoteTimelineClientConfig::from(&location_conf)),
                cancel: CancellationToken::new(),
            })
@@ -2965,7 +3064,99 @@ mod tests {
        );
    }

-    
+    #[tokio::test]
+    async fn bytes_unfinished_gauge_for_layer_file_uploads() {
+        // Setup
+
+        let TestSetup {
+            harness,
+            tenant: _tenant,
+            timeline,
+            ..
+        } = TestSetup::new("metrics").await.unwrap();
+        let client = &timeline.remote_client;
+
+        let layer_file_name_1: LayerName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
+        let local_path = local_layer_path(
+            harness.conf,
+            &timeline.tenant_shard_id,
+            &timeline.timeline_id,
+            &layer_file_name_1,
+            &harness.generation,
+        );
+        let content_1 = dummy_contents("foo");
+        std::fs::write(&local_path, &content_1).unwrap();
+
+        let layer_file_1 = Layer::for_resident(
+            harness.conf,
+            &timeline,
+            local_path,
+            layer_file_name_1.clone(),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
+        );
+
+        #[derive(Debug, PartialEq, Clone, Copy)]
+        struct BytesStartedFinished {
+            started: Option<usize>,
+            finished: Option<usize>,
+        }
+        impl std::ops::Add for BytesStartedFinished {
+            type Output = Self;
+            fn add(self, rhs: Self) -> Self::Output {
+                Self {
+                    started: self.started.map(|v| v + rhs.started.unwrap_or(0)),
+                    finished: self.finished.map(|v| v + rhs.finished.unwrap_or(0)),
+                }
+            }
+        }
+        let get_bytes_started_stopped = || {
+            let started = client
+                .metrics
+                .get_bytes_started_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            let stopped = client
+                .metrics
+                .get_bytes_finished_counter_value(&RemoteOpFileKind::Layer, &RemoteOpKind::Upload)
+                .map(|v| v.try_into().unwrap());
+            BytesStartedFinished {
+                started,
+                finished: stopped,
+            }
+        };
+
+        // Test
+        tracing::info!("now doing actual test");
+
+        let actual_a = get_bytes_started_stopped();
+
+        client
+            .schedule_layer_file_upload(layer_file_1.clone())
+            .unwrap();
+
+        let actual_b = get_bytes_started_stopped();
+
+        client.wait_completion().await.unwrap();
+
+        let actual_c = get_bytes_started_stopped();
+
+        // Validate
+
+        let expected_b = actual_a
+            + BytesStartedFinished {
+                started: Some(content_1.len()),
+                // assert that the _finished metric is created eagerly so that subtractions work on first sample
+                finished: Some(0),
+            };
+        assert_eq!(actual_b, expected_b);
+
+        let expected_c = actual_a
+            + BytesStartedFinished {
+                started: Some(content_1.len()),
+                finished: Some(content_1.len()),
+            };
+        assert_eq!(actual_c, expected_c);
+    }
+
    async fn inject_index_part(test_state: &TestSetup, generation: Generation) -> IndexPart {
        // An empty IndexPart, just sufficient to ensure deserialization will succeed
        let example_index_part = IndexPart::example();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -452,7 +452,7 @@ async fn do_download_index_part(
 /// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
 /// to listing objects.
 ///
-/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `my_generation`: the value of `[crate::tenant::TenantShard::generation]`
 /// * `what`: for logging, what object are we downloading
 /// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
 /// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -6,6 +6,7 @@ mod scheduler;
 use std::sync::Arc;
 use std::time::SystemTime;

+use metrics::UIntGauge;
 use pageserver_api::models;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use remote_storage::GenericRemoteStorage;
@@ -25,6 +26,7 @@ use super::span::debug_assert_current_span_has_tenant_id;
 use super::storage_layer::LayerName;
 use crate::context::RequestContext;
 use crate::disk_usage_eviction_task::DiskUsageEvictionInfo;
+use crate::metrics::{SECONDARY_HEATMAP_TOTAL_SIZE, SECONDARY_RESIDENT_PHYSICAL_SIZE};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};

 enum DownloadCommand {
@@ -107,7 +109,12 @@ pub(crate) struct SecondaryTenant {

    // Public state indicating overall progress of downloads relative to the last heatmap seen
    pub(crate) progress: std::sync::Mutex<models::SecondaryProgress>,
-   
+
+    // Sum of layer sizes on local disk
+    pub(super) resident_size_metric: UIntGauge,
+
+    // Sum of layer sizes in the most recently downloaded heatmap
+    pub(super) heatmap_total_size_metric: UIntGauge,
 }

 impl SecondaryTenant {
@@ -117,8 +124,16 @@ impl SecondaryTenant {
        tenant_conf: pageserver_api::models::TenantConfig,
        config: &SecondaryLocationConfig,
    ) -> Arc<Self> {
-    
-    
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let resident_size_metric = SECONDARY_RESIDENT_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
+        let heatmap_total_size_metric = SECONDARY_HEATMAP_TOTAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id])
+            .unwrap();
+
        Arc::new(Self {
            tenant_shard_id,
            // todo: shall we make this a descendent of the
@@ -135,10 +150,14 @@ impl SecondaryTenant {

            progress: std::sync::Mutex::default(),

+            resident_size_metric,
+            heatmap_total_size_metric,
        })
    }

-    
+    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
+        self.tenant_shard_id
+    }

    pub(crate) async fn shutdown(&self) {
        self.cancel.cancel();
@@ -150,10 +169,15 @@ impl SecondaryTenant {

        // Metrics are subtracted from and/or removed eagerly.
        // Deletions are done in the background via [`BackgroundPurges::spawn`].
+        let tenant_id = self.tenant_shard_id.tenant_id.to_string();
+        let shard_id = format!("{}", self.tenant_shard_id.shard_slug());
+        let _ = SECONDARY_RESIDENT_PHYSICAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+        let _ = SECONDARY_HEATMAP_TOTAL_SIZE.remove_label_values(&[&tenant_id, &shard_id]);
+
        self.detail
            .lock()
            .unwrap()
-            .drain_timelines(&self.tenant_shard_id);
+            .drain_timelines(&self.tenant_shard_id, &self.resident_size_metric);
    }

    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
@@ -231,7 +255,7 @@ impl SecondaryTenant {
            // of the cache.
            let mut detail = this.detail.lock().unwrap();
            if let Some(removed) =
-                detail.evict_layer(name, &timeline_id, now)
+                detail.evict_layer(name, &timeline_id, now, &this.resident_size_metric)
            {
                // We might race with removal of the same layer during downloads, so finding the layer we
                // were trying to remove is optional.  Only issue the disk I/O to remove it if we found it.
@@ -245,9 +269,10 @@ impl SecondaryTenant {
    /// Exhaustive check that incrementally updated metrics match the actual state.
    #[cfg(feature = "testing")]
    fn validate_metrics(&self) {
-        
+        let detail = self.detail.lock().unwrap();
+        let resident_size = detail.total_resident_size();

-        
+        assert_eq!(resident_size, self.resident_size_metric.get());
    }

    #[cfg(not(feature = "testing"))]
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -4,9 +4,11 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::{Duration, Instant, SystemTime};

+use crate::metrics::{STORAGE_IO_SIZE, StorageIoSizeOperation};
 use camino::Utf8PathBuf;
 use chrono::format::{DelayedFormat, StrftimeItems};
 use futures::Future;
+use metrics::UIntGauge;
 use pageserver_api::models::SecondaryProgress;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{DownloadError, DownloadKind, DownloadOpts, Etag, GenericRemoteStorage};
@@ -31,6 +33,7 @@ use crate::context::RequestContext;
 use crate::disk_usage_eviction_task::{
    DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer, finite_f32,
 };
+use crate::metrics::SECONDARY_MODE;
 use crate::tenant::config::SecondaryLocationConfig;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::ephemeral_file::is_ephemeral_file;
@@ -117,6 +120,9 @@ impl OnDiskState {
            .fatal_err("Deleting secondary layer")
    }

+    pub(crate) fn file_size(&self) -> u64 {
+        self.metadata.file_size
+    }
 }

 pub(super) struct SecondaryDetailTimeline {
@@ -169,9 +175,13 @@ impl SecondaryDetailTimeline {
    pub(super) fn remove_layer(
        &mut self,
        name: &LayerName,
+        resident_metric: &UIntGauge,
    ) -> Option<OnDiskState> {
-        self.on_disk_layers.remove(name)
-        
+        let removed = self.on_disk_layers.remove(name);
+        if let Some(removed) = &removed {
+            resident_metric.sub(removed.file_size());
+        }
+        removed
    }

    /// `local_path`
@@ -181,6 +191,7 @@ impl SecondaryDetailTimeline {
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        touched: &HeatMapLayer,
+        resident_metric: &UIntGauge,
        local_path: F,
    ) where
        F: FnOnce() -> Utf8PathBuf,
@@ -200,6 +211,7 @@ impl SecondaryDetailTimeline {
                    touched.access_time,
                    local_path(),
                ));
+                resident_metric.add(touched.metadata.file_size);
            }
        }
    }
@@ -255,16 +267,28 @@ impl SecondaryDetail {
        }
    }

+    #[cfg(feature = "testing")]
+    pub(crate) fn total_resident_size(&self) -> u64 {
+        self.timelines
+            .values()
+            .map(|tl| {
+                tl.on_disk_layers
+                    .values()
+                    .map(|v| v.metadata.file_size)
+                    .sum::<u64>()
+            })
+            .sum::<u64>()
+    }

    pub(super) fn evict_layer(
        &mut self,
        name: LayerName,
        timeline_id: &TimelineId,
        now: SystemTime,
-
+        resident_metric: &UIntGauge,
    ) -> Option<OnDiskState> {
        let timeline = self.timelines.get_mut(timeline_id)?;
-        let removed = timeline.remove_layer(&name);
+        let removed = timeline.remove_layer(&name, resident_metric);
        if removed.is_some() {
            timeline.evicted_at.insert(name, now);
        }
@@ -273,21 +297,52 @@ impl SecondaryDetail {

    pub(super) fn remove_timeline(
        &mut self,
-        _tenant_shard_id: &TenantShardId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
+        resident_metric: &UIntGauge,
    ) {
-        self.timelines.remove(timeline_id);
-        
+        let removed = self.timelines.remove(timeline_id);
+        if let Some(removed) = removed {
+            Self::clear_timeline_metrics(tenant_shard_id, timeline_id, removed, resident_metric);
+        }
    }

    pub(super) fn drain_timelines(
        &mut self,
-        _tenant_shard_id: &TenantShardId,
-
+        tenant_shard_id: &TenantShardId,
+        resident_metric: &UIntGauge,
    ) {
-        
+        for (timeline_id, removed) in self.timelines.drain() {
+            Self::clear_timeline_metrics(tenant_shard_id, &timeline_id, removed, resident_metric);
+        }
    }

+    fn clear_timeline_metrics(
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+        detail: SecondaryDetailTimeline,
+        resident_metric: &UIntGauge,
+    ) {
+        resident_metric.sub(
+            detail
+                .on_disk_layers
+                .values()
+                .map(|l| l.metadata.file_size)
+                .sum(),
+        );
+
+        let shard_id = format!("{}", tenant_shard_id.shard_slug());
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let timeline_id = timeline_id.to_string();
+        for op in StorageIoSizeOperation::VARIANTS {
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[
+                op,
+                tenant_id.as_str(),
+                shard_id.as_str(),
+                timeline_id.as_str(),
+            ]);
+        }
+    }

    /// Additionally returns the total number of layers, used for more stable relative access time
    /// based eviction.
@@ -742,6 +797,7 @@ impl<'a> TenantDownloader<'a> {
                        tenant_shard_id,
                        last_heatmap,
                        timeline,
+                        &self.secondary_state.resident_size_metric,
                        ctx,
                    )
                    .await;
@@ -864,7 +920,11 @@ impl<'a> TenantDownloader<'a> {
            bytes_downloaded: 0,
        };

-       
+        // Also expose heatmap bytes_total as a metric
+        self.secondary_state
+            .heatmap_total_size_metric
+            .set(heatmap_stats.bytes);
+
        // Accumulate list of things to delete while holding the detail lock, for execution after dropping the lock
        let mut delete_layers = Vec::new();
        let mut delete_timelines = Vec::new();
@@ -931,6 +991,7 @@ impl<'a> TenantDownloader<'a> {
                detail.remove_timeline(
                    self.secondary_state.get_tenant_shard_id(),
                    delete_timeline,
+                    &self.secondary_state.resident_size_metric,
                );
            }
        }
@@ -949,7 +1010,7 @@ impl<'a> TenantDownloader<'a> {
            let Some(timeline_state) = detail.timelines.get_mut(&timeline_id) else {
                continue;
            };
-            timeline_state.remove_layer(&layer_name);
+            timeline_state.remove_layer(&layer_name, &self.secondary_state.resident_size_metric);
        }

        for timeline_id in delete_timelines {
@@ -1016,7 +1077,7 @@ impl<'a> TenantDownloader<'a> {
        .await
        .ok_or_else(|| UpdateError::Cancelled)
        .and_then(|x| x)
-        .inspect(|_|{} )
+        .inspect(|_| SECONDARY_MODE.download_heatmap.inc())
    }

    /// Download heatmap layers that are not present on local disk, or update their
@@ -1191,6 +1252,7 @@ impl<'a> TenantDownloader<'a> {
                    tenant_shard_id,
                    &timeline_id,
                    &t,
+                    &self.secondary_state.resident_size_metric,
                    || {
                        local_layer_path(
                            self.conf,
@@ -1302,6 +1364,7 @@ impl<'a> TenantDownloader<'a> {
            progress.layers_downloaded += 1;
        }

+        SECONDARY_MODE.download_layer.inc();

        Ok(Some(layer))
    }
@@ -1313,6 +1376,7 @@ async fn init_timeline_state(
    tenant_shard_id: &TenantShardId,
    last_heatmap: Option<&HeatMapTimeline>,
    heatmap: &HeatMapTimeline,
+    resident_metric: &UIntGauge,
    ctx: &RequestContext,
 ) -> SecondaryDetailTimeline {
    let ctx = ctx.with_scope_secondary_timeline(tenant_shard_id, &heatmap.timeline_id);
@@ -1416,6 +1480,7 @@ async fn init_timeline_state(
                                tenant_shard_id,
                                &heatmap.timeline_id,
                                remote_meta,
+                                resident_metric,
                                || file_path,
                            );
                        }
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -20,7 +20,8 @@ use super::scheduler::{
 };
 use super::{CommandRequest, SecondaryTenantError, UploadCommand};
 use crate::TEMP_FILE_SUFFIX;
-use crate::tenant::Tenant;
+use crate::metrics::SECONDARY_MODE;
+use crate::tenant::TenantShard;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::mgr::{GetTenantError, TenantManager};
 use crate::tenant::remote_timeline_client::remote_heatmap_path;
@@ -73,7 +74,7 @@ impl RunningJob for WriteInProgress {
 }

 struct UploadPending {
-    tenant: Arc<Tenant>,
+    tenant: Arc<TenantShard>,
    last_upload: Option<LastUploadState>,
    target_time: Option<Instant>,
    period: Option<Duration>,
@@ -105,7 +106,7 @@ impl scheduler::Completion for WriteComplete {
 struct UploaderTenantState {
    // This Weak only exists to enable culling idle instances of this type
    // when the Tenant has been deallocated.
-    tenant: Weak<Tenant>,
+    tenant: Weak<TenantShard>,

    /// Digest of the serialized heatmap that we last successfully uploaded
    last_upload_state: Option<LastUploadState>,
@@ -220,10 +221,14 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
            // Guard for the barrier in [`WriteInProgress`]
            let _completion = completion;

-            
+            let started_at = Instant::now();
            let uploaded = match upload_tenant_heatmap(remote_storage, &tenant, last_upload.clone()).await {
                Ok(UploadHeatmapOutcome::Uploaded(uploaded)) => {
-                    
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap.inc();
                    Some(uploaded)
                }
                Ok(UploadHeatmapOutcome::NoChange | UploadHeatmapOutcome::Skipped) => last_upload,
@@ -232,8 +237,11 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
                        "Failed to upload heatmap for tenant {}: {e:#}",
                        tenant.get_tenant_shard_id(),
                    );
-                   
-
+                    let duration = Instant::now().duration_since(started_at);
+                    SECONDARY_MODE
+                        .upload_heatmap_duration
+                        .observe(duration.as_secs_f64());
+                    SECONDARY_MODE.upload_heatmap_errors.inc();
                    last_upload
                }
                Err(UploadHeatmapError::Cancelled) => {
@@ -349,7 +357,7 @@ struct LastUploadState {
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
    last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
    debug_assert_current_span_has_tenant_id();
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -360,7 +360,7 @@ where

    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
    ///
-    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::TenantShard`] or [`crate::tenant::secondary::SecondaryTenant`]
    ///
    /// This function resets the pending list: it is assumed that the caller may change their mind about
    /// which tenants need work between calls to schedule_iteration.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -12,7 +12,7 @@ use tracing::*;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

-use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use super::{GcError, LogicalSizeCalculationCause, TenantShard};
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::{MaybeOffloaded, Timeline};
@@ -156,7 +156,7 @@ pub struct TimelineInputs {
 ///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
 pub(super) async fn gather_inputs(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    limit: &Arc<Semaphore>,
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1620,7 +1620,7 @@ pub(crate) mod test {
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -2209,7 +2209,7 @@ pub(crate) mod test {
    }

    pub(crate) async fn produce_delta_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        tline: &Arc<Timeline>,
        mut deltas: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -559,11 +559,12 @@ impl ImageLayerInner {
            let view = BufView::new_slice(&blobs_buf.buf);

            for meta in blobs_buf.blobs.iter() {
-                let img_buf = meta.read(&view).await?;
-
+                // Just read the raw header+data and pass it through to the target layer, without
+                // decoding and recompressing it.
+                let raw = meta.raw_with_header(&view);
                key_count += 1;
                writer
-                    .put_image(meta.meta.key, img_buf.into_bytes(), ctx)
+                    .put_image_raw(meta.meta.key, raw.into_bytes(), ctx)
                    .await
                    .context(format!("Storing key {}", meta.meta.key))?;
            }
@@ -853,6 +854,41 @@ impl ImageLayerWriterInner {
        Ok(())
    }

+    ///
+    /// Write the next image to the file, as a raw blob header and data.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    async fn put_image_raw(
+        &mut self,
+        key: Key,
+        raw_with_header: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        ensure!(self.key_range.contains(&key));
+
+        // NB: we don't update the (un)compressed metrics, since we can't determine them without
+        // decompressing the image. This seems okay.
+        self.num_keys += 1;
+
+        let (_, res) = self
+            .blob_writer
+            .write_blob_raw(raw_with_header.slice_len(), ctx)
+            .await;
+        let offset = res?;
+
+        let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
+        key.write_to_byte_slice(&mut keybuf);
+        self.tree.append(&keybuf, offset)?;
+
+        #[cfg(feature = "testing")]
+        {
+            self.last_written_key = key;
+        }
+
+        Ok(())
+    }
+
    ///
    /// Finish writing the image layer.
    ///
@@ -882,6 +918,20 @@ impl ImageLayerWriterInner {
    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;

+        // Calculate compression ratio
+        let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES.inc_by(self.uncompressed_bytes);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CONSIDERED
+            .inc_by(self.uncompressed_bytes_eligible);
+        crate::metrics::COMPRESSION_IMAGE_INPUT_BYTES_CHOSEN.inc_by(self.uncompressed_bytes_chosen);
+
+        // NB: filter() may pass through raw pages from a different layer, without looking at
+        // whether these are compressed or not. We don't track metrics for these, so avoid
+        // increasing `COMPRESSION_IMAGE_OUTPUT_BYTES` in this case too.
+        if self.uncompressed_bytes > 0 {
+            crate::metrics::COMPRESSION_IMAGE_OUTPUT_BYTES.inc_by(compressed_size);
+        };
+
        let mut file = self.blob_writer.into_inner();

        // Write out the index
@@ -1026,6 +1076,25 @@ impl ImageLayerWriter {
        self.inner.as_mut().unwrap().put_image(key, img, ctx).await
    }

+    ///
+    /// Write the next value to the file, as a raw header and data. This allows passing through a
+    /// raw, potentially compressed image from a different layer file without recompressing it.
+    ///
+    /// The page versions must be appended in blknum order.
+    ///
+    pub async fn put_image_raw(
+        &mut self,
+        key: Key,
+        raw_with_header: Bytes,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.inner
+            .as_mut()
+            .unwrap()
+            .put_image_raw(key, raw_with_header, ctx)
+            .await
+    }
+
    /// Estimated size of the image layer.
    pub(crate) fn estimated_size(&self) -> u64 {
        let inner = self.inner.as_ref().unwrap();
@@ -1159,7 +1228,7 @@ mod test {
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    #[tokio::test]
    async fn image_layer_rewrite() {
@@ -1341,7 +1410,7 @@ mod test {
    }

    async fn produce_image_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        tline: &Arc<Timeline>,
        mut images: Vec<(Key, Bytes)>,
        lsn: Lsn,
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -32,6 +32,7 @@ use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
+use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::storage_layer::{OnDiskValue, OnDiskValueIo};
 use crate::tenant::timeline::GetVectoredError;
@@ -306,7 +307,11 @@ impl GlobalResourceUnits {
            }
        };

-       
+        // This is a sloppy update: concurrent updates to the counter will race, and the exact
+        // value of the metric might not be the exact latest value of GLOBAL_RESOURCES::dirty_bytes.
+        // That's okay: as long as the metric contains some recent value, it doesn't have to always
+        // be literally the last update.
+        TIMELINE_EPHEMERAL_BYTES.set(new_global_dirty_bytes);

        self.dirty_bytes = size;

--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -231,7 +231,9 @@ impl Layer {

        debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());

-        
+        timeline
+            .metrics
+            .resident_physical_size_add(metadata.file_size);

        ResidentLayer { downloaded, owner }
    }
@@ -524,6 +526,12 @@ impl Layer {
                }
            }

+            // Update the timeline's visible bytes count
+            if let Some(tl) = self.0.timeline.upgrade() {
+                tl.metrics
+                    .visible_physical_size_gauge
+                    .add(self.0.desc.file_size)
+            }
        }
    }

@@ -532,10 +540,23 @@ impl Layer {
        use LayerVisibilityHint::*;
        match (old_visibility, visibility) {
            (Visible, Covered) => {
-                
+                // Subtract this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    debug_assert!(
+                        tl.metrics.visible_physical_size_gauge.get() >= self.0.desc.file_size
+                    );
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .sub(self.0.desc.file_size)
+                }
            }
            (Covered, Visible) => {
-                
+                // Add this layer's contribution to the visible size metric
+                if let Some(tl) = self.0.timeline.upgrade() {
+                    tl.metrics
+                        .visible_physical_size_gauge
+                        .add(self.0.desc.file_size)
+                }
            }
            (Covered, Covered) | (Visible, Visible) => {
                // no change
@@ -588,6 +609,7 @@ impl ResidentOrWantedEvicted {
            ResidentOrWantedEvicted::Resident(strong) => Some((strong.clone(), false)),
            ResidentOrWantedEvicted::WantedEvicted(weak, _) => match weak.upgrade() {
                Some(strong) => {
+                    LAYER_IMPL_METRICS.inc_raced_wanted_evicted_accesses();

                    *self = ResidentOrWantedEvicted::Resident(strong.clone());

@@ -719,8 +741,17 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
-        
-        let timeline: Option<Arc<Timeline>> = self.timeline.upgrade();
+        // if there was a pending eviction, mark it cancelled here to balance metrics
+        if let Some((ResidentOrWantedEvicted::WantedEvicted(..), _)) = self.inner.take_and_deinit()
+        {
+            // eviction has already been started
+            LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
+
+            // eviction request is intentionally not honored as no one is present to wait for it
+            // and we could be delaying shutdown for nothing.
+        }
+
+        let timeline = self.timeline.upgrade();

        if let Some(timeline) = timeline.as_ref() {
            // Only need to decrement metrics if the timeline still exists: otherwise
@@ -728,6 +759,13 @@ impl Drop for LayerInner {
            timeline.metrics.dec_layer(&self.desc);

            if matches!(self.access_stats.visibility(), LayerVisibilityHint::Visible) {
+                debug_assert!(
+                    timeline.metrics.visible_physical_size_gauge.get() >= self.desc.file_size
+                );
+                timeline
+                    .metrics
+                    .visible_physical_size_gauge
+                    .sub(self.desc.file_size);
            }
        }

@@ -739,6 +777,7 @@ impl Drop for LayerInner {

        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().layer_name();
+        let file_size = self.layer_desc().file_size;
        let meta = self.metadata();
        let status = self.status.take();

@@ -747,13 +786,20 @@ impl Drop for LayerInner {

            // carry this until we are finished for [`Layer::wait_drop`] support
            let _status = status;
+
            let Some(timeline) = timeline else {
                // no need to nag that timeline is gone: under normal situation on
                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
                return;
            };

-           match std::fs::remove_file(path) {
+            let Ok(_guard) = timeline.gate.enter() else {
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
+                return;
+            };
+
+            let removed = match std::fs::remove_file(path) {
                Ok(()) => true,
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                    // until we no longer do detaches by removing all local files before removing the
@@ -764,16 +810,34 @@ impl Drop for LayerInner {
                    // layers.
                    false
                }
-                Err(_e) => {
+                Err(e) => {
+                    tracing::error!("failed to remove wanted deleted layer: {e}");
+                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
                    false
                }
            };

-            
-            let _a=timeline
+            if removed {
+                timeline.metrics.resident_physical_size_sub(file_size);
+            }
+            let res = timeline
                .remote_client
                .schedule_deletion_of_unlinked(vec![(file_name, meta)]);

+            if let Err(e) = res {
+                // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                // demonstrating this deadlock (without spawn_blocking): stop will drop
+                // queued items, which will have ResidentLayer's, and those drops would try
+                // to re-entrantly lock the RemoteTimelineClient inner state.
+                if !timeline.is_active() {
+                    tracing::info!("scheduling deletion on drop failed: {e:#}");
+                } else {
+                    tracing::warn!("scheduling deletion on drop failed: {e:#}");
+                }
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
+            } else {
+                LAYER_IMPL_METRICS.inc_completed_deletes();
+            }
        });
    }
 }
@@ -804,6 +868,12 @@ impl LayerInner {
        // This object acts as a RAII guard on these metrics: increment on construction
        timeline.metrics.inc_layer(&desc);

+        // New layers are visible by default. This metric is later updated on drop or in set_visibility
+        timeline
+            .metrics
+            .visible_physical_size_gauge
+            .add(desc.file_size);
+
        LayerInner {
            conf,
            path: local_path,
@@ -824,9 +894,13 @@ impl LayerInner {
    }

    fn delete_on_drop(&self) {
-                    let _a=self.wanted_deleted
+        let res =
+            self.wanted_deleted
                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);

+        if res.is_ok() {
+            LAYER_IMPL_METRICS.inc_started_deletes();
+        }
    }

    /// Cancellation safe, however dropping the future and calling this method again might result
@@ -864,6 +938,12 @@ impl LayerInner {
            // drop the DownloadedLayer outside of the holding the guard
            drop(strong);

+            // idea here is that only one evicter should ever get to witness a strong reference,
+            // which means whenever get_or_maybe_download upgrades a weak, it must mark up a
+            // cancelled eviction and signal us, like it currently does.
+            //
+            // a second concurrent evict_and_wait will not see a strong reference.
+            LAYER_IMPL_METRICS.inc_started_evictions();
        }

        let changed = rx.changed();
@@ -903,13 +983,15 @@ impl LayerInner {
            // get_or_init_detached can:
            // - be fast (mutex lock) OR uncontested semaphore permit acquire
            // - be slow (wait for semaphore permit or closing)
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
+
            let locked = self
                .inner
                .get_or_init_detached_measured(Some(&mut wait_for_download_recorder))
                .await
                .map(|mut guard| guard.get_and_upgrade().ok_or(guard));

-
+            scopeguard::ScopeGuard::into_inner(init_cancelled);

            match locked {
                // this path could had been a RwLock::read
@@ -922,7 +1004,8 @@ impl LayerInner {
                    // note that we also have dropped the Guard; this is fine, because we just made
                    // a state change and are holding a strong reference to be returned.
                    self.status.as_ref().unwrap().send_replace(Status::Resident);
-        
+                    LAYER_IMPL_METRICS
+                        .inc_eviction_cancelled(EvictionCancelled::UpgradedBackOnAccess);

                    return Ok(strong);
                }
@@ -949,7 +1032,8 @@ impl LayerInner {
            .upgrade()
            .ok_or(DownloadError::TimelineShutdown)?;

-        
+        // count cancellations, which currently remain largely unexpected
+        let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());

        // check if we really need to be downloaded: this can happen if a read access won the
        // semaphore before eviction.
@@ -961,6 +1045,7 @@ impl LayerInner {
            .await
            .map_err(DownloadError::PreStatFailed);

+        scopeguard::ScopeGuard::into_inner(init_cancelled);

        let needs_download = needs_download?;

@@ -971,7 +1056,7 @@ impl LayerInner {
            self.failpoint(failpoints::FailpointKind::AfterDeterminingLayerNeedsNoDownload)
                .await?;

-            
+            LAYER_IMPL_METRICS.inc_init_needed_no_download();

            return Ok(self.initialize_after_layer_is_on_disk(permit));
        };
@@ -1012,13 +1097,13 @@ impl LayerInner {
        async move {
            tracing::info!(%reason, "downloading on-demand");

-            
+            let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
            let res = self
                .download_init_and_wait(timeline, permit, ctx.attached_child())
                .maybe_perf_instrument(&ctx, |crnt_perf_span| crnt_perf_span.clone())
                .await?;

-      
+            scopeguard::ScopeGuard::into_inner(init_cancelled);
            Ok(res)
        }
        .instrument(tracing::info_span!("get_or_maybe_download", layer=%self))
@@ -1036,6 +1121,7 @@ impl LayerInner {
                    "unexpectedly on-demand downloading for task kind {:?}",
                    ctx.task_kind()
                );
+                crate::metrics::UNEXPECTED_ONDEMAND_DOWNLOADS.inc();

                let really_error =
                    matches!(b, Error) && !self.conf.ondemand_download_behavior_treat_error_as_warn;
@@ -1087,7 +1173,20 @@ impl LayerInner {

                let res = this.download_and_init(timeline, permit, &ctx).await;

-                let _a =tx.send(res);
+                if let Err(res) = tx.send(res) {
+                    match res {
+                        Ok(_res) => {
+                            tracing::debug!("layer initialized, but caller has been cancelled");
+                            LAYER_IMPL_METRICS.inc_init_completed_without_requester();
+                        }
+                        Err(e) => {
+                            tracing::info!(
+                                "layer file download failed, and caller has been cancelled: {e:?}"
+                            );
+                            LAYER_IMPL_METRICS.inc_download_failed_without_requester();
+                        }
+                    }
+                }
            }
            .in_current_span(),
        );
@@ -1139,9 +1238,21 @@ impl LayerInner {
                    }
                };
                tracing::info!(size=%self.desc.file_size, %latency_millis, "on-demand download successful");
-    
+                timeline
+                    .metrics
+                    .resident_physical_size_add(self.desc.file_size);
                self.consecutive_failures.store(0, Ordering::Relaxed);

+                let since_last_eviction = self
+                    .last_evicted_at
+                    .lock()
+                    .unwrap()
+                    .take()
+                    .map(|ts| ts.elapsed());
+                if let Some(since_last_eviction) = since_last_eviction {
+                    LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction);
+                }
+
                self.access_stats.record_residence_event();

                Ok(self.initialize_after_layer_is_on_disk(permit))
@@ -1296,8 +1407,12 @@ impl LayerInner {

            tracing::debug!("eviction started");

-           let _a = self.wait_for_turn_and_evict(only_version).await;
-            
+            let res = self.wait_for_turn_and_evict(only_version).await;
+            // metrics: ignore the Ok branch, it is not done yet
+            if let Err(e) = res {
+                tracing::debug!(res=?Err::<(), _>(&e), "eviction completed");
+                LAYER_IMPL_METRICS.inc_eviction_cancelled(e);
+            }
        };

        Self::spawn(start_evicting.instrument(span));
@@ -1417,13 +1532,21 @@ impl LayerInner {
        Self::spawn_blocking(move || {
            let _span = span.entered();

-            let res = self.evict_blocking( &gate, &permit);
+            let res = self.evict_blocking(&timeline, &gate, &permit);

            let waiters = self.inner.initializer_count();

-            
+            if waiters > 0 {
+                LAYER_IMPL_METRICS.inc_evicted_with_waiters();
+            }
+
            let completed_in = spawned_at.elapsed();
-           
+            LAYER_IMPL_METRICS.record_time_to_evict(completed_in);
+
+            match res {
+                Ok(()) => LAYER_IMPL_METRICS.inc_completed_evictions(),
+                Err(e) => LAYER_IMPL_METRICS.inc_eviction_cancelled(e),
+            }

            tracing::debug!(?res, elapsed_ms=%completed_in.as_millis(), %waiters, "eviction completed");
        });
@@ -1434,6 +1557,7 @@ impl LayerInner {
    /// This is blocking only to do just one spawn_blocking hop compared to multiple via tokio::fs.
    fn evict_blocking(
        &self,
+        timeline: &Timeline,
        _gate: &gate::GateGuard,
        _permit: &heavier_once_cell::InitPermit,
    ) -> Result<(), EvictionCancelled> {
@@ -1446,7 +1570,17 @@ impl LayerInner {
                    Ok(elapsed) => {
                        let accessed_and_visible = self.access_stats.accessed()
                            && self.access_stats.visibility() == LayerVisibilityHint::Visible;
-                        
+                        if accessed_and_visible {
+                            // Only layers used for reads contribute to our "low residence" metric that is used
+                            // to detect thrashing.  Layers promoted for other reasons (e.g. compaction) are allowed
+                            // to be rapidly evicted without contributing to this metric.
+                            timeline
+                                .metrics
+                                .evictions_with_low_residence_duration
+                                .read()
+                                .unwrap()
+                                .observe(elapsed);
+                        }

                        tracing::info!(
                            residence_millis = elapsed.as_millis(),
@@ -1458,6 +1592,10 @@ impl LayerInner {
                        tracing::info!("evicted layer after unknown residence period");
                    }
                }
+                timeline.metrics.evictions.inc();
+                timeline
+                    .metrics
+                    .resident_physical_size_sub(self.desc.file_size);
            }
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
                tracing::error!(
@@ -1674,7 +1812,13 @@ impl DownloadedLayer {
            match res {
                Ok(layer) => Ok(layer),
                Err(err) => {
-                    
+                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+                    // We log this message once over the lifetime of `Self`
+                    // => Ok and good to log backtrace and path here.
+                    tracing::error!(
+                        "layer load failed, assuming permanent failure: {}: {err:?}",
+                        owner.path
+                    );
                    Err(err)
                }
            }
@@ -1882,6 +2026,218 @@ impl From<ResidentLayer> for Layer {
    }
 }

+use metrics::IntCounter;
+
+pub(crate) struct LayerImplMetrics {
+    started_evictions: IntCounter,
+    completed_evictions: IntCounter,
+    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
+
+    started_deletes: IntCounter,
+    completed_deletes: IntCounter,
+    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
+
+    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
+    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
+    redownload_after: metrics::Histogram,
+    time_to_evict: metrics::Histogram,
+}
+
+impl Default for LayerImplMetrics {
+    fn default() -> Self {
+        use enum_map::Enum;
+
+        // reminder: these will be pageserver_layer_* with "_total" suffix
+
+        let started_evictions = metrics::register_int_counter!(
+            "pageserver_layer_started_evictions",
+            "Evictions started in the Layer implementation"
+        )
+        .unwrap();
+        let completed_evictions = metrics::register_int_counter!(
+            "pageserver_layer_completed_evictions",
+            "Evictions completed in the Layer implementation"
+        )
+        .unwrap();
+
+        let cancelled_evictions = metrics::register_int_counter_vec!(
+            "pageserver_layer_cancelled_evictions_count",
+            "Different reasons for evictions to have been cancelled or failed",
+            &["reason"]
+        )
+        .unwrap();
+
+        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = EvictionCancelled::from_usize(i);
+            let s = reason.as_str();
+            cancelled_evictions.with_label_values(&[s])
+        }));
+
+        let started_deletes = metrics::register_int_counter!(
+            "pageserver_layer_started_deletes",
+            "Deletions on drop pending in the Layer implementation"
+        )
+        .unwrap();
+        let completed_deletes = metrics::register_int_counter!(
+            "pageserver_layer_completed_deletes",
+            "Deletions on drop completed in the Layer implementation"
+        )
+        .unwrap();
+
+        let failed_deletes = metrics::register_int_counter_vec!(
+            "pageserver_layer_failed_deletes_count",
+            "Different reasons for deletions on drop to have failed",
+            &["reason"]
+        )
+        .unwrap();
+
+        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let reason = DeleteFailed::from_usize(i);
+            let s = reason.as_str();
+            failed_deletes.with_label_values(&[s])
+        }));
+
+        let rare_counters = metrics::register_int_counter_vec!(
+            "pageserver_layer_assumed_rare_count",
+            "Times unexpected or assumed rare event happened",
+            &["event"]
+        )
+        .unwrap();
+
+        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
+            let event = RareEvent::from_usize(i);
+            let s = event.as_str();
+            rare_counters.with_label_values(&[s])
+        }));
+
+        let inits_cancelled = metrics::register_int_counter!(
+            "pageserver_layer_inits_cancelled_count",
+            "Times Layer initialization was cancelled",
+        )
+        .unwrap();
+
+        let redownload_after = {
+            let minute = 60.0;
+            let hour = 60.0 * minute;
+            metrics::register_histogram!(
+                "pageserver_layer_redownloaded_after",
+                "Time between evicting and re-downloading.",
+                vec![
+                    10.0,
+                    30.0,
+                    minute,
+                    5.0 * minute,
+                    15.0 * minute,
+                    30.0 * minute,
+                    hour,
+                    12.0 * hour,
+                ]
+            )
+            .unwrap()
+        };
+
+        let time_to_evict = metrics::register_histogram!(
+            "pageserver_layer_eviction_held_permit_seconds",
+            "Time eviction held the permit.",
+            vec![0.001, 0.010, 0.100, 0.500, 1.000, 5.000]
+        )
+        .unwrap();
+
+        Self {
+            started_evictions,
+            completed_evictions,
+            cancelled_evictions,
+
+            started_deletes,
+            completed_deletes,
+            failed_deletes,
+
+            rare_counters,
+            inits_cancelled,
+            redownload_after,
+            time_to_evict,
+        }
+    }
+}
+
+impl LayerImplMetrics {
+    fn inc_started_evictions(&self) {
+        self.started_evictions.inc();
+    }
+    fn inc_completed_evictions(&self) {
+        self.completed_evictions.inc();
+    }
+    fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
+        self.cancelled_evictions[reason].inc()
+    }
+
+    fn inc_started_deletes(&self) {
+        self.started_deletes.inc();
+    }
+    fn inc_completed_deletes(&self) {
+        self.completed_deletes.inc();
+    }
+    fn inc_deletes_failed(&self, reason: DeleteFailed) {
+        self.failed_deletes[reason].inc();
+    }
+
+    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
+    /// attempt regardless of failure to delete local file.
+    fn inc_delete_removes_failed(&self) {
+        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
+    }
+
+    /// Expected rare just as cancellations are rare, but we could have cancellations separate from
+    /// the single caller which can start the download, so use this counter to separte them.
+    fn inc_init_completed_without_requester(&self) {
+        self.rare_counters[RareEvent::InitCompletedWithoutRequester].inc();
+    }
+
+    /// Expected rare because cancellations are unexpected, and failures are unexpected
+    fn inc_download_failed_without_requester(&self) {
+        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
+    }
+
+    /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
+    ///
+    /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
+    /// Option.
+    fn inc_raced_wanted_evicted_accesses(&self) {
+        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
+    }
+
+    /// These are only expected for [`Self::inc_init_cancelled`] amount when
+    /// running with remote storage.
+    fn inc_init_needed_no_download(&self) {
+        self.rare_counters[RareEvent::InitWithoutDownload].inc();
+    }
+
+    /// Expected rare because all layer files should be readable and good
+    fn inc_permanent_loading_failures(&self) {
+        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
+    }
+
+    fn inc_init_cancelled(&self) {
+        self.inits_cancelled.inc()
+    }
+
+    fn record_redownloaded_after(&self, duration: std::time::Duration) {
+        self.redownload_after.observe(duration.as_secs_f64())
+    }
+
+    /// This would be bad if it ever happened, or mean extreme disk pressure. We should probably
+    /// instead cancel eviction if we would have read waiters. We cannot however separate reads
+    /// from other evictions, so this could have noise as well.
+    fn inc_evicted_with_waiters(&self) {
+        self.rare_counters[RareEvent::EvictedWithWaiters].inc();
+    }
+
+    /// Recorded at least initially as the permit is now acquired in async context before
+    /// spawn_blocking action.
+    fn record_time_to_evict(&self, duration: std::time::Duration) {
+        self.time_to_evict.observe(duration.as_secs_f64())
+    }
+}

 #[derive(Debug, Clone, Copy, enum_map::Enum)]
 enum EvictionCancelled {
@@ -1898,6 +2254,21 @@ enum EvictionCancelled {
    UnexpectedEvictedState,
 }

+impl EvictionCancelled {
+    fn as_str(&self) -> &'static str {
+        match self {
+            EvictionCancelled::LayerGone => "layer_gone",
+            EvictionCancelled::TimelineGone => "timeline_gone",
+            EvictionCancelled::VersionCheckFailed => "version_check_fail",
+            EvictionCancelled::FileNotFound => "file_not_found",
+            EvictionCancelled::RemoveFailed => "remove_failed",
+            EvictionCancelled::AlreadyReinitialized => "already_reinitialized",
+            EvictionCancelled::LostToDownload => "lost_to_download",
+            EvictionCancelled::UpgradedBackOnAccess => "upgraded_back_on_access",
+            EvictionCancelled::UnexpectedEvictedState => "unexpected_evicted_state",
+        }
+    }
+}

 #[derive(enum_map::Enum)]
 enum DeleteFailed {
@@ -1905,6 +2276,15 @@ enum DeleteFailed {
    DeleteSchedulingFailed,
 }

+impl DeleteFailed {
+    fn as_str(&self) -> &'static str {
+        match self {
+            DeleteFailed::TimelineGone => "timeline_gone",
+            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
+        }
+    }
+}
+
 #[derive(enum_map::Enum)]
 enum RareEvent {
    RemoveOnDropFailed,
@@ -1916,3 +2296,21 @@ enum RareEvent {
    EvictedWithWaiters,
 }

+impl RareEvent {
+    fn as_str(&self) -> &'static str {
+        use RareEvent::*;
+
+        match self {
+            RemoveOnDropFailed => "remove_on_drop_failed",
+            InitCompletedWithoutRequester => "init_completed_without",
+            DownloadFailedWithoutRequester => "download_failed_without",
+            UpgradedWantedEvicted => "raced_wanted_evicted",
+            InitWithoutDownload => "init_needed_no_download",
+            PermanentLoadingFailure => "permanent_loading_failure",
+            EvictedWithWaiters => "evicted_with_waiters",
+        }
+    }
+}
+
+pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
+    once_cell::sync::Lazy::new(LayerImplMetrics::default);
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -238,7 +238,7 @@ async fn smoke_test() {
        rtc.get_remote_physical_size(),
        dummy_layer.metadata().file_size
    );
-   
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test demonstrates a previous hang when a eviction and deletion were requested at the same
@@ -311,6 +311,11 @@ async fn evict_and_wait_on_wanted_deleted() {

    SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

+    assert_eq!(1, LAYER_IMPL_METRICS.started_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_deletes.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());
+    assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 /// This test ensures we are able to read the layer while the layer eviction has been
@@ -361,7 +366,7 @@ fn read_wins_pending_eviction() {
        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
            .await
            .expect_err("should had been a timeout since we are holding the layer resident");
-       
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

        let (completion, barrier) = utils::completion::channel();
        let (arrival, arrived_at_barrier) = utils::completion::channel();
@@ -393,7 +398,18 @@ fn read_wins_pending_eviction() {

        // works as intended: evictions lose to "downloads"
        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-        
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );

        drop(completion);

@@ -401,9 +417,26 @@ fn read_wins_pending_eviction() {
        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads0(&handle, 1)
            .await;

-        
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());

-        
+        // now we finally can observe the original eviction failing
+        // it would had been possible to observe it earlier, but here it is guaranteed to have
+        // happened.
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::AlreadyReinitialized].get()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
    });
 }

@@ -466,7 +499,7 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
        tokio::time::timeout(ADVANCE, &mut evict_and_wait)
            .await
            .expect_err("should had been a timeout since we are holding the layer resident");
-        
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

        let (completion1, barrier) = utils::completion::channel();
        let mut completion1 = Some(completion1);
@@ -501,9 +534,20 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {

        // works as intended: evictions lose to "downloads"
        assert!(matches!(e, EvictionError::Downloaded), "{e:?}");
-       
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());

-       
+        // this is not wrong: the eviction is technically still "on the way" as it's still queued
+        // because of a failpoint
+        assert_eq!(
+            0,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.completed_evictions.get());

        // configure another failpoint for the second eviction -- evictions are per initialization,
        // so now that we've reinitialized the inner, we get to run two of them at the same time.
@@ -523,10 +567,13 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {

        arrived_at_barrier.wait().await;

-       
+        assert_eq!(2, LAYER_IMPL_METRICS.started_evictions.get());

-        let mut release_earlier_eviction = |_expected_reason| {
-            
+        let mut release_earlier_eviction = |expected_reason| {
+            assert_eq!(
+                0,
+                LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+            );

            drop(completion1.take().unwrap());

@@ -539,7 +586,10 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
                )
                .await;

-                
+                assert_eq!(
+                    1,
+                    LAYER_IMPL_METRICS.cancelled_evictions[expected_reason].get(),
+                );
            }
        };

@@ -562,7 +612,19 @@ fn multiple_pending_evictions_scenario(name: &'static str, in_order: bool) {
            .expect("eviction goes through now that spawn_blocking is unclogged")
            .expect("eviction should succeed, because version matches");

-       
+        assert_eq!(1, LAYER_IMPL_METRICS.completed_evictions.get());
+
+        // ensure the cancelled are unchanged
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS
+                .cancelled_evictions
+                .values()
+                .map(|ctr| ctr.get())
+                .sum::<u64>()
+        );
+
+        assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
    });
 }

@@ -652,7 +714,8 @@ async fn cancelled_get_or_maybe_download_does_not_cancel_eviction() {
        .unwrap_err();
    assert!(matches!(e, DownloadError::DownloadRequired), "{e:?}");

-    
+    // failpoint is not counted as cancellation either
+    assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }

 #[tokio::test(start_paused = true)]
@@ -829,7 +892,8 @@ async fn eviction_cancellation_on_drop() {
                .expect_err("should had been a timeout since we are holding the layer resident");
        }

-        
+        // 1 == we only evict one of the layers
+        assert_eq!(1, LAYER_IMPL_METRICS.started_evictions.get());

        drop(resident);

@@ -838,7 +902,10 @@ async fn eviction_cancellation_on_drop() {

        SpawnBlockingPoolHelper::consume_and_release_all_of_spawn_blocking_threads(&handle).await;

-        
+        assert_eq!(
+            1,
+            LAYER_IMPL_METRICS.cancelled_evictions[EvictionCancelled::LayerGone].get()
+        );
    }
 }

--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -10,7 +10,7 @@ use std::time::{Duration, Instant};
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_COMPACTION_PERIOD;
 use rand::Rng;
-
+use scopeguard::defer;
 use tokio::sync::{Semaphore, SemaphorePermit};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -19,11 +19,12 @@ use utils::completion::Barrier;
 use utils::pausable_failpoint;

 use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
-use crate::tenant::{Tenant, TenantState};
+use crate::tenant::{TenantShard, TenantState};

 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -84,15 +85,17 @@ pub(crate) enum BackgroundLoopKind {
    SecondaryDownload,
 }

-pub struct BackgroundLoopSemaphorePermit {
+pub struct BackgroundLoopSemaphorePermit<'a> {
    _permit: SemaphorePermit<'static>,
+    _recorder: BackgroundLoopSemaphoreMetricsRecorder<'a>,
 }

 /// Acquires a semaphore permit, to limit concurrent background jobs.
 pub(crate) async fn acquire_concurrency_permit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-) -> BackgroundLoopSemaphorePermit {
+) -> BackgroundLoopSemaphorePermit<'static> {
+    let mut recorder = metrics::BACKGROUND_LOOP_SEMAPHORE.record(loop_kind);

    if loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation {
        pausable_failpoint!("initial-size-calculation-permit-pause");
@@ -105,14 +108,16 @@ pub(crate) async fn acquire_concurrency_permit(
    };
    let permit = semaphore.acquire().await.expect("should never close");

+    recorder.acquired();

    BackgroundLoopSemaphorePermit {
        _permit: permit,
+        _recorder: recorder,
    }
 }

 /// Start per tenant background loops: compaction, GC, and ingest housekeeping.
-pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
+pub fn start_background_loops(tenant: &Arc<TenantShard>, can_start: Option<&Barrier>) {
    let tenant_shard_id = tenant.tenant_shard_id;

    task_mgr::spawn(
@@ -130,7 +135,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
-
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                compaction_loop(tenant, cancel)
                    // If you rename this span, change the RUST_LOG env variable in test_runner/performance/test_branch_creation.py
                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
@@ -155,6 +161,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                gc_loop(tenant, cancel)
                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
@@ -178,7 +186,8 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
                    _ = cancel.cancelled() => return Ok(()),
                    _ = Barrier::maybe_wait(can_start) => {}
                };
-
+                TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
+                defer!(TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc());
                tenant_housekeeping_loop(tenant, cancel)
                    .instrument(info_span!("tenant_housekeeping_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
@@ -189,7 +198,7 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
 }

 /// Compaction task's main loop.
-async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn compaction_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    const BASE_BACKOFF_SECS: f64 = 1.0;
    const MAX_BACKOFF_SECS: f64 = 300.0;
    const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10);
@@ -339,7 +348,7 @@ pub(crate) fn log_compaction_error(
 }

 /// GC task's main loop.
-async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn gc_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    const MAX_BACKOFF_SECS: f64 = 300.0;
    let mut error_run = 0; // consecutive errors

@@ -423,7 +432,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 }

 /// Tenant housekeeping's main loop.
-async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn tenant_housekeeping_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    let mut last_throttle_flag_reset_at = Instant::now();
    loop {
        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
@@ -474,7 +483,7 @@ async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken

 /// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
 async fn wait_for_active_tenant(
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
    cancel: &CancellationToken,
 ) -> ControlFlow<()> {
    if tenant.current_state() == TenantState::Active {
@@ -583,5 +592,8 @@ pub(crate) fn warn_when_period_overrun(
            ?task,
            "task iteration took longer than the configured period"
        );
+        metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
+            .with_label_values(&[task.into(), &format!("{}", period.as_secs())])
+            .inc();
    }
 }
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -45,8 +45,8 @@ pub struct Stats {
 }

 pub enum ThrottleResult {
-    NotThrottled {  },
-    Throttled {  },
+    NotThrottled { end: Instant },
+    Throttled { end: Instant },
 }

 impl Throttle {
@@ -114,7 +114,7 @@ impl Throttle {
        let inner = self.inner.load_full(); // clones the `Inner` Arc

        if !inner.enabled {
-            return ThrottleResult::NotThrottled { };
+            return ThrottleResult::NotThrottled { end: start };
        }

        self.count_accounted_start.fetch_add(1, Ordering::Relaxed);
@@ -127,9 +127,9 @@ impl Throttle {
            let wait_time = end - start;
            self.sum_throttled_usecs
                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
-            ThrottleResult::Throttled {  }
+            ThrottleResult::Throttled { end }
        } else {
-            ThrottleResult::NotThrottled { }
+            ThrottleResult::NotThrottled { end: start }
        }
    }
 }
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -96,6 +96,7 @@ use super::{
    AttachedTenantConf, GcError, HeatMapTimeline, MaybeOffloaded,
    debug_assert_current_span_has_tenant_and_timeline_id,
 };
+use crate::aux_file::AuxFileSizeEstimator;
 use crate::config::PageServerConf;
 use crate::context::{
    DownloadBehavior, PerfInstrumentFutureExt, RequestContext, RequestContextBuilder,
@@ -103,7 +104,10 @@ use crate::context::{
 use crate::disk_usage_eviction_task::{DiskUsageEvictionInfo, EvictionCandidate, finite_f32};
 use crate::keyspace::{KeyPartitioning, KeySpace};
 use crate::l0_flush::{self, L0FlushGlobalState};
-use crate::metrics::TimelineMetrics;
+use crate::metrics::{
+    DELTAS_PER_READ_GLOBAL, LAYERS_PER_READ_AMORTIZED_GLOBAL, LAYERS_PER_READ_BATCH_GLOBAL,
+    LAYERS_PER_READ_GLOBAL, ScanLatencyOngoingRecording, TimelineMetrics,
+};
 use crate::page_service::TenantManagerTypes;
 use crate::pgdatadir_mapping::{
    CalculateLogicalSizeError, CollectKeySpaceError, DirectoryKind, LsnForTimestamp,
@@ -408,9 +412,11 @@ pub struct Timeline {
    /// Timeline deletion will acquire both compaction and gc locks in whatever order.
    gc_lock: tokio::sync::Mutex<()>,

-    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
+    /// Cloned from [`super::TenantShard::pagestream_throttle`] on construction.
    pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,

+    /// Size estimator for aux file v2
+    pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,

    /// Some test cases directly place keys into the timeline without actually modifying the directory
    /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
@@ -1204,20 +1210,20 @@ impl Timeline {
            ctx.task_kind(),
        );

-        // let start = crate::metrics::GET_VECTORED_LATENCY
-        //     .for_task_kind(ctx.task_kind())
-        //     .map(|metric| (metric, Instant::now()));
+        let start = crate::metrics::GET_VECTORED_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(|metric| (metric, Instant::now()));

-         self
+        let res = self
            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
-            .await
+            .await;

-        // if let Some((metric, start)) = start {
-        //     let elapsed = start.elapsed();
-        //     metric.observe(elapsed.as_secs_f64());
-        // }
+        if let Some((metric, start)) = start {
+            let elapsed = start.elapsed();
+            metric.observe(elapsed.as_secs_f64());
+        }

-        
+        res
    }

    /// Scan the keyspace and return all existing key-values in the keyspace. This currently uses vectored
@@ -1256,21 +1262,21 @@ impl Timeline {
            }
        }

-        // let start = crate::metrics::SCAN_LATENCY
-        //     .for_task_kind(ctx.task_kind())
-        //     .map(ScanLatencyOngoingRecording::start_recording);
+        let start = crate::metrics::SCAN_LATENCY
+            .for_task_kind(ctx.task_kind())
+            .map(ScanLatencyOngoingRecording::start_recording);

        let query = VersionedKeySpaceQuery::uniform(keyspace, lsn);

-         self
+        let vectored_res = self
            .get_vectored_impl(query, &mut ValuesReconstructState::new(io_concurrency), ctx)
-            .await
+            .await;

-        // if let Some(recording) = start {
-        //     recording.observe();
-        // }
+        if let Some(recording) = start {
+            recording.observe();
+        }

-       
+        vectored_res
    }

    pub(super) async fn get_vectored_impl(
@@ -1279,6 +1285,10 @@ impl Timeline {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
            Some(ReadPath::new(
                query.total_keyspace(),
@@ -1379,7 +1389,7 @@ impl Timeline {
                            return (key, Err(err));
                        }
                    };
-                   
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);

                    // The walredo module expects the records to be descending in terms of Lsn.
                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
@@ -1417,21 +1427,42 @@ impl Timeline {
        // when they're missing. Instead they are omitted from the resulting btree
        // (this is a requirement, not a bug). Skip updating the metric in these cases
        // to avoid infinite results.
-        if !results.is_empty() && layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
-            let total_keyspace = query.total_keyspace();
-            let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");
+        if !results.is_empty() {
+            if layers_visited >= Self::LAYERS_VISITED_WARN_THRESHOLD {
+                let total_keyspace = query.total_keyspace();
+                let max_request_lsn = query.high_watermark_lsn().expect("Validated previously");

-            static LOG_PACER: Lazy<Mutex<RateLimit>> =
-                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
-            LOG_PACER.lock().unwrap().call(|| {
-                let num_keys = total_keyspace.total_raw_size();
-                let num_pages = results.len();
-                tracing::info!(
-                  shard_id = %self.tenant_shard_id.shard_slug(),
-                  lsn = %max_request_lsn,
-                  "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
-                );
-            });
+                static LOG_PACER: Lazy<Mutex<RateLimit>> =
+                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(60))));
+                LOG_PACER.lock().unwrap().call(|| {
+                    let num_keys = total_keyspace.total_raw_size();
+                    let num_pages = results.len();
+                    tracing::info!(
+                      shard_id = %self.tenant_shard_id.shard_slug(),
+                      lsn = %max_request_lsn,
+                      "Vectored read for {total_keyspace} visited {layers_visited} layers. Returned {num_pages}/{num_keys} pages.",
+                    );
+                });
+            }
+
+            // Records the number of layers visited in a few different ways:
+            //
+            // * LAYERS_PER_READ: all layers count towards every read in the batch, because each
+            //   layer directly affects its observed latency.
+            //
+            // * LAYERS_PER_READ_BATCH: all layers count towards each batch, to get the per-batch
+            //   layer visits and access cost.
+            //
+            // * LAYERS_PER_READ_AMORTIZED: the average layer count per read, to get the amortized
+            //   read amplification after batching.
+            let layers_visited = layers_visited as f64;
+            let avg_layers_visited = layers_visited / results.len() as f64;
+            LAYERS_PER_READ_BATCH_GLOBAL.observe(layers_visited);
+            for _ in &results {
+                self.metrics.layers_per_read.observe(layers_visited);
+                LAYERS_PER_READ_GLOBAL.observe(layers_visited);
+                LAYERS_PER_READ_AMORTIZED_GLOBAL.observe(avg_layers_visited);
+            }
        }

        Ok(results)
@@ -1483,6 +1514,9 @@ impl Timeline {
        guard.layer_size_sum()
    }

+    pub(crate) fn resident_physical_size(&self) -> u64 {
+        self.metrics.resident_physical_size_get()
+    }

    pub(crate) fn get_directory_metrics(&self) -> [u64; DirectoryKind::KINDS_NUM] {
        array::from_fn(|idx| self.directory_metrics[idx].load(AtomicOrdering::Relaxed))
@@ -1545,6 +1579,8 @@ impl Timeline {
            WaitLsnTimeout::Default => self.conf.wait_lsn_timeout,
        };

+        let timer = crate::metrics::WAIT_LSN_TIME.start_timer();
+        let start_finish_counterpair_guard = self.metrics.wait_lsn_start_finish_counterpair.guard();

        let wait_for_timeout = self.last_record_lsn.wait_for_timeout(lsn, timeout);
        let wait_for_timeout = std::pin::pin!(wait_for_timeout);
@@ -1561,8 +1597,11 @@ impl Timeline {
                 ready,
                 is_slow,
                 elapsed_total,
-                 elapsed_since_last_callback: _,
+                 elapsed_since_last_callback,
             }| {
+                self.metrics
+                    .wait_lsn_in_progress_micros
+                    .inc_by(u64::try_from(elapsed_since_last_callback.as_micros()).unwrap());
                if !is_slow {
                    return;
                }
@@ -1592,6 +1631,8 @@ impl Timeline {
        let res = wait_for_timeout.await;
        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
        drop(logging_permit);
+        drop(start_finish_counterpair_guard);
+        drop(timer);
        match res {
            Ok(()) => Ok(()),
            Err(e) => {
@@ -2028,7 +2069,7 @@ impl Timeline {

    pub(crate) fn activate(
        self: &Arc<Self>,
-        parent: Arc<crate::tenant::Tenant>,
+        parent: Arc<crate::tenant::TenantShard>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
@@ -2681,6 +2722,15 @@ impl Timeline {
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
    }

+    fn get_evictions_low_residence_duration_metric_threshold(
+        tenant_conf: &pageserver_api::models::TenantConfig,
+        default_tenant_conf: &pageserver_api::config::TenantConfigToml,
+    ) -> Duration {
+        tenant_conf
+            .evictions_low_residence_duration_metric_threshold
+            .unwrap_or(default_tenant_conf.evictions_low_residence_duration_metric_threshold)
+    }
+
    fn get_image_layer_creation_check_threshold(&self) -> u8 {
        let tenant_conf = self.tenant_conf.load();
        tenant_conf
@@ -2756,8 +2806,28 @@ impl Timeline {

        // The threshold is embedded in the metric. So, we need to update it.
        {
+            let new_threshold = Self::get_evictions_low_residence_duration_metric_threshold(
+                &new_conf.tenant_conf,
+                &self.conf.default_tenant_conf,
+            );
+
+            let tenant_id_str = self.tenant_shard_id.tenant_id.to_string();
+            let shard_id_str = format!("{}", self.tenant_shard_id.shard_slug());
+
+            let timeline_id_str = self.timeline_id.to_string();
+
            self.remote_client.update_config(&new_conf.location);

+            self.metrics
+                .evictions_with_low_residence_duration
+                .write()
+                .unwrap()
+                .change_threshold(
+                    &tenant_id_str,
+                    &shard_id_str,
+                    &timeline_id_str,
+                    new_threshold,
+                );
        }
    }

@@ -2791,6 +2861,13 @@ impl Timeline {
        let (layer_flush_start_tx, _) = tokio::sync::watch::channel((0, disk_consistent_lsn));
        let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

+        let evictions_low_residence_duration_metric_threshold = {
+            let loaded_tenant_conf = tenant_conf.load();
+            Self::get_evictions_low_residence_duration_metric_threshold(
+                &loaded_tenant_conf.tenant_conf,
+                &conf.default_tenant_conf,
+            )
+        };

        if let Some(ancestor) = &ancestor {
            let mut ancestor_gc_info = ancestor.gc_info.write().unwrap();
@@ -2803,7 +2880,12 @@ impl Timeline {
            let metrics = Arc::new(TimelineMetrics::new(
                &tenant_shard_id,
                &timeline_id,
+                crate::metrics::EvictionsWithLowResidenceDurationBuilder::new(
+                    "mtime",
+                    evictions_low_residence_duration_metric_threshold,
+                ),
            ));
+            let aux_file_metrics = metrics.aux_file_size_gauge.clone();

            let mut result = Timeline {
                conf,
@@ -2911,6 +2993,8 @@ impl Timeline {

                pagestream_throttle: resources.pagestream_throttle,

+                aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
+
                #[cfg(test)]
                extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),

@@ -2936,6 +3020,10 @@ impl Timeline {
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;

+            result
+                .metrics
+                .last_record_lsn_gauge
+                .set(disk_consistent_lsn.0 as i64);
            result
        })
    }
@@ -3091,6 +3179,8 @@ impl Timeline {

        let mut guard = self.layers.write().await;

+        let timer = self.metrics.load_layer_map_histo.start_timer();
+
        // Scan timeline directory and create ImageLayerName and DeltaFilename
        // structs representing all files on disk
        let timeline_path = self
@@ -3239,7 +3329,7 @@ impl Timeline {
        //     (1) and (4)
        // TODO: this is basically a no-op now, should we remove it?
        self.remote_client.schedule_barrier()?;
-        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // TenantShard::create_timeline will wait for these uploads to happen before returning, or
        // on retry.

        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
@@ -3251,6 +3341,7 @@ impl Timeline {
            num_layers, disk_consistent_lsn, total_physical_size
        );

+        timer.stop_and_record();
        Ok(())
    }

@@ -3318,7 +3409,7 @@ impl Timeline {

        if let CurrentLogicalSize::Approximate(_) = &current_size {
            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
-                let _= self
+                let first = self
                    .current_logical_size
                    .did_return_approximate_to_walreceiver
                    .compare_exchange(
@@ -3326,8 +3417,11 @@ impl Timeline {
                        true,
                        AtomicOrdering::Relaxed,
                        AtomicOrdering::Relaxed,
-                    ).is_ok();
-                
+                    )
+                    .is_ok();
+                if first {
+                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
+                }
            }
        }

@@ -3399,7 +3493,7 @@ impl Timeline {
            self.current_logical_size.initialized.add_permits(1);
        }

-        let try_once = |_attempt: usize| {
+        let try_once = |attempt: usize| {
            let background_ctx = &background_ctx;
            let self_ref = &self;
            let skip_concurrency_limiter = &skip_concurrency_limiter;
@@ -3410,7 +3504,7 @@ impl Timeline {
                );

                use crate::metrics::initial_logical_size::StartCircumstances;
-                let (_maybe_permit, _circumstances) = tokio::select! {
+                let (_maybe_permit, circumstances) = tokio::select! {
                    permit = wait_for_permit => {
                        (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit)
                    }
@@ -3427,6 +3521,12 @@ impl Timeline {
                    }
                };

+                let metrics_guard = if attempt == 1 {
+                    crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
+                } else {
+                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
+                };
+
                let io_concurrency = IoConcurrency::spawn_from_conf(
                    self_ref.conf,
                    self_ref
@@ -3453,7 +3553,7 @@ impl Timeline {

                // TODO: add aux file size to logical size

-                Ok(calculated_size)
+                Ok((calculated_size, metrics_guard))
            }
        };

@@ -3490,14 +3590,27 @@ impl Timeline {
            }
        };

-        let calculated_size = match retrying.await {
+        let (calculated_size, metrics_guard) = match retrying.await {
            ControlFlow::Continue(calculated_size) => calculated_size,
            ControlFlow::Break(()) => return,
        };

+        // we cannot query current_logical_size.current_size() to know the current
+        // *negative* value, only truncated to u64.
+        let added = self
+            .current_logical_size
+            .size_added_after_initial
+            .load(AtomicOrdering::Relaxed);
+
+        let sum = calculated_size.saturating_add_signed(added);
+
+        // set the gauge value before it can be set in `update_current_logical_size`.
+        self.metrics.current_logical_size_gauge.set(sum);
+
        self.current_logical_size
            .initial_logical_size
-            .set((calculated_size,))
+            .set((calculated_size, metrics_guard.calculation_result_saved()))
+            .ok()
            .expect("only this task sets it");
    }

@@ -3562,7 +3675,7 @@ impl Timeline {
    async fn calculate_logical_size(
        &self,
        up_to_lsn: Lsn,
-        _cause: LogicalSizeCalculationCause,
+        cause: LogicalSizeCalculationCause,
        _guard: &GateGuard,
        ctx: &RequestContext,
    ) -> Result<u64, CalculateLogicalSizeError> {
@@ -3581,13 +3694,20 @@ impl Timeline {
        if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
            return Ok(size);
        }
-        
-       
+        let storage_time_metrics = match cause {
+            LogicalSizeCalculationCause::Initial
+            | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
+            | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
+            LogicalSizeCalculationCause::EvictionTaskImitation => {
+                &self.metrics.imitate_logical_size_histo
+            }
+        };
+        let timer = storage_time_metrics.start_timer();
        let logical_size = self
            .get_current_logical_size_non_incremental(up_to_lsn, ctx)
            .await?;
        debug!("calculated logical size: {logical_size}");
-      
+        timer.stop_and_record();
        Ok(logical_size)
    }

@@ -3596,6 +3716,21 @@ impl Timeline {
        let logical_size = &self.current_logical_size;
        logical_size.increment_size(delta);

+        // Also set the value in the prometheus gauge. Note that
+        // there is a race condition here: if this is is called by two
+        // threads concurrently, the prometheus gauge might be set to
+        // one value while current_logical_size is set to the
+        // other.
+        match logical_size.current_size() {
+            CurrentLogicalSize::Exact(ref new_current_size) => self
+                .metrics
+                .current_logical_size_gauge
+                .set(new_current_size.into()),
+            CurrentLogicalSize::Approximate(_) => {
+                // don't update the gauge yet, this allows us not to update the gauge back and
+                // forth between the initial size calculation task.
+            }
+        }
    }

    pub(crate) fn update_directory_entries_count(&self, kind: DirectoryKind, count: MetricsUpdate) {
@@ -3633,8 +3768,26 @@ impl Timeline {
            }
        };

-        
-        
+        // TODO: remove this, there's no place in the code that updates this aux metrics.
+        let aux_metric =
+            self.directory_metrics[DirectoryKind::AuxFiles.offset()].load(AtomicOrdering::Relaxed);
+
+        let sum_of_entries = self
+            .directory_metrics
+            .iter()
+            .map(|v| v.load(AtomicOrdering::Relaxed))
+            .sum();
+        // Set a high general threshold and a lower threshold for the auxiliary files,
+        // as we can have large numbers of relations in the db directory.
+        const SUM_THRESHOLD: u64 = 5000;
+        const AUX_THRESHOLD: u64 = 1000;
+        if sum_of_entries >= SUM_THRESHOLD || aux_metric >= AUX_THRESHOLD {
+            self.metrics
+                .directory_entries_count_gauge
+                .set(sum_of_entries);
+        } else if let Some(metric) = Lazy::get(&self.metrics.directory_entries_count_gauge) {
+            metric.set(sum_of_entries);
+        }
    }

    async fn find_layer(
@@ -4354,6 +4507,8 @@ impl Timeline {

    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        assert!(new_lsn.is_aligned());
+
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
        self.last_record_lsn.advance(new_lsn);
    }

@@ -4473,10 +4628,17 @@ impl Timeline {
                            "stalling layer flushes for compaction backpressure at {l0_count} \
                            L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
                        );
-                        
+                        let stall_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
                        tokio::select! {
-                            _result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
-                               
+                            result = watch_l0.wait_for(|l0| *l0 < stall_threshold) => {
+                                if let Ok(l0) = result.as_deref() {
+                                    let delay = stall_timer.elapsed().as_secs_f64();
+                                    info!("resuming layer flushes at {l0} L0 layers after {delay:.3}s");
+                                }
                            },
                            _ = self.cancel.cancelled() => {},
                        }
@@ -4485,7 +4647,7 @@ impl Timeline {
                }

                // Flush the layer.
-                let flush_timer = Instant::now();
+                let flush_timer = self.metrics.flush_time_histo.start_timer();
                match self.flush_frozen_layer(layer, ctx).await {
                    Ok(layer_lsn) => flushed_to_lsn = max(flushed_to_lsn, layer_lsn),
                    Err(FlushLayerError::Cancelled) => {
@@ -4501,7 +4663,7 @@ impl Timeline {
                        break err.map(|_| ());
                    }
                }
-                let flush_duration = flush_timer.elapsed();
+                let flush_duration = flush_timer.stop_and_record();

                // Notify the tenant compaction loop if L0 compaction is needed.
                let l0_count = *watch_l0.borrow();
@@ -4519,7 +4681,11 @@ impl Timeline {
                            "delaying layer flush by {delay:.3}s for compaction backpressure at \
                            {l0_count} L0 layers ({frozen_count} frozen layers with {frozen_size} bytes)"
                        );
-                        
+                        let _delay_timer = self
+                            .metrics
+                            .flush_delay_histo
+                            .start_timer()
+                            .record_on_drop();
                        tokio::select! {
                            _ = tokio::time::sleep(flush_duration) => {},
                            _ = watch_l0.wait_for(|l0| *l0 < delay_threshold) => {},
@@ -4761,6 +4927,9 @@ impl Timeline {
            "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}"
        );

+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
        new_value != old_value
    }

@@ -5246,7 +5415,7 @@ impl Timeline {
        last_status: LastImageLayerCreationStatus,
        yield_for_l0: bool,
    ) -> Result<(Vec<ResidentLayer>, LastImageLayerCreationStatus), CreateImageLayersError> {
-        
+        let timer = self.metrics.create_images_time_histo.start_timer();

        if partitioning.parts.is_empty() {
            warn!("no partitions to create image layers for");
@@ -5279,8 +5448,8 @@ impl Timeline {

        let mut all_generated = true;

-        
-        let total_partitions = partitioning.parts.len();
+        let mut partition_processed = 0;
+        let mut total_partitions = partitioning.parts.len();
        let mut last_partition_processed = None;
        let mut partition_parts = partitioning.parts.clone();

@@ -5305,7 +5474,7 @@ impl Timeline {
                        break; // with found=false
                    }
                    partition_parts = partition_parts.split_off(i + 1); // Remove the first i + 1 elements
-                
+                    total_partitions = partition_parts.len();
                    // Update the start key to the partition start.
                    start = partition_parts[0].start().unwrap();
                    found = true;
@@ -5322,6 +5491,7 @@ impl Timeline {
            if self.cancel.is_cancelled() {
                return Err(CreateImageLayersError::Cancelled);
            }
+            partition_processed += 1;
            let img_range = start..partition.ranges.last().unwrap().end;
            let compact_metadata = partition.overlaps(&Key::metadata_key_range());
            if compact_metadata {
@@ -5484,16 +5654,28 @@ impl Timeline {
            .open_mut()?
            .track_new_image_layers(&image_layers, &self.metrics);
        drop_wlock(guard);
-        
+        let duration = timer.stop_and_record();

        // Creating image layers may have caused some previously visible layers to be covered
        if !image_layers.is_empty() {
            self.update_layer_visibility().await?;
        }

-   
+        let total_layer_size = image_layers
+            .iter()
+            .map(|l| l.metadata().file_size)
+            .sum::<u64>();

-        
+        if !image_layers.is_empty() {
+            info!(
+                "created {} image layers ({} bytes) in {}s, processed {} out of {} partitions",
+                image_layers.len(),
+                total_layer_size,
+                duration.as_secs_f64(),
+                partition_processed,
+                total_partitions
+            );
+        }

        Ok((
            image_layers,
@@ -5576,7 +5758,7 @@ impl Timeline {
    /// from our ancestor to be branches of this timeline.
    pub(crate) async fn prepare_to_detach_from_ancestor(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        options: detach_ancestor::Options,
        behavior: DetachBehavior,
        ctx: &RequestContext,
@@ -5595,7 +5777,7 @@ impl Timeline {
    /// resetting the tenant.
    pub(crate) async fn detach_from_ancestor_and_reparent(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        prepared: detach_ancestor::PreparedTimelineDetach,
        ancestor_timeline_id: TimelineId,
        ancestor_lsn: Lsn,
@@ -5619,7 +5801,7 @@ impl Timeline {
    /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
    pub(crate) async fn complete_detaching_timeline_ancestor(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        attempt: detach_ancestor::Attempt,
        ctx: &RequestContext,
    ) -> Result<(), detach_ancestor::Error> {
@@ -6028,7 +6210,11 @@ impl Timeline {
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<GcCutoffs, PageReconstructError> {
-        
+        let _timer = self
+            .metrics
+            .find_gc_cutoffs_histo
+            .start_timer()
+            .record_on_drop();

        pausable_failpoint!("Timeline::find_gc_cutoffs-pausable");

@@ -6095,7 +6281,7 @@ impl Timeline {
            guard = self.gc_lock.lock() => guard,
            _ = self.cancel.cancelled() => return Ok(GcResult::default()),
        };
-        
+        let timer = self.metrics.garbage_collect_histo.start_timer();

        fail_point!("before-timeline-gc");

@@ -6152,7 +6338,9 @@ impl Timeline {
        // It is an easy way to unset it when standby disappears without adding
        // more conf options.
        self.standby_horizon.store(Lsn::INVALID);
-       
+        self.metrics
+            .standby_horizon_gauge
+            .set(Lsn::INVALID.0 as i64);

        let res = self
            .gc_timeline(
@@ -6167,7 +6355,8 @@ impl Timeline {
            )
            .await?;

-       
+        // only record successes
+        timer.stop_and_record();

        Ok(res)
    }
@@ -6674,14 +6863,14 @@ impl Timeline {
    /// Persistently blocks gc for `Manual` reason.
    ///
    /// Returns true if no such block existed before, false otherwise.
-    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+    pub(crate) async fn block_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<bool> {
        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
    }

    /// Persistently unblocks gc for `Manual` reason.
-    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+    pub(crate) async fn unblock_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<()> {
        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
@@ -6699,8 +6888,8 @@ impl Timeline {

    /// Force create an image layer and place it into the layer map.
    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_image_layer(
@@ -6756,8 +6945,8 @@ impl Timeline {

    /// Force create a delta layer and place it into the layer map.
    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_delta_layer(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -77,7 +77,7 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5;
 /// shard split, which gets expensive for large tenants.
 const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3;

-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+#[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize)]
 pub struct GcCompactionJobId(pub usize);

 impl std::fmt::Display for GcCompactionJobId {
@@ -105,6 +105,50 @@ pub enum GcCompactionQueueItem {
    Notify(GcCompactionJobId, Option<Lsn>),
 }

+/// Statistics for gc-compaction meta jobs, which contains several sub compaction jobs.
+#[derive(Debug, Clone, Serialize, Default)]
+pub struct GcCompactionMetaStatistics {
+    /// The total number of sub compaction jobs.
+    pub total_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that failed.
+    pub failed_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that succeeded.
+    pub succeeded_sub_compaction_jobs: usize,
+    /// The layer size before compaction.
+    pub before_compaction_layer_size: u64,
+    /// The layer size after compaction.
+    pub after_compaction_layer_size: u64,
+    /// The start time of the meta job.
+    pub start_time: Option<chrono::DateTime<chrono::Utc>>,
+    /// The end time of the meta job.
+    pub end_time: Option<chrono::DateTime<chrono::Utc>>,
+    /// The duration of the meta job.
+    pub duration_secs: f64,
+    /// The id of the meta job.
+    pub meta_job_id: GcCompactionJobId,
+    /// The LSN below which the layers are compacted, used to compute the statistics.
+    pub below_lsn: Lsn,
+    /// The retention ratio of the meta job (after_compaction_layer_size / before_compaction_layer_size)
+    pub retention_ratio: f64,
+}
+
+impl GcCompactionMetaStatistics {
+    fn finalize(&mut self) {
+        let end_time = chrono::Utc::now();
+        if let Some(start_time) = self.start_time {
+            if end_time > start_time {
+                let delta = end_time - start_time;
+                if let Ok(std_dur) = delta.to_std() {
+                    self.duration_secs = std_dur.as_secs_f64();
+                }
+            }
+        }
+        self.retention_ratio = self.after_compaction_layer_size as f64
+            / (self.before_compaction_layer_size as f64 + 1.0);
+        self.end_time = Some(end_time);
+    }
+}
+
 impl GcCompactionQueueItem {
    pub fn into_compact_info_resp(
        self,
@@ -142,6 +186,7 @@ struct GcCompactionQueueInner {
    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
    guards: HashMap<GcCompactionJobId, GcCompactionGuardItems>,
    last_id: GcCompactionJobId,
+    meta_statistics: Option<GcCompactionMetaStatistics>,
 }

 impl GcCompactionQueueInner {
@@ -173,6 +218,7 @@ impl GcCompactionQueue {
                queued: VecDeque::new(),
                guards: HashMap::new(),
                last_id: GcCompactionJobId(0),
+                meta_statistics: None,
            }),
            consumer_lock: tokio::sync::Mutex::new(()),
        }
@@ -357,6 +403,23 @@ impl GcCompactionQueue {
        Ok(())
    }

+    async fn collect_layer_below_lsn(
+        &self,
+        timeline: &Arc<Timeline>,
+        lsn: Lsn,
+    ) -> Result<u64, CompactionError> {
+        let guard = timeline.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let layers = layer_map.iter_historic_layers().collect_vec();
+        let mut size = 0;
+        for layer in layers {
+            if layer.lsn_range.start <= lsn {
+                size += layer.file_size();
+            }
+        }
+        Ok(size)
+    }
+
    /// Notify the caller the job has finished and unblock GC.
    fn notify_and_unblock(&self, id: GcCompactionJobId) {
        info!("compaction job id={} finished", id);
@@ -366,6 +429,16 @@ impl GcCompactionQueue {
                let _ = tx.send(());
            }
        }
+        if let Some(ref meta_statistics) = guard.meta_statistics {
+            if meta_statistics.meta_job_id == id {
+                if let Ok(stats) = serde_json::to_string(&meta_statistics) {
+                    info!(
+                        "gc-compaction meta statistics for job id = {}: {}",
+                        id, stats
+                    );
+                }
+            }
+        }
    }

    fn clear_running_job(&self) {
@@ -405,7 +478,11 @@ impl GcCompactionQueue {
            let mut pending_tasks = Vec::new();
            // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate.
            // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN.
-            let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max();
+            let expected_l2_lsn = jobs
+                .iter()
+                .map(|job| job.compact_lsn_range.end)
+                .max()
+                .unwrap();
            for job in jobs {
                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
                // until we do further refactors to allow directly call `compact_with_gc`.
@@ -430,9 +507,13 @@ impl GcCompactionQueue {
            if !auto {
                pending_tasks.push(GcCompactionQueueItem::Notify(id, None));
            } else {
-                pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn));
+                pending_tasks.push(GcCompactionQueueItem::Notify(id, Some(expected_l2_lsn)));
            }

+            let layer_size = self
+                .collect_layer_below_lsn(timeline, expected_l2_lsn)
+                .await?;
+
            {
                let mut guard = self.inner.lock().unwrap();
                let mut tasks = Vec::new();
@@ -444,7 +525,16 @@ impl GcCompactionQueue {
                for item in tasks {
                    guard.queued.push_front(item);
                }
+                guard.meta_statistics = Some(GcCompactionMetaStatistics {
+                    meta_job_id: id,
+                    start_time: Some(chrono::Utc::now()),
+                    before_compaction_layer_size: layer_size,
+                    below_lsn: expected_l2_lsn,
+                    total_sub_compaction_jobs: jobs_len,
+                    ..Default::default()
+                });
            }
+
            info!(
                "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs",
                jobs_len
@@ -573,6 +663,10 @@ impl GcCompactionQueue {
                    Err(err) => {
                        warn!(%err, "failed to run gc-compaction subcompaction job");
                        self.clear_running_job();
+                        let mut guard = self.inner.lock().unwrap();
+                        if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                            meta_statistics.failed_sub_compaction_jobs += 1;
+                        }
                        return Err(err);
                    }
                };
@@ -582,8 +676,34 @@ impl GcCompactionQueue {
                    // we need to clean things up before returning from the function.
                    yield_for_l0 = true;
                }
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.succeeded_sub_compaction_jobs += 1;
+                    }
+                }
            }
            GcCompactionQueueItem::Notify(id, l2_lsn) => {
+                let below_lsn = {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.below_lsn
+                    } else {
+                        Lsn::INVALID
+                    }
+                };
+                let layer_size = if below_lsn != Lsn::INVALID {
+                    self.collect_layer_below_lsn(timeline, below_lsn).await?
+                } else {
+                    0
+                };
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.after_compaction_layer_size = layer_size;
+                        meta_statistics.finalize();
+                    }
+                }
                self.notify_and_unblock(id);
                if let Some(l2_lsn) = l2_lsn {
                    let current_l2_lsn = timeline
@@ -1133,15 +1253,16 @@ impl Timeline {

        // 1. L0 Compact
        let l0_outcome = {
-            
-            self
+            let timer = self.metrics.compact_time_histo.start_timer();
+            let l0_outcome = self
                .compact_level0(
                    target_file_size,
                    options.flags.contains(CompactFlags::ForceL0Compaction),
                    ctx,
                )
-                .await?
-        
+                .await?;
+            timer.stop_and_record();
+            l0_outcome
        };

        if options.flags.contains(CompactFlags::OnlyL0Compaction) {
@@ -4040,7 +4161,7 @@ impl TimelineAdaptor {
        key_range: &Range<Key>,
        ctx: &RequestContext,
    ) -> Result<(), CreateImageLayersError> {
-       
+        let timer = self.timeline.metrics.create_images_time_histo.start_timer();

        let image_layer_writer = ImageLayerWriter::new(
            self.timeline.conf,
@@ -4086,7 +4207,7 @@ impl TimelineAdaptor {
            self.new_images.push(image_layer);
        }

-        
+        timer.stop_and_record();

        Ok(())
    }
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -18,8 +18,8 @@ use crate::tenant::remote_timeline_client::{
    PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
 };
 use crate::tenant::{
-    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError,
-    Timeline, TimelineOrOffloaded,
+    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, TenantManifestError,
+    TenantShard, Timeline, TimelineOrOffloaded,
 };
 use crate::virtual_file::MaybeFatalIo;

@@ -113,7 +113,7 @@ pub(super) async fn delete_local_timeline_directory(
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`make_timeline_delete_guard`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &TimelineOrOffloaded,
    _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
@@ -192,7 +192,7 @@ impl DeleteTimelineFlow {
    // error out if some of the shutdown tasks have already been completed!
    #[instrument(skip_all)]
    pub async fn run(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -288,7 +288,7 @@ impl DeleteTimelineFlow {
    /// Shortcut to create Timeline in stopping state and spawn deletion task.
    #[instrument(skip_all, fields(%timeline_id))]
    pub(crate) async fn resume_deletion(
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
@@ -338,7 +338,7 @@ impl DeleteTimelineFlow {
    fn schedule_background(
        guard: DeletionGuard,
        conf: &'static PageServerConf,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline: TimelineOrOffloaded,
        remote_client: Arc<RemoteTimelineClient>,
    ) {
@@ -381,7 +381,7 @@ impl DeleteTimelineFlow {
    async fn background(
        mut guard: DeletionGuard,
        conf: &PageServerConf,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &TimelineOrOffloaded,
        remote_client: Arc<RemoteTimelineClient>,
    ) -> Result<(), DeleteTimelineError> {
@@ -435,7 +435,7 @@ pub(super) enum TimelineDeleteGuardKind {
 }

 pub(super) fn make_timeline_delete_guard(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline_id: TimelineId,
    guard_kind: TimelineDeleteGuardKind,
 ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -23,7 +23,7 @@ use super::layer_manager::LayerManager;
 use super::{FlushLayerError, Timeline};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::TaskKind;
-use crate::tenant::Tenant;
+use crate::tenant::TenantShard;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::{
@@ -265,7 +265,7 @@ async fn generate_tombstone_image_layer(
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    behavior: DetachBehavior,
    options: Options,
    ctx: &RequestContext,
@@ -590,7 +590,7 @@ pub(super) async fn prepare(

 async fn start_new_attempt(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -611,7 +611,7 @@ async fn start_new_attempt(

 async fn continue_with_blocked_gc(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -622,7 +622,7 @@ async fn continue_with_blocked_gc(

 fn obtain_exclusive_attempt(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -655,7 +655,7 @@ fn obtain_exclusive_attempt(

 fn reparented_direct_children(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
 ) -> Result<HashSet<TimelineId>, Error> {
    let mut all_direct_children = tenant
        .timelines
@@ -950,7 +950,7 @@ impl DetachingAndReparenting {
 /// See [`Timeline::detach_from_ancestor_and_reparent`].
 pub(super) async fn detach_and_reparent(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    prepared: PreparedTimelineDetach,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
@@ -1184,7 +1184,7 @@ pub(super) async fn detach_and_reparent(

 pub(super) async fn complete(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    mut attempt: Attempt,
    _ctx: &RequestContext,
 ) -> Result<(), Error> {
@@ -1258,7 +1258,7 @@ where
 }

 fn check_no_archived_children_of_ancestor(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    detached: &Arc<Timeline>,
    ancestor: &Arc<Timeline>,
    ancestor_lsn: Lsn,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -33,7 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::storage_layer::LayerVisibilityHint;
 use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
 use crate::tenant::timeline::EvictionError;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 #[derive(Default)]
 pub struct EvictionTaskTimelineState {
@@ -48,7 +48,7 @@ pub struct EvictionTaskTenantState {
 impl Timeline {
    pub(super) fn launch_eviction_task(
        self: &Arc<Self>,
-        parent: Arc<Tenant>,
+        parent: Arc<TenantShard>,
        background_tasks_can_start: Option<&completion::Barrier>,
    ) {
        let self_clone = Arc::clone(self);
@@ -75,7 +75,7 @@ impl Timeline {
    }

    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<TenantShard>) {
        // acquire the gate guard only once within a useful span
        let Ok(guard) = self.gate.enter() else {
            return;
@@ -118,7 +118,7 @@ impl Timeline {
    #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
    async fn eviction_iteration(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        policy: &EvictionPolicy,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -126,7 +126,7 @@ impl Timeline {
    ) -> ControlFlow<(), Instant> {
        debug!("eviction iteration: {policy:?}");
        let start = Instant::now();
-        let (period, _) = match policy {
+        let (period, threshold) = match policy {
            EvictionPolicy::NoEviction => {
                // check again in 10 seconds; XXX config watch mechanism
                return ControlFlow::Continue(Instant::now() + Duration::from_secs(10));
@@ -159,13 +159,23 @@ impl Timeline {
            period,
            BackgroundLoopKind::Eviction,
        );
+        // FIXME: if we were to mix policies on a pageserver, we would have no way to sense this. I
+        // don't think that is a relevant fear however, and regardless the imitation should be the
+        // most costly part.
+        crate::metrics::EVICTION_ITERATION_DURATION
+            .get_metric_with_label_values(&[
+                &format!("{}", period.as_secs()),
+                &format!("{}", threshold.as_secs()),
+            ])
+            .unwrap()
+            .observe(elapsed.as_secs_f64());

        ControlFlow::Continue(start + period)
    }

    async fn eviction_iteration_threshold(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -299,7 +309,7 @@ impl Timeline {
    /// disk usage based eviction task.
    async fn imitiate_only(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -315,7 +325,7 @@ impl Timeline {
        &self,
        cancel: &CancellationToken,
        ctx: &RequestContext,
-    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit> {
+    ) -> ControlFlow<(), BackgroundLoopSemaphorePermit<'static>> {
        let acquire_permit =
            crate::tenant::tasks::acquire_concurrency_permit(BackgroundLoopKind::Eviction, ctx);

@@ -353,11 +363,11 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_layer_accesses(
        &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
-        permit: BackgroundLoopSemaphorePermit,
+        permit: BackgroundLoopSemaphorePermit<'static>,
        ctx: &RequestContext,
    ) -> ControlFlow<()> {
        if !self.tenant_shard_id.is_shard_zero() {
@@ -489,7 +499,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_synthetic_size_calculation_worker(
        &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) {
--- a/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
+++ b/pageserver/src/tenant/timeline/heatmap_layers_downloader.rs
@@ -59,7 +59,11 @@ impl HeatmapLayersDownloader {
                    return;
                };

-                
+                tracing::info!(
+                    resident_size=%timeline.resident_physical_size(),
+                    heatmap_layers=%heatmap.all_layers().count(),
+                    "Starting heatmap layers download"
+                );

                let stream = futures::stream::iter(heatmap.all_layers().cloned().filter_map(
                    |layer| {
@@ -89,7 +93,7 @@ impl HeatmapLayersDownloader {
                tokio::select! {
                    _ = stream.collect::<()>() => {
                        tracing::info!(
-                            
+                            resident_size=%timeline.resident_physical_size(),
                            "Heatmap layers download completed"
                        );
                    },
--- a/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/upcall_api.rs
@@ -1,6 +1,6 @@
 //! FIXME: most of this is copy-paste from mgmt_api.rs ; dedupe into a `reqwest_utils::Client` crate.
 use pageserver_client::mgmt_api::{Error, ResponseErrorMessageExt};
-use reqwest::Method;
+use reqwest::{Certificate, Method};
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
 use tracing::error;
@@ -34,7 +34,7 @@ impl Client {
        };
        let mut http_client = reqwest::Client::builder();
        for cert in &conf.ssl_ca_certs {
-            http_client = http_client.add_root_certificate(cert.clone());
+            http_client = http_client.add_root_certificate(Certificate::from_der(cert.contents())?);
        }
        let http_client = http_client.build()?;

--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -290,7 +290,7 @@ impl OpenLayerManager {
        lsn: Lsn,
        last_freeze_at: &AtomicLsn,
        write_lock: &mut tokio::sync::MutexGuard<'_, Option<TimelineWriterState>>,
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) -> bool {
        let Lsn(last_record_lsn) = lsn;
        let end_lsn = Lsn(last_record_lsn + 1);
@@ -299,6 +299,10 @@ impl OpenLayerManager {
            let open_layer_rc = Arc::clone(open_layer);
            open_layer.freeze(end_lsn).await;

+            // Increment the frozen layer metrics. This is decremented in `finish_flush_l0_layer()`.
+            // TODO: It would be nicer to do this via `InMemoryLayer::drop()`, but it requires a
+            // reference to the timeline metrics. Other methods use a metrics borrow as well.
+            metrics.inc_frozen_layer(open_layer);

            // The layer is no longer open, update the layer map to reflect this.
            // We will replace it with on-disk historics below.
@@ -330,12 +334,16 @@ impl OpenLayerManager {
    pub(crate) fn track_new_image_layers(
        &mut self,
        image_layers: &[ResidentLayer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for layer in image_layers {
            Self::insert_historic_layer(layer.as_ref().clone(), &mut updates, &mut self.layer_fmgr);

+            // record these here instead of Layer::finish_creating because otherwise partial
+            // failure with create_image_layers would balloon up the physical size gauge. downside
+            // is that all layers need to be created before metrics are updated.
+            metrics.record_new_file_metrics(layer.layer_desc().file_size);
        }
        updates.flush();
    }
@@ -345,13 +353,14 @@ impl OpenLayerManager {
        &mut self,
        delta_layer: Option<&ResidentLayer>,
        frozen_layer_for_check: &Arc<InMemoryLayer>,
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let inmem = self
            .layer_map
            .frozen_layers
            .pop_front()
            .expect("there must be a inmem layer to flush");
+        metrics.dec_frozen_layer(&inmem);

        // Only one task may call this function at a time (for this
        // timeline). If two tasks tried to flush the same frozen
@@ -361,6 +370,7 @@ impl OpenLayerManager {
        if let Some(l) = delta_layer {
            let mut updates = self.layer_map.batch_update();
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
            updates.flush();
        }
    }
@@ -370,11 +380,12 @@ impl OpenLayerManager {
        &mut self,
        compact_from: &[Layer],
        compact_to: &[ResidentLayer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for l in compact_to {
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        for l in compact_from {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
@@ -427,7 +438,7 @@ impl OpenLayerManager {
        rewrite_layers: &[(Layer, ResidentLayer)],
        drop_layers: &[Layer],
        add_layers: &[ResidentLayer],
-        _metrics: &TimelineMetrics,
+        metrics: &TimelineMetrics,
    ) {
        let mut updates = self.layer_map.batch_update();
        for (old_layer, new_layer) in rewrite_layers {
@@ -458,12 +469,14 @@ impl OpenLayerManager {
                &mut self.layer_fmgr,
            );

+            metrics.record_new_file_metrics(new_layer.layer_desc().file_size);
        }
        for l in drop_layers {
            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        for l in add_layers {
            Self::insert_historic_layer(l.as_ref().clone(), &mut updates, &mut self.layer_fmgr);
+            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        updates.flush();
    }
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -23,6 +23,7 @@ pub(super) struct LogicalSize {
    /// the initial size at a different LSN.
    pub initial_logical_size: OnceCell<(
        u64,
+        crate::metrics::initial_logical_size::FinishedCalculationGuard,
    )>,

    /// Cancellation for the best-effort logical size calculation.
@@ -129,7 +130,11 @@ impl CurrentLogicalSize {
 impl LogicalSize {
    pub(super) fn empty_initial() -> Self {
        Self {
-            initial_logical_size: OnceCell::with_value((0,)),
+            initial_logical_size: OnceCell::with_value((0, {
+                crate::metrics::initial_logical_size::START_CALCULATION
+                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
+                    .calculation_result_saved()
+            })),
            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
@@ -154,7 +159,7 @@ impl LogicalSize {
        //                  ^^^ keep this type explicit so that the casts in this function break if
        //                  we change the type.
        match self.initial_logical_size.get() {
-            Some((initial_size, )) => {
+            Some((initial_size, _)) => {
                CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
                    .unwrap()))
@@ -176,7 +181,7 @@ impl LogicalSize {
    /// available for re-use. This doesn't contain the incremental part.
    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, )| *s),
+            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
            _ => None,
        }
    }
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -8,7 +8,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard};
 use crate::tenant::{
-    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+    DeleteTimelineError, OffloadedTimeline, TenantManifestError, TenantShard, TimelineOrOffloaded,
 };

 #[derive(thiserror::Error, Debug)]
@@ -33,7 +33,7 @@ impl From<TenantManifestError> for OffloadError {
 }

 pub(crate) async fn offload_timeline(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &Arc<Timeline>,
 ) -> Result<(), OffloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
@@ -123,7 +123,7 @@ pub(crate) async fn offload_timeline(
 ///
 /// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &Timeline,
    _: &DeletionGuard, // using it as a witness
 ) -> usize {
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -15,17 +15,19 @@ use super::Timeline;
 use crate::context::RequestContext;
 use crate::import_datadir;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded};
+use crate::tenant::{
+    CreateTimelineError, CreateTimelineIdempotency, TenantShard, TimelineOrOffloaded,
+};

 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
 /// its local files are removed.  If we crash while this class exists, then the timeline's local
-/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
+/// state is cleaned up during [`TenantShard::clean_up_timelines`], because the timeline's content isn't in remote storage.
 ///
 /// The caller is responsible for proper timeline data filling before the final init.
 #[must_use]
 pub struct UninitializedTimeline<'t> {
-    pub(crate) owning_tenant: &'t Tenant,
+    pub(crate) owning_tenant: &'t TenantShard,
    timeline_id: TimelineId,
    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
@@ -35,7 +37,7 @@ pub struct UninitializedTimeline<'t> {

 impl<'t> UninitializedTimeline<'t> {
    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
+        owning_tenant: &'t TenantShard,
        timeline_id: TimelineId,
        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
    ) -> Self {
@@ -156,7 +158,7 @@ impl<'t> UninitializedTimeline<'t> {
    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
        mut self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
@@ -227,17 +229,17 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
        }
    }
-    // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
+    // Having cleaned up, we can release this TimelineId in `[TenantShard::timelines_creating]` to allow other
    // timeline creation attempts under this TimelineId to proceed
    drop(create_guard);
 }

 /// A guard for timeline creations in process: as long as this object exists, the timeline ID
-/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
+/// is kept in `[TenantShard::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
 pub(crate) struct TimelineCreateGuard {
    pub(crate) _tenant_gate_guard: GateGuard,
-    pub(crate) owning_tenant: Arc<Tenant>,
+    pub(crate) owning_tenant: Arc<TenantShard>,
    pub(crate) timeline_id: TimelineId,
    pub(crate) timeline_path: Utf8PathBuf,
    pub(crate) idempotency: CreateTimelineIdempotency,
@@ -263,7 +265,7 @@ pub(crate) enum TimelineExclusionError {

 impl TimelineCreateGuard {
    pub(crate) fn new(
-        owning_tenant: &Arc<Tenant>,
+        owning_tenant: &Arc<TenantShard>,
        timeline_id: TimelineId,
        timeline_path: Utf8PathBuf,
        idempotency: CreateTimelineIdempotency,
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -39,6 +39,10 @@ use utils::postgres_client::{
 use super::walreceiver_connection::{WalConnectionStatus, WalReceiverError};
 use super::{TaskEvent, TaskHandle, TaskStateUpdate, WalReceiverConf};
 use crate::context::{DownloadBehavior, RequestContext};
+use crate::metrics::{
+    WALRECEIVER_ACTIVE_MANAGERS, WALRECEIVER_BROKER_UPDATES, WALRECEIVER_CANDIDATES_ADDED,
+    WALRECEIVER_CANDIDATES_REMOVED, WALRECEIVER_SWITCHES,
+};
 use crate::task_mgr::TaskKind;
 use crate::tenant::{Timeline, debug_assert_current_span_has_tenant_and_timeline_id};

@@ -72,6 +76,11 @@ pub(super) async fn connection_manager_loop_step(
        }
    }

+    WALRECEIVER_ACTIVE_MANAGERS.inc();
+    scopeguard::defer! {
+        WALRECEIVER_ACTIVE_MANAGERS.dec();
+    }
+
    let id = TenantTimelineId {
        tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
        timeline_id: connection_manager_state.timeline.timeline_id,
@@ -517,6 +526,9 @@ impl ConnectionManagerState {

    /// Shuts down the current connection (if any) and immediately starts another one with the given connection string.
    async fn change_connection(&mut self, new_sk: NewWalConnectionCandidate, ctx: &RequestContext) {
+        WALRECEIVER_SWITCHES
+            .with_label_values(&[new_sk.reason.name()])
+            .inc();

        self.drop_old_connection(true).await;

@@ -719,6 +731,8 @@ impl ConnectionManagerState {
            }
        };

+        WALRECEIVER_BROKER_UPDATES.inc();
+
        trace!(
            "safekeeper info update: standby_horizon(cutoff)={}",
            timeline_update.standby_horizon
@@ -728,6 +742,10 @@ impl ConnectionManagerState {
            self.timeline
                .standby_horizon
                .store(Lsn(timeline_update.standby_horizon));
+            self.timeline
+                .metrics
+                .standby_horizon_gauge
+                .set(timeline_update.standby_horizon as i64);
        }

        let new_safekeeper_id = NodeId(timeline_update.safekeeper_id);
@@ -745,6 +763,7 @@ impl ConnectionManagerState {
                %new_safekeeper_id,
                "New SK node was added",
            );
+            WALRECEIVER_CANDIDATES_ADDED.inc();
        }
    }

@@ -1032,6 +1051,7 @@ impl ConnectionManagerState {
                    "Safekeeper node {node_id} did not send events for over {lagging_wal_timeout:?}, not retrying the connections"
                );
                self.wal_connection_retries.remove(&node_id);
+                WALRECEIVER_CANDIDATES_REMOVED.inc();
            }
        }
    }
@@ -1058,7 +1078,6 @@ struct NewWalConnectionCandidate {
    safekeeper_id: NodeId,
    wal_source_connconf: PgConnectionConfig,
    availability_zone: Option<String>,
-    #[allow(dead_code)]
    reason: ReconnectReason,
 }

@@ -1087,6 +1106,18 @@ enum ReconnectReason {
    },
 }

+impl ReconnectReason {
+    fn name(&self) -> &str {
+        match self {
+            ReconnectReason::NoExistingConnection => "NoExistingConnection",
+            ReconnectReason::LaggingWal { .. } => "LaggingWal",
+            ReconnectReason::SwitchAvailabilityZone => "SwitchAvailabilityZone",
+            ReconnectReason::NoWalTimeout { .. } => "NoWalTimeout",
+            ReconnectReason::NoKeepAlives { .. } => "NoKeepAlives",
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use pageserver_api::config::defaults::DEFAULT_WAL_RECEIVER_PROTOCOL;
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -36,6 +36,7 @@ use wal_decoder::wire_format::FromWireFormat;

 use super::TaskStateUpdate;
 use crate::context::RequestContext;
+use crate::metrics::{LIVE_CONNECTIONS, WAL_INGEST, WALRECEIVER_STARTED_CONNECTIONS};
 use crate::pgdatadir_mapping::DatadirModification;
 use crate::task_mgr::{TaskKind, WALRECEIVER_RUNTIME};
 use crate::tenant::{
@@ -136,7 +137,7 @@ pub(super) async fn handle_walreceiver_connection(
        GateError::GateClosed => WalReceiverError::ClosedGate,
    })?;

-  
+    WALRECEIVER_STARTED_CONNECTIONS.inc();

    // Connect to the database in replication mode.
    info!("connecting to {wal_source_connconf:?}");
@@ -222,6 +223,10 @@ pub(super) async fn handle_walreceiver_connection(
        .instrument(tracing::info_span!("poller")),
    );

+    let _guard = LIVE_CONNECTIONS
+        .with_label_values(&["wal_receiver"])
+        .guard();
+
    let identify = identify_system(&replication_client).await?;
    info!("{identify:?}");

@@ -339,7 +344,7 @@ pub(super) async fn handle_walreceiver_connection(

        let status_update = match replication_message {
            ReplicationMessage::RawInterpretedWalRecords(raw) => {
-               
+                WAL_INGEST.bytes_received.inc_by(raw.data().len() as u64);

                let mut uncommitted_records = 0;

@@ -412,13 +417,21 @@ pub(super) async fn handle_walreceiver_connection(
                    ctx: &RequestContext,
                    uncommitted: &mut u64,
                ) -> anyhow::Result<()> {
-                  
+                    let stats = modification.stats();
                    modification.commit(ctx).await?;
-                  
+                    WAL_INGEST.records_committed.inc_by(*uncommitted);
+                    WAL_INGEST.inc_values_committed(&stats);
                    *uncommitted = 0;
                    Ok(())
                }

+                if !records.is_empty() {
+                    timeline
+                        .metrics
+                        .wal_records_received
+                        .inc_by(records.len() as u64);
+                }
+
                for interpreted in records {
                    if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                        && uncommitted_records > 0
@@ -428,6 +441,9 @@ pub(super) async fn handle_walreceiver_connection(

                    let local_next_record_lsn = interpreted.next_record_lsn;

+                    if interpreted.is_observed() {
+                        WAL_INGEST.records_observed.inc();
+                    }

                    walingest
                        .ingest_record(interpreted, &mut modification, &ctx)
@@ -499,9 +515,12 @@ pub(super) async fn handle_walreceiver_connection(
                    filtered: &mut u64,
                    ctx: &RequestContext,
                ) -> anyhow::Result<()> {
-                   
+                    let stats = modification.stats();
                    modification.commit(ctx).await?;
-                   
+                    WAL_INGEST
+                        .records_committed
+                        .inc_by(*uncommitted - *filtered);
+                    WAL_INGEST.inc_values_committed(&stats);
                    *uncommitted = 0;
                    *filtered = 0;
                    Ok(())
@@ -515,7 +534,7 @@ pub(super) async fn handle_walreceiver_connection(

                trace!("received XLogData between {startlsn} and {endlsn}");

-            
+                WAL_INGEST.bytes_received.inc_by(data.len() as u64);
                waldecoder.feed_bytes(data);

                {
@@ -557,6 +576,7 @@ pub(super) async fn handle_walreceiver_connection(
                        }

                        // Ingest the records without immediately committing them.
+                        timeline.metrics.wal_records_received.inc();
                        let ingested = walingest
                            .ingest_record(interpreted, &mut modification, &ctx)
                            .await
@@ -572,7 +592,7 @@ pub(super) async fn handle_walreceiver_connection(
                            })?;
                        if !ingested {
                            tracing::debug!("ingest: filtered out record @ LSN {next_record_lsn}");
-                          
+                            WAL_INGEST.records_filtered.inc();
                            filtered_records += 1;
                        }

--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -151,7 +151,6 @@ impl VectoredBlob {
    }

    /// Returns the raw blob including header.
-    #[allow(unused)]
    pub(crate) fn raw_with_header<'a>(&self, buf: &BufView<'a>) -> BufView<'a> {
        buf.view(self.header_start..self.end)
    }
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -10,7 +10,7 @@ use pageserver_api::models::PageserverUtilization;
 use utils::serde_percent::Percent;

 use crate::config::PageServerConf;
-
+use crate::metrics::NODE_UTILIZATION_SCORE;
 use crate::tenant::mgr::TenantManager;

 pub(crate) fn regenerate(
@@ -53,7 +53,7 @@ pub(crate) fn regenerate(
    // Express a static value for how many shards we may schedule on one node
    const MAX_SHARDS: u32 = 5000;

-    let doc = PageserverUtilization {
+    let mut doc = PageserverUtilization {
        disk_usage_bytes: used,
        free_space_bytes: free,
        disk_wanted_bytes,
@@ -63,7 +63,10 @@ pub(crate) fn regenerate(
        utilization_score: None,
        captured_at: utils::serde_system_time::SystemTime(captured_at),
    };
-    
+
+    // Initialize `PageserverUtilization::utilization_score`
+    let score = doc.cached_score();
+    NODE_UTILIZATION_SCORE.set(score);

    Ok(doc)
 }
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Alex Chi Z	3357edc442	drain instead of mark offline Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-23 16:03:55 -04:00
Alex Chi Z	6d26b2dd4d	test(storcon): ensure essential scheduling mode attaches tenant to secondary Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-23 15:57:37 -04:00
devin-ai-integration[bot]	8e09ecf2ab	Fix KeyError in physical replication benchmark test (#11675 ) # Fix KeyError in physical replication benchmark test This PR fixes the failing physical replication benchmark test that was encountering a KeyError: 'endpoints'. The issue was in accessing `project["project"]["endpoints"][0]["id"]` when it should be `project["endpoints"][0]["id"]`, consistent with how endpoints are accessed elsewhere in the codebase. Fixed the issue in both test functions: - test_ro_replica_lag - test_replication_start_stop Link to Devin run: https://app.devin.ai/sessions/be3fe9a9ee5942e4b12e74a7055f541b Requested by: Peter Bendel Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com> Co-authored-by: peterbendel@neon.tech <peterbendel@neon.tech>	2025-04-23 14:51:08 +00:00
Mikhail Kot	c3534cea39	Rename object_storage->endpoint_storage (#11678 ) 1. Rename service to avoid ambiguity as discussed in Slack 2. Ignore endpoint_id in read paths as requested in https://github.com/neondatabase/cloud/issues/26346#issuecomment-2806758224	2025-04-23 14:03:19 +00:00
Folke Behrens	21d3d60cef	proxy/pglb: Add in-process connection support (#11677 ) Define a `Connection` and a `Stream` type that resemble simple QUIC connections and (multiplexed) streams.	2025-04-23 12:18:30 +00:00
Tristan Partin	b00db536bb	Add CPU architecture to the remote extensions object key (#11590 ) ARM computes are incoming and we need to account for that in remote extensions. Previously, we just blindly assumed that all computes were x86_64. Note that we use the Go architecture naming convention instead of the Rust one directly to do our best and be consistent across the stack. Part-of: https://github.com/neondatabase/cloud/issues/23148 Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-22 22:47:22 +00:00
Arpad Müller	149cbd1e0a	Support single and two safekeeper scenarios (#11483 ) In tests and when one safekeeper is down in small regions, we need to contend with one or two safekeepers. Before, we gave an error in `safekeepers_for_new_timeline`. Now we just silently allow the timeline to be created on one or two safekeepers. Part of #9011	2025-04-22 21:27:01 +00:00
Alexander Lakhin	7b949daf13	fix(test): allow reconcile errors in test_storage_controller_heartbeats (#11665 ) ## Problem test_storage_controller_heartbeats is flaky because of unallowed reconciler errors (#11625) ## Summary of changes Allow reconcile errors as in other tests in test_storage_controller.py.	2025-04-22 18:13:16 +00:00
Konstantin Knizhnik	132b6154bb	Unlogged build debug compare local v2 (#11554 ) ## Problem Init fork is used in DEBUG_COMPARE_LOCAL to determine unlogged relation or unlogged build. But it is created only after the relation is initialized and so can be swapped out, producing `Page is evicted with zero LSN` error. ## Summary of changes Create init fork together with main fork for unlogged relations in DEBUG_COMPARE_LOCAL mode. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-04-22 18:07:45 +00:00
Alex Chi Z.	ad3519ebcb	fix(pageserver): report synthetic size = 1 if all tls offloaded (#11648 ) ## Problem A quick workaround for https://github.com/neondatabase/neon/issues/11631 ## Summary of changes Report synthetic size == 1 if all timelines are offloaded. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-22 14:28:22 +00:00
Dmitrii Kovalkov	6173c0f44c	safekeeper: add enable_tls_wal_service_api (#11520 ) ## Problem Safekeeper doesn't use TLS in wal service - Closes: https://github.com/neondatabase/cloud/issues/27302 ## Summary of changes - Add `enable_tls_wal_service_api` option to safekeeper's cmd arguments - Propagate `tls_server_config` to `wal_service` if the option is enabled - Create `BACKGROUND_RUNTIME` for small background tasks and offload SSL certificate reloader to it. No integration tests for now because support from compute side is required: https://github.com/neondatabase/cloud/issues/25823	2025-04-22 13:19:03 +00:00
a-masterov	fd916abf25	Remove NOTICE messages, which can make the pg_repack regression test fail. (#11659 ) ## Problem The pg_repack test can be flaky due to unpredictable `NOTICE` messages about waiting for some processes. E.g., ``` INFO: repacking table "public.issue3_2" +NOTICE: Waiting for 1 transactions to finish. First PID: 427 ``` ## Summary of changes The `client_min_messages` set to `warning` for the regression tests.	2025-04-22 11:43:45 +00:00
Alexander Bayandin	cd2e1fbc7c	CI(benchmarks): upload perf results for passed tests (#11649 ) ## Problem We run benchmarks in batches (five parallel jobs on different runners). If any test in a batch fails, we won’t upload any results for that batch, even for the tests that passed. ## Summary of changes - Move the results upload to a separate step in the run-python-test-set action, and execute this step even if tests fail.	2025-04-22 09:41:28 +00:00
Tristan Partin	5df4a747e6	Update pgbouncer in compute images to 1.24.1 (#11651 ) Fixes CVE-2025-2291. Link: https://www.postgresql.org/about/news/pgbouncer-1241-released-fixes-cve-2025-2291-3059/ Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-21 17:49:17 +00:00
Vlad Lazar	cbf442292b	pageserver: handle empty get vectored queries (#11652 ) ## Problem If all batched requests are excluded from the query by `Timeine::get_rel_page_at_lsn_batched` (e.g. because they are past the end of the relation), the read path would panic since it doesn't expect empty queries. This is a change in behaviour that was introduced with the scattered query implementation. ## Summary of Changes Handle empty queries explicitly.	2025-04-21 17:45:16 +00:00
Heikki Linnakangas	4d0c1e8b78	refactor: Extract some code in pagebench getpage command to function (#11563 ) This makes it easier to add a different client implementation alongside the current one. I started working on a new gRPC-based protocol to replace the libpq protocol, which will introduce a new function like `client_libpq`, but for the new protocol. It's a little more readable with less indentation anyway.	2025-04-19 08:38:03 +00:00
JC Grünhage	3158442a59	fix(ci): set token for fast-forward failure comments and allow merging with state unstable (#11647 ) ## Problem https://github.com/neondatabase/neon/actions/runs/14538136318/job/40790985693?pr=11645 failed, even though the relevant parts of the CI had passed and auto-merge determined the PR is ready to merge. After that, commenting failed. ## Summary of changes - set GH_TOKEN for commenting after fast-forward failure - allow merging with mergeable_state unstable	2025-04-18 17:49:34 +00:00
JC Grünhage	f006879fb7	fix(ci): make regex to find rc branches less strict (#11646 ) ## Problem https://github.com/neondatabase/neon/actions/runs/14537161022/job/40787763965 failed to find the correct RC PR run, preventing artifact re-use. This broke in https://github.com/neondatabase/neon/pull/11547. There's a hotfix release containing this in https://github.com/neondatabase/neon/pull/11645. ## Summary of changes Make the regex for finding the RC PR run less strict, it was needlessly precise.	2025-04-18 16:39:18 +00:00
Dmitrii Kovalkov	a0d844dfed	pageserver + safekeeper: pass ssl ca certs to broker client (#11635 ) ## Problem Pageservers and safakeepers do not pass CA certificates to broker client, so the client do not trust locally issued certificates. - Part of https://github.com/neondatabase/cloud/issues/27492 ## Summary of changes - Change `ssl_ca_certs` type in PS/SK's config to `Pem` which may be converted to both `reqwest` and `tonic` certificates. - Pass CA certificates to storage broker client in PS and SK	2025-04-18 06:27:23 +00:00
Alex Chi Z.	5073e46df4	feat(pageserver): use rfc3339 time and print ratio in gc-compact stats (#11638 ) ## Problem follow-up on https://github.com/neondatabase/neon/pull/11601 ## Summary of changes - serialize the start/end time using rfc3339 time string - compute the size ratio of the compaction --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-18 05:28:01 +00:00
Alexander Bayandin	182bd95a4e	CI(regress-tests): run tests on `large-metal` (#11634 ) ## Problem Regression tests are more flaky on virtualised (`qemu-x64-*`) runners See https://neondb.slack.com/archives/C069Z2199DL/p1744891865307769 Ref https://github.com/neondatabase/neon/issues/11627 ## Summary of changes - Switch `regress-tests` to metal-only large runners to mitigate flaky behaviour	2025-04-18 01:25:38 +00:00
Anastasia Lubennikova	ce7795a67d	compute: use project_id, endpoint_id as tag (#11556 ) for compute audit logs part of https://github.com/neondatabase/cloud/issues/21955	2025-04-17 23:32:38 +00:00
Suhas Thalanki	134d01c771	remove pg_anon.patch (#11636 ) This PR removes `pg_anon.patch` as the `anon` v1 extension has been removed and the patch is not being used anywhere	2025-04-17 22:08:16 +00:00
Arpad Müller	c1e4befd56	Additional fixes and improvements to storcon safekeeper timelines (#11477 ) This delivers some additional fixes and improvements to storcon managed safekeeper timelines: * use `i32::MAX` for the generation number of timeline deletion * start the generation for new timelines at 1 instead of 0: this ensures that the other components actually are generation enabled * fix database operations we use for metrics * use join in list_pending_ops to prevent the classical ORM issue where one does many db queries * use enums in `test_storcon_create_delete_sk_down`. we are adding a second parameter, and having two bool parameters is weird. * extend `test_storcon_create_delete_sk_down` with a test of whole tenant deletion. this hasn't been tested before. * remove some redundant logging contexts * Don't require mutable access to the service lock for scheduling pending ops in memory. In order to pull this off, create reconcilers eagerly. The advantage is that we don't need mutable access to the service lock that way any more. Part of #9011 --------- Co-authored-by: Arseny Sher <sher-ars@yandex.ru>	2025-04-17 20:25:30 +00:00
a-masterov	6c2e5c044c	random operations test (#10986 ) ## Problem We need to test the stability of Neon. ## Summary of changes The test runs random operations on a Neon project. It performs via the Public API calls the following operations: `create a branch`, `delete a branch`, `add a read-only endpoint`, `delete a read-only endpoint`, `restore a branch to a random position in the past`. All the branches and endpoints are loaded with `pgbench`. --------- Co-authored-by: Peter Bendel <peterbendel@neon.tech> Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2025-04-17 19:59:35 +00:00
Alex Chi Z.	748539b222	fix(pageserver): lower L0 compaction threshold (#11617 ) ## Problem We saw OOMs due to L0 compaction happening simultaneously for all shards of the same tenant right after the shard split. ## Summary of changes Lower the threshold so that we compact fewer files. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-17 19:51:28 +00:00
Alex Chi Z.	ad0c5fdae7	fix(test): allow stale generation warnings in storcon (#11624 ) ## Problem https://github.com/neondatabase/neon/pull/11531 did not fully fix the problem because the warning is part of the storcon instead of pageserver. ## Summary of changes Allow stale generation error in storcon. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-17 16:12:24 +00:00
Christian Schwarz	2b041964b3	cover direct IO + concurrent IO in unit, regression & perf tests (#11585 ) This mirrors the production config. Thread that discusses the merits of this: - https://neondb.slack.com/archives/C033RQ5SPDH/p1744742010740569 # Refs - context https://neondb.slack.com/archives/C04BLQ4LW7K/p1744724844844589?thread_ts=1744705831.014169&cid=C04BLQ4LW7K - prep for https://github.com/neondatabase/neon/pull/11558 which adds new io mode `direct-rw` # Impact on CI turnaround time Spot-checking impact on CI timings - Baseline: [some recent main commit](https://github.com/neondatabase/neon/actions/runs/14471549758/job/40587837475) - Comparison: [this commit](https://github.com/neondatabase/neon/actions/runs/14471945087/job/40589613274) in this PR here Impact on CI turnaround time - Regression tests: - x64: very minor, sometimes better; likely in the noise - arm64: substantial 30min => 40min - Benchmarks (x86 only I think): very minor; noise seems higher than regress tests --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Alex Chi Z. <4198311+skyzh@users.noreply.github.com> Co-authored-by: Peter Bendel <peterbendel@neon.tech> Co-authored-by: Alex Chi Z <chi@neon.tech>	2025-04-17 15:53:10 +00:00
John Spray	d4c059a884	tests: use endpoint http wrapper to get auth (#11628 ) ## Problem `test_compute_startup_simple` and `test_compute_ondemand_slru_startup` are failing. This test implicitly asserts that the metrics.json endpoint succeeds and returns all expected metrics, but doesn't make it easy to see what went wrong if it doesn't (e.g. in this failure https://neon-github-public-dev.s3.amazonaws.com/reports/main/14513210240/index.html#suites/13d8e764c394daadbad415a08454c04e/b0f92a86b2ed309f/) In this case, it was failing because of a missing auth token, because it was using `requests` directly instead of using the endpoint http client type. ## Summary of changes - Use endpoint http wrapper to get raise_for_status & auth token	2025-04-17 15:03:23 +00:00
Folke Behrens	2c56c46d48	compute: Set max log level for local proxy sql_over_http mod to WARN (#11629 ) neondatabase/cloud#27738	2025-04-17 14:38:19 +00:00
Tristan Partin	d1728a6bcd	Remove old compatibility hack for remote extensions (#11620 ) Control plane has long since been updated to send the right value. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-17 14:08:42 +00:00
John Spray	0a27973584	pageserver: rename `Tenant` to `TenantShard` (#11589 ) ## Problem `Tenant` isn't really a whole tenant: it's just one shard of a tenant. ## Summary of changes - Automated rename of Tenant to TenantShard - Followup commit to change references in comments	2025-04-17 13:29:16 +00:00
Alexander Bayandin	07c2411f6b	tests: remove mentions of ALLOW_*_COMPATIBILITY_BREAKAGE (#11618 ) ## Problem There are mentions of `ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` and `ALLOW_FORWARD_COMPATIBILITY_BREAKAGE`, but in reality, this mechanism doesn't work, so let's remove it to avoid confusion. The idea behind it was to allow some breaking changes by adding a special label to a PR that would `xfail` the test. However, in practice, this means we would need to carry this label through all subsequent PRs until the release (and artifact regeneration). This approach isn't really viable, as it increases the risk of missing a compatibility break in another PR. ## Summary of changes - Remove mentions and handling of `ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` / `ALLOW_FORWARD_COMPATIBILITY_BREAKAGE`	2025-04-17 10:03:21 +00:00
Alexander Bayandin	5819938c93	CI(pg-clients): fix workflow permissions (#11623 ) ## Problem `pg-clients` can't start: ``` The workflow is not valid. .github/workflows/pg-clients.yml (Line: 44, Col: 3): Error calling workflow 'neondatabase/neon/.github/workflows/build-build-tools-image.yml@aa19f10e7e958fbe0e0641f2e8c5952ce3be44b3'. The nested job 'check-image' is requesting 'packages: read', but is only allowed 'packages: none'. .github/workflows/pg-clients.yml (Line: 44, Col: 3): Error calling workflow 'neondatabase/neon/.github/workflows/build-build-tools-image.yml@aa19f10e7e958fbe0e0641f2e8c5952ce3be44b3'. The nested job 'build-image' is requesting 'packages: write', but is only allowed 'packages: none'. ``` ## Summary of changes - Grant required `packages: write` permissions to the workflow	2025-04-17 08:54:23 +00:00
Konstantin Knizhnik	b7548de814	Disable autovacuum and increase limit for WS approximation (#11583 ) ## Problem Test lfc working set approximation becomes flaky after recent changes in prefetch. May be it is caused by updating HLL in `lfc_write`, may be by some other reasons. ## Summary of changes 1. Disable autovacuum in this test (as possible source of extra page accesses). 2. Increase upper boundary for WS approximation from 12 to 20. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-04-17 05:07:45 +00:00
Tristan Partin	9794f386f4	Make Postgres 17 the default version (#11619 ) This is mostly a documentation update, but a few updates with regard to neon_local, pageserver, and tests. 17 is our default for users in production, so dropping references to 16 makes sense. Signed-off-by: Tristan Partin <tristan@neon.tech> Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-16 23:23:37 +00:00
Tristan Partin	79083de61c	Remove forward compatibility hacks related to compute_ctl auth (#11621 ) These various hacks were needed for the forward compatibility tests. Enough time has passed since the merge that these are no longer needed. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-16 23:14:24 +00:00
Folke Behrens	ec9079f483	Allow unwrap() in tests when clippy::unwrap_used is denied (#11616 ) ## Problem The proxy denies using `unwrap()`s in regular code, but we want to use it in test code and so have to allow it for each test block. ## Summary of changes Set `allow-unwrap-in-tests = true` in clippy.toml and remove all exceptions.	2025-04-16 20:05:21 +00:00
Ivan Efremov	b9b25e13a0	feat(proxy): Return prefixed errors to testodrome (#11561 ) Testodrome measures uptime based on the failed requests and errors. In case of testodrome request we send back error based on the service. This will help us distinguish error types in testodrome and rely on the uptime SLI.	2025-04-16 19:03:23 +00:00
Alex Chi Z.	cf2e695f49	feat(pageserver): gc-compaction meta statistics (#11601 ) ## Problem We currently only have gc-compaction statistics for each single sub-compaction job. ## Summary of changes Add meta statistics across all sub-compaction jobs scheduled. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-16 18:51:48 +00:00
Conrad Ludgate	fc233794f6	fix(proxy): make sure that sql-over-http is TLS aware (#11612 ) I noticed that while auth-broker -> local-proxy is TLS aware, and TCP proxy -> postgres is TLS aware, HTTP proxy -> postgres is not 😅	2025-04-16 18:37:17 +00:00
Tristan Partin	c002236145	Remove compute_ctl authorization bypass if testing feature was enable (#11596 ) We want to exercise the authorization middleware in our regression tests. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-16 17:54:51 +00:00
Erik Grinaker	4af0b9b387	pageserver: don't recompress images in `ImageLayerInner::filter()` (#11592 ) ## Problem During shard ancestor compaction, we currently recompress all page images as we move them into a new layer file. This is expensive and unnecessary. Resolves #11562. Requires #11607. ## Summary of changes Pass through compressed page images in `ImageLayerInner::filter()`.	2025-04-16 17:10:15 +00:00