pageserver: handle empty get vectored queries (#11652 )

## Problem If all batched requests are excluded from the query by `Timeine::get_rel_page_at_lsn_batched` (e.g. because they are past the end of the relation), the read path would panic since it doesn't expect empty queries. This is a change in behaviour that was introduced with the scattered query implementation. ## Summary of Changes Handle empty queries explicitly.
fix(ci): set token for fast-forward failure comments and allow merging with state unstable (#11647 )
2026-05-15 12:10:37 +00:00 · 2025-04-21 15:38:44 -04:00 · 2025-04-21 15:38:44 -04:00 · 2025-04-21 15:38:44 -04:00 · 2025-04-18 05:28:01 +00:00 · 2025-04-18 01:25:38 +00:00
102 changed files with 1554 additions and 832 deletions
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -113,8 +113,6 @@ runs:
        TEST_OUTPUT: /tmp/test_output
        BUILD_TYPE: ${{ inputs.build_type }}
        COMPATIBILITY_SNAPSHOT_DIR: /tmp/compatibility_snapshot_pg${{ inputs.pg_version }}
-        ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'backward compatibility breakage')
-        ALLOW_FORWARD_COMPATIBILITY_BREAKAGE: contains(github.event.pull_request.labels.*.name, 'forward compatibility breakage')
        RERUN_FAILED: ${{ inputs.rerun_failed }}
        PG_VERSION: ${{ inputs.pg_version }}
        SANITIZERS: ${{ inputs.sanitizers }}
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -272,10 +272,13 @@ jobs:
          # run pageserver tests with different settings
          for get_vectored_concurrent_io in sequential sidecar-task; do
            for io_engine in std-fs tokio-epoll-uring ; do
-              NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
-                NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
-                ${cov_prefix} \
-                cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+                for io_mode in buffered direct direct-rw ; do
+                  NEON_PAGESERVER_UNIT_TEST_GET_VECTORED_CONCURRENT_IO=$get_vectored_concurrent_io \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOENGINE=$io_engine \
+                  NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IOMODE=$io_mode \
+                  ${cov_prefix} \
+                  cargo nextest run $CARGO_FLAGS $CARGO_FEATURES  -E 'package(pageserver)'
+              done
            done
          done

@@ -346,7 +349,7 @@ jobs:
      contents: read
      statuses: write
    needs: [ build-neon ]
-    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large')) }}
+    runs-on: ${{ fromJSON(format('["self-hosted", "{0}"]', inputs.arch == 'arm64' && 'large-arm64' || 'large-metal')) }}
    container:
      image: ${{ inputs.build-tools-image }}
      credentials:
@@ -392,6 +395,7 @@ jobs:
          BUILD_TAG: ${{ inputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
          USE_LFC: ${{ matrix.lfc_state == 'with-lfc' && 'true' || 'false' }}

      # Temporary disable this step until we figure out why it's so flaky
--- a/.github/workflows/_meta.yml
+++ b/.github/workflows/_meta.yml
@@ -165,5 +165,5 @@ jobs:
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          CURRENT_SHA: ${{ github.sha }}
        run: |
-          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release(-(proxy|compute))?/[0-9]{4}-[0-9]{2}-[0-9]{2}$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
+          RELEASE_PR_RUN_ID=$(gh api "/repos/${GITHUB_REPOSITORY}/actions/runs?head_sha=$CURRENT_SHA" | jq '[.workflow_runs[] | select(.name == "Build and Test") | select(.head_branch | test("^rc/release.*$"; "s"))] | first | .id // ("Failed to find Build and Test run from  RC PR!" | halt_error(1))')
          echo "release-pr-run-id=$RELEASE_PR_RUN_ID" | tee -a $GITHUB_OUTPUT
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -323,6 +323,8 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
+          PAGESERVER_GET_VECTORED_CONCURRENT_IO: sidecar-task
+          PAGESERVER_VIRTUAL_FILE_IO_MODE: direct
          SYNC_BETWEEN_TESTS: true
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones
--- a/.github/workflows/fast-forward.yml
+++ b/.github/workflows/fast-forward.yml
@@ -27,15 +27,17 @@ jobs:
      - name: Fast forwarding
        uses: sequoia-pgp/fast-forward@ea7628bedcb0b0b96e94383ada458d812fca4979
        # See https://docs.github.com/en/graphql/reference/enums#mergestatestatus
-        if: ${{ github.event.pull_request.mergeable_state  == 'clean' }}
+        if: ${{ contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
        with:
          merge: true
          comment: on-error
          github_token: ${{ secrets.CI_ACCESS_TOKEN }}

      - name: Comment if mergeable_state is not clean
-        if: ${{ github.event.pull_request.mergeable_state  != 'clean' }}
+        if: ${{ !contains(fromJSON('["clean", "unstable"]'), github.event.pull_request.mergeable_state) }}
+        env:
+          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
          gh pr comment ${{ github.event.pull_request.number }} \
            --repo "${GITHUB_REPOSITORY}" \
-            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\`."
+            --body "Not trying to forward pull-request, because \`mergeable_state\` is \`${{ github.event.pull_request.mergeable_state }}\`, not \`clean\` or \`unstable\`."
--- a/.github/workflows/pg-clients.yml
+++ b/.github/workflows/pg-clients.yml
@@ -30,7 +30,7 @@ permissions:
  statuses: write # require for posting a status update

 env:
-  DEFAULT_PG_VERSION: 16
+  DEFAULT_PG_VERSION: 17
  PLATFORM: neon-captest-new
  AWS_DEFAULT_REGION: eu-central-1

@@ -42,6 +42,8 @@ jobs:
      github-event-name: ${{ github.event_name }}

  build-build-tools-image:
+    permissions:
+      packages: write
    needs: [ check-permissions ]
    uses: ./.github/workflows/build-build-tools-image.yml
    secrets: inherit
--- a/.github/workflows/random-ops-test.yml
+++ b/.github/workflows/random-ops-test.yml
@@ -0,0 +1,93 @@
+name: Random Operations Test
+
+on:
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │  ┌───────────── hour (0 - 23)
+    #          │  │  ┌───────────── day of the month (1 - 31)
+    #          │  │  │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │  │  │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:  '23 */2 * * *' # runs every 2 hours
+  workflow_dispatch:
+    inputs:
+      random_seed:
+        type: number
+        description: 'The random seed'
+        required: false
+        default: 0
+      num_operations:
+        type: number
+        description: "The number of operations to test"
+        default: 250
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+permissions: {}
+
+env:
+  DEFAULT_PG_VERSION: 16
+  PLATFORM: neon-captest-new
+  AWS_DEFAULT_REGION: eu-central-1
+
+jobs:
+  run-random-rests:
+    env:
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+    runs-on: small
+    permissions:
+      id-token: write
+      statuses: write
+
+    strategy:
+      fail-fast: false
+      matrix:
+        pg-version: [16, 17]
+
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+      - name: Download Neon artifact
+        uses: ./.github/actions/download
+        with:
+          name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+          path: /tmp/neon/
+          prefix: latest
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+      - name: Run tests
+        uses: ./.github/actions/run-python-test-set
+        with:
+          build_type: remote
+          test_selection: random_ops
+          run_in_parallel: false
+          extra_params: -m remote_cluster
+          pg_version: ${{ matrix.pg-version }}
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+          RANDOM_SEED: ${{ inputs.random_seed }}
+          NUM_OPERATIONS: ${{ inputs.num_operations }}
+
+      - name: Create Allure report
+        if: ${{ !cancelled() }}
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
+        with:
+          store-test-results-into-db: true
+          aws-oicd-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        env:
+          REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4352,6 +4352,7 @@ dependencies = [
 "humantime-serde",
 "itertools 0.10.5",
 "nix 0.27.1",
+ "once_cell",
 "postgres_backend",
 "postgres_ffi",
 "rand 0.8.5",
--- a/README.md
+++ b/README.md
@@ -270,7 +270,7 @@ By default, this runs both debug and release modes, and all supported postgres v
 testing locally, it is convenient to run just one set of permutations, like this:

 ```sh
-DEFAULT_PG_VERSION=16 BUILD_TYPE=release ./scripts/pytest
+DEFAULT_PG_VERSION=17 BUILD_TYPE=release ./scripts/pytest
 ```

 ## Flamegraphs
--- a/clippy.toml
+++ b/clippy.toml
@@ -12,3 +12,5 @@ disallowed-macros = [
    # cannot disallow this, because clippy finds used from tokio macros
    #"tokio::pin",
 ]
+
+allow-unwrap-in-tests = true
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,265 +0,0 @@
-commit 00aa659afc9c7336ab81036edec3017168aabf40
-Author: Heikki Linnakangas <heikki@neon.tech>
-Date:   Tue Nov 12 16:59:19 2024 +0200
-
-    Temporarily disable test that depends on timezone
-
-diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
-index 23ef5fa..9e60deb 100644
--- a/ext-src/pg_anon-src/tests/expected/generalization.out
-+++ b/ext-src/pg_anon-src/tests/expected/generalization.out
-@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
-  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
- (1 row)
- 
-SELECT anon.generalize_tstzrange('19041107','millennium');
-                      generalize_tstzrange                       
------------------------------------------------------------------
- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
-(1 row)
-
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-   generalize_daterange   
-diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
-index b868344..b4fc977 100644
--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
-+++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
-@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
- SELECT anon.generalize_tstzrange('19041107','year');
- SELECT anon.generalize_tstzrange('19041107','decade');
- SELECT anon.generalize_tstzrange('19041107','century');
-SELECT anon.generalize_tstzrange('19041107','millennium');
-+-- temporarily disabled, see:
-+-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
-+--SELECT anon.generalize_tstzrange('19041107','millennium');
- 
- -- generalize_daterange
- SELECT anon.generalize_daterange('19041107');
-
-commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
-Author: Alexey Masterov <alexeymasterov@neon.tech>
-Date:   Fri May 31 06:34:26 2024 +0000
-
-    These alternative expected files were added to consider the neon features
-
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-new file mode 100644
-index 0000000..2539cfd
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_masked_role_1.out
-@@ -0,0 +1,101 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE mallory_the_masked_user;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.anonymize_table('t1');
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ERROR:  Only supersusers can start the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.start_dynamic_masking(boolean) line 18 at RAISE
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT * FROM mask.t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  SELECT * FROM public.t1;
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  Only supersusers can stop the dynamic masking engine.
-+CONTEXT:  PL/pgSQL function anon.stop_dynamic_masking() line 18 at RAISE
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE mallory_the_masked_user;
-+SELECT COUNT(*)=1 FROM anon.pg_masking_rules;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
-diff --git a/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-new file mode 100644
-index 0000000..8b090fe
--- /dev/null
-+++ b/ext-src/pg_anon-src/tests/expected/permissions_owner_1.out
-@@ -0,0 +1,104 @@
-+BEGIN;
-+CREATE EXTENSION anon CASCADE;
-+NOTICE:  installing required extension "pgcrypto"
-+SELECT anon.init();
-+ init 
-+------
-+ t
-+(1 row)
-+
-+CREATE ROLE oscar_the_owner;
-+ALTER DATABASE :DBNAME OWNER TO oscar_the_owner;
-+CREATE ROLE mallory_the_masked_user;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS 'MASKED';
-+--
-+-- We're checking the owner's permissions
-+--
-+-- see
-+-- https://postgresql-anonymizer.readthedocs.io/en/latest/SECURITY/#permissions
-+--
-+SET ROLE oscar_the_owner;
-+SELECT anon.pseudo_first_name(0) IS NOT NULL;
-+ ?column? 
-+----------
-+ t
-+(1 row)
-+
-+-- SHOULD FAIL
-+DO $$
-+BEGIN
-+  PERFORM anon.init();
-+  EXCEPTION WHEN insufficient_privilege
-+  THEN RAISE NOTICE 'insufficient_privilege';
-+END$$;
-+NOTICE:  insufficient_privilege
-+CREATE TABLE t1(i INT);
-+ALTER TABLE t1 ADD COLUMN t TEXT;
-+SECURITY LABEL FOR anon ON COLUMN t1.t
-+IS 'MASKED WITH VALUE NULL';
-+INSERT INTO t1 VALUES (1,'test');
-+SELECT anon.anonymize_table('t1');
-+ anonymize_table 
-+-----------------
-+ t
-+(1 row)
-+
-+SELECT * FROM t1;
-+ i | t 
-+---+---
-+ 1 | 
-+(1 row)
-+
-+UPDATE t1 SET t='test' WHERE i=1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_start_engine;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+ROLLBACK TO fail_start_engine;
-+RESET ROLE;
-+SELECT anon.start_dynamic_masking();
-+ start_dynamic_masking 
-+-----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+SELECT * FROM t1;
-+ i |  t   
-+---+------
-+ 1 | test
-+(1 row)
-+
-+--SELECT * FROM mask.t1;
-+-- SHOULD FAIL
-+SAVEPOINT fail_stop_engine;
-+SELECT anon.stop_dynamic_masking();
-+ERROR:  permission denied for schema mask
-+CONTEXT:  SQL statement "DROP VIEW mask.t1;"
-+PL/pgSQL function anon.mask_drop_view(oid) line 3 at EXECUTE
-+SQL statement "SELECT anon.mask_drop_view(oid)
-+  FROM pg_catalog.pg_class
-+  WHERE relnamespace=quote_ident(pg_catalog.current_setting('anon.sourceschema'))::REGNAMESPACE
-+  AND relkind IN ('r','p','f')"
-+PL/pgSQL function anon.stop_dynamic_masking() line 22 at PERFORM
-+ROLLBACK TO fail_stop_engine;
-+RESET ROLE;
-+SELECT anon.stop_dynamic_masking();
-+NOTICE:  The previous priviledges of 'mallory_the_masked_user' are not restored. You need to grant them manually.
-+ stop_dynamic_masking 
-+----------------------
-+ t
-+(1 row)
-+
-+SET ROLE oscar_the_owner;
-+-- SHOULD FAIL
-+SAVEPOINT fail_seclabel_on_role;
-+SECURITY LABEL FOR anon ON ROLE mallory_the_masked_user IS NULL;
-+ERROR:  permission denied
-+DETAIL:  The current user must have the CREATEROLE attribute.
-+ROLLBACK TO fail_seclabel_on_role;
-+ROLLBACK;
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -22,7 +22,7 @@ commands:
  - name: local_proxy
    user: postgres
    sysvInitAction: respawn
-    shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
+    shell: 'RUST_LOG="info,proxy::serverless::sql_over_http=warn" /usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432'
  - name: postgres-exporter
    user: nobody
    sysvInitAction: respawn
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -57,24 +57,13 @@ use tracing::{error, info};
 use url::Url;
 use utils::failpoint_support;

-// Compatibility hack: if the control plane specified any remote-ext-config
-// use the default value for extension storage proxy gateway.
-// Remove this once the control plane is updated to pass the gateway URL
-fn parse_remote_ext_config(arg: &str) -> Result<String> {
-    if arg.starts_with("http") {
-        Ok(arg.trim_end_matches('/').to_string())
-    } else {
-        Ok("http://pg-ext-s3-gateway".to_string())
-    }
-}
-
 #[derive(Parser)]
 #[command(rename_all = "kebab-case")]
 struct Cli {
    #[arg(short = 'b', long, default_value = "postgres", env = "POSTGRES_PATH")]
    pub pgbin: String,

-    #[arg(short = 'r', long, value_parser = parse_remote_ext_config)]
+    #[arg(short = 'r', long)]
    pub remote_ext_config: Option<String>,

    /// The port to bind the external listening HTTP server to. Clients running
@@ -116,9 +105,7 @@ struct Cli {
    #[arg(long)]
    pub set_disk_quota_for_fs: Option<String>,

-    // TODO(tristan957): remove alias after compatibility tests are no longer
-    // an issue
-    #[arg(short = 'c', long, alias = "spec-path")]
+    #[arg(short = 'c', long)]
    pub config: Option<OsString>,

    #[arg(short = 'i', long, group = "compute-id")]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -11,9 +11,7 @@ use std::{env, fs};
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
-use compute_api::responses::{
-    ActivityKind, ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus,
-};
+use compute_api::responses::{ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus};
 use compute_api::spec::{
    ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PgIdent,
 };
@@ -134,10 +132,6 @@ pub struct ComputeState {
    /// Timestamp of the last Postgres activity. It could be `None` if
    /// compute wasn't used since start.
    pub last_active: Option<DateTime<Utc>>,
-    /// Timestamp of the last client's activity. Unlike `last_active` it doesn't take into account
-    /// baclkground activity: autovacuum, LR,...
-    pub last_active_query: Option<DateTime<Utc>>,
-    pub last_activity_kind: Option<ActivityKind>,
    pub error: Option<String>,

    /// Compute spec. This can be received from the CLI or - more likely -
@@ -165,8 +159,6 @@ impl ComputeState {
            start_time: Utc::now(),
            status: ComputeStatus::Empty,
            last_active: None,
-            last_active_query: None,
-            last_activity_kind: None,
            error: None,
            pspec: None,
            startup_span: None,
@@ -649,7 +641,26 @@ impl ComputeNode {

                let log_directory_path = Path::new(&self.params.pgdata).join("log");
                let log_directory_path = log_directory_path.to_string_lossy().to_string();
-                configure_audit_rsyslog(log_directory_path.clone(), "hipaa", &remote_endpoint)?;
+
+                // Add project_id,endpoint_id tag to identify the logs.
+                //
+                // These ids are passed from cplane,
+                // for backwards compatibility (old computes that don't have them),
+                // we set them to None.
+                // TODO: Clean up this code when all computes have them.
+                let tag: Option<String> = match (
+                    pspec.spec.project_id.as_deref(),
+                    pspec.spec.endpoint_id.as_deref(),
+                ) {
+                    (Some(project_id), Some(endpoint_id)) => {
+                        Some(format!("{project_id}/{endpoint_id}"))
+                    }
+                    (Some(project_id), None) => Some(format!("{project_id}/None")),
+                    (None, Some(endpoint_id)) => Some(format!("None,{endpoint_id}")),
+                    (None, None) => None,
+                };
+
+                configure_audit_rsyslog(log_directory_path.clone(), tag, &remote_endpoint)?;

                // Launch a background task to clean up the audit logs
                launch_pgaudit_gc(log_directory_path);
@@ -1696,22 +1707,13 @@ impl ComputeNode {
    }

    /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
-    pub fn update_last_active(
-        &self,
-        last_active: Option<DateTime<Utc>>,
-        activity_kind: ActivityKind,
-    ) {
+    pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
        let mut state = self.state.lock().unwrap();
        // NB: `Some(<DateTime>)` is always greater than `None`.
        if last_active > state.last_active {
            state.last_active = last_active;
            debug!("set the last compute activity time to: {:?}", last_active);
        }
-        if activity_kind == ActivityKind::Query && last_active > state.last_active_query {
-            state.last_active_query = last_active;
-            debug!("set the last user's activity time to: {:?}", last_active);
-        }
-        state.last_activity_kind = Some(activity_kind);
    }

    // Look for core dumps and collect backtraces.
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -30,8 +30,6 @@ impl From<&ComputeState> for ComputeStatusResponse {
                .map(|pspec| pspec.timeline_id.to_string()),
            status: state.status,
            last_active: state.last_active,
-            last_active_query: state.last_active_query,
-            last_activity_kind: state.last_activity_kind,
            error: state.error.clone(),
        }
    }
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,7 +3,7 @@ use std::thread;
 use std::time::Duration;

 use chrono::{DateTime, Utc};
-use compute_api::responses::{ActivityKind, ComputeStatus};
+use compute_api::responses::ComputeStatus;
 use compute_api::spec::ComputeFeature;
 use postgres::{Client, NoTls};
 use tracing::{debug, error, info, warn};
@@ -91,7 +91,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                            if detected_activity {
                                // Update the last active time and continue, we don't need to
                                // check backends state change.
-                                compute.update_last_active(Some(Utc::now()), ActivityKind::Query);
+                                compute.update_last_active(Some(Utc::now()));
                                continue;
                            }
                        }
@@ -109,7 +109,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                // This helps us to discover new sessions, that did nothing yet.
                match get_backends_state_change(cli) {
                    Ok(last_active) => {
-                        compute.update_last_active(last_active, ActivityKind::Query);
+                        compute.update_last_active(last_active);
                    }
                    Err(e) => {
                        error!("could not get backends state change: {}", e);
@@ -125,10 +125,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    Ok(r) => match r.try_get::<&str, i64>("count") {
                        Ok(num_ws) => {
                            if num_ws > 0 {
-                                compute.update_last_active(
-                                    Some(Utc::now()),
-                                    ActivityKind::LogicalReplication,
-                                );
+                                compute.update_last_active(Some(Utc::now()));
                                continue;
                            }
                        }
@@ -153,10 +150,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    Ok(row) => match row.try_get::<&str, i64>("count") {
                        Ok(num_subscribers) => {
                            if num_subscribers > 0 {
-                                compute.update_last_active(
-                                    Some(Utc::now()),
-                                    ActivityKind::LogicalReplication,
-                                );
+                                compute.update_last_active(Some(Utc::now()));
                                continue;
                            }
                        }
@@ -181,8 +175,7 @@ fn watch_compute_activity(compute: &ComputeNode) {
                    Ok(r) => match r.try_get::<&str, i64>("count") {
                        Ok(num_workers) => {
                            if num_workers > 0 {
-                                compute
-                                    .update_last_active(Some(Utc::now()), ActivityKind::Autovacuum);
+                                compute.update_last_active(Some(Utc::now()));
                                continue;
                            }
                        }
--- a/compute_tools/src/rsyslog.rs
+++ b/compute_tools/src/rsyslog.rs
@@ -50,13 +50,13 @@ fn restart_rsyslog() -> Result<()> {

 pub fn configure_audit_rsyslog(
    log_directory: String,
-    tag: &str,
+    tag: Option<String>,
    remote_endpoint: &str,
 ) -> Result<()> {
    let config_content: String = format!(
        include_str!("config_template/compute_audit_rsyslog_template.conf"),
        log_directory = log_directory,
-        tag = tag,
+        tag = tag.unwrap_or("".to_string()),
        remote_endpoint = remote_endpoint
    );

--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -63,7 +63,7 @@ const DEFAULT_PAGESERVER_ID: NodeId = NodeId(1);
 const DEFAULT_BRANCH_NAME: &str = "main";
 project_git_version!(GIT_VERSION);

-const DEFAULT_PG_VERSION: u32 = 16;
+const DEFAULT_PG_VERSION: u32 = 17;

 const DEFAULT_PAGESERVER_CONTROL_PLANE_API: &str = "http://127.0.0.1:1234/upcall/v1/";

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -766,10 +766,6 @@ impl Endpoint {
            }
        };

-        // TODO(tristan957): Remove the write to spec.json after compatibility
-        // tests work themselves out
-        let spec_path = self.endpoint_path().join("spec.json");
-        std::fs::write(spec_path, serde_json::to_string_pretty(&config.spec)?)?;
        let config_path = self.endpoint_path().join("config.json");
        std::fs::write(config_path, serde_json::to_string_pretty(&config)?)?;

@@ -779,16 +775,6 @@ impl Endpoint {
            .append(true)
            .open(self.endpoint_path().join("compute.log"))?;

-        // TODO(tristan957): Remove when compatibility tests are no longer an
-        // issue
-        let old_compute_ctl = {
-            let mut cmd = Command::new(self.env.neon_distrib_dir.join("compute_ctl"));
-            let help_output = cmd.arg("--help").output()?;
-            let help_output = String::from_utf8_lossy(&help_output.stdout);
-
-            !help_output.contains("--config")
-        };
-
        // Launch compute_ctl
        let conn_str = self.connstr("cloud_admin", "postgres");
        println!("Starting postgres node at '{}'", conn_str);
@@ -807,19 +793,8 @@ impl Endpoint {
        ])
        .args(["--pgdata", self.pgdata().to_str().unwrap()])
        .args(["--connstr", &conn_str])
-        // TODO(tristan957): Change this to --config when compatibility tests
-        // are no longer an issue
-        .args([
-            "--spec-path",
-            self.endpoint_path()
-                .join(if old_compute_ctl {
-                    "spec.json"
-                } else {
-                    "config.json"
-                })
-                .to_str()
-                .unwrap(),
-        ])
+        .arg("--config")
+        .arg(self.endpoint_path().join("config.json").as_os_str())
        .args([
            "--pgbin",
            self.env
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -23,7 +23,7 @@ use crate::object_storage::{OBJECT_STORAGE_REMOTE_STORAGE_DIR, ObjectStorage};
 use crate::pageserver::{PAGESERVER_REMOTE_STORAGE_DIR, PageServerNode};
 use crate::safekeeper::SafekeeperNode;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 //
 // This data structures represents neon_local CLI config
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -81,19 +81,9 @@ sed -i "s/TIMELINE_ID/${timeline_id}/" ${CONFIG_FILE}

 cat ${CONFIG_FILE}

-# TODO(tristan957): Remove these workarounds for backwards compatibility after
-# the next compute release. That includes these next few lines and the
-# --spec-path in the compute_ctl invocation.
-if compute_ctl --help | grep --quiet -- '--config'; then
-  SPEC_PATH="$CONFIG_FILE"
-else
-  jq '.spec' < "$CONFIG_FILE" > /tmp/spec.json
-  SPEC_PATH=/tmp/spec.json
-fi
-
 echo "Start compute node"
 /usr/local/bin/compute_ctl --pgdata /var/db/postgres/compute \
     -C "postgresql://cloud_admin@localhost:55433/postgres"  \
     -b /usr/local/bin/postgres                              \
     --compute-id "compute-$RANDOM"                          \
-     --spec-path "$SPEC_PATH"
+     --config "$CONFIG_FILE"
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -56,22 +56,9 @@ pub struct ComputeStatusResponse {
    pub status: ComputeStatus,
    #[serde(serialize_with = "rfc3339_serialize")]
    pub last_active: Option<DateTime<Utc>>,
-    pub last_active_query: Option<DateTime<Utc>>,
-    pub last_activity_kind: Option<ActivityKind>,
    pub error: Option<String>,
 }

-#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ActivityKind {
-    // Client's query is executed
-    Query,
-    // Logical replication  is active (subscription or publication)
-    LogicalReplication,
-    // Autovacuum is active
-    Autovacuum,
-}
-
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -35,6 +35,7 @@ nix = {workspace = true, optional = true}
 reqwest.workspace = true
 rand.workspace = true
 tracing-utils.workspace = true
+once_cell.workspace = true

 [dev-dependencies]
 bincode.workspace = true
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -682,10 +682,10 @@ pub mod tenant_conf_defaults {
    pub const DEFAULT_COMPACTION_SHARD_ANCESTOR: bool = true;

    // This value needs to be tuned to avoid OOM. We have 3/4*CPUs threads for L0 compaction, that's
-    // 3/4*16=9 on most of our pageservers. Compacting 20 layers requires about 1 GB memory (could
-    // be reduced later by optimizing L0 hole calculation to avoid loading all keys into memory). So
-    // with this config, we can get a maximum peak compaction usage of 9 GB.
-    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 20;
+    // 3/4*8=6 on most of our pageservers. Compacting 10 layers requires a maximum of
+    // DEFAULT_CHECKPOINT_DISTANCE*10 memory, that's 2560MB. So with this config, we can get a maximum peak
+    // compaction usage of 15360MB.
+    pub const DEFAULT_COMPACTION_UPPER_LIMIT: usize = 10;
    // Enable L0 compaction pass and semaphore by default. L0 compaction must be responsive to avoid
    // read amp.
    pub const DEFAULT_COMPACTION_L0_FIRST: bool = true;
@@ -702,8 +702,11 @@ pub mod tenant_conf_defaults {
    // Relevant: https://github.com/neondatabase/neon/issues/3394
    pub const DEFAULT_GC_PERIOD: &str = "1 hr";
    pub const DEFAULT_IMAGE_CREATION_THRESHOLD: usize = 3;
-    // If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
-    // layer creation will end immediately. Set to 0 to disable.
+    // Currently, any value other than 0 will trigger image layer creation preemption immediately with L0 backpressure
+    // without looking at the exact number of L0 layers.
+    // It was expected to have the following behavior:
+    // > If there are more than threshold * compaction_threshold (that is 3 * 10 in the default config) L0 layers, image
+    // > layer creation will end immediately. Set to 0 to disable.
    pub const DEFAULT_IMAGE_CREATION_PREEMPT_THRESHOLD: usize = 3;
    pub const DEFAULT_PITR_INTERVAL: &str = "7 days";
    pub const DEFAULT_WALRECEIVER_CONNECT_TIMEOUT: &str = "10 seconds";
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1817,8 +1817,34 @@ pub mod virtual_file {
    }

    impl IoMode {
-        pub const fn preferred() -> Self {
-            Self::Buffered
+        pub fn preferred() -> Self {
+            // The default behavior when running Rust unit tests without any further
+            // flags is to use the newest behavior if available on the platform (Direct).
+            // The CI uses the following environment variable to unit tests for all
+            // different modes.
+            // NB: the Python regression & perf tests have their own defaults management
+            // that writes pageserver.toml; they do not use this variable.
+            if cfg!(test) {
+                use once_cell::sync::Lazy;
+                static CACHED: Lazy<IoMode> = Lazy::new(|| {
+                    utils::env::var_serde_json_string(
+                        "NEON_PAGESERVER_UNIT_TEST_VIRTUAL_FILE_IO_MODE",
+                    )
+                    .unwrap_or({
+                        #[cfg(target_os = "linux")]
+                        {
+                            IoMode::Direct
+                        }
+                        #[cfg(not(target_os = "linux"))]
+                        {
+                            IoMode::Buffered
+                        }
+                    })
+                });
+                *CACHED
+            } else {
+                IoMode::Buffered
+            }
        }
    }

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -118,13 +118,13 @@ pub struct PageServerConf {
    /// A lower value implicitly deprioritizes loading such tenants, vs. other work in the system.
    pub concurrent_tenant_warmup: ConfigurableSemaphore,

-    /// Number of concurrent [`Tenant::gather_size_inputs`](crate::tenant::Tenant::gather_size_inputs) allowed.
+    /// Number of concurrent [`TenantShard::gather_size_inputs`](crate::tenant::TenantShard::gather_size_inputs) allowed.
    pub concurrent_tenant_size_logical_size_queries: ConfigurableSemaphore,
-    /// Limit of concurrent [`Tenant::gather_size_inputs`] issued by module `eviction_task`.
+    /// Limit of concurrent [`TenantShard::gather_size_inputs`] issued by module `eviction_task`.
    /// The number of permits is the same as `concurrent_tenant_size_logical_size_queries`.
    /// See the comment in `eviction_task` for details.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub eviction_task_immitated_concurrent_logical_size_queries: ConfigurableSemaphore,

    // How often to collect metrics and send them to the metrics endpoint.
@@ -588,10 +588,10 @@ impl ConfigurableSemaphore {
    /// Initializse using a non-zero amount of permits.
    ///
    /// Require a non-zero initial permits, because using permits == 0 is a crude way to disable a
-    /// feature such as [`Tenant::gather_size_inputs`]. Otherwise any semaphore using future will
+    /// feature such as [`TenantShard::gather_size_inputs`]. Otherwise any semaphore using future will
    /// behave like [`futures::future::pending`], just waiting until new permits are added.
    ///
-    /// [`Tenant::gather_size_inputs`]: crate::tenant::Tenant::gather_size_inputs
+    /// [`TenantShard::gather_size_inputs`]: crate::tenant::TenantShard::gather_size_inputs
    pub fn new(initial_permits: NonZeroUsize) -> Self {
        ConfigurableSemaphore {
            initial_permits,
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::mgr::TenantManager;
 use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 mod disk_cache;
 mod metrics;
@@ -428,7 +428,7 @@ async fn calculate_synthetic_size_worker(
    }
 }

-async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
+async fn calculate_and_log(tenant: &TenantShard, cancel: &CancellationToken, ctx: &RequestContext) {
    const CAUSE: LogicalSizeCalculationCause =
        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;

--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -175,9 +175,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::remote_size`]
+    /// [`TenantShard::remote_size`]
    ///
-    /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size
+    /// [`TenantShard::remote_size`]: crate::tenant::TenantShard::remote_size
    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
            tenant_id,
@@ -199,9 +199,9 @@ impl MetricsKey {
        .absolute_values()
    }

-    /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
+    /// [`TenantShard::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`].
    ///
-    /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size
+    /// [`TenantShard::cached_synthetic_size`]: crate::tenant::TenantShard::cached_synthetic_size
    /// [`calculate_synthetic_size_worker`]: super::calculate_synthetic_size_worker
    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
        MetricsKey {
@@ -254,7 +254,7 @@ pub(super) async fn collect_all_metrics(

 async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
 where
-    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
+    S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::TenantShard>)>,
 {
    let mut current_metrics: Vec<NewRawMetric> = Vec::new();

@@ -308,7 +308,7 @@ impl TenantSnapshot {
    ///
    /// `resident_size` is calculated of the timelines we had access to for other metrics, so we
    /// cannot just list timelines here.
-    fn collect(t: &Arc<crate::tenant::Tenant>, resident_size: u64) -> Self {
+    fn collect(t: &Arc<crate::tenant::TenantShard>, resident_size: u64) -> Self {
        TenantSnapshot {
            resident_size,
            remote_size: t.remote_size(),
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1873,7 +1873,7 @@ async fn update_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

@@ -1917,7 +1917,7 @@ async fn patch_tenant_config_handler(
        &ShardParameters::default(),
    );

-    crate::tenant::Tenant::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
+    crate::tenant::TenantShard::persist_tenant_config(state.conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(|e| ApiError::InternalServerError(anyhow::anyhow!(e)))?;

--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,7 +49,7 @@ use tracing::{info, info_span};
 /// backwards-compatible changes to the metadata format.
 pub const STORAGE_FORMAT_VERSION: u16 = 3;

-pub const DEFAULT_PG_VERSION: u32 = 16;
+pub const DEFAULT_PG_VERSION: u32 = 17;

 // Magic constants used to identify different kinds of files
 pub const IMAGE_FILE_MAGIC: u16 = 0x5A60;
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1086,7 +1086,7 @@ pub(crate) static TIMELINE_EPHEMERAL_BYTES: Lazy<UIntGauge> = Lazy::new(|| {
    .expect("Failed to register metric")
 });

-/// Metrics related to the lifecycle of a [`crate::tenant::Tenant`] object: things
+/// Metrics related to the lifecycle of a [`crate::tenant::TenantShard`] object: things
 /// like how long it took to load.
 ///
 /// Note that these are process-global metrics, _not_ per-tenant metrics.  Per-tenant
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -76,7 +76,7 @@ use crate::tenant::timeline::{self, WaitLsnError};
 use crate::tenant::{GetTimelineError, PageReconstructError, Timeline};
 use crate::{basebackup, timed_after_cancellation};

-/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::Tenant`] which
+/// How long we may wait for a [`crate::tenant::mgr::TenantSlot::InProgress`]` and/or a [`crate::tenant::TenantShard`] which
 /// is not yet in state [`TenantState::Active`].
 ///
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -158,7 +158,7 @@ pub struct TenantSharedResources {
    pub l0_flush_global_state: L0FlushGlobalState,
 }

-/// A [`Tenant`] is really an _attached_ tenant.  The configuration
+/// A [`TenantShard`] is really an _attached_ tenant.  The configuration
 /// for an attached tenant is a subset of the [`LocationConf`], represented
 /// in this struct.
 #[derive(Clone)]
@@ -245,7 +245,7 @@ pub(crate) enum SpawnMode {
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
-pub struct Tenant {
+pub struct TenantShard {
    // Global pageserver config parameters
    pub conf: &'static PageServerConf,

@@ -267,7 +267,7 @@ pub struct Tenant {
    shard_identity: ShardIdentity,

    /// The remote storage generation, used to protect S3 objects from split-brain.
-    /// Does not change over the lifetime of the [`Tenant`] object.
+    /// Does not change over the lifetime of the [`TenantShard`] object.
    ///
    /// This duplicates the generation stored in LocationConf, but that structure is mutable:
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
@@ -309,7 +309,7 @@ pub struct Tenant {
    // Access to global deletion queue for when this tenant wants to schedule a deletion
    deletion_queue_client: DeletionQueueClient,

-    /// Cached logical sizes updated updated on each [`Tenant::gather_size_inputs`].
+    /// Cached logical sizes updated updated on each [`TenantShard::gather_size_inputs`].
    cached_logical_sizes: tokio::sync::Mutex<HashMap<(TimelineId, Lsn), u64>>,
    cached_synthetic_tenant_size: Arc<AtomicU64>,

@@ -337,12 +337,12 @@ pub struct Tenant {
    // Timelines' cancellation token.
    pub(crate) cancel: CancellationToken,

-    // Users of the Tenant such as the page service must take this Gate to avoid
-    // trying to use a Tenant which is shutting down.
+    // Users of the TenantShard such as the page service must take this Gate to avoid
+    // trying to use a TenantShard which is shutting down.
    pub(crate) gate: Gate,

    /// Throttle applied at the top of [`Timeline::get`].
-    /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
+    /// All [`TenantShard::timelines`] of a given [`TenantShard`] instance share the same [`throttle::Throttle`] instance.
    pub(crate) pagestream_throttle: Arc<throttle::Throttle>,

    pub(crate) pagestream_throttle_metrics: Arc<crate::metrics::tenant_throttling::Pagestream>,
@@ -362,7 +362,7 @@ pub struct Tenant {

    l0_flush_global_state: L0FlushGlobalState,
 }
-impl std::fmt::Debug for Tenant {
+impl std::fmt::Debug for TenantShard {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{} ({})", self.tenant_shard_id, self.current_state())
    }
@@ -841,7 +841,7 @@ impl Debug for SetStoppingError {
    }
 }

-/// Arguments to [`Tenant::create_timeline`].
+/// Arguments to [`TenantShard::create_timeline`].
 ///
 /// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
 /// is `None`, the result of the timeline create call is not deterministic.
@@ -876,7 +876,7 @@ pub(crate) struct CreateTimelineParamsImportPgdata {
    pub(crate) idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`] in  [`Tenant::start_creating_timeline`].
+/// What is used to determine idempotency of a [`TenantShard::create_timeline`] call in  [`TenantShard::start_creating_timeline`] in  [`TenantShard::start_creating_timeline`].
 ///
 /// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
@@ -914,7 +914,7 @@ pub(crate) struct CreatingTimelineIdempotencyImportPgdata {
    idempotency_key: import_pgdata::index_part_format::IdempotencyKey,
 }

-/// What is returned by [`Tenant::start_creating_timeline`].
+/// What is returned by [`TenantShard::start_creating_timeline`].
 #[must_use]
 enum StartCreatingTimelineResult {
    CreateGuard(TimelineCreateGuard),
@@ -943,13 +943,13 @@ struct TimelineInitAndSyncNeedsSpawnImportPgdata {
    guard: TimelineCreateGuard,
 }

-/// What is returned by [`Tenant::create_timeline`].
+/// What is returned by [`TenantShard::create_timeline`].
 enum CreateTimelineResult {
    Created(Arc<Timeline>),
    Idempotent(Arc<Timeline>),
-    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`Tenant::timelines`] when
+    /// IMPORTANT: This [`Arc<Timeline>`] object is not in [`TenantShard::timelines`] when
    /// we return this result, nor will this concrete object ever be added there.
-    /// Cf method comment on [`Tenant::create_timeline_import_pgdata`].
+    /// Cf method comment on [`TenantShard::create_timeline_import_pgdata`].
    ImportSpawned(Arc<Timeline>),
 }

@@ -1082,7 +1082,7 @@ pub(crate) enum LoadConfigError {
    NotFound(Utf8PathBuf),
 }

-impl Tenant {
+impl TenantShard {
    /// Yet another helper for timeline initialization.
    ///
    /// - Initializes the Timeline struct and inserts it into the tenant's hash map
@@ -1303,7 +1303,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Arc<Tenant>, GlobalShutDown> {
+    ) -> Result<Arc<TenantShard>, GlobalShutDown> {
        let wal_redo_manager =
            WalRedoManager::new(PostgresRedoManager::new(conf, tenant_shard_id))?;

@@ -1317,7 +1317,7 @@ impl Tenant {
        let attach_mode = attached_conf.location.attach_mode;
        let generation = attached_conf.location.generation;

-        let tenant = Arc::new(Tenant::new(
+        let tenant = Arc::new(TenantShard::new(
            TenantState::Attaching,
            conf,
            attached_conf,
@@ -1334,7 +1334,7 @@ impl Tenant {
        let attach_gate_guard = tenant
            .gate
            .enter()
-            .expect("We just created the Tenant: nothing else can have shut it down yet");
+            .expect("We just created the TenantShard: nothing else can have shut it down yet");

        // Do all the hard work in the background
        let tenant_clone = Arc::clone(&tenant);
@@ -1362,7 +1362,7 @@ impl Tenant {
                    }
                }

-                fn make_broken_or_stopping(t: &Tenant, err: anyhow::Error) {
+                fn make_broken_or_stopping(t: &TenantShard, err: anyhow::Error) {
                    t.state.send_modify(|state| match state {
                        // TODO: the old code alluded to DeleteTenantFlow sometimes setting
                        // TenantState::Stopping before we get here, but this may be outdated.
@@ -1627,7 +1627,7 @@ impl Tenant {
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -1957,7 +1957,7 @@ impl Tenant {
    }

    async fn load_timelines_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_ids: HashSet<TimelineId>,
        remote_storage: &GenericRemoteStorage,
        heatmap: Option<(HeatMapTenant, std::time::Instant)>,
@@ -2028,7 +2028,7 @@ impl Tenant {
    }

    fn load_timeline_metadata(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        timeline_id: TimelineId,
        remote_storage: GenericRemoteStorage,
        previous_heatmap: Option<PreviousHeatmap>,
@@ -2429,14 +2429,14 @@ impl Tenant {
    /// This is used by tests & import-from-basebackup.
    ///
    /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
-    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
+    /// a state that will fail [`TenantShard::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
    ///
    /// The caller is responsible for getting the timeline into a state that will be accepted
-    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
+    /// by [`TenantShard::load_remote_timeline`] / [`TenantShard::attach`].
    /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
-    /// to the [`Tenant::timelines`].
+    /// to the [`TenantShard::timelines`].
    ///
-    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
+    /// Tests should use `TenantShard::create_test_timeline` to set up the minimum required metadata keys.
    pub(crate) async fn create_empty_timeline(
        self: &Arc<Self>,
        new_timeline_id: TimelineId,
@@ -2584,7 +2584,7 @@ impl Tenant {
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
    #[allow(clippy::too_many_arguments)]
    pub(crate) async fn create_timeline(
-        self: &Arc<Tenant>,
+        self: &Arc<TenantShard>,
        params: CreateTimelineParams,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
@@ -2751,13 +2751,13 @@ impl Tenant {
        Ok(activated_timeline)
    }

-    /// The returned [`Arc<Timeline>`] is NOT in the [`Tenant::timelines`] map until the import
+    /// The returned [`Arc<Timeline>`] is NOT in the [`TenantShard::timelines`] map until the import
    /// completes in the background. A DIFFERENT [`Arc<Timeline>`] will be inserted into the
-    /// [`Tenant::timelines`] map when the import completes.
+    /// [`TenantShard::timelines`] map when the import completes.
    /// We only return an [`Arc<Timeline>`] here so the API handler can create a [`pageserver_api::models::TimelineInfo`]
    /// for the response.
    async fn create_timeline_import_pgdata(
-        self: &Arc<Tenant>,
+        self: &Arc<Self>,
        params: CreateTimelineParamsImportPgdata,
        activate: ActivateTimelineArgs,
        ctx: &RequestContext,
@@ -2854,7 +2854,7 @@ impl Tenant {

    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))]
    async fn create_timeline_import_pgdata_task(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2882,7 +2882,7 @@ impl Tenant {
    }

    async fn create_timeline_import_pgdata_task_impl(
-        self: Arc<Tenant>,
+        self: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        index_part: import_pgdata::index_part_format::Root,
        activate: ActivateTimelineArgs,
@@ -2899,10 +2899,10 @@ impl Tenant {
        // Reload timeline from remote.
        // This proves that the remote state is attachable, and it reuses the code.
        //
-        // TODO: think about whether this is safe to do with concurrent Tenant::shutdown.
+        // TODO: think about whether this is safe to do with concurrent TenantShard::shutdown.
        // timeline_create_guard hols the tenant gate open, so, shutdown cannot _complete_ until we exit.
-        // But our activate() call might launch new background tasks after Tenant::shutdown
-        // already went past shutting down the Tenant::timelines, which this timeline here is no part of.
+        // But our activate() call might launch new background tasks after TenantShard::shutdown
+        // already went past shutting down the TenantShard::timelines, which this timeline here is no part of.
        // I think the same problem exists with the bootstrap & branch mgmt API tasks (tenant shutting
        // down while bootstrapping/branching + activating), but, the race condition is much more likely
        // to manifest because of the long runtime of this import task.
@@ -2917,7 +2917,7 @@ impl Tenant {
        // };
        let timeline_id = timeline.timeline_id;

-        // load from object storage like Tenant::attach does
+        // load from object storage like TenantShard::attach does
        let resources = self.build_timeline_resources(timeline_id);
        let index_part = resources
            .remote_client
@@ -3938,7 +3938,7 @@ enum ActivateTimelineArgs {
    No,
 }

-impl Tenant {
+impl TenantShard {
    pub fn tenant_specific_overrides(&self) -> pageserver_api::models::TenantConfig {
        self.tenant_conf.load().tenant_conf.clone()
    }
@@ -4096,7 +4096,7 @@ impl Tenant {
        update: F,
    ) -> anyhow::Result<pageserver_api::models::TenantConfig> {
        // Use read-copy-update in order to avoid overwriting the location config
-        // state if this races with [`Tenant::set_new_location_config`]. Note that
+        // state if this races with [`TenantShard::set_new_location_config`]. Note that
        // this race is not possible if both request types come from the storage
        // controller (as they should!) because an exclusive op lock is required
        // on the storage controller side.
@@ -4219,7 +4219,7 @@ impl Tenant {
        Ok((timeline, timeline_ctx))
    }

-    /// [`Tenant::shutdown`] must be called before dropping the returned [`Tenant`] object
+    /// [`TenantShard::shutdown`] must be called before dropping the returned [`TenantShard`] object
    /// to ensure proper cleanup of background tasks and metrics.
    //
    // Allow too_many_arguments because a constructor's argument list naturally grows with the
@@ -4235,7 +4235,7 @@ impl Tenant {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        l0_flush_global_state: L0FlushGlobalState,
-    ) -> Tenant {
+    ) -> TenantShard {
        debug_assert!(
            !attached_conf.location.generation.is_none() || conf.control_plane_api.is_none()
        );
@@ -4295,7 +4295,7 @@ impl Tenant {
            }
        });

-        Tenant {
+        TenantShard {
            tenant_shard_id,
            shard_identity,
            generation: attached_conf.location.generation,
@@ -4330,7 +4330,7 @@ impl Tenant {
            cancel: CancellationToken::default(),
            gate: Gate::default(),
            pagestream_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                TenantShard::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
            )),
            pagestream_throttle_metrics: Arc::new(
                crate::metrics::tenant_throttling::Pagestream::new(&tenant_shard_id),
@@ -4466,11 +4466,11 @@ impl Tenant {

        // Perform GC for each timeline.
        //
-        // Note that we don't hold the `Tenant::gc_cs` lock here because we don't want to delay the
+        // Note that we don't hold the `TenantShard::gc_cs` lock here because we don't want to delay the
        // branch creation task, which requires the GC lock. A GC iteration can run concurrently
        // with branch creation.
        //
-        // See comments in [`Tenant::branch_timeline`] for more information about why branch
+        // See comments in [`TenantShard::branch_timeline`] for more information about why branch
        // creation task can run concurrently with timeline's GC iteration.
        for timeline in gc_timelines {
            if cancel.is_cancelled() {
@@ -4500,7 +4500,7 @@ impl Tenant {

    /// Refreshes the Timeline::gc_info for all timelines, returning the
    /// vector of timelines which have [`Timeline::get_last_record_lsn`] past
-    /// [`Tenant::get_gc_horizon`].
+    /// [`TenantShard::get_gc_horizon`].
    ///
    /// This is usually executed as part of periodic gc, but can now be triggered more often.
    pub(crate) async fn refresh_gc_info(
@@ -5499,7 +5499,7 @@ impl Tenant {
            }
        }

-        // The flushes we did above were just writes, but the Tenant might have had
+        // The flushes we did above were just writes, but the TenantShard might have had
        // pending deletions as well from recent compaction/gc: we want to flush those
        // as well.  This requires flushing the global delete queue.  This is cheap
        // because it's typically a no-op.
@@ -5517,7 +5517,7 @@ impl Tenant {

    /// How much local storage would this tenant like to have?  It can cope with
    /// less than this (via eviction and on-demand downloads), but this function enables
-    /// the Tenant to advertise how much storage it would prefer to have to provide fast I/O
+    /// the TenantShard to advertise how much storage it would prefer to have to provide fast I/O
    /// by keeping important things on local disk.
    ///
    /// This is a heuristic, not a guarantee: tenants that are long-idle will actually use less
@@ -5540,11 +5540,11 @@ impl Tenant {
    /// manifest in `Self::remote_tenant_manifest`.
    ///
    /// TODO: instead of requiring callers to remember to call `maybe_upload_tenant_manifest` after
-    /// changing any `Tenant` state that's included in the manifest, consider making the manifest
+    /// changing any `TenantShard` state that's included in the manifest, consider making the manifest
    /// the authoritative source of data with an API that automatically uploads on changes. Revisit
    /// this when the manifest is more widely used and we have a better idea of the data model.
    pub(crate) async fn maybe_upload_tenant_manifest(&self) -> Result<(), TenantManifestError> {
-        // Multiple tasks may call this function concurrently after mutating the Tenant runtime
+        // Multiple tasks may call this function concurrently after mutating the TenantShard runtime
        // state, affecting the manifest generated by `build_tenant_manifest`. We use an async mutex
        // to serialize these callers. `eq_ignoring_version` acts as a slightly inefficient but
        // simple coalescing mechanism.
@@ -5812,7 +5812,7 @@ pub(crate) mod harness {
            info_span!("TenantHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
        }

-        pub(crate) async fn load(&self) -> (Arc<Tenant>, RequestContext) {
+        pub(crate) async fn load(&self) -> (Arc<TenantShard>, RequestContext) {
            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
                .with_scope_unit_test();
            (
@@ -5827,10 +5827,10 @@ pub(crate) mod harness {
        pub(crate) async fn do_try_load(
            &self,
            ctx: &RequestContext,
-        ) -> anyhow::Result<Arc<Tenant>> {
+        ) -> anyhow::Result<Arc<TenantShard>> {
            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));

-            let tenant = Arc::new(Tenant::new(
+            let tenant = Arc::new(TenantShard::new(
                TenantState::Attaching,
                self.conf,
                AttachedTenantConf::try_from(LocationConf::attached_single(
@@ -6046,7 +6046,7 @@ mod tests {
    #[cfg(feature = "testing")]
    #[allow(clippy::too_many_arguments)]
    async fn randomize_timeline(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
        new_timeline_id: TimelineId,
        pg_version: u32,
        spec: TestTimelineSpecification,
@@ -6936,7 +6936,7 @@ mod tests {
    }

    async fn bulk_insert_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        lsn: Lsn,
@@ -6948,7 +6948,7 @@ mod tests {
    }

    async fn bulk_insert_maybe_compact_gc(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &Arc<Timeline>,
        ctx: &RequestContext,
        mut lsn: Lsn,
@@ -7858,7 +7858,7 @@ mod tests {
            let (tline, _ctx) = tenant
                .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
                .await?;
-            // Leave the timeline ID in [`Tenant::timelines_creating`] to exclude attempting to create it again
+            // Leave the timeline ID in [`TenantShard::timelines_creating`] to exclude attempting to create it again
            let raw_tline = tline.raw_timeline().unwrap();
            raw_tline
                .shutdown(super::timeline::ShutdownMode::Hard)
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -564,8 +564,9 @@ mod tests {
            Lsn(0),
            Lsn(0),
            Lsn(0),
-            // Any version will do here, so use the default
-            crate::DEFAULT_PG_VERSION,
+            // Updating this version to 17 will cause the test to fail at the
+            // next assert_eq!().
+            16,
        );
        let expected_bytes = vec![
            /* TimelineMetadataHeader */
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -52,7 +52,9 @@ use crate::tenant::config::{
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::storage_layer::inmemory_layer;
 use crate::tenant::timeline::ShutdownMode;
-use crate::tenant::{AttachedTenantConf, GcError, LoadConfigError, SpawnMode, Tenant, TenantState};
+use crate::tenant::{
+    AttachedTenantConf, GcError, LoadConfigError, SpawnMode, TenantShard, TenantState,
+};
 use crate::virtual_file::MaybeFatalIo;
 use crate::{InitializationOrder, TEMP_FILE_SUFFIX};

@@ -67,7 +69,7 @@ use crate::{InitializationOrder, TEMP_FILE_SUFFIX};
 /// having a properly acquired generation (Secondary doesn't need a generation)
 #[derive(Clone)]
 pub(crate) enum TenantSlot {
-    Attached(Arc<Tenant>),
+    Attached(Arc<TenantShard>),
    Secondary(Arc<SecondaryTenant>),
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
@@ -86,7 +88,7 @@ impl std::fmt::Debug for TenantSlot {

 impl TenantSlot {
    /// Return the `Tenant` in this slot if attached, else None
-    fn get_attached(&self) -> Option<&Arc<Tenant>> {
+    fn get_attached(&self) -> Option<&Arc<TenantShard>> {
        match self {
            Self::Attached(t) => Some(t),
            Self::Secondary(_) => None,
@@ -164,7 +166,7 @@ impl TenantStartupMode {
 /// Result type for looking up a TenantId to a specific shard
 pub(crate) enum ShardResolveResult {
    NotFound,
-    Found(Arc<Tenant>),
+    Found(Arc<TenantShard>),
    // Wait for this barrrier, then query again
    InProgress(utils::completion::Barrier),
 }
@@ -173,7 +175,7 @@ impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
    /// None is returned.
-    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<TenantShard>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
@@ -410,7 +412,7 @@ fn load_tenant_config(
        return None;
    }

-    Some(Tenant::load_tenant_config(conf, &tenant_shard_id))
+    Some(TenantShard::load_tenant_config(conf, &tenant_shard_id))
 }

 /// Initial stage of load: walk the local tenants directory, clean up any temp files,
@@ -606,7 +608,8 @@ pub async fn init_tenant_mgr(
        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
        config_write_futs.push(async move {
-            let r = Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
+            let r =
+                TenantShard::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await;
            (tenant_shard_id, location_conf, r)
        });
    }
@@ -694,7 +697,7 @@ fn tenant_spawn(
    init_order: Option<InitializationOrder>,
    mode: SpawnMode,
    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, GlobalShutDown> {
+) -> Result<Arc<TenantShard>, GlobalShutDown> {
    // All these conditions should have been satisfied by our caller: the tenant dir exists, is a well formed
    // path, and contains a configuration file.  Assertions that do synchronous I/O are limited to debug mode
    // to avoid impacting prod runtime performance.
@@ -706,7 +709,7 @@ fn tenant_spawn(
            .unwrap()
    );

-    Tenant::spawn(
+    TenantShard::spawn(
        conf,
        tenant_shard_id,
        resources,
@@ -883,12 +886,12 @@ impl TenantManager {
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or currently
    /// undergoing a state change (i.e. slot is InProgress).
    ///
-    /// The return Tenant is not guaranteed to be active: check its status after obtaing it, or
-    /// use [`Tenant::wait_to_become_active`] before using it if you will do I/O on it.
+    /// The return TenantShard is not guaranteed to be active: check its status after obtaing it, or
+    /// use [`TenantShard::wait_to_become_active`] before using it if you will do I/O on it.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
-    ) -> Result<Arc<Tenant>, GetTenantError> {
+    ) -> Result<Arc<TenantShard>, GetTenantError> {
        let locked = self.tenants.read().unwrap();

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
@@ -937,12 +940,12 @@ impl TenantManager {
        flush: Option<Duration>,
        mut spawn_mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
+    ) -> Result<Option<Arc<TenantShard>>, UpsertLocationError> {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

        enum FastPathModified {
-            Attached(Arc<Tenant>),
+            Attached(Arc<TenantShard>),
            Secondary(Arc<SecondaryTenant>),
        }

@@ -999,9 +1002,13 @@ impl TenantManager {
        // phase of writing config and/or waiting for flush, before returning.
        match fast_path_taken {
            Some(FastPathModified::Attached(tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                // Transition to AttachedStale means we may well hold a valid generation
                // still, and have been requested to go stale as part of a migration.  If
@@ -1030,9 +1037,13 @@ impl TenantManager {
                return Ok(Some(tenant));
            }
            Some(FastPathModified::Secondary(_secondary_tenant)) => {
-                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                    .await
-                    .fatal_err("write tenant shard config");
+                TenantShard::persist_tenant_config(
+                    self.conf,
+                    &tenant_shard_id,
+                    &new_location_config,
+                )
+                .await
+                .fatal_err("write tenant shard config");

                return Ok(None);
            }
@@ -1122,7 +1133,7 @@ impl TenantManager {
        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+        TenantShard::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
            .await
            .fatal_err("write tenant shard config");

@@ -1262,7 +1273,7 @@ impl TenantManager {

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+        let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)?;

        if drop_cache {
            tracing::info!("Dropping local file cache");
@@ -1297,7 +1308,7 @@ impl TenantManager {
        Ok(())
    }

-    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<Tenant>> {
+    pub(crate) fn get_attached_active_tenant_shards(&self) -> Vec<Arc<TenantShard>> {
        let locked = self.tenants.read().unwrap();
        match &*locked {
            TenantsMap::Initializing => Vec::new(),
@@ -1446,7 +1457,7 @@ impl TenantManager {
    #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1476,7 +1487,7 @@ impl TenantManager {

    pub(crate) async fn do_shard_split(
        &self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        new_shard_count: ShardCount,
        new_stripe_size: Option<ShardStripeSize>,
        ctx: &RequestContext,
@@ -1703,7 +1714,7 @@ impl TenantManager {
    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
    async fn shard_split_hardlink(
        &self,
-        parent_shard: &Tenant,
+        parent_shard: &TenantShard,
        child_shards: Vec<TenantShardId>,
    ) -> anyhow::Result<()> {
        debug_assert_current_span_has_tenant_id();
@@ -1988,7 +1999,7 @@ impl TenantManager {
            }

            let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-            let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)
+            let config = TenantShard::load_tenant_config(self.conf, &tenant_shard_id)
                .map_err(|e| Error::DetachReparent(e.into()))?;

            let shard_identity = config.shard;
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -133,7 +133,7 @@
 //! - Initiate upload queue with that [`IndexPart`].
 //! - Reschedule all lost operations by comparing the local filesystem state
 //!   and remote state as per [`IndexPart`]. This is done in
-//!   [`Tenant::timeline_init_and_sync`].
+//!   [`TenantShard::timeline_init_and_sync`].
 //!
 //! Note that if we crash during file deletion between the index update
 //! that removes the file from the list of files, and deleting the remote file,
@@ -171,7 +171,7 @@
 //! If no remote storage configuration is provided, the [`RemoteTimelineClient`] is
 //! not created and the uploads are skipped.
 //!
-//! [`Tenant::timeline_init_and_sync`]: super::Tenant::timeline_init_and_sync
+//! [`TenantShard::timeline_init_and_sync`]: super::TenantShard::timeline_init_and_sync
 //! [`Timeline::load_layer_map`]: super::Timeline::load_layer_map

 pub(crate) mod download;
@@ -2743,7 +2743,7 @@ mod tests {
    use crate::tenant::config::AttachmentMode;
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::layer::local_layer_path;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    pub(super) fn dummy_contents(name: &str) -> Vec<u8> {
        format!("contents for {name}").into()
@@ -2796,7 +2796,7 @@ mod tests {

    struct TestSetup {
        harness: TenantHarness,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline: Arc<Timeline>,
        tenant_ctx: RequestContext,
    }
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -452,7 +452,7 @@ async fn do_download_index_part(
 /// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
 /// to listing objects.
 ///
-/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `my_generation`: the value of `[crate::tenant::TenantShard::generation]`
 /// * `what`: for logging, what object are we downloading
 /// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
 /// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -21,7 +21,7 @@ use super::scheduler::{
 use super::{CommandRequest, SecondaryTenantError, UploadCommand};
 use crate::TEMP_FILE_SUFFIX;
 use crate::metrics::SECONDARY_MODE;
-use crate::tenant::Tenant;
+use crate::tenant::TenantShard;
 use crate::tenant::config::AttachmentMode;
 use crate::tenant::mgr::{GetTenantError, TenantManager};
 use crate::tenant::remote_timeline_client::remote_heatmap_path;
@@ -74,7 +74,7 @@ impl RunningJob for WriteInProgress {
 }

 struct UploadPending {
-    tenant: Arc<Tenant>,
+    tenant: Arc<TenantShard>,
    last_upload: Option<LastUploadState>,
    target_time: Option<Instant>,
    period: Option<Duration>,
@@ -106,7 +106,7 @@ impl scheduler::Completion for WriteComplete {
 struct UploaderTenantState {
    // This Weak only exists to enable culling idle instances of this type
    // when the Tenant has been deallocated.
-    tenant: Weak<Tenant>,
+    tenant: Weak<TenantShard>,

    /// Digest of the serialized heatmap that we last successfully uploaded
    last_upload_state: Option<LastUploadState>,
@@ -357,7 +357,7 @@ struct LastUploadState {
 /// of the object we would have uploaded.
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
    last_upload: Option<LastUploadState>,
 ) -> Result<UploadHeatmapOutcome, UploadHeatmapError> {
    debug_assert_current_span_has_tenant_id();
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -360,7 +360,7 @@ where

    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
    ///
-    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::TenantShard`] or [`crate::tenant::secondary::SecondaryTenant`]
    ///
    /// This function resets the pending list: it is assumed that the caller may change their mind about
    /// which tenants need work between calls to schedule_iteration.
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -12,7 +12,7 @@ use tracing::*;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;

-use super::{GcError, LogicalSizeCalculationCause, Tenant};
+use super::{GcError, LogicalSizeCalculationCause, TenantShard};
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::tenant::{MaybeOffloaded, Timeline};
@@ -156,7 +156,7 @@ pub struct TimelineInputs {
 ///   initdb_lsn  branchpoints*  next_pitr_cutoff  latest
 /// ```
 pub(super) async fn gather_inputs(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    limit: &Arc<Semaphore>,
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1620,7 +1620,7 @@ pub(crate) mod test {
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    /// Construct an index for a fictional delta layer and and then
    /// traverse in order to plan vectored reads for a query. Finally,
@@ -2209,7 +2209,7 @@ pub(crate) mod test {
    }

    pub(crate) async fn produce_delta_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        tline: &Arc<Timeline>,
        mut deltas: Vec<(Key, Lsn, Value)>,
        ctx: &RequestContext,
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1228,7 +1228,7 @@ mod test {
    use crate::tenant::harness::{TIMELINE_ID, TenantHarness};
    use crate::tenant::storage_layer::{Layer, ResidentLayer};
    use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
-    use crate::tenant::{Tenant, Timeline};
+    use crate::tenant::{TenantShard, Timeline};

    #[tokio::test]
    async fn image_layer_rewrite() {
@@ -1410,7 +1410,7 @@ mod test {
    }

    async fn produce_image_layer(
-        tenant: &Tenant,
+        tenant: &TenantShard,
        tline: &Arc<Timeline>,
        mut images: Vec<(Key, Bytes)>,
        lsn: Lsn,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -24,7 +24,7 @@ use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
-use crate::tenant::{Tenant, TenantState};
+use crate::tenant::{TenantShard, TenantState};

 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -117,7 +117,7 @@ pub(crate) async fn acquire_concurrency_permit(
 }

 /// Start per tenant background loops: compaction, GC, and ingest housekeeping.
-pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>) {
+pub fn start_background_loops(tenant: &Arc<TenantShard>, can_start: Option<&Barrier>) {
    let tenant_shard_id = tenant.tenant_shard_id;

    task_mgr::spawn(
@@ -198,7 +198,7 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, can_start: Option<&Barrier>)
 }

 /// Compaction task's main loop.
-async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn compaction_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    const BASE_BACKOFF_SECS: f64 = 1.0;
    const MAX_BACKOFF_SECS: f64 = 300.0;
    const RECHECK_CONFIG_INTERVAL: Duration = Duration::from_secs(10);
@@ -348,7 +348,7 @@ pub(crate) fn log_compaction_error(
 }

 /// GC task's main loop.
-async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn gc_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    const MAX_BACKOFF_SECS: f64 = 300.0;
    let mut error_run = 0; // consecutive errors

@@ -432,7 +432,7 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
 }

 /// Tenant housekeeping's main loop.
-async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
+async fn tenant_housekeeping_loop(tenant: Arc<TenantShard>, cancel: CancellationToken) {
    let mut last_throttle_flag_reset_at = Instant::now();
    loop {
        if wait_for_active_tenant(&tenant, &cancel).await.is_break() {
@@ -483,7 +483,7 @@ async fn tenant_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken

 /// Waits until the tenant becomes active, or returns `ControlFlow::Break()` to shut down.
 async fn wait_for_active_tenant(
-    tenant: &Arc<Tenant>,
+    tenant: &Arc<TenantShard>,
    cancel: &CancellationToken,
 ) -> ControlFlow<()> {
    if tenant.current_state() == TenantState::Active {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -412,7 +412,7 @@ pub struct Timeline {
    /// Timeline deletion will acquire both compaction and gc locks in whatever order.
    gc_lock: tokio::sync::Mutex<()>,

-    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
+    /// Cloned from [`super::TenantShard::pagestream_throttle`] on construction.
    pub(crate) pagestream_throttle: Arc<crate::tenant::throttle::Throttle>,

    /// Size estimator for aux file v2
@@ -1285,6 +1285,10 @@ impl Timeline {
        reconstruct_state: &mut ValuesReconstructState,
        ctx: &RequestContext,
    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
            Some(ReadPath::new(
                query.total_keyspace(),
@@ -2065,7 +2069,7 @@ impl Timeline {

    pub(crate) fn activate(
        self: &Arc<Self>,
-        parent: Arc<crate::tenant::Tenant>,
+        parent: Arc<crate::tenant::TenantShard>,
        broker_client: BrokerClientChannel,
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
@@ -3325,7 +3329,7 @@ impl Timeline {
        //     (1) and (4)
        // TODO: this is basically a no-op now, should we remove it?
        self.remote_client.schedule_barrier()?;
-        // Tenant::create_timeline will wait for these uploads to happen before returning, or
+        // TenantShard::create_timeline will wait for these uploads to happen before returning, or
        // on retry.

        // Now that we have the full layer map, we may calculate the visibility of layers within it (a global scan)
@@ -5754,7 +5758,7 @@ impl Timeline {
    /// from our ancestor to be branches of this timeline.
    pub(crate) async fn prepare_to_detach_from_ancestor(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        options: detach_ancestor::Options,
        behavior: DetachBehavior,
        ctx: &RequestContext,
@@ -5773,7 +5777,7 @@ impl Timeline {
    /// resetting the tenant.
    pub(crate) async fn detach_from_ancestor_and_reparent(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        prepared: detach_ancestor::PreparedTimelineDetach,
        ancestor_timeline_id: TimelineId,
        ancestor_lsn: Lsn,
@@ -5797,7 +5801,7 @@ impl Timeline {
    /// The tenant must've been reset if ancestry was modified previously (in tenant manager).
    pub(crate) async fn complete_detaching_timeline_ancestor(
        self: &Arc<Timeline>,
-        tenant: &crate::tenant::Tenant,
+        tenant: &crate::tenant::TenantShard,
        attempt: detach_ancestor::Attempt,
        ctx: &RequestContext,
    ) -> Result<(), detach_ancestor::Error> {
@@ -6859,14 +6863,14 @@ impl Timeline {
    /// Persistently blocks gc for `Manual` reason.
    ///
    /// Returns true if no such block existed before, false otherwise.
-    pub(crate) async fn block_gc(&self, tenant: &super::Tenant) -> anyhow::Result<bool> {
+    pub(crate) async fn block_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<bool> {
        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
        tenant.gc_block.insert(self, GcBlockingReason::Manual).await
    }

    /// Persistently unblocks gc for `Manual` reason.
-    pub(crate) async fn unblock_gc(&self, tenant: &super::Tenant) -> anyhow::Result<()> {
+    pub(crate) async fn unblock_gc(&self, tenant: &super::TenantShard) -> anyhow::Result<()> {
        use crate::tenant::remote_timeline_client::index::GcBlockingReason;
        assert_eq!(self.tenant_shard_id, tenant.tenant_shard_id);
        tenant.gc_block.remove(self, GcBlockingReason::Manual).await
@@ -6884,8 +6888,8 @@ impl Timeline {

    /// Force create an image layer and place it into the layer map.
    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_image_layer(
@@ -6941,8 +6945,8 @@ impl Timeline {

    /// Force create a delta layer and place it into the layer map.
    ///
-    /// DO NOT use this function directly. Use [`Tenant::branch_timeline_test_with_layers`]
-    /// or [`Tenant::create_test_timeline_with_layers`] to ensure all these layers are
+    /// DO NOT use this function directly. Use [`TenantShard::branch_timeline_test_with_layers`]
+    /// or [`TenantShard::create_test_timeline_with_layers`] to ensure all these layers are
    /// placed into the layer map in one run AND be validated.
    #[cfg(test)]
    pub(super) async fn force_create_delta_layer(
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -77,7 +77,7 @@ const COMPACTION_DELTA_THRESHOLD: usize = 5;
 /// shard split, which gets expensive for large tenants.
 const ANCESTOR_COMPACTION_REWRITE_THRESHOLD: f64 = 0.3;

-#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
+#[derive(Default, Debug, Clone, Copy, Hash, PartialEq, Eq, Serialize)]
 pub struct GcCompactionJobId(pub usize);

 impl std::fmt::Display for GcCompactionJobId {
@@ -105,6 +105,50 @@ pub enum GcCompactionQueueItem {
    Notify(GcCompactionJobId, Option<Lsn>),
 }

+/// Statistics for gc-compaction meta jobs, which contains several sub compaction jobs.
+#[derive(Debug, Clone, Serialize, Default)]
+pub struct GcCompactionMetaStatistics {
+    /// The total number of sub compaction jobs.
+    pub total_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that failed.
+    pub failed_sub_compaction_jobs: usize,
+    /// The total number of sub compaction jobs that succeeded.
+    pub succeeded_sub_compaction_jobs: usize,
+    /// The layer size before compaction.
+    pub before_compaction_layer_size: u64,
+    /// The layer size after compaction.
+    pub after_compaction_layer_size: u64,
+    /// The start time of the meta job.
+    pub start_time: Option<chrono::DateTime<chrono::Utc>>,
+    /// The end time of the meta job.
+    pub end_time: Option<chrono::DateTime<chrono::Utc>>,
+    /// The duration of the meta job.
+    pub duration_secs: f64,
+    /// The id of the meta job.
+    pub meta_job_id: GcCompactionJobId,
+    /// The LSN below which the layers are compacted, used to compute the statistics.
+    pub below_lsn: Lsn,
+    /// The retention ratio of the meta job (after_compaction_layer_size / before_compaction_layer_size)
+    pub retention_ratio: f64,
+}
+
+impl GcCompactionMetaStatistics {
+    fn finalize(&mut self) {
+        let end_time = chrono::Utc::now();
+        if let Some(start_time) = self.start_time {
+            if end_time > start_time {
+                let delta = end_time - start_time;
+                if let Ok(std_dur) = delta.to_std() {
+                    self.duration_secs = std_dur.as_secs_f64();
+                }
+            }
+        }
+        self.retention_ratio = self.after_compaction_layer_size as f64
+            / (self.before_compaction_layer_size as f64 + 1.0);
+        self.end_time = Some(end_time);
+    }
+}
+
 impl GcCompactionQueueItem {
    pub fn into_compact_info_resp(
        self,
@@ -142,6 +186,7 @@ struct GcCompactionQueueInner {
    queued: VecDeque<(GcCompactionJobId, GcCompactionQueueItem)>,
    guards: HashMap<GcCompactionJobId, GcCompactionGuardItems>,
    last_id: GcCompactionJobId,
+    meta_statistics: Option<GcCompactionMetaStatistics>,
 }

 impl GcCompactionQueueInner {
@@ -173,6 +218,7 @@ impl GcCompactionQueue {
                queued: VecDeque::new(),
                guards: HashMap::new(),
                last_id: GcCompactionJobId(0),
+                meta_statistics: None,
            }),
            consumer_lock: tokio::sync::Mutex::new(()),
        }
@@ -357,6 +403,23 @@ impl GcCompactionQueue {
        Ok(())
    }

+    async fn collect_layer_below_lsn(
+        &self,
+        timeline: &Arc<Timeline>,
+        lsn: Lsn,
+    ) -> Result<u64, CompactionError> {
+        let guard = timeline.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let layers = layer_map.iter_historic_layers().collect_vec();
+        let mut size = 0;
+        for layer in layers {
+            if layer.lsn_range.start <= lsn {
+                size += layer.file_size();
+            }
+        }
+        Ok(size)
+    }
+
    /// Notify the caller the job has finished and unblock GC.
    fn notify_and_unblock(&self, id: GcCompactionJobId) {
        info!("compaction job id={} finished", id);
@@ -366,6 +429,16 @@ impl GcCompactionQueue {
                let _ = tx.send(());
            }
        }
+        if let Some(ref meta_statistics) = guard.meta_statistics {
+            if meta_statistics.meta_job_id == id {
+                if let Ok(stats) = serde_json::to_string(&meta_statistics) {
+                    info!(
+                        "gc-compaction meta statistics for job id = {}: {}",
+                        id, stats
+                    );
+                }
+            }
+        }
    }

    fn clear_running_job(&self) {
@@ -405,7 +478,11 @@ impl GcCompactionQueue {
            let mut pending_tasks = Vec::new();
            // gc-compaction might pick more layers or fewer layers to compact. The L2 LSN does not need to be accurate.
            // And therefore, we simply assume the maximum LSN of all jobs is the expected L2 LSN.
-            let expected_l2_lsn = jobs.iter().map(|job| job.compact_lsn_range.end).max();
+            let expected_l2_lsn = jobs
+                .iter()
+                .map(|job| job.compact_lsn_range.end)
+                .max()
+                .unwrap();
            for job in jobs {
                // Unfortunately we need to convert the `GcCompactJob` back to `CompactionOptions`
                // until we do further refactors to allow directly call `compact_with_gc`.
@@ -430,9 +507,13 @@ impl GcCompactionQueue {
            if !auto {
                pending_tasks.push(GcCompactionQueueItem::Notify(id, None));
            } else {
-                pending_tasks.push(GcCompactionQueueItem::Notify(id, expected_l2_lsn));
+                pending_tasks.push(GcCompactionQueueItem::Notify(id, Some(expected_l2_lsn)));
            }

+            let layer_size = self
+                .collect_layer_below_lsn(timeline, expected_l2_lsn)
+                .await?;
+
            {
                let mut guard = self.inner.lock().unwrap();
                let mut tasks = Vec::new();
@@ -444,7 +525,16 @@ impl GcCompactionQueue {
                for item in tasks {
                    guard.queued.push_front(item);
                }
+                guard.meta_statistics = Some(GcCompactionMetaStatistics {
+                    meta_job_id: id,
+                    start_time: Some(chrono::Utc::now()),
+                    before_compaction_layer_size: layer_size,
+                    below_lsn: expected_l2_lsn,
+                    total_sub_compaction_jobs: jobs_len,
+                    ..Default::default()
+                });
            }
+
            info!(
                "scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs",
                jobs_len
@@ -573,6 +663,10 @@ impl GcCompactionQueue {
                    Err(err) => {
                        warn!(%err, "failed to run gc-compaction subcompaction job");
                        self.clear_running_job();
+                        let mut guard = self.inner.lock().unwrap();
+                        if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                            meta_statistics.failed_sub_compaction_jobs += 1;
+                        }
                        return Err(err);
                    }
                };
@@ -582,8 +676,34 @@ impl GcCompactionQueue {
                    // we need to clean things up before returning from the function.
                    yield_for_l0 = true;
                }
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.succeeded_sub_compaction_jobs += 1;
+                    }
+                }
            }
            GcCompactionQueueItem::Notify(id, l2_lsn) => {
+                let below_lsn = {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.below_lsn
+                    } else {
+                        Lsn::INVALID
+                    }
+                };
+                let layer_size = if below_lsn != Lsn::INVALID {
+                    self.collect_layer_below_lsn(timeline, below_lsn).await?
+                } else {
+                    0
+                };
+                {
+                    let mut guard = self.inner.lock().unwrap();
+                    if let Some(ref mut meta_statistics) = guard.meta_statistics {
+                        meta_statistics.after_compaction_layer_size = layer_size;
+                        meta_statistics.finalize();
+                    }
+                }
                self.notify_and_unblock(id);
                if let Some(l2_lsn) = l2_lsn {
                    let current_l2_lsn = timeline
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -18,8 +18,8 @@ use crate::tenant::remote_timeline_client::{
    PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
 };
 use crate::tenant::{
-    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant, TenantManifestError,
-    Timeline, TimelineOrOffloaded,
+    CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, TenantManifestError,
+    TenantShard, Timeline, TimelineOrOffloaded,
 };
 use crate::virtual_file::MaybeFatalIo;

@@ -113,7 +113,7 @@ pub(super) async fn delete_local_timeline_directory(
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`make_timeline_delete_guard`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &TimelineOrOffloaded,
    _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
@@ -192,7 +192,7 @@ impl DeleteTimelineFlow {
    // error out if some of the shutdown tasks have already been completed!
    #[instrument(skip_all)]
    pub async fn run(
-        tenant: &Arc<Tenant>,
+        tenant: &Arc<TenantShard>,
        timeline_id: TimelineId,
    ) -> Result<(), DeleteTimelineError> {
        super::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -288,7 +288,7 @@ impl DeleteTimelineFlow {
    /// Shortcut to create Timeline in stopping state and spawn deletion task.
    #[instrument(skip_all, fields(%timeline_id))]
    pub(crate) async fn resume_deletion(
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline_id: TimelineId,
        local_metadata: &TimelineMetadata,
        remote_client: RemoteTimelineClient,
@@ -338,7 +338,7 @@ impl DeleteTimelineFlow {
    fn schedule_background(
        guard: DeletionGuard,
        conf: &'static PageServerConf,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        timeline: TimelineOrOffloaded,
        remote_client: Arc<RemoteTimelineClient>,
    ) {
@@ -381,7 +381,7 @@ impl DeleteTimelineFlow {
    async fn background(
        mut guard: DeletionGuard,
        conf: &PageServerConf,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        timeline: &TimelineOrOffloaded,
        remote_client: Arc<RemoteTimelineClient>,
    ) -> Result<(), DeleteTimelineError> {
@@ -435,7 +435,7 @@ pub(super) enum TimelineDeleteGuardKind {
 }

 pub(super) fn make_timeline_delete_guard(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline_id: TimelineId,
    guard_kind: TimelineDeleteGuardKind,
 ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -23,7 +23,7 @@ use super::layer_manager::LayerManager;
 use super::{FlushLayerError, Timeline};
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::TaskKind;
-use crate::tenant::Tenant;
+use crate::tenant::TenantShard;
 use crate::tenant::remote_timeline_client::index::GcBlockingReason::DetachAncestor;
 use crate::tenant::storage_layer::layer::local_layer_path;
 use crate::tenant::storage_layer::{
@@ -265,7 +265,7 @@ async fn generate_tombstone_image_layer(
 /// See [`Timeline::prepare_to_detach_from_ancestor`]
 pub(super) async fn prepare(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    behavior: DetachBehavior,
    options: Options,
    ctx: &RequestContext,
@@ -590,7 +590,7 @@ pub(super) async fn prepare(

 async fn start_new_attempt(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -611,7 +611,7 @@ async fn start_new_attempt(

 async fn continue_with_blocked_gc(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -622,7 +622,7 @@ async fn continue_with_blocked_gc(

 fn obtain_exclusive_attempt(
    detached: &Timeline,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
 ) -> Result<Attempt, Error> {
@@ -655,7 +655,7 @@ fn obtain_exclusive_attempt(

 fn reparented_direct_children(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
 ) -> Result<HashSet<TimelineId>, Error> {
    let mut all_direct_children = tenant
        .timelines
@@ -950,7 +950,7 @@ impl DetachingAndReparenting {
 /// See [`Timeline::detach_from_ancestor_and_reparent`].
 pub(super) async fn detach_and_reparent(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    prepared: PreparedTimelineDetach,
    ancestor_timeline_id: TimelineId,
    ancestor_lsn: Lsn,
@@ -1184,7 +1184,7 @@ pub(super) async fn detach_and_reparent(

 pub(super) async fn complete(
    detached: &Arc<Timeline>,
-    tenant: &Tenant,
+    tenant: &TenantShard,
    mut attempt: Attempt,
    _ctx: &RequestContext,
 ) -> Result<(), Error> {
@@ -1258,7 +1258,7 @@ where
 }

 fn check_no_archived_children_of_ancestor(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    detached: &Arc<Timeline>,
    ancestor: &Arc<Timeline>,
    ancestor_lsn: Lsn,
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -33,7 +33,7 @@ use crate::tenant::size::CalculateSyntheticSizeError;
 use crate::tenant::storage_layer::LayerVisibilityHint;
 use crate::tenant::tasks::{BackgroundLoopKind, BackgroundLoopSemaphorePermit, sleep_random};
 use crate::tenant::timeline::EvictionError;
-use crate::tenant::{LogicalSizeCalculationCause, Tenant};
+use crate::tenant::{LogicalSizeCalculationCause, TenantShard};

 #[derive(Default)]
 pub struct EvictionTaskTimelineState {
@@ -48,7 +48,7 @@ pub struct EvictionTaskTenantState {
 impl Timeline {
    pub(super) fn launch_eviction_task(
        self: &Arc<Self>,
-        parent: Arc<Tenant>,
+        parent: Arc<TenantShard>,
        background_tasks_can_start: Option<&completion::Barrier>,
    ) {
        let self_clone = Arc::clone(self);
@@ -75,7 +75,7 @@ impl Timeline {
    }

    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
-    async fn eviction_task(self: Arc<Self>, tenant: Arc<Tenant>) {
+    async fn eviction_task(self: Arc<Self>, tenant: Arc<TenantShard>) {
        // acquire the gate guard only once within a useful span
        let Ok(guard) = self.gate.enter() else {
            return;
@@ -118,7 +118,7 @@ impl Timeline {
    #[instrument(skip_all, fields(policy_kind = policy.discriminant_str()))]
    async fn eviction_iteration(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        policy: &EvictionPolicy,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -175,7 +175,7 @@ impl Timeline {

    async fn eviction_iteration_threshold(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -309,7 +309,7 @@ impl Timeline {
    /// disk usage based eviction task.
    async fn imitiate_only(
        self: &Arc<Self>,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -363,7 +363,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_layer_accesses(
        &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        p: &EvictionPolicyLayerAccessThreshold,
        cancel: &CancellationToken,
        gate: &GateGuard,
@@ -499,7 +499,7 @@ impl Timeline {
    #[instrument(skip_all)]
    async fn imitate_synthetic_size_calculation_worker(
        &self,
-        tenant: &Tenant,
+        tenant: &TenantShard,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) {
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -8,7 +8,7 @@ use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
 use crate::tenant::timeline::delete::{TimelineDeleteGuardKind, make_timeline_delete_guard};
 use crate::tenant::{
-    DeleteTimelineError, OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded,
+    DeleteTimelineError, OffloadedTimeline, TenantManifestError, TenantShard, TimelineOrOffloaded,
 };

 #[derive(thiserror::Error, Debug)]
@@ -33,7 +33,7 @@ impl From<TenantManifestError> for OffloadError {
 }

 pub(crate) async fn offload_timeline(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &Arc<Timeline>,
 ) -> Result<(), OffloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
@@ -123,7 +123,7 @@ pub(crate) async fn offload_timeline(
 ///
 /// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(
-    tenant: &Tenant,
+    tenant: &TenantShard,
    timeline: &Timeline,
    _: &DeletionGuard, // using it as a witness
 ) -> usize {
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -15,17 +15,19 @@ use super::Timeline;
 use crate::context::RequestContext;
 use crate::import_datadir;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{CreateTimelineError, CreateTimelineIdempotency, Tenant, TimelineOrOffloaded};
+use crate::tenant::{
+    CreateTimelineError, CreateTimelineIdempotency, TenantShard, TimelineOrOffloaded,
+};

 /// A timeline with some of its files on disk, being initialized.
 /// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
 /// its local files are removed.  If we crash while this class exists, then the timeline's local
-/// state is cleaned up during [`Tenant::clean_up_timelines`], because the timeline's content isn't in remote storage.
+/// state is cleaned up during [`TenantShard::clean_up_timelines`], because the timeline's content isn't in remote storage.
 ///
 /// The caller is responsible for proper timeline data filling before the final init.
 #[must_use]
 pub struct UninitializedTimeline<'t> {
-    pub(crate) owning_tenant: &'t Tenant,
+    pub(crate) owning_tenant: &'t TenantShard,
    timeline_id: TimelineId,
    raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
    /// Whether we spawned the inner Timeline's tasks such that we must later shut it down
@@ -35,7 +37,7 @@ pub struct UninitializedTimeline<'t> {

 impl<'t> UninitializedTimeline<'t> {
    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
+        owning_tenant: &'t TenantShard,
        timeline_id: TimelineId,
        raw_timeline: Option<(Arc<Timeline>, TimelineCreateGuard)>,
    ) -> Self {
@@ -156,7 +158,7 @@ impl<'t> UninitializedTimeline<'t> {
    /// Prepares timeline data by loading it from the basebackup archive.
    pub(crate) async fn import_basebackup_from_tar(
        mut self,
-        tenant: Arc<Tenant>,
+        tenant: Arc<TenantShard>,
        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
        base_lsn: Lsn,
        broker_client: storage_broker::BrokerClientChannel,
@@ -227,17 +229,17 @@ pub(crate) fn cleanup_timeline_directory(create_guard: TimelineCreateGuard) {
            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
        }
    }
-    // Having cleaned up, we can release this TimelineId in `[Tenant::timelines_creating]` to allow other
+    // Having cleaned up, we can release this TimelineId in `[TenantShard::timelines_creating]` to allow other
    // timeline creation attempts under this TimelineId to proceed
    drop(create_guard);
 }

 /// A guard for timeline creations in process: as long as this object exists, the timeline ID
-/// is kept in `[Tenant::timelines_creating]` to exclude concurrent attempts to create the same timeline.
+/// is kept in `[TenantShard::timelines_creating]` to exclude concurrent attempts to create the same timeline.
 #[must_use]
 pub(crate) struct TimelineCreateGuard {
    pub(crate) _tenant_gate_guard: GateGuard,
-    pub(crate) owning_tenant: Arc<Tenant>,
+    pub(crate) owning_tenant: Arc<TenantShard>,
    pub(crate) timeline_id: TimelineId,
    pub(crate) timeline_path: Utf8PathBuf,
    pub(crate) idempotency: CreateTimelineIdempotency,
@@ -263,7 +265,7 @@ pub(crate) enum TimelineExclusionError {

 impl TimelineCreateGuard {
    pub(crate) fn new(
-        owning_tenant: &Arc<Tenant>,
+        owning_tenant: &Arc<TenantShard>,
        timeline_id: TimelineId,
        timeline_path: Utf8PathBuf,
        idempotency: CreateTimelineIdempotency,
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -1366,7 +1366,8 @@ pub(crate) type IoBuffer = AlignedBuffer<ConstAlign<{ get_io_buffer_alignment()
 pub(crate) type IoPageSlice<'a> =
    AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>;

-static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+static IO_MODE: once_cell::sync::Lazy<AtomicU8> =
+    once_cell::sync::Lazy::new(|| AtomicU8::new(IoMode::preferred() as u8));

 pub(crate) fn set_io_mode(mode: IoMode) {
    IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -776,7 +776,6 @@ impl From<&jose_jwk::Key> for KeyType {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::future::IntoFuture;
    use std::net::SocketAddr;
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -253,7 +253,6 @@ fn project_name_valid(name: &str) -> bool {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use ComputeUserInfoParseError::*;
    use serde_json::json;
--- a/proxy/src/binary/pg_sni_router.rs
+++ b/proxy/src/binary/pg_sni_router.rs
@@ -258,7 +258,7 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
                "unexpected startup packet, rejecting connection"
            );
            stream
-                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User, None)
                .await?
        }
    }
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -259,7 +259,6 @@ impl EndpointsCache {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;

--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -585,7 +585,6 @@ impl Cache for ProjectInfoCacheImpl {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;
    use crate::scram::ServerSecret;
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -222,7 +222,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
    {
        Ok(auth_result) => auth_result,
        Err(e) => {
-            return stream.throw_error(e).await?;
+            return stream.throw_error(e, Some(ctx)).await?;
        }
    };

@@ -238,7 +238,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        config.wake_compute_retry_config,
        &config.connect_to_compute,
    )
-    .or_else(|e| stream.throw_error(e))
+    .or_else(|e| stream.throw_error(e, Some(ctx)))
    .await?;

    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -63,7 +63,7 @@ struct RequestContextInner {
    success: bool,
    pub(crate) cold_start_info: ColdStartInfo,
    pg_options: Option<StartupMessageParams>,
-    testodrome_query_id: Option<String>,
+    testodrome_query_id: Option<SmolStr>,

    // extra
    // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -219,7 +219,7 @@ impl RequestContext {
            for option in options_str.split_whitespace() {
                if option.starts_with("neon_query_id:") {
                    if let Some(value) = option.strip_prefix("neon_query_id:") {
-                        this.set_testodrome_id(value.to_string());
+                        this.set_testodrome_id(value.into());
                        break;
                    }
                }
@@ -272,7 +272,7 @@ impl RequestContext {
            .set_user_agent(user_agent);
    }

-    pub(crate) fn set_testodrome_id(&self, query_id: String) {
+    pub(crate) fn set_testodrome_id(&self, query_id: SmolStr) {
        self.0
            .try_lock()
            .expect("should not deadlock")
@@ -378,7 +378,7 @@ impl RequestContext {
            .accumulated()
    }

-    pub(crate) fn get_testodrome_id(&self) -> Option<String> {
+    pub(crate) fn get_testodrome_id(&self) -> Option<SmolStr> {
        self.0
            .try_lock()
            .expect("should not deadlock")
@@ -447,7 +447,7 @@ impl RequestContextInner {
        self.user = Some(user);
    }

-    fn set_testodrome_id(&mut self, query_id: String) {
+    fn set_testodrome_id(&mut self, query_id: SmolStr) {
        self.testodrome_query_id = Some(query_id);
    }

--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -416,7 +416,6 @@ async fn upload_parquet(
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::net::Ipv4Addr;
    use std::num::NonZeroUsize;
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -227,7 +227,6 @@ impl From<AccountId> for AccountIdInt {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::sync::OnceLock;

--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1032,7 +1032,6 @@ impl<const F: usize> serde::ser::Serialize for ExtractedSpanFields<'_, F> {
 }

 #[cfg(test)]
-#[allow(clippy::unwrap_used)]
 mod tests {
    use std::marker::PhantomData;
    use std::sync::{Arc, Mutex, MutexGuard};
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -400,7 +400,6 @@ impl NetworkEndianIpv6 {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use tokio::io::AsyncReadExt;

--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -262,7 +262,6 @@ impl CopyBuffer {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use tokio::io::AsyncWriteExt;

--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -196,7 +196,11 @@ pub(crate) async fn handshake<S: AsyncRead + AsyncWrite + Unpin>(
                // OR we didn't provide it at all (for dev purposes).
                if tls.is_some() {
                    return stream
-                        .throw_error_str(ERR_INSECURE_CONNECTION, crate::error::ErrorKind::User)
+                        .throw_error_str(
+                            ERR_INSECURE_CONNECTION,
+                            crate::error::ErrorKind::User,
+                            None,
+                        )
                        .await?;
                }

--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -329,7 +329,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(

    let user_info = match result {
        Ok(user_info) => user_info,
-        Err(e) => stream.throw_error(e).await?,
+        Err(e) => stream.throw_error(e, Some(ctx)).await?,
    };

    let user = user_info.get_user().to_owned();
@@ -349,7 +349,10 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
            let app = params.get("application_name");
            let params_span = tracing::info_span!("", ?user, ?db, ?app);

-            return stream.throw_error(e).instrument(params_span).await?;
+            return stream
+                .throw_error(e, Some(ctx))
+                .instrument(params_span)
+                .await?;
        }
    };

@@ -374,7 +377,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        config.wake_compute_retry_config,
        &config.connect_to_compute,
    )
-    .or_else(|e| stream.throw_error(e))
+    .or_else(|e| stream.throw_error(e, Some(ctx)))
    .await?;

    let cancellation_handler_clone = Arc::clone(&cancellation_handler);
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -1,5 +1,5 @@
 //! A group of high-level tests for connection establishing logic and auth.
-#![allow(clippy::unimplemented, clippy::unwrap_used)]
+#![allow(clippy::unimplemented)]

 mod mitm;

--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -83,7 +83,7 @@ impl From<LeakyBucketConfig> for utils::leaky_bucket::LeakyBucketConfig {
 }

 #[cfg(test)]
-#[allow(clippy::float_cmp, clippy::unwrap_used)]
+#[allow(clippy::float_cmp)]
 mod tests {
    use std::time::Duration;

--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -63,7 +63,6 @@ impl LimitAlgorithm for Aimd {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::time::Duration;

--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -259,7 +259,6 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::hash::BuildHasherDefault;
    use std::time::Duration;
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -51,7 +51,6 @@ impl<'a> ServerMessage<&'a str> {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;

--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -185,7 +185,6 @@ impl fmt::Debug for OwnedServerFirstMessage {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;

--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -57,7 +57,6 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::threadpool::ThreadPool;
    use super::{Exchange, ServerSecret};
--- a/proxy/src/scram/secret.rs
+++ b/proxy/src/scram/secret.rs
@@ -72,7 +72,6 @@ impl ServerSecret {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;

--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -561,8 +561,10 @@ impl ConnectMechanism for TokioMechanism {
            .dbname(&self.conn_info.dbname)
            .connect_timeout(compute_config.timeout);

+        let mk_tls =
+            crate::tls::postgres_rustls::MakeRustlsConnect::new(compute_config.tls.clone());
        let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let res = config.connect(postgres_client::NoTls).await;
+        let res = config.connect(mk_tls).await;
        drop(pause);
        let (client, connection) = permit.release_result(res)?;

--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -6,7 +6,7 @@ use std::task::{Poll, ready};
 use futures::Future;
 use futures::future::poll_fn;
 use postgres_client::AsyncMessage;
-use postgres_client::tls::NoTlsStream;
+use postgres_client::tls::MakeTlsConnect;
 use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
@@ -26,6 +26,9 @@ use super::conn_pool_lib::{
 use crate::context::RequestContext;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::Metrics;
+use crate::tls::postgres_rustls::MakeRustlsConnect;
+
+type TlsStream = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::Stream;

 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
@@ -58,7 +61,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
    ctx: &RequestContext,
    conn_info: ConnInfo,
    client: C,
-    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, TlsStream>,
    conn_id: uuid::Uuid,
    aux: MetricsAuxInfo,
 ) -> Client<C> {
@@ -186,7 +189,6 @@ impl ClientDataRemote {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::sync::atomic::AtomicBool;

--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -256,7 +256,6 @@ fn pg_array_parse_inner(
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use serde_json::json;

--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -367,7 +367,6 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use ed25519_dalek::SigningKey;
    use typed_json::json;
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -434,17 +434,6 @@ async fn request_handler(
                .map(Into::into),
        );

-        let testodrome_id = request
-            .headers()
-            .get("X-Neon-Query-ID")
-            .and_then(|value| value.to_str().ok())
-            .map(|s| s.to_string());
-
-        if let Some(query_id) = testodrome_id {
-            info!(parent: &ctx.span(), "testodrome query ID: {query_id}");
-            ctx.set_testodrome_id(query_id);
-        }
-
        let span = ctx.span();
        info!(parent: &span, "performing websocket upgrade");

@@ -491,7 +480,7 @@ async fn request_handler(

        if let Some(query_id) = testodrome_id {
            info!(parent: &ctx.span(), "testodrome query ID: {query_id}");
-            ctx.set_testodrome_id(query_id);
+            ctx.set_testodrome_id(query_id.into());
        }

        sql_over_http::handle(config, ctx, request, backend, http_cancellation_token)
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -1209,7 +1209,6 @@ impl Discard<'_> {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;

--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -157,7 +157,6 @@ pub(crate) async fn serve_websocket(

    match res {
        Err(e) => {
-            // todo: log and push to ctx the error kind
            ctx.set_error_kind(e.get_error_kind());
            Err(e.into())
        }
@@ -178,7 +177,6 @@ pub(crate) async fn serve_websocket(
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::pin::pin;

--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -6,11 +6,13 @@ use bytes::BytesMut;
 use pq_proto::framed::{ConnectionError, Framed};
 use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
 use rustls::ServerConfig;
+use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 use tracing::debug;

+use crate::control_plane::messages::ColdStartInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::Metrics;
 use crate::tls::TlsServerEndPoint;
@@ -100,6 +102,44 @@ impl ReportableError for ReportedError {
    }
 }

+#[derive(Serialize, Deserialize, Debug)]
+enum ErrorTag {
+    #[serde(rename = "proxy")]
+    Proxy,
+    #[serde(rename = "compute")]
+    Compute,
+    #[serde(rename = "client")]
+    Client,
+    #[serde(rename = "controlplane")]
+    ControlPlane,
+    #[serde(rename = "other")]
+    Other,
+}
+
+impl From<ErrorKind> for ErrorTag {
+    fn from(error_kind: ErrorKind) -> Self {
+        match error_kind {
+            ErrorKind::User => Self::Client,
+            ErrorKind::ClientDisconnect => Self::Client,
+            ErrorKind::RateLimit => Self::Proxy,
+            ErrorKind::ServiceRateLimit => Self::Proxy, // considering rate limit as proxy error for SLI
+            ErrorKind::Quota => Self::Proxy,
+            ErrorKind::Service => Self::Proxy,
+            ErrorKind::ControlPlane => Self::ControlPlane,
+            ErrorKind::Postgres => Self::Other,
+            ErrorKind::Compute => Self::Compute,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(rename_all = "snake_case")]
+struct ProbeErrorData {
+    tag: ErrorTag,
+    msg: String,
+    cold_start_info: Option<ColdStartInfo>,
+}
+
 impl<S: AsyncWrite + Unpin> PqStream<S> {
    /// Write the message into an internal buffer, but don't flush the underlying stream.
    pub(crate) fn write_message_noflush(
@@ -125,26 +165,54 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
        Ok(self)
    }

-    /// Write the error message using [`Self::write_message`], then re-throw it.
+    /// Writes message with the given error kind to the stream.
+    /// Used only for probe queries
+    async fn write_format_message(
+        &mut self,
+        msg: &str,
+        error_kind: ErrorKind,
+        ctx: Option<&crate::context::RequestContext>,
+    ) -> String {
+        let formatted_msg = match ctx {
+            Some(ctx) if ctx.get_testodrome_id().is_some() => {
+                serde_json::to_string(&ProbeErrorData {
+                    tag: ErrorTag::from(error_kind),
+                    msg: msg.to_string(),
+                    cold_start_info: Some(ctx.cold_start_info()),
+                })
+                .unwrap_or_default()
+            }
+            _ => msg.to_string(),
+        };
+
+        // already error case, ignore client IO error
+        self.write_message(&BeMessage::ErrorResponse(&formatted_msg, None))
+            .await
+            .inspect_err(|e| debug!("write_message failed: {e}"))
+            .ok();
+
+        formatted_msg
+    }
+
+    /// Write the error message using [`Self::write_format_message`], then re-throw it.
    /// Allowing string literals is safe under the assumption they might not contain any runtime info.
    /// This method exists due to `&str` not implementing `Into<anyhow::Error>`.
+    /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind.
    pub async fn throw_error_str<T>(
        &mut self,
        msg: &'static str,
        error_kind: ErrorKind,
+        ctx: Option<&crate::context::RequestContext>,
    ) -> Result<T, ReportedError> {
-        // TODO: only log this for actually interesting errors
-        tracing::info!(
-            kind = error_kind.to_metric_label(),
-            msg,
-            "forwarding error to user"
-        );
+        self.write_format_message(msg, error_kind, ctx).await;

-        // already error case, ignore client IO error
-        self.write_message(&BeMessage::ErrorResponse(msg, None))
-            .await
-            .inspect_err(|e| debug!("write_message failed: {e}"))
-            .ok();
+        if error_kind != ErrorKind::RateLimit && error_kind != ErrorKind::User {
+            tracing::info!(
+                kind = error_kind.to_metric_label(),
+                msg,
+                "forwarding error to user"
+            );
+        }

        Err(ReportedError {
            source: anyhow::anyhow!(msg),
@@ -152,26 +220,28 @@ impl<S: AsyncWrite + Unpin> PqStream<S> {
        })
    }

-    /// Write the error message using [`Self::write_message`], then re-throw it.
+    /// Write the error message using [`Self::write_format_message`], then re-throw it.
    /// Trait [`UserFacingError`] acts as an allowlist for error types.
-    pub(crate) async fn throw_error<T, E>(&mut self, error: E) -> Result<T, ReportedError>
+    /// If `ctx` is provided and has testodrome_id set, error messages will be prefixed according to error kind.
+    pub(crate) async fn throw_error<T, E>(
+        &mut self,
+        error: E,
+        ctx: Option<&crate::context::RequestContext>,
+    ) -> Result<T, ReportedError>
    where
        E: UserFacingError + Into<anyhow::Error>,
    {
        let error_kind = error.get_error_kind();
        let msg = error.to_string_client();
-        tracing::info!(
-            kind=error_kind.to_metric_label(),
-            error=%error,
-            msg,
-            "forwarding error to user"
-        );
-
-        // already error case, ignore client IO error
-        self.write_message(&BeMessage::ErrorResponse(&msg, None))
-            .await
-            .inspect_err(|e| debug!("write_message failed: {e}"))
-            .ok();
+        self.write_format_message(&msg, error_kind, ctx).await;
+        if error_kind != ErrorKind::RateLimit && error_kind != ErrorKind::User {
+            tracing::info!(
+                kind=error_kind.to_metric_label(),
+                error=%error,
+                msg,
+                "forwarding error to user",
+            );
+        }

        Err(ReportedError {
            source: anyhow::anyhow!(error),
--- a/proxy/src/url.rs
+++ b/proxy/src/url.rs
@@ -50,7 +50,6 @@ impl std::fmt::Display for ApiUrl {
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use super::*;

--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -497,7 +497,6 @@ async fn upload_backup_events(
 }

 #[cfg(test)]
-#[expect(clippy::unwrap_used)]
 mod tests {
    use std::fs;
    use std::io::BufReader;
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -126,6 +126,7 @@ pub(crate) enum DatabaseOperation {
    InsertTimelineReconcile,
    RemoveTimelineReconcile,
    ListTimelineReconcile,
+    ListTimelineReconcileStartup,
 }

 #[must_use]
@@ -1521,23 +1522,41 @@ impl Persistence {
        .await
    }

-    /// Load pending operations from db.
-    pub(crate) async fn list_pending_ops(
+    /// Load pending operations from db, joined together with timeline data.
+    pub(crate) async fn list_pending_ops_with_timelines(
        &self,
-    ) -> DatabaseResult<Vec<TimelinePendingOpPersistence>> {
+    ) -> DatabaseResult<Vec<(TimelinePendingOpPersistence, Option<TimelinePersistence>)>> {
        use crate::schema::safekeeper_timeline_pending_ops::dsl;
+        use crate::schema::timelines;

        let timeline_from_db = self
-            .with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| {
-                Box::pin(async move {
-                    let from_db: Vec<TimelinePendingOpPersistence> =
-                        dsl::safekeeper_timeline_pending_ops.load(conn).await?;
-                    Ok(from_db)
-                })
-            })
+            .with_measured_conn(
+                DatabaseOperation::ListTimelineReconcileStartup,
+                move |conn| {
+                    Box::pin(async move {
+                        let from_db: Vec<(TimelinePendingOpPersistence, Option<TimelineFromDb>)> =
+                            dsl::safekeeper_timeline_pending_ops
+                                .left_join(
+                                    timelines::table.on(timelines::tenant_id
+                                        .eq(dsl::tenant_id)
+                                        .and(timelines::timeline_id.eq(dsl::timeline_id))),
+                                )
+                                .select((
+                                    TimelinePendingOpPersistence::as_select(),
+                                    Option::<TimelineFromDb>::as_select(),
+                                ))
+                                .load(conn)
+                                .await?;
+                        Ok(from_db)
+                    })
+                },
+            )
            .await?;

-        Ok(timeline_from_db)
+        Ok(timeline_from_db
+            .into_iter()
+            .map(|(op, tl_opt)| (op, tl_opt.map(|tl_opt| tl_opt.into_persistence())))
+            .collect())
    }
    /// List pending operations for a given timeline (including tenant-global ones)
    pub(crate) async fn list_pending_ops_for_timeline(
@@ -1580,7 +1599,7 @@ impl Persistence {

        let tenant_id = &tenant_id;
        let timeline_id = &timeline_id;
-        self.with_measured_conn(DatabaseOperation::ListTimelineReconcile, move |conn| {
+        self.with_measured_conn(DatabaseOperation::RemoveTimelineReconcile, move |conn| {
            let timeline_id_str = timeline_id.map(|tid| tid.to_string()).unwrap_or_default();
            Box::pin(async move {
                diesel::delete(dsl::safekeeper_timeline_pending_ops)
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -824,9 +824,13 @@ impl Service {
            let mut locked = self.inner.write().unwrap();
            locked.become_leader();

+            for (sk_id, _sk) in locked.safekeepers.clone().iter() {
+                locked.safekeeper_reconcilers.start_reconciler(*sk_id, self);
+            }
+
            locked
                .safekeeper_reconcilers
-                .schedule_request_vec(self, sk_schedule_requests);
+                .schedule_request_vec(sk_schedule_requests);
        }

        // TODO: if any tenant's intent now differs from its loaded generation_pageserver, we should clear that
--- a/storage_controller/src/service/safekeeper_reconciler.rs
+++ b/storage_controller/src/service/safekeeper_reconciler.rs
@@ -30,31 +30,35 @@ impl SafekeeperReconcilers {
            reconcilers: HashMap::new(),
        }
    }
-    pub(crate) fn schedule_request_vec(
-        &mut self,
-        service: &Arc<Service>,
-        reqs: Vec<ScheduleRequest>,
-    ) {
+    /// Adds a safekeeper-specific reconciler.
+    /// Can be called multiple times, but it needs to be called at least once
+    /// for every new safekeeper added.
+    pub(crate) fn start_reconciler(&mut self, node_id: NodeId, service: &Arc<Service>) {
+        self.reconcilers.entry(node_id).or_insert_with(|| {
+            SafekeeperReconciler::spawn(self.cancel.child_token(), service.clone())
+        });
+    }
+    /// Stop a safekeeper-specific reconciler.
+    /// Stops the reconciler, cancelling all ongoing tasks.
+    pub(crate) fn stop_reconciler(&mut self, node_id: NodeId) {
+        if let Some(handle) = self.reconcilers.remove(&node_id) {
+            handle.cancel.cancel();
+        }
+    }
+    pub(crate) fn schedule_request_vec(&self, reqs: Vec<ScheduleRequest>) {
        tracing::info!(
            "Scheduling {} pending safekeeper ops loaded from db",
            reqs.len()
        );
        for req in reqs {
-            self.schedule_request(service, req);
+            self.schedule_request(req);
        }
    }
-    pub(crate) fn schedule_request(&mut self, service: &Arc<Service>, req: ScheduleRequest) {
+    pub(crate) fn schedule_request(&self, req: ScheduleRequest) {
        let node_id = req.safekeeper.get_id();
-        let reconciler_handle = self.reconcilers.entry(node_id).or_insert_with(|| {
-            SafekeeperReconciler::spawn(self.cancel.child_token(), service.clone())
-        });
+        let reconciler_handle = self.reconcilers.get(&node_id).unwrap();
        reconciler_handle.schedule_reconcile(req);
    }
-    pub(crate) fn cancel_safekeeper(&mut self, node_id: NodeId) {
-        if let Some(handle) = self.reconcilers.remove(&node_id) {
-            handle.cancel.cancel();
-        }
-    }
    /// Cancel ongoing reconciles for the given timeline
    ///
    /// Specifying `None` here only removes reconciles for the tenant-global reconciliation,
@@ -78,9 +82,12 @@ pub(crate) async fn load_schedule_requests(
    service: &Arc<Service>,
    safekeepers: &HashMap<NodeId, Safekeeper>,
 ) -> anyhow::Result<Vec<ScheduleRequest>> {
-    let pending_ops = service.persistence.list_pending_ops().await?;
-    let mut res = Vec::with_capacity(pending_ops.len());
-    for op_persist in pending_ops {
+    let pending_ops_timelines = service
+        .persistence
+        .list_pending_ops_with_timelines()
+        .await?;
+    let mut res = Vec::with_capacity(pending_ops_timelines.len());
+    for (op_persist, timeline_persist) in pending_ops_timelines {
        let node_id = NodeId(op_persist.sk_id as u64);
        let Some(sk) = safekeepers.get(&node_id) else {
            // This shouldn't happen, at least the safekeeper should exist as decomissioned.
@@ -102,16 +109,12 @@ pub(crate) async fn load_schedule_requests(
            SafekeeperTimelineOpKind::Delete => Vec::new(),
            SafekeeperTimelineOpKind::Exclude => Vec::new(),
            SafekeeperTimelineOpKind::Pull => {
-                // TODO this code is super hacky, it doesn't take migrations into account
-                let Some(timeline_id) = timeline_id else {
+                if timeline_id.is_none() {
+                    // We only do this extra check (outside of timeline_persist check) to give better error msgs
                    anyhow::bail!(
                        "timeline_id is empty for `pull` schedule request for {tenant_id}"
                    );
                };
-                let timeline_persist = service
-                    .persistence
-                    .get_timeline(tenant_id, timeline_id)
-                    .await?;
                let Some(timeline_persist) = timeline_persist else {
                    // This shouldn't happen, the timeline should still exist
                    tracing::warn!(
@@ -163,6 +166,7 @@ pub(crate) struct ScheduleRequest {
    pub(crate) kind: SafekeeperTimelineOpKind,
 }

+/// Handle to per safekeeper reconciler.
 struct ReconcilerHandle {
    tx: UnboundedSender<(ScheduleRequest, CancellationToken)>,
    ongoing_tokens: Arc<ClashMap<(TenantId, Option<TimelineId>), CancellationToken>>,
@@ -170,7 +174,10 @@ struct ReconcilerHandle {
 }

 impl ReconcilerHandle {
-    /// Obtain a new token slot, cancelling any existing reconciliations for that timeline
+    /// Obtain a new token slot, cancelling any existing reconciliations for
+    /// that timeline. It is not useful to have >1 operation per <tenant_id,
+    /// timeline_id, safekeeper>, hence scheduling op cancels current one if it
+    /// exists.
    fn new_token_slot(
        &self,
        tenant_id: TenantId,
@@ -305,15 +312,16 @@ impl SafekeeperReconciler {
            SafekeeperTimelineOpKind::Delete => {
                let tenant_id = req.tenant_id;
                if let Some(timeline_id) = req.timeline_id {
-                    let deleted = self.reconcile_inner(
-                        req,
-                        async |client| client.delete_timeline(tenant_id, timeline_id).await,
-                        |_resp| {
-                            tracing::info!(%tenant_id, %timeline_id, "deleted timeline from {req_host}");
-                        },
-                        req_cancel,
-                    )
-                    .await;
+                    let deleted = self
+                        .reconcile_inner(
+                            req,
+                            async |client| client.delete_timeline(tenant_id, timeline_id).await,
+                            |_resp| {
+                                tracing::info!("deleted timeline from {req_host}");
+                            },
+                            req_cancel,
+                        )
+                        .await;
                    if deleted {
                        self.delete_timeline_from_db(tenant_id, timeline_id).await;
                    }
@@ -344,12 +352,13 @@ impl SafekeeperReconciler {
        {
            Ok(list) => {
                if !list.is_empty() {
-                    tracing::info!(%tenant_id, %timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len());
+                    // duplicate the timeline_id here because it might be None in the reconcile context
+                    tracing::info!(%timeline_id, "not deleting timeline from db as there is {} open reconciles", list.len());
                    return;
                }
            }
            Err(e) => {
-                tracing::warn!(%tenant_id, %timeline_id, "couldn't query pending ops: {e}");
+                tracing::warn!(%timeline_id, "couldn't query pending ops: {e}");
                return;
            }
        }
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -46,6 +46,7 @@ impl Service {
            .map(SecretString::from);
        let mut joinset = JoinSet::new();

+        // Prepare membership::Configuration from choosen safekeepers.
        let safekeepers = {
            let locked = self.inner.read().unwrap();
            locked.safekeepers.clone()
@@ -205,7 +206,7 @@ impl Service {
            tenant_id: tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
            start_lsn: start_lsn.into(),
-            generation: 0,
+            generation: 1,
            sk_set: sks_persistence.clone(),
            new_sk_set: None,
            cplane_notified_generation: 0,
@@ -254,7 +255,7 @@ impl Service {
            self.persistence.insert_pending_op(pending_op).await?;
        }
        if !remaining.is_empty() {
-            let mut locked = self.inner.write().unwrap();
+            let locked = self.inner.read().unwrap();
            for remaining_id in remaining {
                let Some(sk) = locked.safekeepers.get(&remaining_id) else {
                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
@@ -290,7 +291,7 @@ impl Service {
                    generation: timeline_persist.generation as u32,
                    kind: crate::persistence::SafekeeperTimelineOpKind::Pull,
                };
-                locked.safekeeper_reconcilers.schedule_request(self, req);
+                locked.safekeeper_reconcilers.schedule_request(req);
            }
        }

@@ -357,7 +358,7 @@ impl Service {
            let pending_op = TimelinePendingOpPersistence {
                tenant_id: tenant_id.to_string(),
                timeline_id: timeline_id.to_string(),
-                generation: tl.generation,
+                generation: i32::MAX,
                op_kind: SafekeeperTimelineOpKind::Delete,
                sk_id: *sk_id,
            };
@@ -365,7 +366,7 @@ impl Service {
            self.persistence.insert_pending_op(pending_op).await?;
        }
        {
-            let mut locked = self.inner.write().unwrap();
+            let locked = self.inner.read().unwrap();
            for sk_id in all_sks {
                let sk_id = NodeId(*sk_id as u64);
                let Some(sk) = locked.safekeepers.get(&sk_id) else {
@@ -383,7 +384,7 @@ impl Service {
                    generation: tl.generation as u32,
                    kind: SafekeeperTimelineOpKind::Delete,
                };
-                locked.safekeeper_reconcilers.schedule_request(self, req);
+                locked.safekeeper_reconcilers.schedule_request(req);
            }
        }
        Ok(())
@@ -482,7 +483,7 @@ impl Service {
                tenant_id,
                timeline_id: None,
            };
-            locked.safekeeper_reconcilers.schedule_request(self, req);
+            locked.safekeeper_reconcilers.schedule_request(req);
        }
        Ok(())
    }
@@ -579,7 +580,7 @@ impl Service {
    }

    pub(crate) async fn upsert_safekeeper(
-        &self,
+        self: &Arc<Service>,
        record: crate::persistence::SafekeeperUpsert,
    ) -> Result<(), ApiError> {
        let node_id = NodeId(record.id as u64);
@@ -618,6 +619,9 @@ impl Service {
                    );
                }
            }
+            locked
+                .safekeeper_reconcilers
+                .start_reconciler(node_id, self);
            locked.safekeepers = Arc::new(safekeepers);
            metrics::METRICS_REGISTRY
                .metrics_group
@@ -638,7 +642,7 @@ impl Service {
    }

    pub(crate) async fn set_safekeeper_scheduling_policy(
-        &self,
+        self: &Arc<Service>,
        id: i64,
        scheduling_policy: SkSchedulingPolicy,
    ) -> Result<(), DatabaseError> {
@@ -656,9 +660,13 @@ impl Service {
            sk.set_scheduling_policy(scheduling_policy);

            match scheduling_policy {
-                SkSchedulingPolicy::Active => (),
+                SkSchedulingPolicy::Active => {
+                    locked
+                        .safekeeper_reconcilers
+                        .start_reconciler(node_id, self);
+                }
                SkSchedulingPolicy::Decomissioned | SkSchedulingPolicy::Pause => {
-                    locked.safekeeper_reconcilers.cancel_safekeeper(node_id);
+                    locked.safekeeper_reconcilers.stop_reconciler(node_id);
                }
            }

--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -72,7 +72,7 @@ Inside that dir, a `bin/postgres` binary should be present.
 `COMPATIBILITY_POSTGRES_DISTRIB_DIR`: The directory where the prevoius version of postgres distribution can be found.
 `DEFAULT_PG_VERSION`: The version of Postgres to use,
 This is used to construct full path to the postgres binaries.
-Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=16`
+Format is 2-digit major version nubmer, i.e. `DEFAULT_PG_VERSION=17`
 `TEST_OUTPUT`: Set the directory where test state and test output files
 should go.
 `RUST_LOG`: logging configuration to pass into Neon CLI
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -22,19 +22,62 @@ def connection_parameters_to_env(params: dict[str, str]) -> dict[str, str]:
    }


+# Some API calls not yet implemented.
+# You may want to copy not-yet-implemented methods from the PR https://github.com/neondatabase/neon/pull/11305
 class NeonAPI:
    def __init__(self, neon_api_key: str, neon_api_base_url: str):
        self.__neon_api_key = neon_api_key
        self.__neon_api_base_url = neon_api_base_url.strip("/")
+        self.retry_if_possible = False
+        self.attempts = 10
+        self.sleep_before_retry = 1
+        self.retries524 = 0
+        self.retries4xx = 0

    def __request(self, method: str | bytes, endpoint: str, **kwargs: Any) -> requests.Response:
-        if "headers" not in kwargs:
-            kwargs["headers"] = {}
+        kwargs["headers"] = kwargs.get("headers", {})
        kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"

-        resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
-        log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text)
-        resp.raise_for_status()
+        for attempt in range(self.attempts):
+            retry = False
+            resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+            if resp.status_code >= 400:
+                log.error(
+                    "%s %s returned a %d: %s",
+                    method,
+                    endpoint,
+                    resp.status_code,
+                    resp.text if resp.status_code != 524 else "CloudFlare error page",
+                )
+            else:
+                log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text)
+            if not self.retry_if_possible:
+                resp.raise_for_status()
+                break
+            elif resp.status_code >= 400:
+                if resp.status_code == 422:
+                    if resp.json()["message"] == "branch not ready yet":
+                        retry = True
+                        self.retries4xx += 1
+                elif resp.status_code == 423 and resp.json()["message"] in {
+                    "endpoint is in some transitive state, could not suspend",
+                    "project already has running conflicting operations, scheduling of new ones is prohibited",
+                }:
+                    retry = True
+                    self.retries4xx += 1
+                elif resp.status_code == 524:
+                    log.info("The request was timed out, trying to get operations")
+                    retry = True
+                    self.retries524 += 1
+            if retry:
+                log.info("Retrying, attempt %s/%s", attempt + 1, self.attempts)
+                time.sleep(self.sleep_before_retry)
+                continue
+            else:
+                resp.raise_for_status()
+            break
+        else:
+            raise RuntimeError("Max retry count is reached")

        return resp

@@ -101,6 +144,96 @@ class NeonAPI:

        return cast("dict[str, Any]", resp.json())

+    def create_branch(
+        self,
+        project_id: str,
+        branch_name: str | None = None,
+        parent_id: str | None = None,
+        parent_lsn: str | None = None,
+        parent_timestamp: str | None = None,
+        protected: bool | None = None,
+        archived: bool | None = None,
+        init_source: str | None = None,
+        add_endpoint=True,
+    ) -> dict[str, Any]:
+        data: dict[str, Any] = {}
+        if add_endpoint:
+            data["endpoints"] = [{"type": "read_write"}]
+        data["branch"] = {}
+        if parent_id:
+            data["branch"]["parent_id"] = parent_id
+        if branch_name:
+            data["branch"]["name"] = branch_name
+        if parent_lsn is not None:
+            data["branch"]["parent_lsn"] = parent_lsn
+        if parent_timestamp is not None:
+            data["branch"]["parent_timestamp"] = parent_timestamp
+        if protected is not None:
+            data["branch"]["protected"] = protected
+        if init_source is not None:
+            data["branch"]["init_source"] = init_source
+        if archived is not None:
+            data["branch"]["archived"] = archived
+        if not data["branch"]:
+            data.pop("branch")
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/branches",
+            headers={
+                "Accept": "application/json",
+                "Content-Type": "application/json",
+            },
+            json=data,
+        )
+        return cast("dict[str, Any]", resp.json())
+
+    def get_branch_details(self, project_id: str, branch_id: str) -> dict[str, Any]:
+        resp = self.__request(
+            "GET",
+            f"/projects/{project_id}/branches/{branch_id}",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+        return cast("dict[str, Any]", resp.json())
+
+    def delete_branch(self, project_id: str, branch_id: str) -> dict[str, Any]:
+        resp = self.__request(
+            "DELETE",
+            f"/projects/{project_id}/branches/{branch_id}",
+            headers={
+                "Accept": "application/json",
+            },
+        )
+        return cast("dict[str, Any]", resp.json())
+
+    def restore_branch(
+        self,
+        project_id: str,
+        branch_id: str,
+        source_branch_id: str,
+        source_lsn: str | None,
+        source_timestamp: str | None,
+        preserve_under_name: str | None,
+    ):
+        data = {"source_branch_id": source_branch_id}
+        if source_lsn:
+            data["source_lsn"] = source_lsn
+        if source_timestamp:
+            data["source_timestamp"] = source_timestamp
+        if preserve_under_name:
+            data["preserve_under_name"] = preserve_under_name
+        log.info("Data: %s", data)
+        resp = self.__request(
+            "POST",
+            f"/projects/{project_id}/branches/{branch_id}/restore",
+            headers={
+                "Accept": "application/json",
+            },
+            json=data,
+        )
+        return cast("dict[str, Any]", resp.json())
+
    def start_endpoint(
        self,
        project_id: str,
@@ -176,6 +309,10 @@ class NeonAPI:

        return cast("dict[str, Any]", resp.json())

+    def delete_endpoint(self, project_id: str, endpoint_id: str) -> dict[str, Any]:
+        resp = self.__request("DELETE", f"/projects/{project_id}/endpoints/{endpoint_id}")
+        return cast("dict[str,Any]", resp.json())
+
    def get_connection_uri(
        self,
        project_id: str,
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3185,6 +3185,7 @@ class PgBin:
        command: list[str],
        env: Env | None = None,
        cwd: str | Path | None = None,
+        stderr_pipe: Any | None = None,
    ) -> subprocess.Popen[Any]:
        """
        Run one of the postgres binaries, not waiting for it to finish
@@ -3202,7 +3203,9 @@ class PgBin:
        log.info(f"Running command '{' '.join(command)}'")
        env = self._build_env(env)
        self._log_env(env)
-        return subprocess.Popen(command, env=env, cwd=cwd, stdout=subprocess.PIPE, text=True)
+        return subprocess.Popen(
+            command, env=env, cwd=cwd, stdout=subprocess.PIPE, stderr=stderr_pipe, text=True
+        )

    def run(
        self,
--- a/test_runner/performance/README.md
+++ b/test_runner/performance/README.md
@@ -7,7 +7,7 @@ easier to see if you have compile errors without scrolling up.
 You may also need to run `./scripts/pysync`.

 Then run the tests
-`DEFAULT_PG_VERSION=16 NEON_BIN=./target/release poetry run pytest test_runner/performance`
+`DEFAULT_PG_VERSION=17 NEON_BIN=./target/release poetry run pytest test_runner/performance`

 Some handy pytest flags for local development:
 - `-x` tells pytest to stop on first error
--- a/test_runner/performance/pageserver/README.md
+++ b/test_runner/performance/pageserver/README.md
@@ -11,6 +11,6 @@ It supports mounting snapshots using overlayfs, which improves iteration time.
 Here's a full command line.

 ```
-RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=16 BUILD_TYPE=release \
+RUST_BACKTRACE=1 NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 DEFAULT_PG_VERSION=17 BUILD_TYPE=release \
    ./scripts/pytest test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
 ````
--- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
+++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py
@@ -16,7 +16,7 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking

 """
 Usage:
-DEFAULT_PG_VERSION=16 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
+DEFAULT_PG_VERSION=17 BUILD_TYPE=debug NEON_ENV_BUILDER_USE_OVERLAYFS_FOR_SNAPSHOTS=1 INTERACTIVE=true \
    ./scripts/pytest --timeout 0 test_runner/performance/pageserver/interactive/test_many_small_tenants.py
 """

--- a/test_runner/performance/test_compute_startup.py
+++ b/test_runner/performance/test_compute_startup.py
@@ -3,7 +3,6 @@ from __future__ import annotations
 from typing import TYPE_CHECKING

 import pytest
-import requests
 from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker

 if TYPE_CHECKING:
@@ -68,9 +67,7 @@ def test_compute_startup_simple(
            endpoint.safe_psql("select 1;")

        # Get metrics
-        metrics = requests.get(
-            f"http://localhost:{endpoint.external_http_port}/metrics.json"
-        ).json()
+        metrics = endpoint.http_client().metrics_json()
        durations = {
            "wait_for_spec_ms": f"{i}_wait_for_spec",
            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
@@ -155,9 +152,7 @@ def test_compute_ondemand_slru_startup(
            assert sum == 1000000

        # Get metrics
-        metrics = requests.get(
-            f"http://localhost:{endpoint.external_http_port}/metrics.json"
-        ).json()
+        metrics = endpoint.http_client().metrics_json()
        durations = {
            "wait_for_spec_ms": f"{slru}_{i}_wait_for_spec",
            "sync_safekeepers_ms": f"{slru}_{i}_sync_safekeepers",
--- a/test_runner/random_ops/README.md
+++ b/test_runner/random_ops/README.md
@@ -0,0 +1,93 @@
+# Random Operations Test for Neon Stability
+
+## Problem Statement
+
+Neon needs robust testing of Neon's stability to ensure reliability for users. The random operations test addresses this by continuously exercising the API with unpredictable sequences of operations, helping to identify edge cases and potential issues that might not be caught by deterministic tests.
+
+### Key Components
+
+#### 1. Class Structure
+
+The test implements three main classes to model the Neon architecture:
+
+- **NeonProject**: Represents a Neon project and manages the lifecycle of branches and endpoints
+- **NeonBranch**: Represents a branch within a project, with methods for creating child branches, endpoints, and performing point-in-time restores
+- **NeonEndpoint**: Represents an endpoint (connection point) for a branch, with methods for managing benchmarks
+
+#### 2. Operations Tested
+
+The test randomly performs the following operations with weighted probabilities:
+
+- **Creating branches** 
+- **Deleting branches**
+- **Adding read-only endpoints**
+- **Deleting read-only endpoints**
+- **Restoring branches to random points in time**
+
+#### 3. Load Generation
+
+Each branch and endpoint is loaded with `pgbench` to simulate real database workloads during testing. This ensures that the operations are performed against branches with actual data and ongoing transactions.
+
+#### 4. Error Handling
+
+The test includes robust error handling for various scenarios:
+- Branch limit exceeded
+- Connection timeouts
+- Control plane timeouts (HTTP 524 errors)
+- Benchmark failures
+
+#### 5. CI Integration
+
+The test is integrated into the CI pipeline via a GitHub workflow that runs daily, ensuring continuous validation of API stability.
+
+## How It Works
+
+1. The test creates a Neon project using the Public API
+2. It initializes the main branch with pgbench data
+3. It performs random operations according to the weighted probabilities
+4. During each operation, it checks that all running benchmarks are still operational
+5. The test cleans up by deleting the project at the end
+
+## Configuration
+
+The test can be configured with:
+- `RANDOM_SEED`: Set a specific random seed for reproducible test runs
+- `NEON_API_KEY`: API key for authentication
+- `NEON_API_BASE_URL`: Base URL for the API (defaults to staging environment)
+- `NUM_OPERATIONS`: The number of operations to be performed
+
+## Running the Test
+
+The test is designed to run in the CI environment but can also be executed locally:
+
+```bash
+NEON_API_KEY=your_api_key ./scripts/pytest test_runner/random_ops/test_random_ops.py -m remote_cluster
+```
+
+To run with a specific random seed for reproducibility:
+
+```bash
+RANDOM_SEED=12345 NEON_API_KEY=your_api_key ./scripts/pytest test_runner/random_ops/test_random_ops.py -m remote_cluster
+```
+
+To run with the custom number of operations:
+
+```bash
+NUM_OPERATIONS=500 NEON_API_KEY=your_api_key ./scripts/pytest test_runner/random_ops/test_random_ops.py -m remote_cluster
+```
+
+## Benefits
+
+This test provides several key benefits:
+1. **Comprehensive API testing**: Exercises multiple API endpoints in combination
+2. **Edge case discovery**: Random sequences may uncover issues not found in deterministic tests
+3. **Stability validation**: Continuous execution helps ensure long-term API reliability
+4. **Regression prevention**: Detects if new changes break existing API functionality
+
+## Future Improvements
+
+Potential enhancements to the test could include:
+1. Adding more API operations, e.g. `reset_to_parent`, `snapshot`, etc 
+2. Implementing more sophisticated load patterns
+3. Adding metrics collection to measure API performance
+4. Extending test duration for longer-term stability validation
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -0,0 +1,463 @@
+"""
+Run the random API tests on the cloud instance of Neon
+"""
+
+from __future__ import annotations
+
+import os
+import random
+import subprocess
+import time
+from datetime import UTC, datetime, timedelta
+from typing import TYPE_CHECKING, Any
+
+import pytest
+from fixtures.log_helper import log
+from requests import HTTPError
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fixtures.neon_api import NeonAPI
+    from fixtures.neon_fixtures import PgBin
+    from fixtures.pg_version import PgVersion
+
+
+class NeonEndpoint:
+    """
+    Neon Endpoint
+    Gets the output of the API call of an endpoint creation
+    """
+
+    def __init__(self, project: NeonProject, endpoint: dict[str, Any]):
+        self.project: NeonProject = project
+        self.id: str = endpoint["id"]
+        # The branch endpoint belongs to
+        self.branch: NeonBranch = project.branches[endpoint["branch_id"]]
+        self.type: str = endpoint["type"]
+        # add itself to the list of endpoints of the branch
+        self.branch.endpoints[self.id] = self
+        self.project.endpoints[self.id] = self
+        self.host: str = endpoint["host"]
+        self.benchmark: subprocess.Popen[Any] | None = None
+        # The connection environment is used when running benchmark
+        self.connect_env: dict[str, str] | None = None
+        if self.branch.connect_env:
+            self.connect_env = self.branch.connect_env.copy()
+            self.connect_env["PGHOST"] = self.host
+
+    def delete(self):
+        self.project.delete_endpoint(self.id)
+
+    def start_benchmark(self, clients=10):
+        return self.project.start_benchmark(self.id, clients=clients)
+
+    def check_benchmark(self):
+        self.project.check_benchmark(self.id)
+
+    def terminate_benchmark(self):
+        self.project.terminate_benchmark(self.id)
+
+
+class NeonBranch:
+    """
+    Neon Branch
+    Gets the output of the API call of the Neon Public API call of a branch creation as a first parameter
+    is_reset defines if the branch is a reset one i.e. created as a result of the reset API Call
+    """
+
+    def __init__(self, project, branch: dict[str, Any], is_reset=False):
+        self.id: str = branch["branch"]["id"]
+        self.desc = branch
+        self.project: NeonProject = project
+        self.neon_api: NeonAPI = project.neon_api
+        self.project_id: str = branch["branch"]["project_id"]
+        self.parent: NeonBranch | None = (
+            self.project.branches[branch["branch"]["parent_id"]]
+            if "parent_id" in branch["branch"]
+            else None
+        )
+        if is_reset:
+            self.project.reset_branches.add(self.id)
+        elif self.parent:
+            self.project.leaf_branches[self.id] = self
+        if self.parent is not None and self.parent.id in self.project.leaf_branches:
+            self.project.leaf_branches.pop(self.parent.id)
+        self.project.branches[self.id] = self
+        self.children: dict[str, NeonBranch] = {}
+        if self.parent is not None:
+            self.parent.children[self.id] = self
+        self.endpoints: dict[str, NeonEndpoint] = {}
+        self.connection_parameters: dict[str, str] | None = (
+            branch["connection_uris"][0]["connection_parameters"]
+            if "connection_uris" in branch
+            else None
+        )
+        self.benchmark: subprocess.Popen[Any] | None = None
+        self.updated_at: datetime = datetime.fromisoformat(branch["branch"]["updated_at"])
+        self.connect_env: dict[str, str] | None = None
+        if self.connection_parameters:
+            self.connect_env = {
+                "PGHOST": self.connection_parameters["host"],
+                "PGUSER": self.connection_parameters["role"],
+                "PGDATABASE": self.connection_parameters["database"],
+                "PGPASSWORD": self.connection_parameters["password"],
+                "PGSSLMODE": "require",
+            }
+
+    def __str__(self):
+        """
+        Prints the branch's name with all the predecessors
+        (r) means the branch is a reset one
+        """
+        return f"{self.id}{'(r)' if self.id in self.project.reset_branches else ''}, parent: {self.parent}"
+
+    def create_child_branch(self) -> NeonBranch | None:
+        return self.project.create_branch(self.id)
+
+    def create_ro_endpoint(self) -> NeonEndpoint:
+        return NeonEndpoint(
+            self.project,
+            self.neon_api.create_endpoint(self.project_id, self.id, "read_only", {})["endpoint"],
+        )
+
+    def delete(self) -> None:
+        self.project.delete_branch(self.id)
+
+    def start_benchmark(self, clients=10) -> subprocess.Popen[Any]:
+        return self.project.start_benchmark(self.id, clients=clients)
+
+    def check_benchmark(self) -> None:
+        self.project.check_benchmark(self.id)
+
+    def terminate_benchmark(self) -> None:
+        self.project.terminate_benchmark(self.id)
+
+    def restore_random_time(self) -> None:
+        """
+        Does PITR, i.e. calls the reset API call on the same branch to the random time in the past
+        """
+        min_time = self.updated_at + timedelta(seconds=1)
+        max_time = datetime.now(UTC) - timedelta(seconds=1)
+        target_time = (min_time + (max_time - min_time) * random.random()).replace(microsecond=0)
+        res = self.restore(
+            self.id,
+            source_timestamp=target_time.isoformat().replace("+00:00", "Z"),
+            preserve_under_name=self.project.gen_restore_name(),
+        )
+        if res is None:
+            return
+        self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"])
+        parent_id: str = res["branch"]["parent_id"]
+        # Creates an object for the parent branch
+        # After the reset operation a new parent branch is created
+        parent = NeonBranch(
+            self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True
+        )
+        self.project.branches[parent_id] = parent
+        self.parent = parent
+        parent.children[self.id] = self
+        self.project.wait()
+
+    def restore(
+        self,
+        source_branch_id: str,
+        source_lsn: str | None = None,
+        source_timestamp: str | None = None,
+        preserve_under_name: str | None = None,
+    ) -> dict[str, Any] | None:
+        endpoints = [ep for ep in self.endpoints.values() if ep.type == "read_only"]
+        # Terminate all the benchmarks running to prevent errors. Errors in benchmark during pgbench are expected
+        for ep in endpoints:
+            ep.terminate_benchmark()
+        self.terminate_benchmark()
+        try:
+            res: dict[str, Any] = self.neon_api.restore_branch(
+                self.project_id,
+                self.id,
+                source_branch_id,
+                source_lsn,
+                source_timestamp,
+                preserve_under_name,
+            )
+        except HTTPError as he:
+            if (
+                he.response.status_code == 422
+                and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED"
+            ):
+                log.info("Branch limit exceeded, skipping")
+                return None
+            else:
+                raise HTTPError(he) from he
+        self.project.wait()
+        self.start_benchmark()
+        for ep in endpoints:
+            ep.start_benchmark()
+        return res
+
+
+class NeonProject:
+    """
+    The project object
+    Calls the Public API to create a Neon Project
+    """
+
+    def __init__(self, neon_api: NeonAPI, pg_bin: PgBin, pg_version: PgVersion):
+        self.neon_api = neon_api
+        self.pg_bin = pg_bin
+        proj = self.neon_api.create_project(
+            pg_version, f"Automatic random API test {os.getenv('GITHUB_RUN_ID')}"
+        )
+        self.id: str = proj["project"]["id"]
+        self.name: str = proj["project"]["name"]
+        self.connection_uri: str = proj["connection_uris"][0]["connection_uri"]
+        self.connection_parameters: dict[str, str] = proj["connection_uris"][0][
+            "connection_parameters"
+        ]
+        self.pg_version: PgVersion = pg_version
+        # Leaf branches are the branches, which do not have children
+        self.leaf_branches: dict[str, NeonBranch] = {}
+        self.branches: dict[str, NeonBranch] = {}
+        self.reset_branches: set[str] = set()
+        self.main_branch: NeonBranch = NeonBranch(self, proj)
+        self.main_branch.connection_parameters = self.connection_parameters
+        self.endpoints: dict[str, NeonEndpoint] = {}
+        for endpoint in proj["endpoints"]:
+            NeonEndpoint(self, endpoint)
+        self.neon_api.wait_for_operation_to_finish(self.id)
+        self.benchmarks: dict[str, subprocess.Popen[Any]] = {}
+        self.restore_num: int = 0
+        self.restart_pgbench_on_console_errors: bool = False
+
+    def delete(self):
+        self.neon_api.delete_project(self.id)
+
+    def create_branch(self, parent_id: str | None = None) -> NeonBranch | None:
+        self.wait()
+        try:
+            branch_def = self.neon_api.create_branch(self.id, parent_id=parent_id)
+        except HTTPError as he:
+            if (
+                he.response.status_code == 422
+                and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED"
+            ):
+                log.info("Branch limit exceeded, skipping")
+                return None
+            else:
+                raise HTTPError(he) from he
+        new_branch = NeonBranch(self, branch_def)
+        self.wait()
+        return new_branch
+
+    def delete_branch(self, branch_id: str) -> None:
+        parent = self.branches[branch_id].parent
+        if not parent or branch_id == self.main_branch.id:
+            raise RuntimeError("Cannot delete the main branch")
+        if branch_id not in self.leaf_branches and branch_id not in self.reset_branches:
+            raise RuntimeError(f"The branch {branch_id}, probably, has ancestors")
+        if branch_id not in self.branches:
+            raise RuntimeError(f"The branch with id {branch_id} is not found")
+        endpoints_to_delete = [
+            ep for ep in self.branches[branch_id].endpoints.values() if ep.type == "read_only"
+        ]
+        for ep in endpoints_to_delete:
+            ep.delete()
+        if branch_id not in self.reset_branches:
+            self.terminate_benchmark(branch_id)
+        self.neon_api.delete_branch(self.id, branch_id)
+        if len(parent.children) == 1 and parent.id != self.main_branch.id:
+            self.leaf_branches[parent.id] = parent
+        parent.children.pop(branch_id)
+        if branch_id in self.leaf_branches:
+            self.leaf_branches.pop(branch_id)
+        else:
+            self.reset_branches.remove(branch_id)
+        self.branches.pop(branch_id)
+        self.wait()
+        if parent.id in self.reset_branches:
+            parent.delete()
+
+    def delete_endpoint(self, endpoint_id: str) -> None:
+        self.terminate_benchmark(endpoint_id)
+        self.neon_api.delete_endpoint(self.id, endpoint_id)
+        self.endpoints[endpoint_id].branch.endpoints.pop(endpoint_id)
+        self.endpoints.pop(endpoint_id)
+        self.wait()
+
+    def start_benchmark(self, target: str, clients: int = 10) -> subprocess.Popen[Any]:
+        if target in self.benchmarks:
+            raise RuntimeError(f"Benchmark was already started for {target}")
+        is_endpoint = target.startswith("ep")
+        read_only = is_endpoint and self.endpoints[target].type == "read_only"
+        cmd = ["pgbench", f"-c{clients}", "-T10800", "-Mprepared"]
+        if read_only:
+            cmd.extend(["-S", "-n"])
+        target_object = self.endpoints[target] if is_endpoint else self.branches[target]
+        if target_object.connect_env is None:
+            raise RuntimeError(f"The connection environment is not defined for {target}")
+        log.info(
+            "running pgbench on %s, cmd: %s, host: %s",
+            target,
+            cmd,
+            target_object.connect_env["PGHOST"],
+        )
+        pgbench = self.pg_bin.run_nonblocking(
+            cmd, env=target_object.connect_env, stderr_pipe=subprocess.PIPE
+        )
+        self.benchmarks[target] = pgbench
+        target_object.benchmark = pgbench
+        time.sleep(2)
+        return pgbench
+
+    def check_all_benchmarks(self) -> None:
+        for target in tuple(self.benchmarks.keys()):
+            self.check_benchmark(target)
+
+    def check_benchmark(self, target) -> None:
+        rc = self.benchmarks[target].poll()
+        if rc is not None:
+            _, err = self.benchmarks[target].communicate()
+            log.error("STDERR: %s", err)
+            # if the benchmark failed due to irresponsible Control plane,
+            # just restart it
+            if self.restart_pgbench_on_console_errors and (
+                "ERROR:  Couldn't connect to compute node" in err
+                or "ERROR:  Console request failed" in err
+            ):
+                log.info("Restarting benchmark for %s", target)
+                self.benchmarks.pop(target)
+                self.start_benchmark(target)
+                return
+            raise RuntimeError(f"The benchmark for {target} ended with code {rc}")
+
+    def terminate_benchmark(self, target):
+        log.info("Terminating the benchmark %s", target)
+        target_endpoint = target.startswith("ep")
+        self.check_benchmark(target)
+        self.benchmarks[target].terminate()
+        self.benchmarks.pop(target)
+        if target_endpoint:
+            self.endpoints[target].benchmark = None
+        else:
+            self.branches[target].benchmark = None
+
+    def wait(self):
+        """
+        Wait for all the operations to be finished
+        """
+        return self.neon_api.wait_for_operation_to_finish(self.id)
+
+    def gen_restore_name(self):
+        self.restore_num += 1
+        return f"restore{self.restore_num}"
+
+
+@pytest.fixture()
+def setup_class(
+    pg_version: PgVersion,
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+):
+    neon_api.retry_if_possible = True
+    project = NeonProject(neon_api, pg_bin, pg_version)
+    log.info("Created a project with id %s, name %s", project.id, project.name)
+    yield pg_bin, project
+    log.info("Retried 524 errors: %s", neon_api.retries524)
+    log.info("Retried 4xx errors: %s", neon_api.retries4xx)
+    if neon_api.retries524 > 0:
+        print(f"::warning::Retried on 524 error {neon_api.retries524} times")
+    if neon_api.retries4xx > 0:
+        print(f"::warning::Retried on 4xx error {neon_api.retries4xx} times")
+    log.info("Removing the project")
+    project.delete()
+
+
+def do_action(project: NeonProject, action: str) -> None:
+    """
+    Runs the action
+    """
+    log.info("Action: %s", action)
+    if action == "new_branch":
+        log.info("Trying to create a new branch")
+        parent = project.branches[
+            random.choice(list(set(project.branches.keys()) - project.reset_branches))
+        ]
+        log.info("Parent: %s", parent)
+        child = parent.create_child_branch()
+        if child is None:
+            return
+        log.info("Created branch %s", child)
+        child.start_benchmark()
+    elif action == "delete_branch":
+        if project.leaf_branches:
+            target = random.choice(list(project.leaf_branches.values()))
+            log.info("Trying to delete branch %s", target)
+            target.delete()
+        else:
+            log.info("Leaf branches not found, skipping")
+    elif action == "new_ro_endpoint":
+        ep = random.choice(
+            [br for br in project.branches.values() if br.id not in project.reset_branches]
+        ).create_ro_endpoint()
+        log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id)
+        ep.start_benchmark()
+    elif action == "delete_ro_endpoint":
+        ro_endpoints: list[NeonEndpoint] = [
+            endpoint for endpoint in project.endpoints.values() if endpoint.type == "read_only"
+        ]
+        if ro_endpoints:
+            target_ep: NeonEndpoint = random.choice(ro_endpoints)
+            target_ep.delete()
+            log.info("endpoint %s deleted", target_ep.id)
+        else:
+            log.info("no read_only endpoints present, skipping")
+    elif action == "restore_random_time":
+        if project.leaf_branches:
+            br: NeonBranch = random.choice(list(project.leaf_branches.values()))
+            log.info("Restore %s", br)
+            br.restore_random_time()
+        else:
+            log.info("No leaf branches found")
+    else:
+        raise ValueError(f"The action {action} is unknown")
+
+
+@pytest.mark.timeout(7200)
+@pytest.mark.remote_cluster
+def test_api_random(
+    setup_class,
+    pg_distrib_dir: Path,
+    test_output_dir: Path,
+):
+    """
+    Run the random API tests
+    """
+    if seed_env := os.getenv("RANDOM_SEED"):
+        seed = int(seed_env)
+    else:
+        seed = 0
+    if seed == 0:
+        seed = int(time.time())
+    log.info("Using random seed: %s", seed)
+    random.seed(seed)
+    pg_bin, project = setup_class
+    # Here we can assign weights
+    ACTIONS = (
+        ("new_branch", 1.5),
+        ("new_ro_endpoint", 1.4),
+        ("delete_ro_endpoint", 0.8),
+        ("delete_branch", 1.0),
+        ("restore_random_time", 1.2),
+    )
+    if num_ops_env := os.getenv("NUM_OPERATIONS"):
+        num_operations = int(num_ops_env)
+    else:
+        num_operations = 250
+    pg_bin.run(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=project.main_branch.connect_env)
+    for _ in range(num_operations):
+        log.info("Starting action #%s", _ + 1)
+        do_action(
+            project, random.choices([a[0] for a in ACTIONS], weights=[w[1] for w in ACTIONS])[0]
+        )
+        project.check_all_benchmarks()
+    assert True
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -72,6 +72,7 @@ PREEMPT_GC_COMPACTION_TENANT_CONF = {
    "wal_receiver_protocol",
    [PageserverWalReceiverProtocol.VANILLA, PageserverWalReceiverProtocol.INTERPRETED],
 )
+@pytest.mark.timeout(900)
 def test_pageserver_compaction_smoke(
    neon_env_builder: NeonEnvBuilder,
    wal_receiver_protocol: PageserverWalReceiverProtocol,
@@ -190,6 +191,7 @@ def test_pageserver_compaction_preempt(


@skip_in_debug_build("only run with release build")
+@pytest.mark.timeout(600)
 def test_pageserver_gc_compaction_preempt(
    neon_env_builder: NeonEnvBuilder,
 ):
@@ -227,7 +229,7 @@ def test_pageserver_gc_compaction_preempt(


@skip_in_debug_build("only run with release build")
-@pytest.mark.timeout(900)  # This test is slow with sanitizers enabled, especially on ARM
+@pytest.mark.timeout(600)  # This test is slow with sanitizers enabled, especially on ARM
@pytest.mark.parametrize(
    "with_branches",
    ["with_branches", "no_branches"],
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -36,10 +36,8 @@ if TYPE_CHECKING:
 # - `test_create_snapshot` a script wrapped in a test that creates a data snapshot.
 # - `test_backward_compatibility` checks that the current version of Neon can start/read/interract with a data snapshot created by the previous version.
 #   The path to the snapshot is configured by COMPATIBILITY_SNAPSHOT_DIR environment variable.
-#   If the breakage is intentional, the test can be xfaild with setting ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE=true.
 # - `test_forward_compatibility` checks that a snapshot created by the current version can be started/read/interracted by the previous version of Neon.
 #   Paths to Neon and Postgres are configured by COMPATIBILITY_NEON_BIN and COMPATIBILITY_POSTGRES_DISTRIB_DIR environment variables.
-#   If the breakage is intentional, the test can be xfaild with setting ALLOW_FORWARD_COMPATIBILITY_BREAKAGE=true.
 #
 # The file contains a couple of helper functions:
 # - check_neon_works performs the test itself, feel free to add more checks there.
@@ -48,7 +46,7 @@ if TYPE_CHECKING:
 #
 # How to run `test_backward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=16
+#    export DEFAULT_PG_VERSION=17
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_SNAPSHOT_DIR=test_output/compatibility_snapshot_pgv${DEFAULT_PG_VERSION}
@@ -70,7 +68,7 @@ if TYPE_CHECKING:
 #
 # How to run `test_forward_compatibility` locally:
 #
-#    export DEFAULT_PG_VERSION=16
+#    export DEFAULT_PG_VERSION=17
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
@@ -96,7 +94,7 @@ if TYPE_CHECKING:
 #
 # How to run `test_version_mismatch` locally:
 #
-#    export DEFAULT_PG_VERSION=16
+#    export DEFAULT_PG_VERSION=17
 #    export BUILD_TYPE=release
 #    export CHECK_ONDISK_DATA_COMPATIBILITY=true
 #    export COMPATIBILITY_NEON_BIN=neon_previous/target/${BUILD_TYPE}
@@ -208,36 +206,19 @@ def test_backward_compatibility(
    """
    Test that the new binaries can read old data
    """
-    breaking_changes_allowed = (
-        os.environ.get("ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
+    env.pageserver.allowed_errors.append(ingest_lag_log_line)
+    env.start()
+
+    check_neon_works(
+        env,
+        test_output_dir=test_output_dir,
+        sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+        repo_dir=env.repo_dir,
    )

-    try:
-        neon_env_builder.num_safekeepers = 3
-        env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
-        env.pageserver.allowed_errors.append(ingest_lag_log_line)
-        env.start()
-
-        check_neon_works(
-            env,
-            test_output_dir=test_output_dir,
-            sql_dump_path=compatibility_snapshot_dir / "dump.sql",
-            repo_dir=env.repo_dir,
-        )
-
-        env.pageserver.assert_log_contains(ingest_lag_log_line)
-
-    except Exception:
-        if breaking_changes_allowed:
-            pytest.xfail(
-                "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE env var"
-            )
-        else:
-            raise
-
-    assert not breaking_changes_allowed, (
-        "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
-    )
+    env.pageserver.assert_log_contains(ingest_lag_log_line)


@check_ondisk_data_compatibility_if_enabled
@@ -254,72 +235,56 @@ def test_forward_compatibility(
    """
    Test that the old binaries can read new data
    """
-    breaking_changes_allowed = (
-        os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
-    )

    neon_env_builder.control_plane_hooks_api = compute_reconfigure_listener.control_plane_hooks_api
    neon_env_builder.test_may_use_compatibility_snapshot_binaries = True

-    try:
-        neon_env_builder.num_safekeepers = 3
+    neon_env_builder.num_safekeepers = 3

-        # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
-        # But always use the current version's neon_local binary.
-        # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI.
-        assert neon_env_builder.compatibility_neon_binpath is not None, (
-            "the environment variable COMPATIBILITY_NEON_BIN is required"
-        )
-        assert neon_env_builder.compatibility_pg_distrib_dir is not None, (
-            "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required"
-        )
-        neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
-        neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir
+    # Use previous version's production binaries (pageserver, safekeeper, pg_distrib_dir, etc.).
+    # But always use the current version's neon_local binary.
+    # This is because we want to test the compatibility of the data format, not the compatibility of the neon_local CLI.
+    assert neon_env_builder.compatibility_neon_binpath is not None, (
+        "the environment variable COMPATIBILITY_NEON_BIN is required"
+    )
+    assert neon_env_builder.compatibility_pg_distrib_dir is not None, (
+        "the environment variable COMPATIBILITY_POSTGRES_DISTRIB_DIR is required"
+    )
+    neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
+    neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir

-        env = neon_env_builder.from_repo_dir(
-            compatibility_snapshot_dir / "repo",
-        )
-        # there may be an arbitrary number of unrelated tests run between create_snapshot and here
-        env.pageserver.allowed_errors.append(ingest_lag_log_line)
+    env = neon_env_builder.from_repo_dir(
+        compatibility_snapshot_dir / "repo",
+    )
+    # there may be an arbitrary number of unrelated tests run between create_snapshot and here
+    env.pageserver.allowed_errors.append(ingest_lag_log_line)

-        # not using env.pageserver.version because it was initialized before
-        prev_pageserver_version_str = env.get_binary_version("pageserver")
-        prev_pageserver_version_match = re.search(
-            "Neon page server git(?:-env)?:(.*) failpoints: (.*), features: (.*)",
-            prev_pageserver_version_str,
-        )
-        if prev_pageserver_version_match is not None:
-            prev_pageserver_version = prev_pageserver_version_match.group(1)
-        else:
-            raise AssertionError(
-                "cannot find git hash in the version string: " + prev_pageserver_version_str
-            )
-
-        # does not include logs from previous runs
-        assert not env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")
-
-        env.start()
-
-        # ensure the specified pageserver is running
-        assert env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")
-
-        check_neon_works(
-            env,
-            test_output_dir=test_output_dir,
-            sql_dump_path=compatibility_snapshot_dir / "dump.sql",
-            repo_dir=env.repo_dir,
+    # not using env.pageserver.version because it was initialized before
+    prev_pageserver_version_str = env.get_binary_version("pageserver")
+    prev_pageserver_version_match = re.search(
+        "Neon page server git(?:-env)?:(.*) failpoints: (.*), features: (.*)",
+        prev_pageserver_version_str,
+    )
+    if prev_pageserver_version_match is not None:
+        prev_pageserver_version = prev_pageserver_version_match.group(1)
+    else:
+        raise AssertionError(
+            "cannot find git hash in the version string: " + prev_pageserver_version_str
        )

-    except Exception:
-        if breaking_changes_allowed:
-            pytest.xfail(
-                "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE env var"
-            )
-        else:
-            raise
+    # does not include logs from previous runs
+    assert not env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")

-    assert not breaking_changes_allowed, (
-        "Breaking changes are allowed by ALLOW_FORWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
+    env.start()
+
+    # ensure the specified pageserver is running
+    assert env.pageserver.log_contains(f"git(-env)?:{prev_pageserver_version}")
+
+    check_neon_works(
+        env,
+        test_output_dir=test_output_dir,
+        sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+        repo_dir=env.repo_dir,
    )


--- a/test_runner/regress/test_lfc_working_set_approximation.py
+++ b/test_runner/regress/test_lfc_working_set_approximation.py
@@ -22,7 +22,12 @@ def test_lfc_working_set_approximation(neon_simple_env: NeonEnv):
    log.info("Creating endpoint with 1MB shared_buffers and 64 MB LFC")
    endpoint = env.endpoints.create_start(
        "main",
-        config_lines=["neon.max_file_cache_size='128MB'", "neon.file_cache_size_limit='64MB'"],
+        config_lines=[
+            "autovacuum=off",
+            "bgwriter_lru_maxpages=0",
+            "neon.max_file_cache_size='128MB'",
+            "neon.file_cache_size_limit='64MB'",
+        ],
    )

    cur = endpoint.connect().cursor()
@@ -72,7 +77,7 @@ WITH (fillfactor='100');
    # verify working set size after some index access of a few select pages only
    blocks = query_scalar(cur, "select approximate_working_set_size(true)")
    log.info(f"working set size after some index access of a few select pages only {blocks}")
-    assert blocks < 12
+    assert blocks < 20


@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
@@ -83,6 +88,7 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
        branch_name="main",
        config_lines=[
            "autovacuum = off",
+            "bgwriter_lru_maxpages=0",
            "shared_buffers=1MB",
            "neon.max_file_cache_size=256MB",
            "neon.file_cache_size_limit=245MB",
@@ -92,9 +98,9 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
    cur = conn.cursor()
    cur.execute("create extension neon")
    cur.execute(
-        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 128))"
+        "create table t(pk integer primary key, count integer default 0, payload text default repeat('?', 1000)) with (fillfactor=10)"
    )
-    cur.execute("insert into t (pk) values (generate_series(1,1000000))")
+    cur.execute("insert into t (pk) values (generate_series(1,100000))")
    time.sleep(2)
    before_10k = time.monotonic()
    cur.execute("select sum(count) from t where pk between 10000 and 20000")
@@ -115,5 +121,5 @@ def test_sliding_working_set_approximation(neon_simple_env: NeonEnv):
    size = cur.fetchall()[0][0] // 8192
    log.info(f"Table size {size} blocks")

-    assert estimation_1k >= 20 and estimation_1k <= 40
-    assert estimation_10k >= 200 and estimation_10k <= 440
+    assert estimation_1k >= 900 and estimation_1k <= 2000
+    assert estimation_10k >= 9000 and estimation_10k <= 20000
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Vlad Lazar	db95540975	pageserver: handle empty get vectored queries (#11652 ) ## Problem If all batched requests are excluded from the query by `Timeine::get_rel_page_at_lsn_batched` (e.g. because they are past the end of the relation), the read path would panic since it doesn't expect empty queries. This is a change in behaviour that was introduced with the scattered query implementation. ## Summary of Changes Handle empty queries explicitly.	2025-04-21 15:38:44 -04:00
JC Grünhage	90033fe693	fix(ci): set token for fast-forward failure comments and allow merging with state unstable (#11647 ) ## Problem https://github.com/neondatabase/neon/actions/runs/14538136318/job/40790985693?pr=11645 failed, even though the relevant parts of the CI had passed and auto-merge determined the PR is ready to merge. After that, commenting failed. ## Summary of changes - set GH_TOKEN for commenting after fast-forward failure - allow merging with mergeable_state unstable	2025-04-21 15:38:44 -04:00
JC Grünhage	cb9d439cc1	fix(ci): make regex to find rc branches less strict (#11646 ) ## Problem https://github.com/neondatabase/neon/actions/runs/14537161022/job/40787763965 failed to find the correct RC PR run, preventing artifact re-use. This broke in https://github.com/neondatabase/neon/pull/11547. There's a hotfix release containing this in https://github.com/neondatabase/neon/pull/11645. ## Summary of changes Make the regex for finding the RC PR run less strict, it was needlessly precise.	2025-04-21 15:38:44 -04:00
Alex Chi Z.	5073e46df4	feat(pageserver): use rfc3339 time and print ratio in gc-compact stats (#11638 ) ## Problem follow-up on https://github.com/neondatabase/neon/pull/11601 ## Summary of changes - serialize the start/end time using rfc3339 time string - compute the size ratio of the compaction --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-18 05:28:01 +00:00
Alexander Bayandin	182bd95a4e	CI(regress-tests): run tests on `large-metal` (#11634 ) ## Problem Regression tests are more flaky on virtualised (`qemu-x64-*`) runners See https://neondb.slack.com/archives/C069Z2199DL/p1744891865307769 Ref https://github.com/neondatabase/neon/issues/11627 ## Summary of changes - Switch `regress-tests` to metal-only large runners to mitigate flaky behaviour	2025-04-18 01:25:38 +00:00
Anastasia Lubennikova	ce7795a67d	compute: use project_id, endpoint_id as tag (#11556 ) for compute audit logs part of https://github.com/neondatabase/cloud/issues/21955	2025-04-17 23:32:38 +00:00
Suhas Thalanki	134d01c771	remove pg_anon.patch (#11636 ) This PR removes `pg_anon.patch` as the `anon` v1 extension has been removed and the patch is not being used anywhere	2025-04-17 22:08:16 +00:00
Arpad Müller	c1e4befd56	Additional fixes and improvements to storcon safekeeper timelines (#11477 ) This delivers some additional fixes and improvements to storcon managed safekeeper timelines: * use `i32::MAX` for the generation number of timeline deletion * start the generation for new timelines at 1 instead of 0: this ensures that the other components actually are generation enabled * fix database operations we use for metrics * use join in list_pending_ops to prevent the classical ORM issue where one does many db queries * use enums in `test_storcon_create_delete_sk_down`. we are adding a second parameter, and having two bool parameters is weird. * extend `test_storcon_create_delete_sk_down` with a test of whole tenant deletion. this hasn't been tested before. * remove some redundant logging contexts * Don't require mutable access to the service lock for scheduling pending ops in memory. In order to pull this off, create reconcilers eagerly. The advantage is that we don't need mutable access to the service lock that way any more. Part of #9011 --------- Co-authored-by: Arseny Sher <sher-ars@yandex.ru>	2025-04-17 20:25:30 +00:00
a-masterov	6c2e5c044c	random operations test (#10986 ) ## Problem We need to test the stability of Neon. ## Summary of changes The test runs random operations on a Neon project. It performs via the Public API calls the following operations: `create a branch`, `delete a branch`, `add a read-only endpoint`, `delete a read-only endpoint`, `restore a branch to a random position in the past`. All the branches and endpoints are loaded with `pgbench`. --------- Co-authored-by: Peter Bendel <peterbendel@neon.tech> Co-authored-by: Alexander Bayandin <alexander@neon.tech>	2025-04-17 19:59:35 +00:00
Alex Chi Z.	748539b222	fix(pageserver): lower L0 compaction threshold (#11617 ) ## Problem We saw OOMs due to L0 compaction happening simultaneously for all shards of the same tenant right after the shard split. ## Summary of changes Lower the threshold so that we compact fewer files. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-17 19:51:28 +00:00
Alex Chi Z.	ad0c5fdae7	fix(test): allow stale generation warnings in storcon (#11624 ) ## Problem https://github.com/neondatabase/neon/pull/11531 did not fully fix the problem because the warning is part of the storcon instead of pageserver. ## Summary of changes Allow stale generation error in storcon. --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-17 16:12:24 +00:00
Christian Schwarz	2b041964b3	cover direct IO + concurrent IO in unit, regression & perf tests (#11585 ) This mirrors the production config. Thread that discusses the merits of this: - https://neondb.slack.com/archives/C033RQ5SPDH/p1744742010740569 # Refs - context https://neondb.slack.com/archives/C04BLQ4LW7K/p1744724844844589?thread_ts=1744705831.014169&cid=C04BLQ4LW7K - prep for https://github.com/neondatabase/neon/pull/11558 which adds new io mode `direct-rw` # Impact on CI turnaround time Spot-checking impact on CI timings - Baseline: [some recent main commit](https://github.com/neondatabase/neon/actions/runs/14471549758/job/40587837475) - Comparison: [this commit](https://github.com/neondatabase/neon/actions/runs/14471945087/job/40589613274) in this PR here Impact on CI turnaround time - Regression tests: - x64: very minor, sometimes better; likely in the noise - arm64: substantial 30min => 40min - Benchmarks (x86 only I think): very minor; noise seems higher than regress tests --------- Signed-off-by: Alex Chi Z <chi@neon.tech> Co-authored-by: Alex Chi Z. <4198311+skyzh@users.noreply.github.com> Co-authored-by: Peter Bendel <peterbendel@neon.tech> Co-authored-by: Alex Chi Z <chi@neon.tech>	2025-04-17 15:53:10 +00:00
John Spray	d4c059a884	tests: use endpoint http wrapper to get auth (#11628 ) ## Problem `test_compute_startup_simple` and `test_compute_ondemand_slru_startup` are failing. This test implicitly asserts that the metrics.json endpoint succeeds and returns all expected metrics, but doesn't make it easy to see what went wrong if it doesn't (e.g. in this failure https://neon-github-public-dev.s3.amazonaws.com/reports/main/14513210240/index.html#suites/13d8e764c394daadbad415a08454c04e/b0f92a86b2ed309f/) In this case, it was failing because of a missing auth token, because it was using `requests` directly instead of using the endpoint http client type. ## Summary of changes - Use endpoint http wrapper to get raise_for_status & auth token	2025-04-17 15:03:23 +00:00
Folke Behrens	2c56c46d48	compute: Set max log level for local proxy sql_over_http mod to WARN (#11629 ) neondatabase/cloud#27738	2025-04-17 14:38:19 +00:00
Tristan Partin	d1728a6bcd	Remove old compatibility hack for remote extensions (#11620 ) Control plane has long since been updated to send the right value. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-17 14:08:42 +00:00
John Spray	0a27973584	pageserver: rename `Tenant` to `TenantShard` (#11589 ) ## Problem `Tenant` isn't really a whole tenant: it's just one shard of a tenant. ## Summary of changes - Automated rename of Tenant to TenantShard - Followup commit to change references in comments	2025-04-17 13:29:16 +00:00
Alexander Bayandin	07c2411f6b	tests: remove mentions of ALLOW_*_COMPATIBILITY_BREAKAGE (#11618 ) ## Problem There are mentions of `ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` and `ALLOW_FORWARD_COMPATIBILITY_BREAKAGE`, but in reality, this mechanism doesn't work, so let's remove it to avoid confusion. The idea behind it was to allow some breaking changes by adding a special label to a PR that would `xfail` the test. However, in practice, this means we would need to carry this label through all subsequent PRs until the release (and artifact regeneration). This approach isn't really viable, as it increases the risk of missing a compatibility break in another PR. ## Summary of changes - Remove mentions and handling of `ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE` / `ALLOW_FORWARD_COMPATIBILITY_BREAKAGE`	2025-04-17 10:03:21 +00:00
Alexander Bayandin	5819938c93	CI(pg-clients): fix workflow permissions (#11623 ) ## Problem `pg-clients` can't start: ``` The workflow is not valid. .github/workflows/pg-clients.yml (Line: 44, Col: 3): Error calling workflow 'neondatabase/neon/.github/workflows/build-build-tools-image.yml@aa19f10e7e958fbe0e0641f2e8c5952ce3be44b3'. The nested job 'check-image' is requesting 'packages: read', but is only allowed 'packages: none'. .github/workflows/pg-clients.yml (Line: 44, Col: 3): Error calling workflow 'neondatabase/neon/.github/workflows/build-build-tools-image.yml@aa19f10e7e958fbe0e0641f2e8c5952ce3be44b3'. The nested job 'build-image' is requesting 'packages: write', but is only allowed 'packages: none'. ``` ## Summary of changes - Grant required `packages: write` permissions to the workflow	2025-04-17 08:54:23 +00:00
Konstantin Knizhnik	b7548de814	Disable autovacuum and increase limit for WS approximation (#11583 ) ## Problem Test lfc working set approximation becomes flaky after recent changes in prefetch. May be it is caused by updating HLL in `lfc_write`, may be by some other reasons. ## Summary of changes 1. Disable autovacuum in this test (as possible source of extra page accesses). 2. Increase upper boundary for WS approximation from 12 to 20. --------- Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>	2025-04-17 05:07:45 +00:00
Tristan Partin	9794f386f4	Make Postgres 17 the default version (#11619 ) This is mostly a documentation update, but a few updates with regard to neon_local, pageserver, and tests. 17 is our default for users in production, so dropping references to 16 makes sense. Signed-off-by: Tristan Partin <tristan@neon.tech> Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-16 23:23:37 +00:00
Tristan Partin	79083de61c	Remove forward compatibility hacks related to compute_ctl auth (#11621 ) These various hacks were needed for the forward compatibility tests. Enough time has passed since the merge that these are no longer needed. Signed-off-by: Tristan Partin <tristan@neon.tech>	2025-04-16 23:14:24 +00:00
Folke Behrens	ec9079f483	Allow unwrap() in tests when clippy::unwrap_used is denied (#11616 ) ## Problem The proxy denies using `unwrap()`s in regular code, but we want to use it in test code and so have to allow it for each test block. ## Summary of changes Set `allow-unwrap-in-tests = true` in clippy.toml and remove all exceptions.	2025-04-16 20:05:21 +00:00
Ivan Efremov	b9b25e13a0	feat(proxy): Return prefixed errors to testodrome (#11561 ) Testodrome measures uptime based on the failed requests and errors. In case of testodrome request we send back error based on the service. This will help us distinguish error types in testodrome and rely on the uptime SLI.	2025-04-16 19:03:23 +00:00
Alex Chi Z.	cf2e695f49	feat(pageserver): gc-compaction meta statistics (#11601 ) ## Problem We currently only have gc-compaction statistics for each single sub-compaction job. ## Summary of changes Add meta statistics across all sub-compaction jobs scheduled. Signed-off-by: Alex Chi Z <chi@neon.tech>	2025-04-16 18:51:48 +00:00
Conrad Ludgate	fc233794f6	fix(proxy): make sure that sql-over-http is TLS aware (#11612 ) I noticed that while auth-broker -> local-proxy is TLS aware, and TCP proxy -> postgres is TLS aware, HTTP proxy -> postgres is not 😅	2025-04-16 18:37:17 +00:00