From ca9d8761ffb71e9eb12631c9da04ae58b468847b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 7 Jul 2025 08:44:41 +0200
Subject: [PATCH 001/163] Move some perf benchmarks  from hetzner to aws arm
 github runners (#12393)

## Problem

We want to move some benchmarks from hetzner runners to aws graviton
runners

## Summary of changes

Adjust the runner labels for some workflows.
Adjust the pagebench number of clients to match the latecny knee at 8
cores of the new instance type
Add `--security-opt seccomp=unconfined` to docker run command to bypass
IO_URING EPERM error.

## New runners


https://us-east-2.console.aws.amazon.com/ec2/home?region=us-east-2#Instances:instanceState=running;search=:github-unit-perf-runner-arm;v=3;$case=tags:true%5C,client:false;$regex=tags:false%5C,client:false;sort=tag:Name

## Important Notes

I added the run-benchmarks label to get this tested **before we merge
it**.
[See](https://github.com/neondatabase/neon/actions/runs/15974141360)

I also test a run of pagebench with the new setup from this branch, see
https://github.com/neondatabase/neon/actions/runs/15972523054
- Update: the benchmarking workflow had failures, [see]
(https://github.com/neondatabase/neon/actions/runs/15974141360/job/45055897591)
- changed docker run command to avoid io_uring EPERM error, new run
[see](https://github.com/neondatabase/neon/actions/runs/15997965633/job/45125689920?pr=12393)

Update: the pagebench test run on the new runner [completed
successfully](https://github.com/neondatabase/neon/actions/runs/15972523054/job/45046772556)

Update 2025-07-07: the latest runs with instance store ext4 have been
successful and resolved the direct I/O issues we have been seeing before
in some runs. We only had one perf testcase failing (shard split) that
had been flaky before. So I think we can merge this now.

## Follow up

if this is merged and works successfully we must create a separate issue
to de-provision the hetzner unit-perf runners defined
[here](https://github.com/neondatabase/runner/blob/91a41729af5b168d42e8acab0a7c38fde031113e/ansible/inventory/hosts_metal#L111)
---
 .github/actionlint.yml                                     | 1 +
 .github/workflows/build_and_test.yml                       | 4 ++--
 .github/workflows/periodic_pagebench.yml                   | 4 ++--
 .github/workflows/proxy-benchmark.yml                      | 4 ++--
 ...test_pageserver_max_throughput_getpage_at_latest_lsn.py | 7 ++++---
 5 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index b7e0be761a..3142a36fa0 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -7,6 +7,7 @@ self-hosted-runner:
     - small-metal
     - small-arm64
     - unit-perf
+    - unit-perf-aws-arm
     - us-east-2
 config-variables:
   - AWS_ECR_REGION
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 456c7b8c92..0ceaa96fb0 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -306,14 +306,14 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, unit-perf ]
+    runs-on: [ self-hosted, unit-perf-aws-arm ]
     container:
       image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
       credentials:
         username: ${{ github.actor }}
         password: ${{ secrets.GITHUB_TOKEN }}
       # for changed limits, see comments on `options:` earlier in this file
-      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864
+      options: --init --shm-size=512mb --ulimit memlock=67108864:67108864 --ulimit nofile=65536:65536 --security-opt seccomp=unconfined
     strategy:
       fail-fast: false
       matrix:
diff --git a/.github/workflows/periodic_pagebench.yml b/.github/workflows/periodic_pagebench.yml
index 317db94052..728a6d4956 100644
--- a/.github/workflows/periodic_pagebench.yml
+++ b/.github/workflows/periodic_pagebench.yml
@@ -1,4 +1,4 @@
-name: Periodic pagebench performance test on unit-perf hetzner runner
+name: Periodic pagebench performance test on unit-perf-aws-arm runners
 
 on:
   schedule:
@@ -40,7 +40,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [ self-hosted, unit-perf ]
+    runs-on: [ self-hosted, unit-perf-aws-arm ]
     container:
       image: ghcr.io/neondatabase/build-tools:pinned-bookworm
       credentials:
diff --git a/.github/workflows/proxy-benchmark.yml b/.github/workflows/proxy-benchmark.yml
index 3a98ad4e8e..0ae93ce295 100644
--- a/.github/workflows/proxy-benchmark.yml
+++ b/.github/workflows/proxy-benchmark.yml
@@ -1,4 +1,4 @@
-name: Periodic proxy performance test on unit-perf hetzner runner
+name: Periodic proxy performance test on unit-perf-aws-arm runners
 
 on:
   push: # TODO: remove after testing
@@ -32,7 +32,7 @@ jobs:
       statuses: write
       contents: write
       pull-requests: write
-    runs-on: [self-hosted, unit-perf]
+    runs-on: [self-hosted, unit-perf-aws-arm]
     timeout-minutes: 60  # 1h timeout
     container:
       image: ghcr.io/neondatabase/build-tools:pinned-bookworm
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 41696bf887..68bfa81b25 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -55,9 +55,10 @@ def test_pageserver_characterize_throughput_with_n_tenants(
 @pytest.mark.parametrize("duration", [20 * 60])
 @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(2048)])
 # we use 1 client to characterize latencies, and 64 clients to characterize throughput/scalability
-# we use 64 clients because typically for a high number of connections we recommend the connection pooler
-# which by default uses 64 connections
-@pytest.mark.parametrize("n_clients", [1, 64])
+# we use 8 clients because we see a latency knee around 6-8 clients on im4gn.2xlarge instance type,
+# which we use for this periodic test - at a cpu utilization of around 70 % - which is considered
+# a good utilization for pageserver.
+@pytest.mark.parametrize("n_clients", [1, 8])
 @pytest.mark.parametrize("n_tenants", [1])
 @pytest.mark.timeout(2400)
 def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(

From 4b5c75b52f62935f42526954d104467f0828e323 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Mon, 7 Jul 2025 11:25:15 +0400
Subject: [PATCH 002/163] docs: revise safekeeper migration rfc (#12432)

## Problem
The safekeeper migration code/logic slightly diverges from the initial
RFC. This PR aims to address these differences.

- Part of https://github.com/neondatabase/neon/issues/12192

## Summary of changes
- Adjust the RFC to reflect that we implemented the safekeeper
reconciler with in-memory queue.
- Add `sk_set_notified_generation` field to the `timelines` table in the
RFC to address the "finish migration atomically" problem.
- Describe how we are going to make the timeline migration handler fully
retriable with in-memory reconciler queue.
- Unify type/field/method names in the code and RFC.
- Fix typos
---
 ...35-safekeeper-dynamic-membership-change.md | 133 +++++++++++-------
 1 file changed, 79 insertions(+), 54 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 9b320c7285..8619f83ff5 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -20,7 +20,7 @@ In our case consensus leader is compute (walproposer), and we don't want to wake
 up all computes for the change. Neither we want to fully reimplement the leader
 logic second time outside compute. Because of that the proposed algorithm relies
 for issuing configurations on the external fault tolerant (distributed) strongly
-consisent storage with simple API: CAS (compare-and-swap) on the single key.
+consistent storage with simple API: CAS (compare-and-swap) on the single key.
 Properly configured postgres suits this.
 
 In the system consensus is implemented at the timeline level, so algorithm below
@@ -34,7 +34,7 @@ A configuration is
 
 ```
 struct Configuration {
-    generation: Generation, // a number uniquely identifying configuration
+    generation: SafekeeperGeneration, // a number uniquely identifying configuration
     sk_set: Vec<NodeId>, // current safekeeper set
     new_sk_set: Optional<Vec<NodeId>>,
 }
@@ -81,11 +81,11 @@ configuration generation in them is less than its current one. Namely, it
 refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
 response it sends its current configuration generation to let walproposer know.
 
-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
-accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/membership`
+accepting `Configuration`. Safekeeper switches to the given conf if it is higher than its
 current one and ignores it otherwise. In any case it replies with
 ```
-struct ConfigurationSwitchResponse {
+struct TimelineMembershipSwitchResponse {
     conf: Configuration,
     term: Term,
     last_log_term: Term,
@@ -108,7 +108,7 @@ establishes this configuration as its own and moves to voting.
 It should stop talking to safekeepers not listed in the configuration at this
 point, though it is not unsafe to continue doing so.
 
-To be elected it must receive votes from both majorites if `new_sk_set` is present.
+To be elected it must receive votes from both majorities if `new_sk_set` is present.
 Similarly, to commit WAL it must receive flush acknowledge from both majorities.
 
 If walproposer hears from safekeeper configuration higher than his own (i.e.
@@ -130,7 +130,7 @@ storage are reachable.
 1) Fetch current timeline configuration from the configuration storage.
 2) If it is already joint one and `new_set` is different from `desired_set`
    refuse to change. However, assign join conf to (in memory) var
-   `join_conf` and proceed to step 4 to finish the ongoing change.
+   `joint_conf` and proceed to step 4 to finish the ongoing change.
 3) Else, create joint `joint_conf: Configuration`: increment current conf number
    `n` and put `desired_set` to `new_sk_set`. Persist it in the configuration
    storage by doing CAS on the current generation: change happens only if
@@ -161,11 +161,11 @@ storage are reachable.
    because `pull_timeline` already includes it and plus additionally would be
    broadcast by compute. More importantly, we may proceed to the next step
    only when `<last_log_term, flush_lsn>` on the majority of the new set reached
-   `sync_position`. Similarly, on the happy path no waiting is not needed because
+   `sync_position`. Similarly, on the happy path no waiting is needed because
    `pull_timeline` already includes it. However, we should double
     check to be safe. For example, timeline could have been created earlier e.g.
     manually or after try-to-migrate, abort, try-to-migrate-again sequence.
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
+7) Create `new_conf: Configuration` incrementing `joint_conf` generation and having new
    safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
    storage under one more CAS.
 8) Call `PUT` `configuration` on safekeepers from the new set,
@@ -178,12 +178,12 @@ spec of it.
 
 Description above focuses on safety. To make the flow practical and live, here a few more
 considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
+1) It makes sense to ping new set to ensure we are migrating to live node(s) before
   step 3.
 2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
    it is safe to rollback to the old conf with one more CAS.
 3) On step 4 timeline might be already created on members of the new set for various reasons;
-   the simplest is the procedure restart. There are more complicated scenarious like mentioned
+   the simplest is the procedure restart. There are more complicated scenarios like mentioned
    in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
    generations, so seems simpler to treat existing timeline as success. However, this also
    has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
@@ -192,7 +192,7 @@ considerations.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
    in the old set but not in the new one, unless they are unreachable. To be
    safe this also should be done under generation number (deletion proceeds only if
-   current configuration is <= than one in request and safekeeper is not memeber of it).
+   current configuration is <= than one in request and safekeeper is not member of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
    jump to step 7, using it as `new_conf`.
 
@@ -261,14 +261,14 @@ Timeline (branch) creation in cplane should call storage_controller POST
 Response should be augmented with `safekeepers_generation` and `safekeepers`
 fields like described in `/notify-safekeepers` above. Initially (currently)
 these fields may be absent; in this case cplane chooses safekeepers on its own
-like it currently does. The call should be retried until succeeds.
+like it currently does. The call should be retried until it succeeds.
 
 Timeline deletion and tenant deletion in cplane should call appropriate
 storage_controller endpoints like it currently does for sharded tenants. The
 calls should be retried until they succeed.
 
-When compute receives safekeepers list from control plane it needs to know the
-generation to checked whether it should be updated (note that compute may get
+When compute receives safekeeper list from control plane it needs to know the
+generation to check whether it should be updated (note that compute may get
 safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
 GUC is just a comma separates list of `host:port`. Let's prefix it with
 `g#<generation>:` to this end, so it will look like
@@ -305,8 +305,8 @@ enum MigrationRequest {
 ```
 
 `FinishPending` requests to run the procedure to ensure state is clean: current
-configuration is not joint and majority of safekeepers are aware of it, but do
-not attempt to migrate anywhere. If current configuration fetched on step 1 is
+configuration is not joint and the majority of safekeepers are aware of it, but do
+not attempt to migrate anywhere. If the current configuration fetched on step 1 is
 not joint it jumps to step 7. It should be run at startup for all timelines (but
 similarly, in the first version it is ok to trigger it manually).
 
@@ -315,7 +315,7 @@ similarly, in the first version it is ok to trigger it manually).
 `safekeepers` table mirroring current `nodes` should be added, except that for
 `scheduling_policy`: it is enough to have at least in the beginning only 3
 fields: 1) `active` 2) `paused` (initially means only not assign new tlis there
-3) `decomissioned` (node is removed).
+3) `decommissioned` (node is removed).
 
 `timelines` table:
 ```
@@ -326,9 +326,10 @@ table! {
         tenant_id -> Varchar,
         start_lsn -> pg_lsn,
         generation -> Int4,
-        sk_set -> Array<Int4>, // list of safekeeper ids
+        sk_set -> Array<Int8>, // list of safekeeper ids
         new_sk_set -> Nullable<Array<Int8>>, // list of safekeeper ids, null if not joint conf
         cplane_notified_generation -> Int4,
+        sk_set_notified_generation -> Int4, // the generation a quorum of sk_set knows about
         deleted_at -> Nullable<Timestamptz>,
     }
 }
@@ -338,13 +339,23 @@ table! {
 might also want to add ancestor_timeline_id to preserve the hierarchy, but for
 this RFC it is not needed.
 
+`cplane_notified_generation` and `sk_set_notified_generation` fields are used to
+track the last stage of the algorithm, when we need to notify safekeeper set and cplane
+with the final configuration after it's already committed to DB.
+
+The timeline is up-to-date (no migration in progress) if `new_sk_set` is null and
+`*_notified_generation` fields are up to date with `generation`. 
+
+It's possible to replace `*_notified_generation` with one boolean field `migration_completed`,
+but for better observability it's nice to have them separately.
+
 #### API
 
 Node management is similar to pageserver:
-1) POST `/control/v1/safekeepers` inserts safekeeper.
-2) GET `/control/v1/safekeepers` lists safekeepers.
-3) GET `/control/v1/safekeepers/:node_id` gets safekeeper.
-4) PUT `/control/v1/safekepers/:node_id/status` changes status to e.g.
+1) POST `/control/v1/safekeeper` inserts safekeeper.
+2) GET `/control/v1/safekeeper` lists safekeepers.
+3) GET `/control/v1/safekeeper/:node_id` gets safekeeper.
+4) PUT `/control/v1/safekeper/:node_id/scheduling_policy` changes status to e.g.
    `offline` or `decomissioned`. Initially it is simpler not to schedule any
     migrations here.
 
@@ -368,8 +379,8 @@ Migration API: the first version is the simplest and the most imperative:
 all timelines from one safekeeper to another. It accepts json
 ```
 {
-    "src_sk": u32,
-    "dst_sk": u32,
+    "src_sk": NodeId,
+    "dst_sk": NodeId,
     "limit": Optional<u32>,
 }
 ```
@@ -379,12 +390,15 @@ Returns list of scheduled requests.
 2) PUT `/control/v1/tenant/:tenant_id/timeline/:timeline_id/safekeeper_migrate` schedules `MigrationRequest`
    to move single timeline to given set of safekeepers:
 ```
-{
-    "desired_set": Vec<u32>,
+struct TimelineSafekeeperMigrateRequest {
+    "new_sk_set": Vec<NodeId>,
 }
 ```
 
-Returns scheduled request.
+In the first version the handler migrates the timeline to `new_sk_set` synchronously.
+Should be retried until success.
+
+In the future we might change it to asynchronous API and return scheduled request.
 
 Similar call should be added for the tenant.
 
@@ -434,6 +448,9 @@ table! {
 }
 ```
 
+We load all pending ops from the table on startup into the memory.
+The table is needed only to preserve the state between restarts.
+
 `op_type` can be `include` (seed from peers and ensure generation is up to
 date), `exclude` (remove locally) and `delete`. Field is actually not strictly
 needed as it can be computed from current configuration, but gives more explicit
@@ -474,7 +491,7 @@ actions must be idempotent. Now, a tricky point here is timeline start LSN. For
 the initial (tenant creation) call cplane doesn't know it. However, setting
 start_lsn on safekeepers during creation is a good thing -- it provides a
 guarantee that walproposer can always find a common point in WAL histories of
-safekeeper and its own, and so absense of it would be a clear sign of
+safekeeper and its own, and so absence of it would be a clear sign of
 corruption. The following sequence works:
 1) Create timeline (or observe that it exists) on pageserver,
    figuring out last_record_lsn in response.
@@ -497,11 +514,9 @@ corruption. The following sequence works:
    retries the call until 200 response.
 
    There is a small question how request handler (timeline creation in this
-   case) would interact with per sk reconciler. As always I prefer to do the
-   simplest possible thing and here it seems to be just waking it up so it
-   re-reads the db for work to do. Passing work in memory is faster, but
-   that shouldn't matter, and path to scan db for work will exist anyway, 
-   simpler to reuse it.
+   case) would interact with per sk reconciler. In the current implementation
+   we first persist the request in the DB, and then send an in-memory request
+   to each safekeeper reconciler to process it.
 
 For pg version / wal segment size: while we may persist them in `timelines`
 table, it is not necessary as initial creation at step 3 can take them from
@@ -509,30 +524,40 @@ pageserver or cplane creation call and later pull_timeline will carry them
 around.
 
 Timeline migration.
-1) CAS to the db to create joint conf, and in the same transaction create
-   `safekeeper_timeline_pending_ops` `include` entries to initialize new members
-   as well as deliver this conf to current ones; poke per sk reconcilers to work
-   on it. Also any conf change should also poke cplane notifier task(s).
-2) Once it becomes possible per alg description above, get out of joint conf
-   with another CAS. Task should get wakeups from per sk reconcilers because 
-   conf switch is required for advancement; however retries should be sleep
-   based as well as LSN advancement might be needed, though in happy path 
-   it isn't. To see whether further transition is possible on wakup migration
-   executor polls safekeepers per the algorithm. CAS creating new conf with only
-   new members should again insert entries to `safekeeper_timeline_pending_ops`
-   to switch them there, as well as `exclude` rows to remove timeline from 
-   old members.
+1) CAS to the db to create joint conf. Since this moment the migration is considered to be 
+   "in progress". We can detect all "in-progress" migrations looking into the database.
+2) Do steps 4-6 from the algorithm, including `pull_timeline` onto `new_sk_set`, update membership
+   configuration on all safekeepers, notify cplane, etc. All operations are idempotent,
+   so we don't need to persist anything in the database at this stage. If any errors occur,
+   it's safe to retry or abort the migration.
+3) Once it becomes possible per alg description above, get out of joint conf
+   with another CAS. Also should insert `exclude` entries into `safekeeper_timeline_pending_ops`
+   in the same DB transaction. Adding `exclude` entries atomically is nesessary because after
+   CAS we don't have the list of excluded safekeepers in the `timelines` table anymore, but we
+   need to have them persisted somewhere in case the migration is interrupted right after the CAS.
+4) Finish the migration. The final membership configuration is committed to the DB at this stage.
+   So, the migration can not be aborted anymore. But it can still be retried if the migration fails
+   past stage 3. To finish the migration we need to send the new membership configuration to
+   a new quorum of safekeepers, notify cplane with the new safekeeper list and schedule the `exclude`
+   requests to in-memory queue for safekeeper reconciler. If the algrorithm is retried, it's
+   possible that we have already committed `exclude` requests to DB, but didn't send them to
+   the in-memory queue. In this case we need to read them from `safekeeper_timeline_pending_ops`
+   because it's the only place where they are persistent. The fields `sk_set_notified_generation`
+   and `cplane_notified_generation` are updated after each step. The migration is considered
+   fully completed when they match the `generation` field.
+
+In practice, we can report "success" after stage 3 and do the "finish" step in per-timeline
+reconciler (if we implement it). But it's wise to at least try to finish them synchronously,
+so the timeline is always in a "good state" and doesn't require an old quorum to commit
+WAL after the migration reported "success".
 
 Timeline deletion: just set `deleted_at` on the timeline row and insert
 `safekeeper_timeline_pending_ops` entries in the same xact, the rest is done by
 per sk reconcilers.
 
-When node is removed (set to `decomissioned`), `safekeeper_timeline_pending_ops`
+When node is removed (set to `decommissioned`), `safekeeper_timeline_pending_ops`
 for it must be cleared in the same transaction.
 
-One more task pool should infinitely retry notifying control plane about changed
-safekeeper sets (trying making `cplane_notified_generation` equal `generation`).
-
 #### Dealing with multiple instances of storage_controller
 
 Operations described above executed concurrently might create some errors but do
@@ -541,7 +566,7 @@ of storage_controller it is fine to have it temporarily, e.g. during redeploy.
 
 To harden against some controller instance creating some work in
 `safekeeper_timeline_pending_ops` and then disappearing without anyone pickup up
-the job per sk reconcilers apart from explicit wakups should scan for work
+the job per sk reconcilers apart from explicit wakeups should scan for work
 periodically. It is possible to remove that though if all db updates are
 protected with leadership token/term -- then such scans are needed only after
 leadership is acquired.
@@ -563,7 +588,7 @@ There should be following layers of tests:
    safekeeper communication and pull_timeline need to be mocked and main switch
    procedure wrapped to as a node (thread) in simulation tests, using these
    mocks. Test would inject migrations like it currently injects
-   safekeeper/walproposer restars. Main assert is the same -- committed WAL must
+   safekeeper/walproposer restarts. Main assert is the same -- committed WAL must
    not be lost.
 
 3) Since simulation testing injects at relatively high level points (not
@@ -613,7 +638,7 @@ Let's have the following implementation bits for gradual rollout:
   `notify-safekeepers`.
 
 Then the rollout for a region would be:
-- Current situation: safekeepers are choosen by control_plane.
+- Current situation: safekeepers are chosen by control_plane.
 - We manually migrate some timelines, test moving them around.
 - Then we enable `--set-safekeepers` so that all new timelines
   are on storage controller.

From fc10bb9438fa58efebde744179725bd55bbf9ee1 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Mon, 7 Jul 2025 13:22:03 +0400
Subject: [PATCH 003/163] storage: rename term -> last_log_term in
 TimelineMembershipSwitchResponse  (#12481)

## Problem
Names are not consistent between safekeeper migration RFC and the actual
implementation.

It's not used anywhere in production yet, so it's safe to rename. We
don't need to worry about backward compatibility.

- Follow up on https://github.com/neondatabase/neon/pull/12432

## Summary of changes
- rename term -> last_log_term in TimelineMembershipSwitchResponse
- add missing fields to TimelineMembershipSwitchResponse in python
---
 libs/safekeeper_api/src/models.rs                    | 2 +-
 safekeeper/src/timeline.rs                           | 2 +-
 storage_controller/src/http.rs                       | 2 +-
 storage_controller/src/service/safekeeper_service.rs | 6 +++---
 test_runner/fixtures/safekeeper/http.py              | 8 +++++++-
 5 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 1774489c1c..e87232474b 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -221,7 +221,7 @@ pub struct TimelineMembershipSwitchRequest {
 pub struct TimelineMembershipSwitchResponse {
     pub previous_conf: Configuration,
     pub current_conf: Configuration,
-    pub term: Term,
+    pub last_log_term: Term,
     pub flush_lsn: Lsn,
 }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 95b5fe6d5d..dbe510a019 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -197,7 +197,7 @@ impl StateSK {
         Ok(TimelineMembershipSwitchResponse {
             previous_conf: result.previous_conf,
             current_conf: result.current_conf,
-            term: self.state().acceptor_state.term,
+            last_log_term: self.state().acceptor_state.term,
             flush_lsn: self.flush_lsn(),
         })
     }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index ee446ea65d..e5a3a969d4 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -2371,7 +2371,7 @@ pub fn make_router(
             named_request_span(
                 r,
                 handle_safekeeper_scheduling_policy,
-                RequestName("v1_safekeeper_status"),
+                RequestName("v1_safekeeper_scheduling_policy"),
             )
         })
         // Tenant Shard operations
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 90ea48dd7b..d7179372b2 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -914,13 +914,13 @@ impl Service {
                         // so it isn't counted toward the quorum.
                         if let Some(min_position) = min_position {
                             if let Ok(ok_res) = &res {
-                                if (ok_res.term, ok_res.flush_lsn) < min_position {
+                                if (ok_res.last_log_term, ok_res.flush_lsn) < min_position {
                                     // Use Error::Timeout to make this error retriable.
                                     res = Err(mgmt_api::Error::Timeout(
                                         format!(
                                         "safekeeper {} returned position {:?} which is less than minimum required position {:?}",
                                         client.node_id_label(),
-                                        (ok_res.term, ok_res.flush_lsn),
+                                        (ok_res.last_log_term, ok_res.flush_lsn),
                                         min_position
                                         )
                                     ));
@@ -1216,7 +1216,7 @@ impl Service {
 
         let mut sync_position = (INITIAL_TERM, Lsn::INVALID);
         for res in results.into_iter().flatten() {
-            let sk_position = (res.term, res.flush_lsn);
+            let sk_position = (res.last_log_term, res.flush_lsn);
             if sync_position < sk_position {
                 sync_position = sk_position;
             }
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 839e985419..942b620be6 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -112,12 +112,18 @@ class TimelineCreateRequest:
 class TimelineMembershipSwitchResponse:
     previous_conf: MembershipConfiguration
     current_conf: MembershipConfiguration
+    last_log_term: int
+    flush_lsn: Lsn
 
     @classmethod
     def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
         previous_conf = MembershipConfiguration.from_json(d["previous_conf"])
         current_conf = MembershipConfiguration.from_json(d["current_conf"])
-        return TimelineMembershipSwitchResponse(previous_conf, current_conf)
+        last_log_term = d["last_log_term"]
+        flush_lsn = Lsn(d["flush_lsn"])
+        return TimelineMembershipSwitchResponse(
+            previous_conf, current_conf, last_log_term, flush_lsn
+        )
 
 
 class SafekeeperHttpClient(requests.Session, MetricsGetter):

From 1eef961f09c46ede620b223f6ad71ce69d2e6a41 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 7 Jul 2025 14:24:06 +0200
Subject: [PATCH 004/163] pageserver: add gRPC error logging (#12445)

## Problem

We don't log gRPC request errors on the server.

Touches #11728.

## Summary of changes

Automatically log non-OK gRPC response statuses in the observability
middleware, and add corresponding logging for the `get_pages` stream.

Also adds the peer address and gRPC method to the gRPC tracing span.

Example output:

```
2025-07-02T20:18:16.813718Z  WARN grpc:pageservice{peer=127.0.0.1:56698 method=CheckRelExists tenant_id=c7b45faa1924b1958f05c5fdee8b0d04 timeline_id=4a36ee64fd2f97781b9dcc2c3cddd51b shard_id=0000}: request failed with NotFound: Tenant c7b45faa1924b1958f05c5fdee8b0d04 not found
```
---
 pageserver/src/page_service.rs | 81 ++++++++++++++++++++++++++++------
 1 file changed, 67 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1d824ac846..49928a9036 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -50,6 +50,7 @@ use tokio::io::{AsyncRead, AsyncReadExt as _, AsyncWrite, AsyncWriteExt as _, Bu
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tonic::service::Interceptor as _;
+use tonic::transport::server::TcpConnectInfo;
 use tracing::*;
 use utils::auth::{Claims, Scope, SwappableJwtAuth};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
@@ -3685,8 +3686,15 @@ impl proto::PageService for GrpcPageServiceHandler {
                 yield match result {
                     Ok(resp) => resp,
                     // Convert per-request errors to GetPageResponses as appropriate, or terminate
-                    // the stream with a tonic::Status.
-                    Err(err) => page_api::GetPageResponse::try_from_status(err, req_id)?.into(),
+                    // the stream with a tonic::Status. Log the error regardless, since
+                    // ObservabilityLayer can't automatically log stream errors.
+                    Err(status) => {
+                        // TODO: it would be nice if we could propagate the get_page() fields here.
+                        span.in_scope(|| {
+                            warn!("request failed with {:?}: {}", status.code(), status.message());
+                        });
+                        page_api::GetPageResponse::try_from_status(status, req_id)?.into()
+                    }
                 }
             }
         };
@@ -3824,40 +3832,85 @@ impl<S: tonic::server::NamedService> tonic::server::NamedService for Observabili
     const NAME: &'static str = S::NAME; // propagate inner service name
 }
 
-impl<S, B> tower::Service<http::Request<B>> for ObservabilityLayerService<S>
+impl<S, Req, Resp> tower::Service<http::Request<Req>> for ObservabilityLayerService<S>
 where
-    S: tower::Service<http::Request<B>>,
+    S: tower::Service<http::Request<Req>, Response = http::Response<Resp>> + Send,
     S::Future: Send + 'static,
 {
     type Response = S::Response;
     type Error = S::Error;
     type Future = BoxFuture<'static, Result<Self::Response, Self::Error>>;
 
-    fn call(&mut self, mut req: http::Request<B>) -> Self::Future {
+    fn call(&mut self, mut req: http::Request<Req>) -> Self::Future {
         // Record the request start time as a request extension.
         //
         // TODO: we should start a timer here instead, but it currently requires a timeline handle
         // and SmgrQueryType, which we don't have yet. Refactor it to provide it later.
         req.extensions_mut().insert(ReceivedAt(Instant::now()));
 
-        // Create a basic tracing span. Enter the span for the current thread (to use it for inner
-        // sync code like interceptors), and instrument the future (to use it for inner async code
-        // like the page service itself).
+        // Extract the peer address and gRPC method.
+        let peer = req
+            .extensions()
+            .get::<TcpConnectInfo>()
+            .and_then(|info| info.remote_addr())
+            .map(|addr| addr.to_string())
+            .unwrap_or_default();
+
+        let method = req
+            .uri()
+            .path()
+            .split('/')
+            .nth(2)
+            .unwrap_or(req.uri().path())
+            .to_string();
+
+        // Create a basic tracing span.
         //
-        // The instrument() call below is not sufficient. It only affects the returned future, and
-        // only takes effect when the caller polls it. Any sync code executed when we call
-        // self.inner.call() below (such as interceptors) runs outside of the returned future, and
-        // is not affected by it. We therefore have to enter the span on the current thread too.
+        // Enter the span for the current thread and instrument the future. It is not sufficient to
+        // only instrument the future, since it only takes effect after the future is returned and
+        // polled, not when the inner service is called below (e.g. during interceptor execution).
         let span = info_span!(
             "grpc:pageservice",
-            // Set by TenantMetadataInterceptor.
+            // These will be populated by TenantMetadataInterceptor.
             tenant_id = field::Empty,
             timeline_id = field::Empty,
             shard_id = field::Empty,
+            // NB: empty fields must be listed first above. Otherwise, the field names will be
+            // clobbered when the empty fields are populated. They will be output last regardless.
+            %peer,
+            %method,
         );
         let _guard = span.enter();
 
-        Box::pin(self.inner.call(req).instrument(span.clone()))
+        // Construct a future for calling the inner service, but don't await it. This avoids having
+        // to clone the inner service into the future below.
+        let call = self.inner.call(req);
+
+        async move {
+            // Await the inner service call.
+            let result = call.await;
+
+            // Log gRPC error statuses. This won't include request info from handler spans, but it
+            // will catch all errors (even those emitted before handler spans are constructed). Only
+            // unary request errors are logged here, not streaming response errors.
+            if let Ok(ref resp) = result
+                && let Some(status) = tonic::Status::from_header_map(resp.headers())
+                && status.code() != tonic::Code::Ok
+            {
+                // TODO: it would be nice if we could propagate the handler span's request fields
+                // here. This could e.g. be done by attaching the request fields to
+                // tonic::Status::metadata via a proc macro.
+                warn!(
+                    "request failed with {:?}: {}",
+                    status.code(),
+                    status.message()
+                );
+            }
+
+            result
+        }
+        .instrument(span.clone())
+        .boxed()
     }
 
     fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {

From 55aef2993d1712dff35045f024baae2b9055e41b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 7 Jul 2025 16:12:02 +0100
Subject: [PATCH 005/163] introduce a JSON serialization lib (#12417)

See #11992 and #11961 for some examples of usecases.

This introduces a JSON serialization lib, designed for more flexibility
than serde_json offers.

## Dynamic construction

Sometimes you have dynamic values you want to serialize, that are not
already in a serde-aware model like a struct or a Vec etc. To achieve
this with serde, you need to implement a lot of different traits on a
lot of different new-types. Because of this, it's often easier to
give-in and pull all the data into a serde-aware model
(serde_json::Value or some intermediate struct), but that is often not
very efficient.

This crate allows full control over the JSON encoding without needing to
implement any extra traits. Just call the relevant functions, and it
will guarantee a correctly encoded JSON value.

## Async construction

Similar to the above, sometimes the values arrive asynchronously. Often
collecting those values in memory is more expensive than writing them as
JSON, since the overheads of `Vec` and `String` is much higher, however
there are exceptions.

Serializing to JSON all in one go is also more CPU intensive and can
cause lag spikes, whereas serializing values incrementally spreads out
the CPU load and reduces lag.
---
 .config/hakari.toml           |   1 +
 Cargo.lock                    |  11 +
 Cargo.toml                    |   1 +
 libs/proxy/json/Cargo.toml    |  12 +
 libs/proxy/json/src/lib.rs    | 412 ++++++++++++++++++++++++++++++++++
 libs/proxy/json/src/macros.rs |  86 +++++++
 libs/proxy/json/src/str.rs    | 166 ++++++++++++++
 libs/proxy/json/src/value.rs  | 168 ++++++++++++++
 workspace_hack/Cargo.toml     |   2 +
 9 files changed, 859 insertions(+)
 create mode 100644 libs/proxy/json/Cargo.toml
 create mode 100644 libs/proxy/json/src/lib.rs
 create mode 100644 libs/proxy/json/src/macros.rs
 create mode 100644 libs/proxy/json/src/str.rs
 create mode 100644 libs/proxy/json/src/value.rs

diff --git a/.config/hakari.toml b/.config/hakari.toml
index 3b6d9d8822..9991cd92b0 100644
--- a/.config/hakari.toml
+++ b/.config/hakari.toml
@@ -33,6 +33,7 @@ workspace-members = [
     "compute_api",
     "consumption_metrics",
     "desim",
+    "json",
     "metrics",
     "pageserver_api",
     "postgres_backend",
diff --git a/Cargo.lock b/Cargo.lock
index 0d4dc10149..237defaec3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3489,6 +3489,15 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "json"
+version = "0.1.0"
+dependencies = [
+ "futures",
+ "itoa",
+ "ryu",
+]
+
 [[package]]
 name = "json-structural-diff"
 version = "0.2.0"
@@ -8693,8 +8702,10 @@ dependencies = [
  "fail",
  "form_urlencoded",
  "futures-channel",
+ "futures-core",
  "futures-executor",
  "futures-io",
+ "futures-sink",
  "futures-util",
  "generic-array",
  "getrandom 0.2.11",
diff --git a/Cargo.toml b/Cargo.toml
index 68016a08a9..840e3c6036 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -42,6 +42,7 @@ members = [
     "libs/walproposer",
     "libs/wal_decoder",
     "libs/postgres_initdb",
+    "libs/proxy/json",
     "libs/proxy/postgres-protocol2",
     "libs/proxy/postgres-types2",
     "libs/proxy/tokio-postgres2",
diff --git a/libs/proxy/json/Cargo.toml b/libs/proxy/json/Cargo.toml
new file mode 100644
index 0000000000..2f163c141d
--- /dev/null
+++ b/libs/proxy/json/Cargo.toml
@@ -0,0 +1,12 @@
+[package]
+name = "json"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+ryu = "1"
+itoa = "1"
+
+[dev-dependencies]
+futures = "0.3"
diff --git a/libs/proxy/json/src/lib.rs b/libs/proxy/json/src/lib.rs
new file mode 100644
index 0000000000..a8b2e6b509
--- /dev/null
+++ b/libs/proxy/json/src/lib.rs
@@ -0,0 +1,412 @@
+//! A JSON serialization lib, designed for more flexibility than `serde_json` offers.
+//!
+//! Features:
+//!
+//! ## Dynamic construction
+//!
+//! Sometimes you have dynamic values you want to serialize, that are not already in a serde-aware model like a struct or a Vec etc.
+//! To achieve this with serde, you need to implement a lot of different traits on a lot of different new-types.
+//! Because of this, it's often easier to give-in and pull all the data into a serde-aware model (`serde_json::Value` or some intermediate struct),
+//! but that is often not very efficient.
+//!
+//! This crate allows full control over the JSON encoding without needing to implement any extra traits. Just call the
+//! relevant functions, and it will guarantee a correctly encoded JSON value.
+//!
+//! ## Async construction
+//!
+//! Similar to the above, sometimes the values arrive asynchronously. Often collecting those values in memory
+//! is more expensive than writing them as JSON, since the overheads of `Vec` and `String` is much higher, however
+//! there are exceptions.
+//!
+//! Serializing to JSON all in one go is also more CPU intensive and can cause lag spikes,
+//! whereas serializing values incrementally spreads out the CPU load and reduces lag.
+//!
+//! ## Examples
+//!
+//! To represent the following JSON as a compact string
+//!
+//! ```json
+//! {
+//!   "results": {
+//!     "rows": [
+//!       {
+//!         "id": 1,
+//!         "value": null
+//!       },
+//!       {
+//!         "id": 2,
+//!         "value": "hello"
+//!       }
+//!     ]
+//!   }
+//! }
+//! ```
+//!
+//! We can use the following code:
+//!
+//! ```
+//! // create the outer object
+//! let s = json::value_to_string!(|v| json::value_as_object!(|v| {
+//!     // create an entry with key "results" and start an object value associated with it.
+//!     let results = v.key("results");
+//!     json::value_as_object!(|results| {
+//!         // create an entry with key "rows" and start an list value associated with it.
+//!         let rows = results.key("rows");
+//!         json::value_as_list!(|rows| {
+//!             // create a list entry and start an object value associated with it.
+//!             let row = rows.entry();
+//!             json::value_as_object!(|row| {
+//!                 // add entry "id": 1
+//!                 row.entry("id", 1);
+//!                 // add entry "value": null
+//!                 row.entry("value", json::Null);
+//!             });
+//!
+//!             // create a list entry and start an object value associated with it.
+//!             let row = rows.entry();
+//!             json::value_as_object!(|row| {
+//!                 // add entry "id": 2
+//!                 row.entry("id", 2);
+//!                 // add entry "value": "hello"
+//!                 row.entry("value", "hello");
+//!             });
+//!         });
+//!     });
+//! }));
+//!
+//! assert_eq!(s, r#"{"results":{"rows":[{"id":1,"value":null},{"id":2,"value":"hello"}]}}"#);
+//! ```
+
+mod macros;
+mod str;
+mod value;
+
+pub use value::{Null, ValueEncoder};
+
+#[must_use]
+/// Serialize a single json value.
+pub struct ValueSer<'buf> {
+    buf: &'buf mut Vec<u8>,
+    start: usize,
+}
+
+impl<'buf> ValueSer<'buf> {
+    /// Create a new json value serializer.
+    pub fn new(buf: &'buf mut Vec<u8>) -> Self {
+        Self { buf, start: 0 }
+    }
+
+    /// Borrow the underlying buffer
+    pub fn as_buffer(&self) -> &[u8] {
+        self.buf
+    }
+
+    #[inline]
+    pub fn value(self, e: impl ValueEncoder) {
+        e.encode(self);
+    }
+
+    /// Write raw bytes to the buf. This must be already JSON encoded.
+    #[inline]
+    pub fn write_raw_json(self, data: &[u8]) {
+        self.buf.extend_from_slice(data);
+        self.finish();
+    }
+
+    /// Start a new object serializer.
+    #[inline]
+    pub fn object(self) -> ObjectSer<'buf> {
+        ObjectSer::new(self)
+    }
+
+    /// Start a new list serializer.
+    #[inline]
+    pub fn list(self) -> ListSer<'buf> {
+        ListSer::new(self)
+    }
+
+    /// Finish the value ser.
+    #[inline]
+    fn finish(self) {
+        // don't trigger the drop handler which triggers a rollback.
+        // this won't cause memory leaks because `ValueSet` owns no allocations.
+        std::mem::forget(self);
+    }
+}
+
+impl Drop for ValueSer<'_> {
+    fn drop(&mut self) {
+        self.buf.truncate(self.start);
+    }
+}
+
+#[must_use]
+/// Serialize a json object.
+pub struct ObjectSer<'buf> {
+    value: ValueSer<'buf>,
+    start: usize,
+}
+
+impl<'buf> ObjectSer<'buf> {
+    /// Start a new object serializer.
+    #[inline]
+    pub fn new(value: ValueSer<'buf>) -> Self {
+        value.buf.push(b'{');
+        let start = value.buf.len();
+        Self { value, start }
+    }
+
+    /// Borrow the underlying buffer
+    pub fn as_buffer(&self) -> &[u8] {
+        self.value.as_buffer()
+    }
+
+    /// Start a new object entry with the given string key, returning a [`ValueSer`] for the associated value.
+    #[inline]
+    pub fn key(&mut self, key: impl KeyEncoder) -> ValueSer<'_> {
+        key.write_key(self)
+    }
+
+    /// Write an entry (key-value pair) to the object.
+    #[inline]
+    pub fn entry(&mut self, key: impl KeyEncoder, val: impl ValueEncoder) {
+        self.key(key).value(val);
+    }
+
+    #[inline]
+    fn entry_inner(&mut self, f: impl FnOnce(&mut Vec<u8>)) -> ValueSer<'_> {
+        // track before the separator so we the value is rolled back it also removes the separator.
+        let start = self.value.buf.len();
+
+        // push separator if necessary
+        if self.value.buf.len() > self.start {
+            self.value.buf.push(b',');
+        }
+        // push key
+        f(self.value.buf);
+        // push value separator
+        self.value.buf.push(b':');
+
+        // return value writer.
+        ValueSer {
+            buf: self.value.buf,
+            start,
+        }
+    }
+
+    /// Reset the buffer back to before this object was started.
+    #[inline]
+    pub fn rollback(self) -> ValueSer<'buf> {
+        // Do not fully reset the value, only reset it to before the `{`.
+        // This ensures any `,` before this value are not clobbered.
+        self.value.buf.truncate(self.start - 1);
+        self.value
+    }
+
+    /// Finish the object ser.
+    #[inline]
+    pub fn finish(self) {
+        self.value.buf.push(b'}');
+        self.value.finish();
+    }
+}
+
+pub trait KeyEncoder {
+    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a>;
+}
+
+#[must_use]
+/// Serialize a json object.
+pub struct ListSer<'buf> {
+    value: ValueSer<'buf>,
+    start: usize,
+}
+
+impl<'buf> ListSer<'buf> {
+    /// Start a new list serializer.
+    #[inline]
+    pub fn new(value: ValueSer<'buf>) -> Self {
+        value.buf.push(b'[');
+        let start = value.buf.len();
+        Self { value, start }
+    }
+
+    /// Borrow the underlying buffer
+    pub fn as_buffer(&self) -> &[u8] {
+        self.value.as_buffer()
+    }
+
+    /// Write an value to the list.
+    #[inline]
+    pub fn push(&mut self, val: impl ValueEncoder) {
+        self.entry().value(val);
+    }
+
+    /// Start a new value entry in this list.
+    #[inline]
+    pub fn entry(&mut self) -> ValueSer<'_> {
+        // track before the separator so we the value is rolled back it also removes the separator.
+        let start = self.value.buf.len();
+
+        // push separator if necessary
+        if self.value.buf.len() > self.start {
+            self.value.buf.push(b',');
+        }
+
+        // return value writer.
+        ValueSer {
+            buf: self.value.buf,
+            start,
+        }
+    }
+
+    /// Reset the buffer back to before this object was started.
+    #[inline]
+    pub fn rollback(self) -> ValueSer<'buf> {
+        // Do not fully reset the value, only reset it to before the `[`.
+        // This ensures any `,` before this value are not clobbered.
+        self.value.buf.truncate(self.start - 1);
+        self.value
+    }
+
+    /// Finish the object ser.
+    #[inline]
+    pub fn finish(self) {
+        self.value.buf.push(b']');
+        self.value.finish();
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use crate::{Null, ValueSer};
+
+    #[test]
+    fn object() {
+        let mut buf = vec![];
+        let mut object = ValueSer::new(&mut buf).object();
+        object.entry("foo", "bar");
+        object.entry("baz", Null);
+        object.finish();
+
+        assert_eq!(buf, br#"{"foo":"bar","baz":null}"#);
+    }
+
+    #[test]
+    fn list() {
+        let mut buf = vec![];
+        let mut list = ValueSer::new(&mut buf).list();
+        list.entry().value("bar");
+        list.entry().value(Null);
+        list.finish();
+
+        assert_eq!(buf, br#"["bar",null]"#);
+    }
+
+    #[test]
+    fn object_macro() {
+        let res = crate::value_to_string!(|obj| {
+            crate::value_as_object!(|obj| {
+                obj.entry("foo", "bar");
+                obj.entry("baz", Null);
+            })
+        });
+
+        assert_eq!(res, r#"{"foo":"bar","baz":null}"#);
+    }
+
+    #[test]
+    fn list_macro() {
+        let res = crate::value_to_string!(|list| {
+            crate::value_as_list!(|list| {
+                list.entry().value("bar");
+                list.entry().value(Null);
+            })
+        });
+
+        assert_eq!(res, r#"["bar",null]"#);
+    }
+
+    #[test]
+    fn rollback_on_drop() {
+        let res = crate::value_to_string!(|list| {
+            crate::value_as_list!(|list| {
+                list.entry().value("bar");
+
+                'cancel: {
+                    let nested_list = list.entry();
+                    crate::value_as_list!(|nested_list| {
+                        nested_list.entry().value(1);
+
+                        assert_eq!(nested_list.as_buffer(), br#"["bar",[1"#);
+                        if true {
+                            break 'cancel;
+                        }
+                    })
+                }
+
+                assert_eq!(list.as_buffer(), br#"["bar""#);
+
+                list.entry().value(Null);
+            })
+        });
+
+        assert_eq!(res, r#"["bar",null]"#);
+    }
+
+    #[test]
+    fn rollback_object() {
+        let res = crate::value_to_string!(|obj| {
+            crate::value_as_object!(|obj| {
+                let entry = obj.key("1");
+                entry.value(1_i32);
+
+                let entry = obj.key("2");
+                let entry = {
+                    let mut nested_obj = entry.object();
+                    nested_obj.entry("foo", "bar");
+                    nested_obj.rollback()
+                };
+
+                entry.value(2_i32);
+            })
+        });
+
+        assert_eq!(res, r#"{"1":1,"2":2}"#);
+    }
+
+    #[test]
+    fn rollback_list() {
+        let res = crate::value_to_string!(|list| {
+            crate::value_as_list!(|list| {
+                let entry = list.entry();
+                entry.value(1_i32);
+
+                let entry = list.entry();
+                let entry = {
+                    let mut nested_list = entry.list();
+                    nested_list.push("foo");
+                    nested_list.rollback()
+                };
+
+                entry.value(2_i32);
+            })
+        });
+
+        assert_eq!(res, r#"[1,2]"#);
+    }
+
+    #[test]
+    fn string_escaping() {
+        let mut buf = vec![];
+        let mut object = ValueSer::new(&mut buf).object();
+
+        let key = "hello";
+        let value = "\n world";
+
+        object.entry(format_args!("{key:?}"), value);
+        object.finish();
+
+        assert_eq!(buf, br#"{"\"hello\"":"\n world"}"#);
+    }
+}
diff --git a/libs/proxy/json/src/macros.rs b/libs/proxy/json/src/macros.rs
new file mode 100644
index 0000000000..d3b5cfed10
--- /dev/null
+++ b/libs/proxy/json/src/macros.rs
@@ -0,0 +1,86 @@
+//! # Examples
+//!
+//! ```
+//! use futures::{StreamExt, TryStream, TryStreamExt};
+//!
+//! async fn stream_to_json_list<S, T, E>(mut s: S) -> Result<String, E>
+//! where
+//!     S: TryStream<Ok = T, Error = E> + Unpin,
+//!     T: json::ValueEncoder
+//! {
+//!     Ok(json::value_to_string!(|val| json::value_as_list!(|val| {
+//!         // note how we can use `.await` and `?` in here.
+//!         while let Some(value) = s.try_next().await? {
+//!             val.push(value);
+//!         }
+//!     })))
+//! }
+//!
+//! let stream = futures::stream::iter([1, 2, 3]).map(Ok::<i32, ()>);
+//! let json_string = futures::executor::block_on(stream_to_json_list(stream)).unwrap();
+//! assert_eq!(json_string, "[1,2,3]");
+//! ```
+
+/// A helper to create a new JSON vec.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_to_vec {
+    (|$val:ident| $body:expr) => {{
+        let mut buf = vec![];
+        let $val = $crate::ValueSer::new(&mut buf);
+        let _: () = $body;
+        buf
+    }};
+}
+
+/// A helper to create a new JSON string.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_to_string {
+    (|$val:ident| $body:expr) => {{
+        ::std::string::String::from_utf8($crate::value_to_vec!(|$val| $body))
+            .expect("json should be valid utf8")
+    }};
+}
+
+/// A helper that ensures the [`ObjectSer::finish`](crate::ObjectSer::finish) method is called on completion.
+///
+/// Consumes `$val` and assigns it as an [`ObjectSer`](crate::ObjectSer) serializer.
+/// The serializer is only 'finished' if the body completes.
+/// The serializer is rolled back if `break`/`return` escapes the body.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_as_object {
+    (|$val:ident| $body:expr) => {{
+        let mut obj = $crate::ObjectSer::new($val);
+
+        let $val = &mut obj;
+        let res = $body;
+
+        obj.finish();
+        res
+    }};
+}
+
+/// A helper that ensures the [`ListSer::finish`](crate::ListSer::finish) method is called on completion.
+///
+/// Consumes `$val` and assigns it as an [`ListSer`](crate::ListSer) serializer.
+/// The serializer is only 'finished' if the body completes.
+/// The serializer is rolled back if `break`/`return` escapes the body.
+///
+/// Implemented as a macro to preserve all control flow.
+#[macro_export]
+macro_rules! value_as_list {
+    (|$val:ident| $body:expr) => {{
+        let mut list = $crate::ListSer::new($val);
+
+        let $val = &mut list;
+        let res = $body;
+
+        list.finish();
+        res
+    }};
+}
diff --git a/libs/proxy/json/src/str.rs b/libs/proxy/json/src/str.rs
new file mode 100644
index 0000000000..b092fd50ec
--- /dev/null
+++ b/libs/proxy/json/src/str.rs
@@ -0,0 +1,166 @@
+//! Helpers for serializing escaped strings.
+//!
+//! ## License
+//!
+//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L1514-L1552>
+//! <https://github.com/serde-rs/json/blob/c1826ebcccb1a520389c6b78ad3da15db279220d/src/ser.rs#L2081-L2157>
+//! Licensed by David Tolnay under MIT or Apache-2.0.
+//!
+//! With modifications by Conrad Ludgate on behalf of Databricks.
+
+use std::fmt::{self, Write};
+
+/// Represents a character escape code in a type-safe manner.
+pub enum CharEscape {
+    /// An escaped quote `"`
+    Quote,
+    /// An escaped reverse solidus `\`
+    ReverseSolidus,
+    // /// An escaped solidus `/`
+    // Solidus,
+    /// An escaped backspace character (usually escaped as `\b`)
+    Backspace,
+    /// An escaped form feed character (usually escaped as `\f`)
+    FormFeed,
+    /// An escaped line feed character (usually escaped as `\n`)
+    LineFeed,
+    /// An escaped carriage return character (usually escaped as `\r`)
+    CarriageReturn,
+    /// An escaped tab character (usually escaped as `\t`)
+    Tab,
+    /// An escaped ASCII plane control character (usually escaped as
+    /// `\u00XX` where `XX` are two hex characters)
+    AsciiControl(u8),
+}
+
+impl CharEscape {
+    #[inline]
+    fn from_escape_table(escape: u8, byte: u8) -> CharEscape {
+        match escape {
+            self::BB => CharEscape::Backspace,
+            self::TT => CharEscape::Tab,
+            self::NN => CharEscape::LineFeed,
+            self::FF => CharEscape::FormFeed,
+            self::RR => CharEscape::CarriageReturn,
+            self::QU => CharEscape::Quote,
+            self::BS => CharEscape::ReverseSolidus,
+            self::UU => CharEscape::AsciiControl(byte),
+            _ => unreachable!(),
+        }
+    }
+}
+
+pub(crate) fn format_escaped_str(writer: &mut Vec<u8>, value: &str) {
+    writer.reserve(2 + value.len());
+
+    writer.push(b'"');
+
+    let rest = format_escaped_str_contents(writer, value);
+    writer.extend_from_slice(rest);
+
+    writer.push(b'"');
+}
+
+pub(crate) fn format_escaped_fmt(writer: &mut Vec<u8>, args: fmt::Arguments) {
+    writer.push(b'"');
+
+    Collect { buf: writer }
+        .write_fmt(args)
+        .expect("formatting should not error");
+
+    writer.push(b'"');
+}
+
+struct Collect<'buf> {
+    buf: &'buf mut Vec<u8>,
+}
+
+impl fmt::Write for Collect<'_> {
+    fn write_str(&mut self, s: &str) -> fmt::Result {
+        let last = format_escaped_str_contents(self.buf, s);
+        self.buf.extend(last);
+        Ok(())
+    }
+}
+
+// writes any escape sequences, and returns the suffix still needed to be written.
+fn format_escaped_str_contents<'a>(writer: &mut Vec<u8>, value: &'a str) -> &'a [u8] {
+    let bytes = value.as_bytes();
+
+    let mut start = 0;
+
+    for (i, &byte) in bytes.iter().enumerate() {
+        let escape = ESCAPE[byte as usize];
+        if escape == 0 {
+            continue;
+        }
+
+        writer.extend_from_slice(&bytes[start..i]);
+
+        let char_escape = CharEscape::from_escape_table(escape, byte);
+        write_char_escape(writer, char_escape);
+
+        start = i + 1;
+    }
+
+    &bytes[start..]
+}
+
+const BB: u8 = b'b'; // \x08
+const TT: u8 = b't'; // \x09
+const NN: u8 = b'n'; // \x0A
+const FF: u8 = b'f'; // \x0C
+const RR: u8 = b'r'; // \x0D
+const QU: u8 = b'"'; // \x22
+const BS: u8 = b'\\'; // \x5C
+const UU: u8 = b'u'; // \x00...\x1F except the ones above
+const __: u8 = 0;
+
+// Lookup table of escape sequences. A value of b'x' at index i means that byte
+// i is escaped as "\x" in JSON. A value of 0 means that byte i is not escaped.
+static ESCAPE: [u8; 256] = [
+    //   1   2   3   4   5   6   7   8   9   A   B   C   D   E   F
+    UU, UU, UU, UU, UU, UU, UU, UU, BB, TT, NN, UU, FF, RR, UU, UU, // 0
+    UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, UU, // 1
+    __, __, QU, __, __, __, __, __, __, __, __, __, __, __, __, __, // 2
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 3
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 4
+    __, __, __, __, __, __, __, __, __, __, __, __, BS, __, __, __, // 5
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 6
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 7
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 8
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // 9
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // A
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // B
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // C
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // D
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // E
+    __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, __, // F
+];
+
+fn write_char_escape(writer: &mut Vec<u8>, char_escape: CharEscape) {
+    let s = match char_escape {
+        CharEscape::Quote => b"\\\"",
+        CharEscape::ReverseSolidus => b"\\\\",
+        // CharEscape::Solidus => b"\\/",
+        CharEscape::Backspace => b"\\b",
+        CharEscape::FormFeed => b"\\f",
+        CharEscape::LineFeed => b"\\n",
+        CharEscape::CarriageReturn => b"\\r",
+        CharEscape::Tab => b"\\t",
+        CharEscape::AsciiControl(byte) => {
+            static HEX_DIGITS: [u8; 16] = *b"0123456789abcdef";
+            let bytes = &[
+                b'\\',
+                b'u',
+                b'0',
+                b'0',
+                HEX_DIGITS[(byte >> 4) as usize],
+                HEX_DIGITS[(byte & 0xF) as usize],
+            ];
+            return writer.extend_from_slice(bytes);
+        }
+    };
+
+    writer.extend_from_slice(s);
+}
diff --git a/libs/proxy/json/src/value.rs b/libs/proxy/json/src/value.rs
new file mode 100644
index 0000000000..705af9603e
--- /dev/null
+++ b/libs/proxy/json/src/value.rs
@@ -0,0 +1,168 @@
+use core::fmt;
+use std::collections::{BTreeMap, HashMap};
+
+use crate::str::{format_escaped_fmt, format_escaped_str};
+use crate::{KeyEncoder, ObjectSer, ValueSer, value_as_list, value_as_object};
+
+/// Write a value to the underlying json representation.
+pub trait ValueEncoder {
+    fn encode(self, v: ValueSer<'_>);
+}
+
+pub(crate) fn write_int(x: impl itoa::Integer, b: &mut Vec<u8>) {
+    b.extend_from_slice(itoa::Buffer::new().format(x).as_bytes());
+}
+
+pub(crate) fn write_float(x: impl ryu::Float, b: &mut Vec<u8>) {
+    b.extend_from_slice(ryu::Buffer::new().format(x).as_bytes());
+}
+
+impl<T: Copy + ValueEncoder> ValueEncoder for &T {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        T::encode(*self, v);
+    }
+}
+
+impl ValueEncoder for &str {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        format_escaped_str(v.buf, self);
+        v.finish();
+    }
+}
+
+impl ValueEncoder for fmt::Arguments<'_> {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        if let Some(s) = self.as_str() {
+            format_escaped_str(v.buf, s);
+        } else {
+            format_escaped_fmt(v.buf, self);
+        }
+        v.finish();
+    }
+}
+
+macro_rules! int {
+    [$($t:ty),*] => {
+        $(
+            impl ValueEncoder for $t {
+                #[inline]
+                fn encode(self, v: ValueSer<'_>) {
+                    write_int(self, v.buf);
+                    v.finish();
+                }
+            }
+        )*
+    };
+}
+
+int![u8, u16, u32, u64, usize, u128];
+int![i8, i16, i32, i64, isize, i128];
+
+macro_rules! float {
+    [$($t:ty),*] => {
+        $(
+            impl ValueEncoder for $t {
+                #[inline]
+                fn encode(self, v: ValueSer<'_>) {
+                    write_float(self, v.buf);
+                    v.finish();
+                }
+            }
+        )*
+    };
+}
+
+float![f32, f64];
+
+impl ValueEncoder for bool {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        v.write_raw_json(if self { b"true" } else { b"false" });
+    }
+}
+
+impl<T: ValueEncoder> ValueEncoder for Option<T> {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        match self {
+            Some(value) => value.encode(v),
+            None => Null.encode(v),
+        }
+    }
+}
+
+impl KeyEncoder for &str {
+    #[inline]
+    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
+        let obj = &mut *obj;
+        obj.entry_inner(|b| format_escaped_str(b, self))
+    }
+}
+
+impl KeyEncoder for fmt::Arguments<'_> {
+    #[inline]
+    fn write_key<'a>(self, obj: &'a mut ObjectSer) -> ValueSer<'a> {
+        if let Some(key) = self.as_str() {
+            obj.entry_inner(|b| format_escaped_str(b, key))
+        } else {
+            obj.entry_inner(|b| format_escaped_fmt(b, self))
+        }
+    }
+}
+
+/// Represents the JSON null value.
+pub struct Null;
+
+impl ValueEncoder for Null {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        v.write_raw_json(b"null");
+    }
+}
+
+impl<T: ValueEncoder> ValueEncoder for Vec<T> {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        value_as_list!(|v| {
+            for t in self {
+                v.entry().value(t);
+            }
+        });
+    }
+}
+
+impl<T: Copy + ValueEncoder> ValueEncoder for &[T] {
+    #[inline]
+    fn encode(self, v: ValueSer<'_>) {
+        value_as_list!(|v| {
+            for t in self {
+                v.entry().value(t);
+            }
+        });
+    }
+}
+
+impl<K: KeyEncoder, V: ValueEncoder, S> ValueEncoder for HashMap<K, V, S> {
+    #[inline]
+    fn encode(self, o: ValueSer<'_>) {
+        value_as_object!(|o| {
+            for (k, v) in self {
+                o.entry(k, v);
+            }
+        });
+    }
+}
+
+impl<K: KeyEncoder, V: ValueEncoder> ValueEncoder for BTreeMap<K, V> {
+    #[inline]
+    fn encode(self, o: ValueSer<'_>) {
+        value_as_object!(|o| {
+            for (k, v) in self {
+                o.entry(k, v);
+            }
+        });
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index fb10e27d2a..fc01deb92d 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -40,8 +40,10 @@ env_logger = { version = "0.11" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
 form_urlencoded = { version = "1" }
 futures-channel = { version = "0.3", features = ["sink"] }
+futures-core = { version = "0.3" }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
+futures-sink = { version = "0.3" }
 futures-util = { version = "0.3", features = ["channel", "io", "sink"] }
 generic-array = { version = "0.14", default-features = false, features = ["more_lengths", "zeroize"] }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }

From e65d5f73697a55bac9966c9cb91124ba5cadaaaa Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 7 Jul 2025 19:46:33 +0200
Subject: [PATCH 006/163] proxy: Remove the endpoint filter cache (#12488)

## Problem

The endpoint filter cache is still unused because it's not yet reliable
enough to be used. It only consumes a lot of memory.

## Summary of changes

Remove the code. Needs a new design.

neondatabase/cloud#30634
---
 proxy/src/binary/proxy.rs                     |  23 +-
 proxy/src/cache/endpoints.rs                  | 283 ------------------
 proxy/src/cache/mod.rs                        |   1 -
 proxy/src/config.rs                           |  75 +----
 proxy/src/context/mod.rs                      |  48 +--
 .../control_plane/client/cplane_proxy_v1.rs   |  17 --
 proxy/src/control_plane/client/mod.rs         |   7 +-
 proxy/src/control_plane/errors.rs             |  11 -
 proxy/src/rate_limiter/limiter.rs             |  38 ---
 proxy/src/rate_limiter/mod.rs                 |   2 +-
 proxy/src/types.rs                            |  10 -
 11 files changed, 11 insertions(+), 504 deletions(-)
 delete mode 100644 proxy/src/cache/endpoints.rs

diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index c10678dc68..691709ce2a 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -21,7 +21,7 @@ use tokio::net::TcpListener;
 use tokio::sync::Notify;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::{Instrument, error, info, warn};
+use tracing::{error, info, warn};
 use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version};
 
@@ -195,7 +195,9 @@ struct ProxyCliArgs {
     #[clap(long, default_value = config::ProjectInfoCacheOptions::CACHE_DEFAULT_OPTIONS)]
     project_info_cache: String,
     /// cache for all valid endpoints
-    #[clap(long, default_value = config::EndpointCacheConfig::CACHE_DEFAULT_OPTIONS)]
+    // TODO: remove after a couple of releases.
+    #[clap(long, default_value_t = String::new())]
+    #[deprecated]
     endpoint_cache_config: String,
     #[clap(flatten)]
     parquet_upload: ParquetUploadArgs,
@@ -558,13 +560,6 @@ pub async fn run() -> anyhow::Result<()> {
                 }
             }
         }
-
-        // listen for notifications of new projects/endpoints/branches
-        let cache = api.caches.endpoints_cache.clone();
-        let span = tracing::info_span!("endpoints_cache");
-        maintenance_tasks.spawn(
-            async move { cache.do_read(client, cancellation_token.clone()).await }.instrument(span),
-        );
     }
 
     let maintenance = loop {
@@ -712,18 +707,15 @@ fn build_auth_backend(
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+
             let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
-                endpoint_cache_config,
             )));
 
             let config::ConcurrencyLockOptions {
@@ -793,18 +785,15 @@ fn build_auth_backend(
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
-            let endpoint_cache_config: config::EndpointCacheConfig =
-                args.endpoint_cache_config.parse()?;
 
             info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
             info!(
                 "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
             );
-            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+
             let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
                 wake_compute_cache_config,
                 project_info_cache_config,
-                endpoint_cache_config,
             )));
 
             let config::ConcurrencyLockOptions {
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
deleted file mode 100644
index 3c88e07484..0000000000
--- a/proxy/src/cache/endpoints.rs
+++ /dev/null
@@ -1,283 +0,0 @@
-use std::convert::Infallible;
-use std::future::pending;
-use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::{Arc, Mutex};
-
-use clashmap::ClashSet;
-use redis::streams::{StreamReadOptions, StreamReadReply};
-use redis::{AsyncCommands, FromRedisValue, Value};
-use serde::Deserialize;
-use tokio_util::sync::CancellationToken;
-use tracing::info;
-
-use crate::config::EndpointCacheConfig;
-use crate::context::RequestContext;
-use crate::ext::LockExt;
-use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
-use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
-use crate::rate_limiter::GlobalRateLimiter;
-use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::types::EndpointId;
-
-// TODO: this could be an enum, but events in Redis need to be fixed first.
-// ProjectCreated was sent with type:branch_created. So we ignore type.
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct ControlPlaneEvent {
-    endpoint_created: Option<EndpointCreated>,
-    branch_created: Option<BranchCreated>,
-    project_created: Option<ProjectCreated>,
-    #[serde(rename = "type")]
-    _type: Option<String>,
-}
-
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct EndpointCreated {
-    endpoint_id: EndpointIdInt,
-}
-
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct BranchCreated {
-    branch_id: BranchIdInt,
-}
-
-#[derive(Deserialize, Debug, Clone, PartialEq)]
-struct ProjectCreated {
-    project_id: ProjectIdInt,
-}
-
-impl TryFrom<&Value> for ControlPlaneEvent {
-    type Error = anyhow::Error;
-    fn try_from(value: &Value) -> Result<Self, Self::Error> {
-        let json = String::from_redis_value(value)?;
-        Ok(serde_json::from_str(&json)?)
-    }
-}
-
-pub struct EndpointsCache {
-    config: EndpointCacheConfig,
-    endpoints: ClashSet<EndpointIdInt>,
-    branches: ClashSet<BranchIdInt>,
-    projects: ClashSet<ProjectIdInt>,
-    ready: AtomicBool,
-    limiter: Arc<Mutex<GlobalRateLimiter>>,
-}
-
-impl EndpointsCache {
-    pub(crate) fn new(config: EndpointCacheConfig) -> Self {
-        Self {
-            limiter: Arc::new(Mutex::new(GlobalRateLimiter::new(
-                config.limiter_info.clone(),
-            ))),
-            config,
-            endpoints: ClashSet::new(),
-            branches: ClashSet::new(),
-            projects: ClashSet::new(),
-            ready: AtomicBool::new(false),
-        }
-    }
-
-    pub(crate) fn is_valid(&self, ctx: &RequestContext, endpoint: &EndpointId) -> bool {
-        if !self.ready.load(Ordering::Acquire) {
-            // the endpoint cache is not yet fully initialised.
-            return true;
-        }
-
-        if !self.should_reject(endpoint) {
-            ctx.set_rejected(false);
-            return true;
-        }
-
-        // report that we might want to reject this endpoint
-        ctx.set_rejected(true);
-
-        // If cache is disabled, just collect the metrics and return.
-        if self.config.disable_cache {
-            return true;
-        }
-
-        // If the limiter allows, we can pretend like it's valid
-        // (incase it is, due to redis channel lag).
-        if self.limiter.lock_propagate_poison().check() {
-            return true;
-        }
-
-        // endpoint not found, and there's too much load.
-        false
-    }
-
-    fn should_reject(&self, endpoint: &EndpointId) -> bool {
-        if endpoint.is_endpoint() {
-            let Some(endpoint) = EndpointIdInt::get(endpoint) else {
-                // if we haven't interned this endpoint, it's not in the cache.
-                return true;
-            };
-            !self.endpoints.contains(&endpoint)
-        } else if endpoint.is_branch() {
-            let Some(branch) = BranchIdInt::get(endpoint) else {
-                // if we haven't interned this branch, it's not in the cache.
-                return true;
-            };
-            !self.branches.contains(&branch)
-        } else {
-            let Some(project) = ProjectIdInt::get(endpoint) else {
-                // if we haven't interned this project, it's not in the cache.
-                return true;
-            };
-            !self.projects.contains(&project)
-        }
-    }
-
-    fn insert_event(&self, event: ControlPlaneEvent) {
-        if let Some(endpoint_created) = event.endpoint_created {
-            self.endpoints.insert(endpoint_created.endpoint_id);
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::EndpointCreated);
-        } else if let Some(branch_created) = event.branch_created {
-            self.branches.insert(branch_created.branch_id);
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::BranchCreated);
-        } else if let Some(project_created) = event.project_created {
-            self.projects.insert(project_created.project_id);
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::ProjectCreated);
-        }
-    }
-
-    pub async fn do_read(
-        &self,
-        mut con: ConnectionWithCredentialsProvider,
-        cancellation_token: CancellationToken,
-    ) -> anyhow::Result<Infallible> {
-        let mut last_id = "0-0".to_string();
-        loop {
-            if let Err(e) = con.connect().await {
-                tracing::error!("error connecting to redis: {:?}", e);
-                self.ready.store(false, Ordering::Release);
-            }
-            if let Err(e) = self.read_from_stream(&mut con, &mut last_id).await {
-                tracing::error!("error reading from redis: {:?}", e);
-                self.ready.store(false, Ordering::Release);
-            }
-            if cancellation_token.is_cancelled() {
-                info!("cancellation token is cancelled, exiting");
-                // Maintenance tasks run forever. Sleep forever when canceled.
-                pending::<()>().await;
-            }
-            tokio::time::sleep(self.config.retry_interval).await;
-        }
-    }
-
-    async fn read_from_stream(
-        &self,
-        con: &mut ConnectionWithCredentialsProvider,
-        last_id: &mut String,
-    ) -> anyhow::Result<()> {
-        tracing::info!("reading endpoints/branches/projects from redis");
-        self.batch_read(
-            con,
-            StreamReadOptions::default().count(self.config.initial_batch_size),
-            last_id,
-            true,
-        )
-        .await?;
-        tracing::info!("ready to filter user requests");
-        self.ready.store(true, Ordering::Release);
-        self.batch_read(
-            con,
-            StreamReadOptions::default()
-                .count(self.config.default_batch_size)
-                .block(self.config.xread_timeout.as_millis() as usize),
-            last_id,
-            false,
-        )
-        .await
-    }
-
-    async fn batch_read(
-        &self,
-        conn: &mut ConnectionWithCredentialsProvider,
-        opts: StreamReadOptions,
-        last_id: &mut String,
-        return_when_finish: bool,
-    ) -> anyhow::Result<()> {
-        let mut total: usize = 0;
-        loop {
-            let mut res: StreamReadReply = conn
-                .xread_options(&[&self.config.stream_name], &[last_id.as_str()], &opts)
-                .await?;
-
-            if res.keys.is_empty() {
-                if return_when_finish {
-                    if total != 0 {
-                        break;
-                    }
-                    anyhow::bail!(
-                        "Redis stream {} is empty, cannot be used to filter endpoints",
-                        self.config.stream_name
-                    );
-                }
-                // If we are not returning when finish, we should wait for more data.
-                continue;
-            }
-            if res.keys.len() != 1 {
-                anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
-            }
-
-            let key = res.keys.pop().expect("Checked length above");
-            let len = key.ids.len();
-            for stream_id in key.ids {
-                total += 1;
-                for value in stream_id.map.values() {
-                    match value.try_into() {
-                        Ok(event) => self.insert_event(event),
-                        Err(err) => {
-                            Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
-                                channel: &self.config.stream_name,
-                            });
-                            tracing::error!("error parsing value {value:?}: {err:?}");
-                        }
-                    }
-                }
-                if total.is_power_of_two() {
-                    tracing::debug!("endpoints read {}", total);
-                }
-                *last_id = stream_id.id;
-            }
-            if return_when_finish && len <= self.config.default_batch_size {
-                break;
-            }
-        }
-        tracing::info!("read {} endpoints/branches/projects from redis", total);
-        Ok(())
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    #[test]
-    fn test_parse_control_plane_event() {
-        let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
-
-        let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
-
-        assert_eq!(
-            serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
-            ControlPlaneEvent {
-                endpoint_created: Some(EndpointCreated {
-                    endpoint_id: endpoint_id.into(),
-                }),
-                branch_created: None,
-                project_created: None,
-                _type: Some("endpoint_created".into()),
-            }
-        );
-    }
-}
diff --git a/proxy/src/cache/mod.rs b/proxy/src/cache/mod.rs
index 6c168144a7..ce7f781213 100644
--- a/proxy/src/cache/mod.rs
+++ b/proxy/src/cache/mod.rs
@@ -1,5 +1,4 @@
 pub(crate) mod common;
-pub(crate) mod endpoints;
 pub(crate) mod project_info;
 mod timed_lru;
 
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f97006e206..6157dc8a6a 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -18,7 +18,7 @@ use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::messages::{EndpointJwksResponse, JwksSettings};
 use crate::ext::TaskExt;
 use crate::intern::RoleNameInt;
-use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig};
+use crate::rate_limiter::{RateLimitAlgorithm, RateLimiterConfig};
 use crate::scram::threadpool::ThreadPool;
 use crate::serverless::GlobalConnPoolOptions;
 use crate::serverless::cancel_set::CancelSet;
@@ -80,79 +80,6 @@ pub struct AuthenticationConfig {
     pub console_redirect_confirmation_timeout: tokio::time::Duration,
 }
 
-#[derive(Debug)]
-pub struct EndpointCacheConfig {
-    /// Batch size to receive all endpoints on the startup.
-    pub initial_batch_size: usize,
-    /// Batch size to receive endpoints.
-    pub default_batch_size: usize,
-    /// Timeouts for the stream read operation.
-    pub xread_timeout: Duration,
-    /// Stream name to read from.
-    pub stream_name: String,
-    /// Limiter info (to distinguish when to enable cache).
-    pub limiter_info: Vec<RateBucketInfo>,
-    /// Disable cache.
-    /// If true, cache is ignored, but reports all statistics.
-    pub disable_cache: bool,
-    /// Retry interval for the stream read operation.
-    pub retry_interval: Duration,
-}
-
-impl EndpointCacheConfig {
-    /// Default options for [`crate::control_plane::NodeInfoCache`].
-    /// Notice that by default the limiter is empty, which means that cache is disabled.
-    pub const CACHE_DEFAULT_OPTIONS: &'static str = "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
-
-    /// Parse cache options passed via cmdline.
-    /// Example: [`Self::CACHE_DEFAULT_OPTIONS`].
-    fn parse(options: &str) -> anyhow::Result<Self> {
-        let mut initial_batch_size = None;
-        let mut default_batch_size = None;
-        let mut xread_timeout = None;
-        let mut stream_name = None;
-        let mut limiter_info = vec![];
-        let mut disable_cache = false;
-        let mut retry_interval = None;
-
-        for option in options.split(',') {
-            let (key, value) = option
-                .split_once('=')
-                .with_context(|| format!("bad key-value pair: {option}"))?;
-
-            match key {
-                "initial_batch_size" => initial_batch_size = Some(value.parse()?),
-                "default_batch_size" => default_batch_size = Some(value.parse()?),
-                "xread_timeout" => xread_timeout = Some(humantime::parse_duration(value)?),
-                "stream_name" => stream_name = Some(value.to_string()),
-                "limiter_info" => limiter_info.push(RateBucketInfo::from_str(value)?),
-                "disable_cache" => disable_cache = value.parse()?,
-                "retry_interval" => retry_interval = Some(humantime::parse_duration(value)?),
-                unknown => bail!("unknown key: {unknown}"),
-            }
-        }
-        RateBucketInfo::validate(&mut limiter_info)?;
-
-        Ok(Self {
-            initial_batch_size: initial_batch_size.context("missing `initial_batch_size`")?,
-            default_batch_size: default_batch_size.context("missing `default_batch_size`")?,
-            xread_timeout: xread_timeout.context("missing `xread_timeout`")?,
-            stream_name: stream_name.context("missing `stream_name`")?,
-            disable_cache,
-            limiter_info,
-            retry_interval: retry_interval.context("missing `retry_interval`")?,
-        })
-    }
-}
-
-impl FromStr for EndpointCacheConfig {
-    type Err = anyhow::Error;
-
-    fn from_str(options: &str) -> Result<Self, Self::Err> {
-        let error = || format!("failed to parse endpoint cache options '{options}'");
-        Self::parse(options).with_context(error)
-    }
-}
 #[derive(Debug)]
 pub struct MetricBackupCollectionConfig {
     pub remote_storage_config: Option<RemoteStorageConfig>,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 7b0549e76f..3a8828e70c 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -7,7 +7,7 @@ use once_cell::sync::OnceCell;
 use smol_str::SmolStr;
 use tokio::sync::mpsc;
 use tracing::field::display;
-use tracing::{Span, debug, error, info_span};
+use tracing::{Span, error, info_span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
@@ -15,10 +15,7 @@ use self::parquet::RequestData;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::error::ErrorKind;
 use crate::intern::{BranchIdInt, ProjectIdInt};
-use crate::metrics::{
-    ConnectOutcome, InvalidEndpointsGroup, LatencyAccumulated, LatencyTimer, Metrics, Protocol,
-    Waiting,
-};
+use crate::metrics::{LatencyAccumulated, LatencyTimer, Metrics, Protocol, Waiting};
 use crate::pqproto::StartupMessageParams;
 use crate::protocol2::{ConnectionInfo, ConnectionInfoExtra};
 use crate::types::{DbName, EndpointId, RoleName};
@@ -70,8 +67,6 @@ struct RequestContextInner {
     // This sender is only used to log the length of session in case of success.
     disconnect_sender: Option<mpsc::UnboundedSender<RequestData>>,
     pub(crate) latency_timer: LatencyTimer,
-    // Whether proxy decided that it's not a valid endpoint end rejected it before going to cplane.
-    rejected: Option<bool>,
     disconnect_timestamp: Option<chrono::DateTime<Utc>>,
 }
 
@@ -106,7 +101,6 @@ impl Clone for RequestContext {
             auth_method: inner.auth_method.clone(),
             jwt_issuer: inner.jwt_issuer.clone(),
             success: inner.success,
-            rejected: inner.rejected,
             cold_start_info: inner.cold_start_info,
             pg_options: inner.pg_options.clone(),
             testodrome_query_id: inner.testodrome_query_id.clone(),
@@ -151,7 +145,6 @@ impl RequestContext {
             auth_method: None,
             jwt_issuer: None,
             success: false,
-            rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
             pg_options: None,
             testodrome_query_id: None,
@@ -183,11 +176,6 @@ impl RequestContext {
         )
     }
 
-    pub(crate) fn set_rejected(&self, rejected: bool) {
-        let mut this = self.0.try_lock().expect("should not deadlock");
-        this.rejected = Some(rejected);
-    }
-
     pub(crate) fn set_cold_start_info(&self, info: ColdStartInfo) {
         self.0
             .try_lock()
@@ -461,38 +449,6 @@ impl RequestContextInner {
     }
 
     fn log_connect(&mut self) {
-        let outcome = if self.success {
-            ConnectOutcome::Success
-        } else {
-            ConnectOutcome::Failed
-        };
-
-        // TODO: get rid of entirely/refactor
-        // check for false positives
-        // AND false negatives
-        if let Some(rejected) = self.rejected {
-            let ep = self
-                .endpoint_id
-                .as_ref()
-                .map(|x| x.as_str())
-                .unwrap_or_default();
-            // This makes sense only if cache is disabled
-            debug!(
-                ?outcome,
-                ?rejected,
-                ?ep,
-                "check endpoint is valid with outcome"
-            );
-            Metrics::get()
-                .proxy
-                .invalid_endpoints_total
-                .inc(InvalidEndpointsGroup {
-                    protocol: self.protocol,
-                    rejected: rejected.into(),
-                    outcome,
-                });
-        }
-
         if let Some(tx) = self.sender.take() {
             // If type changes, this error handling needs to be updated.
             let tx: mpsc::UnboundedSender<RequestData> = tx;
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index fbacc97661..fc263b73b1 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -159,13 +159,6 @@ impl NeonControlPlaneClient {
         ctx: &RequestContext,
         endpoint: &EndpointId,
     ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
-        if !self
-            .caches
-            .endpoints_cache
-            .is_valid(ctx, &endpoint.normalize())
-        {
-            return Err(GetEndpointJwksError::EndpointNotFound);
-        }
         let request_id = ctx.session_id().to_string();
         async {
             let request = self
@@ -300,11 +293,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             return Ok(secret);
         }
 
-        if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
-            info!("endpoint is not valid, skipping the request");
-            return Err(GetAuthInfoError::UnknownEndpoint);
-        }
-
         let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
 
         let control = EndpointAccessControl {
@@ -346,11 +334,6 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
             return Ok(control);
         }
 
-        if !self.caches.endpoints_cache.is_valid(ctx, normalized_ep) {
-            info!("endpoint is not valid, skipping the request");
-            return Err(GetAuthInfoError::UnknownEndpoint);
-        }
-
         let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
 
         let control = EndpointAccessControl {
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index 2ffc589df6..ecd4db29b2 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -13,9 +13,8 @@ use tracing::{debug, info};
 use super::{EndpointAccessControl, RoleAccessControl};
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
-use crate::cache::endpoints::EndpointsCache;
 use crate::cache::project_info::ProjectInfoCacheImpl;
-use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
+use crate::config::{CacheOptions, ProjectInfoCacheOptions};
 use crate::context::RequestContext;
 use crate::control_plane::{CachedNodeInfo, ControlPlaneApi, NodeInfoCache, errors};
 use crate::error::ReportableError;
@@ -121,15 +120,12 @@ pub struct ApiCaches {
     pub(crate) node_info: NodeInfoCache,
     /// Cache which stores project_id -> endpoint_ids mapping.
     pub project_info: Arc<ProjectInfoCacheImpl>,
-    /// List of all valid endpoints.
-    pub endpoints_cache: Arc<EndpointsCache>,
 }
 
 impl ApiCaches {
     pub fn new(
         wake_compute_cache_config: CacheOptions,
         project_info_cache_config: ProjectInfoCacheOptions,
-        endpoint_cache_config: EndpointCacheConfig,
     ) -> Self {
         Self {
             node_info: NodeInfoCache::new(
@@ -139,7 +135,6 @@ impl ApiCaches {
                 true,
             ),
             project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
-            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
         }
     }
 }
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
index 77312c89c5..f640657d90 100644
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -99,10 +99,6 @@ pub(crate) enum GetAuthInfoError {
 
     #[error(transparent)]
     ApiError(ControlPlaneError),
-
-    /// Proxy does not know about the endpoint in advanced
-    #[error("endpoint not found in endpoint cache")]
-    UnknownEndpoint,
 }
 
 // This allows more useful interactions than `#[from]`.
@@ -119,8 +115,6 @@ impl UserFacingError for GetAuthInfoError {
             Self::BadSecret => REQUEST_FAILED.to_owned(),
             // However, API might return a meaningful error.
             Self::ApiError(e) => e.to_string_client(),
-            // pretend like control plane returned an error.
-            Self::UnknownEndpoint => REQUEST_FAILED.to_owned(),
         }
     }
 }
@@ -130,8 +124,6 @@ impl ReportableError for GetAuthInfoError {
         match self {
             Self::BadSecret => crate::error::ErrorKind::ControlPlane,
             Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
-            // we only apply endpoint filtering if control plane is under high load.
-            Self::UnknownEndpoint => crate::error::ErrorKind::ServiceRateLimit,
         }
     }
 }
@@ -200,9 +192,6 @@ impl CouldRetry for WakeComputeError {
 
 #[derive(Debug, Error)]
 pub enum GetEndpointJwksError {
-    #[error("endpoint not found")]
-    EndpointNotFound,
-
     #[error("failed to build control plane request: {0}")]
     RequestBuild(#[source] reqwest::Error),
 
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 61d4636c2b..fd1b2af023 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -16,44 +16,6 @@ use super::LeakyBucketConfig;
 use crate::ext::LockExt;
 use crate::intern::EndpointIdInt;
 
-pub struct GlobalRateLimiter {
-    data: Vec<RateBucket>,
-    info: Vec<RateBucketInfo>,
-}
-
-impl GlobalRateLimiter {
-    pub fn new(info: Vec<RateBucketInfo>) -> Self {
-        Self {
-            data: vec![
-                RateBucket {
-                    start: Instant::now(),
-                    count: 0,
-                };
-                info.len()
-            ],
-            info,
-        }
-    }
-
-    /// Check that number of connections is below `max_rps` rps.
-    pub fn check(&mut self) -> bool {
-        let now = Instant::now();
-
-        let should_allow_request = self
-            .data
-            .iter_mut()
-            .zip(&self.info)
-            .all(|(bucket, info)| bucket.should_allow_request(info, now, 1));
-
-        if should_allow_request {
-            // only increment the bucket counts if the request will actually be accepted
-            self.data.iter_mut().for_each(|b| b.inc(1));
-        }
-
-        should_allow_request
-    }
-}
-
 // Simple per-endpoint rate limiter.
 //
 // Check that number of connections to the endpoint is below `max_rps` rps.
diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs
index 112b95873a..828bb63aac 100644
--- a/proxy/src/rate_limiter/mod.rs
+++ b/proxy/src/rate_limiter/mod.rs
@@ -8,4 +8,4 @@ pub(crate) use limit_algorithm::aimd::Aimd;
 pub(crate) use limit_algorithm::{
     DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
-pub use limiter::{GlobalRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
+pub use limiter::{RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/types.rs b/proxy/src/types.rs
index d5952d1d8b..43b8dc5b29 100644
--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -107,13 +107,3 @@ smol_str_wrapper!(DbName);
 
 // postgres hostname, will likely be a port:ip addr
 smol_str_wrapper!(Host);
-
-// Endpoints are a bit tricky. Rare they might be branches or projects.
-impl EndpointId {
-    pub(crate) fn is_endpoint(&self) -> bool {
-        self.0.starts_with("ep-")
-    }
-    pub(crate) fn is_branch(&self) -> bool {
-        self.0.starts_with("br-")
-    }
-}

From 2f3fc7cb57d0ed2c844f41f482a8778e973718a4 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Tue, 8 Jul 2025 12:51:50 +0400
Subject: [PATCH 007/163] Fix keep-failing reconciles test & add logs (#12497)

## Problem

Test is flaky due to the following warning in the logs:

```
Keeping extra secondaries: can't determine which of [NodeId(1), NodeId(2)] to remove (some nodes offline?)
```

Some nodes being offline is expected behavior in this test.

## Summary of changes

- Added `Keeping extra secondaries` to the list of allowed errors
- Improved logging for better debugging experience

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 test_runner/regress/test_storage_controller.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8471ab9f57..10845ef02e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1034,16 +1034,19 @@ def test_storage_controller_compute_hook_keep_failing(
     alive_pageservers = [p for p in env.pageservers if p.id != banned_tenant_ps.id]
 
     # Stop pageserver and ban tenant to trigger failed reconciliation
+    log.info(f"Banning tenant {banned_tenant} and stopping pageserver {banned_tenant_ps.id}")
     status_by_tenant[banned_tenant] = 423
     banned_tenant_ps.stop()
     env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
     env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+    env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
     env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
     env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
 
     # Migrate all allowed tenant shards to the first alive pageserver
     # to trigger storage controller optimizations due to affinity rules
     for shard_number in range(shard_count):
+        log.info(f"Migrating shard {shard_number} of {allowed_tenant} to {alive_pageservers[0].id}")
         env.storage_controller.tenant_shard_migrate(
             TenantShardId(allowed_tenant, shard_number, shard_count),
             alive_pageservers[0].id,

From 18796fd1dd2337592b559ffc2fa745a973a5fe77 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Tue, 8 Jul 2025 13:15:29 +0400
Subject: [PATCH 008/163] tests: more allowed errors for
 test_safekeeper_migration (#12495)

## Problem
Pageserver now writes errors in the log during the safekeeper migration.
Some errors are added to allowed errors, but "timeline not found in
global map" is not.

- Will be properly fixed in
https://github.com/neondatabase/neon/issues/12191

## Summary of changes
Add "timeline not found in global map" error in a list of allowed errors
in `test_safekeeper_migration_simple`
---
 test_runner/regress/test_safekeeper_migration.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py
index 057371175c..b82d7b9bb0 100644
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -27,6 +27,7 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
         [
             ".*Timeline .* was cancelled and cannot be used anymore.*",
             ".*Timeline .* has been deleted.*",
+            ".*Timeline .* was not found in global map.*",
             ".*wal receiver task finished with an error.*",
         ]
     )

From 4f16ab3f56ea6fee142e097012d58ad0a5170f12 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Tue, 8 Jul 2025 10:34:01 +0100
Subject: [PATCH 009/163] add lfc offload and prewarm error metrics (#12486)

Add `compute_ctl_lfc_prewarm_errors_total` and
`compute_ctl_lfc_offload_errors_total` metrics.
Add comments in `test_lfc_prewarm`.
Correction PR for https://github.com/neondatabase/neon/pull/12447
https://github.com/neondatabase/cloud/issues/19011
---
 compute_tools/src/compute_prewarm.rs    |  6 ++-
 compute_tools/src/metrics.rs            | 18 +++++++
 test_runner/regress/test_lfc_prewarm.py | 63 ++++++++++++++++---------
 3 files changed, 62 insertions(+), 25 deletions(-)

diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs
index 4190580e5e..3f6f9a7ecc 100644
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -105,7 +105,8 @@ impl ComputeNode {
                 cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
                 return;
             };
-            error!(%err);
+            crate::metrics::LFC_PREWARM_ERRORS.inc();
+            error!(%err, "prewarming lfc");
             cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
                 error: err.to_string(),
             };
@@ -180,7 +181,8 @@ impl ComputeNode {
             self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
             return;
         };
-        error!(%err);
+        crate::metrics::LFC_OFFLOAD_ERRORS.inc();
+        error!(%err, "offloading lfc");
         self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
             error: err.to_string(),
         };
diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 8f81675c49..91dedbb42a 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -105,6 +105,14 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "compute_ctl_lfc_prewarm_errors_total",
+        "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "compute_ctl_lfc_offloads_total",
@@ -113,6 +121,14 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "compute_ctl_lfc_offload_errors_total",
+        "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
+    )
+    .expect("failed to define a metric")
+});
+
 pub fn collect() -> Vec<MetricFamily> {
     let mut metrics = COMPUTE_CTL_UP.collect();
     metrics.extend(INSTALLED_EXTENSIONS.collect());
@@ -123,6 +139,8 @@ pub fn collect() -> Vec<MetricFamily> {
     metrics.extend(PG_CURR_DOWNTIME_MS.collect());
     metrics.extend(PG_TOTAL_DOWNTIME_MS.collect());
     metrics.extend(LFC_PREWARMS.collect());
+    metrics.extend(LFC_PREWARM_ERRORS.collect());
     metrics.extend(LFC_OFFLOADS.collect());
+    metrics.extend(LFC_OFFLOAD_ERRORS.collect());
     metrics
 }
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
index 1fa1ead034..ae36bbda79 100644
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,7 +1,6 @@
 import random
 import threading
 from enum import StrEnum
-from time import sleep
 from typing import Any
 
 import pytest
@@ -20,28 +19,32 @@ class PrewarmMethod(StrEnum):
 
 
 PREWARM_LABEL = "compute_ctl_lfc_prewarms_total"
+PREWARM_ERR_LABEL = "compute_ctl_lfc_prewarm_errors_total"
 OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
+OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
 METHOD_VALUES = [e for e in PrewarmMethod]
 METHOD_IDS = [e.value for e in PrewarmMethod]
 
 
 def check_pinned_entries(cur: Cursor):
-    # some LFC buffer can be temporary locked by autovacuum or background writer
-    for _ in range(10):
+    """
+    Wait till none of LFC buffers are pinned
+    """
+
+    def none_pinned():
         cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
-        n_pinned = cur.fetchall()[0][0]
-        if n_pinned == 0:
-            break
-        sleep(1)
-    assert n_pinned == 0
+        assert cur.fetchall()[0][0] == 0
+
+    wait_until(none_pinned)
 
 
 def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
+    labels = PREWARM_LABEL, OFFLOAD_LABEL, PREWARM_ERR_LABEL, OFFLOAD_ERR_LABEL
     return {
-        sample.name: sample.value
+        sample.name: int(sample.value)
         for family in prom_parse_impl(client.metrics())
         for sample in family.samples
-        if sample.name in (PREWARM_LABEL, OFFLOAD_LABEL)
+        if sample.name in labels
     }
 
 
@@ -54,7 +57,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
         assert "error" not in status
         client.offload_lfc()
         assert client.prewarm_lfc_status()["status"] == "not_prewarmed"
-        assert prom_parse(client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0}
+        parsed = prom_parse(client)
+        desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
+        assert parsed == desired, f"{parsed=} != {desired=}"
     elif method == PrewarmMethod.POSTGRES:
         cur.execute("select get_local_cache_state()")
         return cur.fetchall()[0][0]
@@ -81,12 +86,17 @@ def check_prewarmed(
         assert prom_parse(client)[PREWARM_LABEL] == 1
     elif method == PrewarmMethod.COMPUTE_CTL:
         assert client.prewarm_lfc_status() == desired_status
-        assert prom_parse(client) == {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1}
+        desired = {OFFLOAD_LABEL: 0, PREWARM_LABEL: 1, PREWARM_ERR_LABEL: 0, OFFLOAD_ERR_LABEL: 0}
+        assert prom_parse(client) == desired
 
 
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 @pytest.mark.parametrize("method", METHOD_VALUES, ids=METHOD_IDS)
 def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
+    """
+    Test we can offload endpoint's LFC cache to endpoint storage.
+    Test we can prewarm endpoint with LFC cache loaded from endpoint storage.
+    """
     env = neon_simple_env
     n_records = 1000000
     cfg = [
@@ -140,18 +150,15 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
     lfc_used_pages = pg_cur.fetchall()[0][0]
     log.info(f"Used LFC size: {lfc_used_pages}")
     pg_cur.execute("select * from get_prewarm_info()")
-    prewarm_info = pg_cur.fetchall()[0]
-    log.info(f"Prewarm info: {prewarm_info}")
-    total, prewarmed, skipped, _ = prewarm_info
+    total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
+    log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
     progress = (prewarmed + skipped) * 100 // total
     log.info(f"Prewarm progress: {progress}%")
-
     assert lfc_used_pages > 10000
-    assert (
-        prewarm_info[0] > 0
-        and prewarm_info[1] > 0
-        and prewarm_info[0] == prewarm_info[1] + prewarm_info[2]
-    )
+    assert total > 0
+    assert prewarmed > 0
+    assert total == prewarmed + skipped
+
     lfc_cur.execute("select sum(pk) from t")
     assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
 
@@ -168,6 +175,9 @@ WORKLOAD_IDS = METHOD_IDS[:-1]
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 @pytest.mark.parametrize("method", WORKLOAD_VALUES, ids=WORKLOAD_IDS)
 def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMethod):
+    """
+    Test continiously prewarming endpoint when there is a write-heavy workload going in parallel
+    """
     env = neon_simple_env
     n_records = 10000
     n_threads = 4
@@ -247,5 +257,12 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
     assert total_balance == 0
 
     check_pinned_entries(pg_cur)
-    if method != PrewarmMethod.POSTGRES:
-        assert prom_parse(http_client) == {OFFLOAD_LABEL: 1, PREWARM_LABEL: n_prewarms}
+    if method == PrewarmMethod.POSTGRES:
+        return
+    desired = {
+        OFFLOAD_LABEL: 1,
+        PREWARM_LABEL: n_prewarms,
+        OFFLOAD_ERR_LABEL: 0,
+        PREWARM_ERR_LABEL: 0,
+    }
+    assert prom_parse(http_client) == desired

From f51ed4a2c40b863b0d8e993e690270e7307b879c Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 8 Jul 2025 11:38:06 +0200
Subject: [PATCH 010/163] "disable" disk eviction in pagebench periodic
 benchmark (#12487)

## Problem

https://github.com/neondatabase/neon/pull/12464 introduced new defaults
for pageserver disk based eviction which activated disk based eviction
for pagebench periodic pagebench.
This caused the testcase to fail.

## Summary of changes

Override the new defaults during testcase execution.

## Test run

https://github.com/neondatabase/neon/actions/runs/16120217757/job/45483869734

Test run was successful, so merging this now
---
 .../test_pageserver_max_throughput_getpage_at_latest_lsn.py   | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index 68bfa81b25..bf998a2a0a 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -104,9 +104,7 @@ def setup_and_run_pagebench_benchmark(
     # configure cache sizes like in prod
     page_cache_size = 16384
     max_file_descriptors = 500000
-    neon_env_builder.pageserver_config_override = (
-        f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}"
-    )
+    neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{max_usage_pct=99, min_avail_bytes=0, period = '999y'}}"
 
     tracing_config = PageserverTracingConfig(
         sampling_ratio=(0, 1000),

From 59e393aef35fea56bbbf5dd1feeebfb3c518731d Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 8 Jul 2025 13:28:39 +0200
Subject: [PATCH 011/163] Enable parallel execution of extension tests (#12118)

## Problem
Extension tests were previously run sequentially, resulting in
unnecessary wait time and underutilization of available CPU cores.
## Summary of changes
Tests are now executed in a customizable number of parallel threads
using separate database branches. This reduces overall test time by
approximately 50% (e.g., on my laptop, parallel test lasts 173s, while
sequential one lasts 340s) and increases the load on the pageserver,
providing better test coverage.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
Co-authored-by: Devin AI <158243242+devin-ai-integration[bot]@users.noreply.github.com>
Co-authored-by: Alexey Masterov <alexey.masterov@databricks.com>
---
 .github/workflows/build_and_test.yml          |  1 +
 .gitignore                                    |  1 +
 compute/compute-node.Dockerfile               |  4 +-
 .../compute_wrapper/shell/compute.sh          | 18 ++--
 docker-compose/docker-compose.yml             | 25 +++--
 docker-compose/docker_compose_test.sh         | 92 +++++++++++++------
 docker-compose/run-tests.sh                   | 69 +++++++++-----
 docker-compose/test_extensions_upgrade.sh     |  6 +-
 8 files changed, 144 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0ceaa96fb0..864abad574 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -986,6 +986,7 @@ jobs:
       - name: Verify docker-compose example and test extensions
         timeout-minutes: 60
         env:
+          PARALLEL_COMPUTES: 3
           TAG: >-
             ${{
               needs.meta.outputs.run-kind == 'compute-rc-pr'
diff --git a/.gitignore b/.gitignore
index 6574d7b9de..4857972f1d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ neon.iml
 /.neon
 /integration_tests/.neon
 compaction-suite-results.*
+docker-compose/docker-compose-parallel.yml
 
 # Coverage
 *.profraw
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 0dd32011fb..39136fe573 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1915,10 +1915,10 @@ RUN cd /ext-src/pg_repack-src && patch -p1 </ext-src/pg_repack.patch && rm -f /e
 
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN echo /usr/local/pgsql/lib > /etc/ld.so.conf.d/00-neon.conf && /sbin/ldconfig
-RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq \
+RUN apt-get update && apt-get install -y libtap-parser-sourcehandler-pgtap-perl jq parallel \
    && apt clean && rm -rf /ext-src/*.tar.gz /ext-src/*.patch /var/lib/apt/lists/*
 ENV PATH=/usr/local/pgsql/bin:$PATH
-ENV PGHOST=compute
+ENV PGHOST=compute1
 ENV PGPORT=55433
 ENV PGUSER=cloud_admin
 ENV PGDATABASE=postgres
diff --git a/docker-compose/compute_wrapper/shell/compute.sh b/docker-compose/compute_wrapper/shell/compute.sh
index 1e62e91fd0..6f36b4358e 100755
--- a/docker-compose/compute_wrapper/shell/compute.sh
+++ b/docker-compose/compute_wrapper/shell/compute.sh
@@ -54,14 +54,16 @@ else
     printf '%s\n' "${result}" | jq .
   fi
 
-  echo "Check if a timeline present"
-  PARAMS=(
-       -X GET
-       -H "Content-Type: application/json"
-       "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
-  )
-  timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
-  if [[ -z "${timeline_id}" || "${timeline_id}" = null ]]; then
+  if [[ "${RUN_PARALLEL:-false}" != "true" ]]; then
+    echo "Check if a timeline present"
+    PARAMS=(
+         -X GET
+         -H "Content-Type: application/json"
+        "http://pageserver:9898/v1/tenant/${tenant_id}/timeline"
+    )
+    timeline_id=$(curl "${PARAMS[@]}" | jq -r .[0].timeline_id)
+  fi
+  if [[ -z "${timeline_id:-}" || "${timeline_id:-}" = null ]]; then
     generate_id timeline_id
     PARAMS=(
         -sbf
diff --git a/docker-compose/docker-compose.yml b/docker-compose/docker-compose.yml
index 2519b75c7f..19c3bc74e2 100644
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -142,7 +142,7 @@ services:
       - "storage_broker"
       - "--listen-addr=0.0.0.0:50051"
 
-  compute:
+  compute1:
     restart: always
     build:
       context: ./compute_wrapper/
@@ -152,6 +152,7 @@ services:
         - TAG=${COMPUTE_TAG:-${TAG:-latest}}
         - http_proxy=${http_proxy:-}
         - https_proxy=${https_proxy:-}
+    image: built-compute
     environment:
       - PG_VERSION=${PG_VERSION:-16}
       - TENANT_ID=${TENANT_ID:-}
@@ -166,6 +167,11 @@ services:
       - 3080:3080 # http endpoints
     entrypoint:
       - "/shell/compute.sh"
+    # Ad an alias for compute1 for compatibility
+    networks:
+      default:
+        aliases:
+            - compute
     depends_on:
       - safekeeper1
       - safekeeper2
@@ -174,15 +180,20 @@ services:
 
   compute_is_ready:
     image: postgres:latest
+    environment:
+      - PARALLEL_COMPUTES=1
     entrypoint:
-      - "/bin/bash"
+      - "/bin/sh"
       - "-c"
     command:
-      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
-            echo 'Waiting to start compute...' && sleep 1;
-         done"
+      - "for i in $(seq 1 $${PARALLEL_COMPUTES}); do
+           until pg_isready -h compute$$i -p 55433 -U cloud_admin ; do
+             sleep 1;
+           done;
+         done;
+         echo All computes are started"
     depends_on:
-      - compute
+      - compute1
 
   neon-test-extensions:
     profiles: ["test-extensions"]
@@ -196,4 +207,4 @@ services:
     command:
       - sleep 3600
     depends_on:
-      - compute
+      - compute1
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 6edf90ca8d..063b8dee85 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 # A basic test to ensure Docker images are built correctly.
 # Build a wrapper around the compute, start all services and runs a simple SQL query.
@@ -13,9 +13,36 @@
 #
 set -eux -o pipefail
 
+cd "$(dirname "${0}")"
 export COMPOSE_FILE='docker-compose.yml'
 export COMPOSE_PROFILES=test-extensions
-cd "$(dirname "${0}")"
+export PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
+READY_MESSAGE="All computes are started"
+COMPUTES=()
+for i in $(seq 1 "${PARALLEL_COMPUTES}"); do
+  COMPUTES+=("compute${i}")
+done
+CURRENT_TMPDIR=$(mktemp -d)
+trap 'rm -rf ${CURRENT_TMPDIR} docker-compose-parallel.yml' EXIT
+if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
+  export COMPOSE_FILE=docker-compose-parallel.yml
+  cp docker-compose.yml docker-compose-parallel.yml
+  # Replace the environment variable PARALLEL_COMPUTES with the actual value
+  yq eval -i ".services.compute_is_ready.environment |=  map(select(. | test(\"^PARALLEL_COMPUTES=\") | not)) + [\"PARALLEL_COMPUTES=${PARALLEL_COMPUTES}\"]" ${COMPOSE_FILE}
+  for i in $(seq 2 "${PARALLEL_COMPUTES}"); do
+    # Duplicate compute1 as compute${i} for parallel execution
+    yq eval -i ".services.compute${i} = .services.compute1" ${COMPOSE_FILE}
+    # We don't need these sections, so delete them
+    yq eval -i "(del .services.compute${i}.build) | (del .services.compute${i}.ports) | (del .services.compute${i}.networks)" ${COMPOSE_FILE}
+    # Let the compute 1 be the only dependence
+    yq eval -i ".services.compute${i}.depends_on = [\"compute1\"]" ${COMPOSE_FILE}
+    # Set RUN_PARALLEL=true for compute2. They will generate tenant_id and timeline_id to avoid using the same as other computes
+    yq eval -i ".services.compute${i}.environment += [\"RUN_PARALLEL=true\"]" ${COMPOSE_FILE}
+    # Remove TENANT_ID and TIMELINE_ID from the environment variables of the generated computes
+    # They will create new TENANT_ID and TIMELINE_ID anyway.
+    yq eval -i ".services.compute${i}.environment |= map(select(. | (test(\"^TENANT_ID=\") or test(\"^TIMELINE_ID=\")) | not))" ${COMPOSE_FILE}
+  done
+fi
 PSQL_OPTION="-h localhost -U cloud_admin -p 55433 -d postgres"
 
 function cleanup() {
@@ -27,11 +54,11 @@ function cleanup() {
 
 for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
     pg_version=${pg_version/v/}
-    echo "clean up containers if exists"
+    echo "clean up containers if exist"
     cleanup
     PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
-    PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull --build -d
-
+    PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose build compute1
+    PG_VERSION=${pg_version} PG_TEST_VERSION=${PG_TEST_VERSION} docker compose up --quiet-pull -d
     echo "wait until the compute is ready. timeout after 60s. "
     cnt=0
     while sleep 3; do
@@ -41,45 +68,50 @@ for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
             echo "timeout before the compute is ready."
             exit 1
         fi
-        if docker compose logs "compute_is_ready" | grep -q "accepting connections"; then
+        if docker compose logs compute_is_ready | grep -q "${READY_MESSAGE}"; then
             echo "OK. The compute is ready to connect."
             echo "execute simple queries."
-            docker compose exec compute /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
+            for compute in "${COMPUTES[@]}"; do
+              docker compose exec "${compute}" /bin/bash -c "psql ${PSQL_OPTION} -c 'SELECT 1'"
+            done
             break
         fi
     done
 
     if [[ ${pg_version} -ge 16 ]]; then
-        # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
-        # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
-        echo Adding dummy config
-        docker compose exec compute touch /var/db/postgres/compute/compute_ctl_temp_override.conf
-        # Prepare for the PostGIS test
-        docker compose exec compute mkdir -p /tmp/pgis_reg/pgis_reg_tmp
-        TMPDIR=$(mktemp -d)
-        docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${TMPDIR}"
-        docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${TMPDIR}"
-        docker compose exec compute mkdir -p /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
-        docker compose cp "${TMPDIR}/test" compute:/ext-src/postgis-src/raster/test
-        docker compose cp "${TMPDIR}/00-regress-install" compute:/ext-src/postgis-src/regress
-        rm -rf "${TMPDIR}"
-        # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
-        TMPDIR=$(mktemp -d)
-        docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${TMPDIR}/data"
-        docker compose cp "${TMPDIR}/data" compute:/ext-src/pg_hint_plan-src/
-        rm -rf "${TMPDIR}"
-        # The following block does the same for the contrib/file_fdw test
-        TMPDIR=$(mktemp -d)
-        docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${TMPDIR}/data"
-        docker compose cp "${TMPDIR}/data" compute:/postgres/contrib/file_fdw/data
-        rm -rf "${TMPDIR}"
+        mkdir "${CURRENT_TMPDIR}"/{pg_hint_plan-src,file_fdw,postgis-src}
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/raster/test "${CURRENT_TMPDIR}/postgis-src/test"
+        docker compose cp neon-test-extensions:/ext-src/postgis-src/regress/00-regress-install "${CURRENT_TMPDIR}/postgis-src/00-regress-install"
+        docker compose cp neon-test-extensions:/ext-src/pg_hint_plan-src/data "${CURRENT_TMPDIR}/pg_hint_plan-src/data"
+        docker compose cp neon-test-extensions:/postgres/contrib/file_fdw/data "${CURRENT_TMPDIR}/file_fdw/data"
+
+        for compute in "${COMPUTES[@]}"; do
+          # This is required for the pg_hint_plan test, to prevent flaky log message causing the test to fail
+          # It cannot be moved to Dockerfile now because the database directory is created after the start of the container
+          echo Adding dummy config on "${compute}"
+          docker compose exec "${compute}" touch /var/db/postgres/compute/compute_ctl_temp_override.conf
+          # Prepare for the PostGIS test
+          docker compose exec "${compute}" mkdir -p /tmp/pgis_reg/pgis_reg_tmp /ext-src/postgis-src/raster /ext-src/postgis-src/regress /ext-src/postgis-src/regress/00-regress-install
+          docker compose cp "${CURRENT_TMPDIR}/postgis-src/test" "${compute}":/ext-src/postgis-src/raster/test
+          docker compose cp "${CURRENT_TMPDIR}/postgis-src/00-regress-install" "${compute}":/ext-src/postgis-src/regress
+          # The following block copies the files for the pg_hintplan test to the compute node for the extension test in an isolated docker-compose environment
+          docker compose cp "${CURRENT_TMPDIR}/pg_hint_plan-src/data" "${compute}":/ext-src/pg_hint_plan-src/
+          # The following block does the same for the contrib/file_fdw test
+          docker compose cp "${CURRENT_TMPDIR}/file_fdw/data" "${compute}":/postgres/contrib/file_fdw/data
+        done
         # Apply patches
         docker compose exec -T neon-test-extensions bash -c "(cd /postgres && patch -p1)" <"../compute/patches/contrib_pg${pg_version}.patch"
         # We are running tests now
         rm -f testout.txt testout_contrib.txt
+        # We want to run the longest tests first to better utilize parallelization and reduce overall test time.
+        # Tests listed in the RUN_FIRST variable will be run before others.
+        # If parallelization is not used, this environment variable will be ignored.
+
         docker compose exec -e USE_PGXS=1 -e SKIP=timescaledb-src,rdkit-src,pg_jsonschema-src,kq_imcx-src,wal2json_2_5-src,rag_jina_reranker_v1_tiny_en-src,rag_bge_small_en_v15-src \
+        -e RUN_FIRST=hll-src,postgis-src,pgtap-src -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
         neon-test-extensions /run-tests.sh /ext-src | tee testout.txt && EXT_SUCCESS=1 || EXT_SUCCESS=0
         docker compose exec -e SKIP=start-scripts,postgres_fdw,ltree_plpython,jsonb_plpython,jsonb_plperl,hstore_plpython,hstore_plperl,dblink,bool_plperl \
+        -e PARALLEL_COMPUTES="${PARALLEL_COMPUTES}" \
         neon-test-extensions /run-tests.sh /postgres/contrib | tee testout_contrib.txt && CONTRIB_SUCCESS=1 || CONTRIB_SUCCESS=0
         if [[ ${EXT_SUCCESS} -eq 0 || ${CONTRIB_SUCCESS} -eq 0 ]]; then
             CONTRIB_FAILED=
diff --git a/docker-compose/run-tests.sh b/docker-compose/run-tests.sh
index 930402ce66..b37b9363fa 100755
--- a/docker-compose/run-tests.sh
+++ b/docker-compose/run-tests.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -x
 
 if [[ -v BENCHMARK_CONNSTR ]]; then
@@ -26,8 +26,9 @@ if [[ -v BENCHMARK_CONNSTR ]]; then
   fi
 fi
 REGULAR_USER=false
-while getopts r arg; do
-  case $arg in
+PARALLEL_COMPUTES=${PARALLEL_COMPUTES:-1}
+while getopts pr arg; do
+  case ${arg} in
   r)
     REGULAR_USER=true
     shift $((OPTIND-1))
@@ -41,26 +42,49 @@ extdir=${1}
 
 cd "${extdir}" || exit 2
 FAILED=
-LIST=$( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
-for d in ${LIST}; do
-    [ -d "${d}" ] || continue
-    if ! psql -w -c "select 1" >/dev/null; then
-      FAILED="${d} ${FAILED}"
-      break
-    fi
-    if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
-       "${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
-       continue
-    fi
+export FAILED_FILE=/tmp/failed
+rm -f ${FAILED_FILE}
+mapfile -t LIST < <( (echo -e "${SKIP//","/"\n"}"; ls) | sort | uniq -u)
+if [[ ${PARALLEL_COMPUTES} -gt 1 ]]; then
+  # Avoid errors if RUN_FIRST is not defined
+  RUN_FIRST=${RUN_FIRST:-}
+  # Move entries listed in the RUN_FIRST variable to the beginning
+  ORDERED_LIST=$(printf "%s\n" "${LIST[@]}" | grep -x -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"); printf "%s\n" "${LIST[@]}" | grep -vx -Ff <(echo -e "${RUN_FIRST//,/$'\n'}"))
+  parallel -j"${PARALLEL_COMPUTES}" "[[ -d {} ]] || exit 0
+                export PGHOST=compute{%}
+                if ! psql -c 'select 1'>/dev/null; then
+                  exit 1
+                fi
+                echo Running on \${PGHOST}
+                if [[ -f ${extdir}/{}/neon-test.sh ]]; then
+                  echo Running from script
+                  ${extdir}/{}/neon-test.sh || echo {} >> ${FAILED_FILE};
+                else
+                  echo Running using make;
+                  USE_PGXS=1 make -C {} installcheck || echo {} >> ${FAILED_FILE};
+                fi" ::: ${ORDERED_LIST}
+  [[ ! -f ${FAILED_FILE} ]] && exit 0
+else
+  for d in "${LIST[@]}"; do
+      [ -d "${d}" ] || continue
+      if ! psql -w -c "select 1" >/dev/null; then
+        FAILED="${d} ${FAILED}"
+        break
+      fi
+      if [[ ${REGULAR_USER} = true ]] && [ -f "${d}"/regular-test.sh ]; then
+        "${d}/regular-test.sh" || FAILED="${d} ${FAILED}"
+        continue
+      fi
 
-    if [ -f "${d}/neon-test.sh" ]; then
-       "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
-    else
-       USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
-    fi
-done
-[ -z "${FAILED}" ] && exit 0
-for d in ${FAILED}; do
+      if [ -f "${d}/neon-test.sh" ]; then
+        "${d}/neon-test.sh" || FAILED="${d} ${FAILED}"
+      else
+        USE_PGXS=1 make -C "${d}" installcheck || FAILED="${d} ${FAILED}"
+      fi
+  done
+  [[ -z ${FAILED} ]]  && exit 0
+fi
+for d in ${FAILED} $([[ ! -f ${FAILED_FILE} ]] || cat ${FAILED_FILE}); do
   cat "$(find $d -name regression.diffs)"
 done
 for postgis_diff in /tmp/pgis_reg/*_diff; do
@@ -68,4 +92,5 @@ for postgis_diff in /tmp/pgis_reg/*_diff; do
   cat "${postgis_diff}"
 done
 echo "${FAILED}"
+cat ${FAILED_FILE}
 exit 1
diff --git a/docker-compose/test_extensions_upgrade.sh b/docker-compose/test_extensions_upgrade.sh
index f1cf17f531..1d39fc029e 100755
--- a/docker-compose/test_extensions_upgrade.sh
+++ b/docker-compose/test_extensions_upgrade.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 set -eux -o pipefail
 cd "$(dirname "${0}")"
 # Takes a variable name as argument. The result is stored in that variable.
@@ -60,8 +60,8 @@ function check_timeline() {
 # Restarts the compute node with the required compute tag and timeline.
 # Accepts the tag for the compute node and the timeline as parameters.
 function restart_compute() {
-  docker compose down compute compute_is_ready
-  COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute compute_is_ready
+  docker compose down compute1 compute_is_ready
+  COMPUTE_TAG=${1} TENANT_ID=${tenant_id} TIMELINE_ID=${2} docker compose up --quiet-pull -d --build compute1 compute_is_ready
   wait_for_ready
   check_timeline ${2}
 }

From 2b2a54767183da8784557b5c4ab714674a6694c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 8 Jul 2025 14:56:22 +0200
Subject: [PATCH 012/163] fix(tests): periodic and immediate gc is effectively
 a no-op in tests (#12431)

The introduction of the default lease deadline feature 9 months ago made
it so
that after PS restart, `.timeline_gc()` calls in Python tests are no-ops
for 10 minute after pageserver startup: the `gc_iteration()` bails with
`Skipping GC because lsn lease deadline is not reached`.

I did some impact analysis in the following PR. About 30 Python tests
are affected:
- https://github.com/neondatabase/neon/pull/12411

Rust tests that don't explicitly enable periodic GC or invoke GC
manually
are unaffected because we disable periodic GC by default in
the `TenantHarness`'s tenant config.
Two tests explicitly did `start_paused=true` + `tokio::time::advance()`,
but it would add cognitive and code bloat to each existing and future
test case that uses TenantHarness if we took that route.

So, this PR sets the default lease deadline feature in both Python
and Rust tests to zero by default. Tests that test the feature were
thus identified by failing the test:
- Python test `test_readonly_node_gc` + `test_lsn_lease_size`
- Rust test `test_lsn_lease`.

To accomplish the above, I changed the code that computes the initial
lease
deadline to respect the pageserver.toml's default tenant config, which
it didn't before (and I would consider a bug). The Python test harness
and the Rust TenantHarness test harness then simply set the default
tenant
config field to zero.

Drive-by:
- `test_lsn_lease_size` was writing a lot of data unnecessarily; reduce
the amount and speed up the test

refs
- PR that introduced default lease deadline:
https://github.com/neondatabase/neon/pull/9055/files
- fixes https://databricks.atlassian.net/browse/LKB-92

---------

Co-authored-by: Christian Schwarz <Christian Schwarz>
---
 pageserver/src/config.rs                  | 11 ++++-
 pageserver/src/tenant.rs                  | 59 ++++++++++++++---------
 pageserver/src/tenant/mgr.rs              | 15 +++---
 test_runner/fixtures/neon_fixtures.py     |  8 +++
 test_runner/regress/test_readonly_node.py |  4 +-
 test_runner/regress/test_tenant_size.py   | 20 +++-----
 6 files changed, 72 insertions(+), 45 deletions(-)

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 75b41b9b60..6e22f9f36e 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -635,7 +635,7 @@ impl PageServerConf {
     pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
         let pg_distrib_dir = Utf8PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("../pg_install");
 
-        let config_toml = pageserver_api::config::ConfigToml {
+        let mut config_toml = pageserver_api::config::ConfigToml {
             wait_lsn_timeout: Duration::from_secs(60),
             wal_redo_timeout: Duration::from_secs(60),
             pg_distrib_dir: Some(pg_distrib_dir),
@@ -647,6 +647,15 @@ impl PageServerConf {
             control_plane_api: Some(Url::parse("http://localhost:6666").unwrap()),
             ..Default::default()
         };
+
+        // Test authors tend to forget about the default 10min initial lease deadline
+        // when writing tests, which turns their immediate gc requests via mgmt API
+        // into no-ops. Override the binary default here, such that there is no initial
+        // lease deadline by default in tests. Tests that care can always override it
+        // themselves.
+        // Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
+        config_toml.tenant_config.lsn_lease_length = Duration::from_secs(0);
+
         PageServerConf::parse_and_validate(NodeId(0), config_toml, &repo_dir).unwrap()
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f4877fd763..9aabd6341f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -34,7 +34,7 @@ use once_cell::sync::Lazy;
 pub use pageserver_api::models::TenantState;
 use pageserver_api::models::{self, RelSizeMigration};
 use pageserver_api::models::{
-    CompactInfoResponse, LsnLease, TimelineArchivalState, TimelineState, TopTenantShardItem,
+    CompactInfoResponse, TimelineArchivalState, TimelineState, TopTenantShardItem,
     WalRedoManagerStatus,
 };
 use pageserver_api::shard::{ShardIdentity, ShardStripeSize, TenantShardId};
@@ -180,6 +180,7 @@ pub(super) struct AttachedTenantConf {
 
 impl AttachedTenantConf {
     fn new(
+        conf: &'static PageServerConf,
         tenant_conf: pageserver_api::models::TenantConfig,
         location: AttachedLocationConfig,
     ) -> Self {
@@ -191,9 +192,7 @@ impl AttachedTenantConf {
         let lsn_lease_deadline = if location.attach_mode == AttachmentMode::Single {
             Some(
                 tokio::time::Instant::now()
-                    + tenant_conf
-                        .lsn_lease_length
-                        .unwrap_or(LsnLease::DEFAULT_LENGTH),
+                    + TenantShard::get_lsn_lease_length_impl(conf, &tenant_conf),
             )
         } else {
             // We don't use `lsn_lease_deadline` to delay GC in AttachedMulti and AttachedStale
@@ -208,10 +207,13 @@ impl AttachedTenantConf {
         }
     }
 
-    fn try_from(location_conf: LocationConf) -> anyhow::Result<Self> {
+    fn try_from(
+        conf: &'static PageServerConf,
+        location_conf: LocationConf,
+    ) -> anyhow::Result<Self> {
         match &location_conf.mode {
             LocationMode::Attached(attach_conf) => {
-                Ok(Self::new(location_conf.tenant_conf, *attach_conf))
+                Ok(Self::new(conf, location_conf.tenant_conf, *attach_conf))
             }
             LocationMode::Secondary(_) => {
                 anyhow::bail!(
@@ -4205,10 +4207,16 @@ impl TenantShard {
     }
 
     pub fn get_lsn_lease_length(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        Self::get_lsn_lease_length_impl(self.conf, &self.tenant_conf.load().tenant_conf)
+    }
+
+    pub fn get_lsn_lease_length_impl(
+        conf: &'static PageServerConf,
+        tenant_conf: &pageserver_api::models::TenantConfig,
+    ) -> Duration {
         tenant_conf
             .lsn_lease_length
-            .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
+            .unwrap_or(conf.default_tenant_conf.lsn_lease_length)
     }
 
     pub fn get_timeline_offloading_enabled(&self) -> bool {
@@ -6020,11 +6028,14 @@ pub(crate) mod harness {
             let tenant = Arc::new(TenantShard::new(
                 TenantState::Attaching,
                 self.conf,
-                AttachedTenantConf::try_from(LocationConf::attached_single(
-                    self.tenant_conf.clone(),
-                    self.generation,
-                    ShardParameters::default(),
-                ))
+                AttachedTenantConf::try_from(
+                    self.conf,
+                    LocationConf::attached_single(
+                        self.tenant_conf.clone(),
+                        self.generation,
+                        ShardParameters::default(),
+                    ),
+                )
                 .unwrap(),
                 self.shard_identity,
                 Some(walredo_mgr),
@@ -6125,7 +6136,7 @@ mod tests {
     use pageserver_api::keyspace::KeySpace;
     #[cfg(feature = "testing")]
     use pageserver_api::keyspace::KeySpaceRandomAccum;
-    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
+    use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings, LsnLease};
     use pageserver_compaction::helpers::overlaps_with;
     #[cfg(feature = "testing")]
     use rand::SeedableRng;
@@ -6675,17 +6686,13 @@ mod tests {
         tline.freeze_and_flush().await.map_err(|e| e.into())
     }
 
-    #[tokio::test(start_paused = true)]
+    #[tokio::test]
     async fn test_prohibit_branch_creation_on_garbage_collected_data() -> anyhow::Result<()> {
         let (tenant, ctx) =
             TenantHarness::create("test_prohibit_branch_creation_on_garbage_collected_data")
                 .await?
                 .load()
                 .await;
-        // Advance to the lsn lease deadline so that GC is not blocked by
-        // initial transition into AttachedSingle.
-        tokio::time::advance(tenant.get_lsn_lease_length()).await;
-        tokio::time::resume();
         let tline = tenant
             .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
             .await?;
@@ -9384,17 +9391,21 @@ mod tests {
         Ok(())
     }
 
-    #[tokio::test(start_paused = true)]
+    #[tokio::test]
     async fn test_lsn_lease() -> anyhow::Result<()> {
         let (tenant, ctx) = TenantHarness::create("test_lsn_lease")
             .await
             .unwrap()
             .load()
             .await;
-        // Advance to the lsn lease deadline so that GC is not blocked by
-        // initial transition into AttachedSingle.
-        tokio::time::advance(tenant.get_lsn_lease_length()).await;
-        tokio::time::resume();
+        // set a non-zero lease length to test the feature
+        tenant
+            .update_tenant_config(|mut conf| {
+                conf.lsn_lease_length = Some(LsnLease::DEFAULT_LENGTH);
+                Ok(conf)
+            })
+            .unwrap();
+
         let key = Key::from_hex("010000000033333333444444445500000000").unwrap();
 
         let end_lsn = Lsn(0x100);
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 95f5c60170..be18b40862 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -664,7 +664,7 @@ pub async fn init_tenant_mgr(
                     tenant_shard_id,
                     &tenant_dir_path,
                     resources.clone(),
-                    AttachedTenantConf::new(location_conf.tenant_conf, attached_conf),
+                    AttachedTenantConf::new(conf, location_conf.tenant_conf, attached_conf),
                     shard_identity,
                     Some(init_order.clone()),
                     SpawnMode::Lazy,
@@ -842,8 +842,11 @@ impl TenantManager {
                             // take our fast path and just provide the updated configuration
                             // to the tenant.
                             tenant.set_new_location_config(
-                                AttachedTenantConf::try_from(new_location_config.clone())
-                                    .map_err(UpsertLocationError::BadRequest)?,
+                                AttachedTenantConf::try_from(
+                                    self.conf,
+                                    new_location_config.clone(),
+                                )
+                                .map_err(UpsertLocationError::BadRequest)?,
                             );
 
                             Some(FastPathModified::Attached(tenant.clone()))
@@ -1046,7 +1049,7 @@ impl TenantManager {
                 // Testing hack: if we are configured with no control plane, then drop the generation
                 // from upserts.  This enables creating generation-less tenants even though neon_local
                 // always uses generations when calling the location conf API.
-                let attached_conf = AttachedTenantConf::try_from(new_location_config)
+                let attached_conf = AttachedTenantConf::try_from(self.conf, new_location_config)
                     .map_err(UpsertLocationError::BadRequest)?;
 
                 let tenant = tenant_spawn(
@@ -1250,7 +1253,7 @@ impl TenantManager {
             tenant_shard_id,
             &tenant_path,
             self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
+            AttachedTenantConf::try_from(self.conf, config)?,
             shard_identity,
             None,
             SpawnMode::Eager,
@@ -2131,7 +2134,7 @@ impl TenantManager {
                 tenant_shard_id,
                 &tenant_path,
                 self.resources.clone(),
-                AttachedTenantConf::try_from(config).map_err(Error::DetachReparent)?,
+                AttachedTenantConf::try_from(self.conf, config).map_err(Error::DetachReparent)?,
                 shard_identity,
                 None,
                 SpawnMode::Eager,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f2ec022666..f54d5be635 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1315,6 +1315,14 @@ class NeonEnv:
             # This feature is pending rollout.
             # tenant_config["rel_size_v2_enabled"] = True
 
+            # Test authors tend to forget about the default 10min initial lease deadline
+            # when writing tests, which turns their immediate gc requests via mgmt API
+            # into no-ops. Override the binary default here, such that there is no initial
+            # lease deadline by default in tests. Tests that care can always override it
+            # themselves.
+            # Cf https://databricks.atlassian.net/browse/LKB-92?focusedCommentId=6722329
+            tenant_config["lsn_lease_length"] = "0s"
+
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
                     self.pageserver_remote_storage
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index ee934a900d..5612236250 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -201,11 +201,11 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
         for shard, ps in tenant_get_shards(env, env.initial_tenant):
             client = ps.http_client()
             layers_guarded_before_gc = get_layers_protected_by_lease(
-                client, shard, env.initial_timeline, lease_lsn=lsn
+                client, shard, env.initial_timeline, lease_lsn=lease_lsn
             )
             gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
             layers_guarded_after_gc = get_layers_protected_by_lease(
-                client, shard, env.initial_timeline, lease_lsn=lsn
+                client, shard, env.initial_timeline, lease_lsn=lease_lsn
             )
 
             # Note: cannot assert on `layers_removed` here because it could be layers
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 190dd914ee..8b291b7cbe 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -740,6 +740,10 @@ def test_lsn_lease_size(neon_env_builder: NeonEnvBuilder, test_output_dir: Path,
         "pitr_interval": "0s" if zero_gc else "3600s",
         "gc_period": "0s",
         "compaction_period": "0s",
+        # The test exercises leases API, so we need non-zero lease length.
+        # If this tests ever does GC, we need to accomodate for the initial lease deadline
+        # after tenant attach, which is also controlled by this variable.
+        "lsn_lease_length": "600s",
     }
 
     env = neon_env_builder.init_start(initial_tenant_conf=conf)
@@ -824,9 +828,7 @@ def insert_with_action(
         log.info(f"initial size: {initial_size}")
 
         with ep.cursor() as cur:
-            cur.execute(
-                "CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
-            )
+            cur.execute("CREATE TABLE t0 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
         last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
 
         if action == "lease":
@@ -841,15 +843,9 @@ def insert_with_action(
             raise AssertionError("Invalid action type, only `lease` and `branch`are accepted")
 
         with ep.cursor() as cur:
-            cur.execute(
-                "CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
-            )
-            cur.execute(
-                "CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
-            )
-            cur.execute(
-                "CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 1000000) s(i)"
-            )
+            cur.execute("CREATE TABLE t1 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
+            cur.execute("CREATE TABLE t2 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
+            cur.execute("CREATE TABLE t3 AS SELECT i::bigint n FROM generate_series(0, 10000) s(i)")
 
         last_flush_lsn = wait_for_last_flush_lsn(env, ep, tenant, timeline)
 

From 38384c37ac50f04fe992795c27f0d5cf4291c2a5 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Tue, 8 Jul 2025 17:15:14 +0400
Subject: [PATCH 013/163] Make node deletion context-aware (#12494)

## Problem

Deletion process does not calculate preferred nodes correctly - it
doesn't consider current tenant-shard layout among all pageservers.

## Summary of changes

- Added a schedule context calculation for node deletion

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 storage_controller/src/service.rs | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9360225396..403ae15b59 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7208,6 +7208,12 @@ impl Service {
                 let mut locked = self.inner.write().unwrap();
                 let (nodes, tenants, scheduler) = locked.parts_mut();
 
+                // Calculate a schedule context here to avoid borrow checker issues.
+                let mut schedule_context = ScheduleContext::default();
+                for (_, shard) in tenants.range(TenantShardId::tenant_range(tid.tenant_id)) {
+                    schedule_context.avoid(&shard.intent.all_pageservers());
+                }
+
                 let tenant_shard = match tenants.get_mut(&tid) {
                     Some(tenant_shard) => tenant_shard,
                     None => {
@@ -7233,9 +7239,6 @@ impl Service {
                 }
 
                 if tenant_shard.deref_node(node_id) {
-                    // TODO(ephemeralsad): we should process all shards in a tenant at once, so
-                    // we can avoid settling the tenant unevenly.
-                    let mut schedule_context = ScheduleContext::new(ScheduleMode::Normal);
                     if let Err(e) = tenant_shard.schedule(scheduler, &mut schedule_context) {
                         tracing::error!(
                             "Refusing to delete node, shard {} can't be rescheduled: {e}",

From 7458d031b1f7adfc6cbc652cc0903d58141f6746 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 8 Jul 2025 15:59:45 +0200
Subject: [PATCH 014/163] clippy: fix unfounded warning on macOS (#12501)

Before this PR, macOS builds would get clippy warning

```
warning: `tokio_epoll_uring::thread_local_system` does not refer to an existing function
```

The reason is that the `thread_local_system` function is only defined on
Linux.

Add `allow-invalid = true` to make macOS clippy pass, and manually test
that on Linux builds, clippy still fails when we use it.

refs
- https://databricks.slack.com/archives/C09254R641L/p1751917655527099

Co-authored-by: Christian Schwarz <Christian Schwarz>
---
 clippy.toml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/clippy.toml b/clippy.toml
index 408232488c..c03059053a 100644
--- a/clippy.toml
+++ b/clippy.toml
@@ -1,9 +1,12 @@
 disallowed-methods = [
     "tokio::task::block_in_place",
+
     # Allow this for now, to deny it later once we stop using Handle::block_on completely
     # "tokio::runtime::Handle::block_on",
-    # use tokio_epoll_uring_ext instead
-    "tokio_epoll_uring::thread_local_system",
+
+    # tokio-epoll-uring:
+    # - allow-invalid because the method doesn't exist on macOS
+    { path = "tokio_epoll_uring::thread_local_system", replacement = "tokio_epoll_uring_ext module inside pageserver crate", allow-invalid = true }
 ]
 
 disallowed-macros = [

From ccf88e9375d8c51bc7a3b43bc505f199c4aada96 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 8 Jul 2025 17:16:09 +0300
Subject: [PATCH 015/163] Improve debug logging by printing IO request details

---
 pgxn/neon/communicator_new.c | 122 ++++++++++++++++++++++++++++++++++-
 1 file changed, 120 insertions(+), 2 deletions(-)

diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index b809358c45..15643b822a 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -131,6 +131,8 @@ static void *bounce_write_if_needed(void *buffer);
 PGDLLEXPORT void communicator_new_bgworker_main(Datum main_arg);
 static void communicator_new_backend_exit(int code, Datum arg);
 
+static char *print_neon_io_request(NeonIORequest *request);
+
 /*
  * Request ID assignment.
  *
@@ -605,7 +607,7 @@ start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p)
 	if (request_idx == -1)
 	{
 		/* -1 means the request was satisfied immediately. */
-		elog(DEBUG4, "communicator request %lu was satisfied immediately", request->rel_exists.request_id);
+		elog(DEBUG4, "communicator request %s was satisfied immediately", print_neon_io_request(request));
 		return -1;
 	}
 	Assert(request_idx == my_next_slot_idx);
@@ -615,7 +617,8 @@ start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p)
 	inflight_requests[num_inflight_requests] = request_idx;
 	num_inflight_requests++;
 
-	elog(LOG, "started communicator request %lu at slot %d", request->rel_exists.request_id, request_idx);
+	elog(LOG, "started communicator request %s at slot %d", print_neon_io_request(request), request_idx);
+
 	return request_idx;
 }
 
@@ -1154,6 +1157,121 @@ communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum)
 	}
 }
 
+
+/* Debugging functions */
+
+static char *
+print_neon_io_request(NeonIORequest *request)
+{
+	switch (request->tag)
+	{
+		case NeonIORequest_Empty:
+			return pstrdup("Empty");
+			break;
+		case NeonIORequest_RelExists:
+			{
+				CRelExistsRequest *r = &request->rel_exists;
+
+				return psprintf("RelExists: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+			}
+		case NeonIORequest_RelSize:
+			{
+				CRelSizeRequest *r = &request->rel_size;
+
+				return psprintf("RelSize: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+			}
+		case NeonIORequest_GetPageV:
+			{
+				CGetPageVRequest *r = &request->get_page_v;
+
+				return psprintf("GetPageV: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %d-%d",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number, r->block_number + r->nblocks);
+			}
+		case NeonIORequest_PrefetchV:
+			{
+				CPrefetchVRequest *r = &request->prefetch_v;
+
+				return psprintf("PrefetchV: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %d-%d",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number, r->block_number + r->nblocks);
+			}
+		case NeonIORequest_DbSize:
+			{
+				CDbSizeRequest *r = &request->db_size;
+
+				return psprintf("PrefetchV: req " UINT64_FORMAT " db %u",
+								r->request_id, r->db_oid);
+			}
+		case NeonIORequest_WritePage:
+			{
+				CWritePageRequest *r = &request->write_page;
+
+				return psprintf("WritePage: req " UINT64_FORMAT " rel %u/%u/%u.%u blk %u lsn %X/%X",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number,
+								LSN_FORMAT_ARGS(r->lsn));
+			}
+		case NeonIORequest_RelExtend:
+			{
+				CRelExtendRequest *r = &request->rel_extend;
+
+				return psprintf("RelExtend: req " UINT64_FORMAT " rel %u/%u/%u.%u blk %u lsn %X/%X",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number,
+								LSN_FORMAT_ARGS(r->lsn));
+			}
+		case NeonIORequest_RelZeroExtend:
+			{
+				CRelZeroExtendRequest *r = &request->rel_zero_extend;
+
+				return psprintf("RelZeroExtend: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %u-%u lsn %X/%X",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number, r->block_number + r->nblocks,
+								LSN_FORMAT_ARGS(r->lsn));
+			}
+		case NeonIORequest_RelCreate:
+			{
+				CRelCreateRequest *r = &request->rel_create;
+
+				return psprintf("RelCreate: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+			}
+		case NeonIORequest_RelTruncate:
+			{
+				CRelTruncateRequest *r = &request->rel_truncate;
+
+				return psprintf("RelTruncate: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %u",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->nblocks);
+			}
+		case NeonIORequest_RelUnlink:
+			{
+				CRelUnlinkRequest *r = &request->rel_unlink;
+
+				return psprintf("RelUnlink: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+			}
+		case NeonIORequest_ForgetCache:
+			{
+				CForgetCacheRequest *r = &request->forget_cache;
+
+				return psprintf("ForgetCache: req " UINT64_FORMAT " rel %u/%u/%u.%u blocks: %u",
+								r->request_id,
+								r->spc_oid, r->db_oid, r->rel_number, r->fork_number,
+					r->nblocks);
+			}
+	}
+	return psprintf("Unknown request type %u", request->tag);
+}
+
+
 /*
  * The worker process can read / write shared buffers directly. But if smgrread() or
  * smgrwrite() is called with a private temporary buffer, we need to copy it to the

From 4053092408f43ed78bb1e8cdba96c74433fd565a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 8 Jul 2025 17:22:24 +0300
Subject: [PATCH 016/163] Fix LSN tracking on "unlogged index builds"

Fixes the test_gin_redo.py test failure, and probably some others
---
 .../neon/communicator/src/integrated_cache.rs | 15 +++-
 pgxn/neon/communicator/src/neon_request.rs    | 34 +++++++++-
 .../src/worker_process/main_loop.rs           | 24 ++++---
 pgxn/neon/communicator_new.c                  | 42 +++++++++++-
 pgxn/neon/communicator_new.h                  |  7 +-
 pgxn/neon/pagestore_client.h                  |  1 +
 pgxn/neon/pagestore_smgr.c                    | 68 +++++++++++++++----
 test_runner/regress/test_gin_redo.py          |  1 +
 8 files changed, 160 insertions(+), 32 deletions(-)

diff --git a/pgxn/neon/communicator/src/integrated_cache.rs b/pgxn/neon/communicator/src/integrated_cache.rs
index 5f0ca5f510..a7009f0eb5 100644
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -192,6 +192,10 @@ struct RelEntry {
     /// cached size of the relation
     /// u32::MAX means 'not known' (that's InvalidBlockNumber in Postgres)
     nblocks: AtomicU32,
+
+    /// This is the last time the "metadata" of this relation changed, not
+    /// the contents of the blocks. That is, the size of the relation.
+    lw_lsn: AtomicLsn,
 }
 
 impl std::fmt::Debug for RelEntry {
@@ -338,7 +342,7 @@ impl<'t> IntegratedCacheWriteAccess<'t> {
         CacheResult::NotFound(lsn)
     }
 
-    pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32) {
+    pub fn remember_rel_size(&'t self, rel: &RelTag, nblocks: u32, lsn: Lsn) {
         match self.relsize_cache.entry(RelKey::from(rel)) {
             Entry::Vacant(e) => {
                 tracing::info!("inserting rel entry for {rel:?}, {nblocks} blocks");
@@ -346,12 +350,14 @@ impl<'t> IntegratedCacheWriteAccess<'t> {
                 _ = e
                     .insert(RelEntry {
                         nblocks: AtomicU32::new(nblocks),
+                        lw_lsn: AtomicLsn::new(lsn.0),
                     })
                     .expect("out of memory");
             }
             Entry::Occupied(e) => {
                 tracing::info!("updating rel entry for {rel:?}, {nblocks} blocks");
                 e.get().nblocks.store(nblocks, Ordering::Relaxed);
+                e.get().lw_lsn.store(lsn);
             }
         };
     }
@@ -515,10 +521,15 @@ impl<'t> IntegratedCacheWriteAccess<'t> {
     }
 
     /// Forget information about given relation in the cache. (For DROP TABLE and such)
-    pub fn forget_rel(&'t self, rel: &RelTag) {
+    pub fn forget_rel(&'t self, rel: &RelTag, _nblocks: Option<u32>, flush_lsn: Lsn) {
         tracing::info!("forgetting rel entry for {rel:?}");
         self.relsize_cache.remove(&RelKey::from(rel));
 
+        // update with flush LSN
+        let _ = self
+            .global_lw_lsn
+            .fetch_max(flush_lsn.0, Ordering::Relaxed);
+
         // also forget all cached blocks for the relation
         // FIXME
         /*
diff --git a/pgxn/neon/communicator/src/neon_request.rs b/pgxn/neon/communicator/src/neon_request.rs
index f54dcd9222..32a02cd8c3 100644
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -28,6 +28,9 @@ pub enum NeonIORequest {
     RelCreate(CRelCreateRequest),
     RelTruncate(CRelTruncateRequest),
     RelUnlink(CRelUnlinkRequest),
+
+    // Other requests
+    ForgetCache(CForgetCacheRequest),
 }
 
 #[repr(C)]
@@ -72,6 +75,7 @@ impl NeonIORequest {
             RelCreate(req) => req.request_id,
             RelTruncate(req) => req.request_id,
             RelUnlink(req) => req.request_id,
+            ForgetCache(req) => req.request_id,
         }
     }
 }
@@ -187,7 +191,6 @@ pub struct CPrefetchVRequest {
 pub struct CDbSizeRequest {
     pub request_id: u64,
     pub db_oid: COid,
-    pub request_lsn: CLsn,
 }
 
 #[repr(C)]
@@ -241,6 +244,7 @@ pub struct CRelCreateRequest {
     pub db_oid: COid,
     pub rel_number: u32,
     pub fork_number: u8,
+    pub lsn: CLsn,
 }
 
 #[repr(C)]
@@ -252,6 +256,7 @@ pub struct CRelTruncateRequest {
     pub rel_number: u32,
     pub fork_number: u8,
     pub nblocks: u32,
+    pub lsn: CLsn,
 }
 
 #[repr(C)]
@@ -262,8 +267,7 @@ pub struct CRelUnlinkRequest {
     pub db_oid: COid,
     pub rel_number: u32,
     pub fork_number: u8,
-    pub block_number: u32,
-    pub nblocks: u32,
+    pub lsn: CLsn,
 }
 
 impl CRelExistsRequest {
@@ -375,3 +379,27 @@ impl CRelUnlinkRequest {
         }
     }
 }
+
+
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CForgetCacheRequest {
+    pub request_id: u64,
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub nblocks: u32,
+    pub lsn: CLsn,
+}
+
+impl CForgetCacheRequest {
+    pub fn reltag(&self) -> page_api::RelTag {
+        page_api::RelTag {
+            spcnode: self.spc_oid,
+            dbnode: self.db_oid,
+            relnode: self.rel_number,
+            forknum: self.fork_number,
+        }
+    }
+}
diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index fe6acbf049..2eacd13609 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -359,8 +359,8 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                 {
                     Ok(nblocks) => {
                         // update the cache
-                        tracing::info!("updated relsize for {:?} in cache: {}", rel, nblocks);
-                        self.cache.remember_rel_size(&rel, nblocks);
+                        tracing::info!("updated relsize for {:?} in cache: {}, lsn {}", rel, nblocks, read_lsn);
+                        self.cache.remember_rel_size(&rel, nblocks, not_modified_since);
 
                         NeonIOResult::RelSize(nblocks)
                     }
@@ -457,7 +457,7 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                     .remember_page(&rel, req.block_number, req.src, Lsn(req.lsn), true)
                     .await;
                 self.cache
-                    .remember_rel_size(&req.reltag(), req.block_number + 1);
+                    .remember_rel_size(&req.reltag(), req.block_number + 1, Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
             NeonIORequest::RelZeroExtend(req) => {
@@ -466,31 +466,37 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                     .inc_by(req.nblocks as u64);
 
                 // TODO: need to grab an io-in-progress lock for this? I guess not
-                // TODO: I think we should put the empty pages to the cache, or at least
-                // update the last-written LSN.
+                // TODO: We could put the empty pages to the cache. Maybe have
+                // a marker on the block entries for all-zero pages, instead of
+                // actually storing the empty pages.
                 self.cache
-                    .remember_rel_size(&req.reltag(), req.block_number + req.nblocks);
+                    .remember_rel_size(&req.reltag(), req.block_number + req.nblocks, Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
             NeonIORequest::RelCreate(req) => {
                 self.request_rel_create_counter.inc();
 
                 // TODO: need to grab an io-in-progress lock for this? I guess not
-                self.cache.remember_rel_size(&req.reltag(), 0);
+                self.cache.remember_rel_size(&req.reltag(), 0, Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
             NeonIORequest::RelTruncate(req) => {
                 self.request_rel_truncate_counter.inc();
 
                 // TODO: need to grab an io-in-progress lock for this? I guess not
-                self.cache.remember_rel_size(&req.reltag(), req.nblocks);
+                self.cache.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
             NeonIORequest::RelUnlink(req) => {
                 self.request_rel_unlink_counter.inc();
 
                 // TODO: need to grab an io-in-progress lock for this? I guess not
-                self.cache.forget_rel(&req.reltag());
+                self.cache.forget_rel(&req.reltag(), None, Lsn(req.lsn));
+                NeonIOResult::WriteOK
+            }
+            NeonIORequest::ForgetCache(req) => {
+                // TODO: need to grab an io-in-progress lock for this? I guess not
+                self.cache.forget_rel(&req.reltag(), Some(req.nblocks), Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
         }
diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index 15643b822a..44070dd72d 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -1061,7 +1061,7 @@ communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum, BlockNum
 }
 
 void
-communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum)
+communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn)
 {
 	NeonIORequest request = {
 		.tag = NeonIORequest_RelCreate,
@@ -1071,6 +1071,7 @@ communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum)
 			.db_oid = NInfoGetDbOid(rinfo),
 			.rel_number = NInfoGetRelNumber(rinfo),
 			.fork_number = forkNum,
+			.lsn = lsn,
 		}
 	};
 	NeonIOResult result;
@@ -1093,7 +1094,7 @@ communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum)
 }
 
 void
-communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
+communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn)
 {
 	NeonIORequest request = {
 		.tag = NeonIORequest_RelTruncate,
@@ -1104,6 +1105,7 @@ communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumbe
 			.rel_number = NInfoGetRelNumber(rinfo),
 			.fork_number = forkNum,
 			.nblocks = nblocks,
+			.lsn = lsn,
 		}
 	};
 	NeonIOResult result;
@@ -1126,7 +1128,7 @@ communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumbe
 }
 
 void
-communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum)
+communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn)
 {
 	NeonIORequest request = {
 		.tag = NeonIORequest_RelUnlink,
@@ -1136,6 +1138,7 @@ communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum)
 			.db_oid = NInfoGetDbOid(rinfo),
 			.rel_number = NInfoGetRelNumber(rinfo),
 			.fork_number = forkNum,
+			.lsn = lsn,
 		}
 	};
 	NeonIOResult result;
@@ -1157,6 +1160,39 @@ communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum)
 	}
 }
 
+void
+communicator_new_forget_cache(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn)
+{
+	NeonIORequest request = {
+		.tag = NeonIORequest_ForgetCache,
+		.forget_cache = {
+			.request_id = assign_request_id(),
+			.spc_oid = NInfoGetSpcOid(rinfo),
+			.db_oid = NInfoGetDbOid(rinfo),
+			.rel_number = NInfoGetRelNumber(rinfo),
+			.fork_number = forkNum,
+			.nblocks = nblocks,
+			.lsn = lsn,
+		}
+	};
+	NeonIOResult result;
+
+	perform_request(&request, &result);
+	switch (result.tag)
+	{
+		case NeonIOResult_WriteOK:
+			return;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not forget cache for rel %u/%u/%u.%u: %s",
+							RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for ForgetCache operation: %d", result.tag);
+			break;
+	}
+}
 
 /* Debugging functions */
 
diff --git a/pgxn/neon/communicator_new.h b/pgxn/neon/communicator_new.h
index bbab3f8f5a..5b636b687a 100644
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -49,8 +49,9 @@ extern void communicator_new_rel_extend(NRelFileInfo rinfo, ForkNumber forkNum,
 extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkNum,
 											BlockNumber blockno, BlockNumber nblocks,
 											XLogRecPtr lsn);
-extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum);
-extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
-extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum);
+extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
+extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
+extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
+extern void communicator_new_forget_cache(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
 
 #endif							/* COMMUNICATOR_NEW_H */
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index c2727e232b..eb3c80702e 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -292,6 +292,7 @@ extern int64 neon_dbsize(Oid dbNode);
 extern void neon_get_request_lsns(NRelFileInfo rinfo, ForkNumber forknum,
 								  BlockNumber blkno, neon_request_lsns *output,
 								  BlockNumber nblocks);
+extern XLogRecPtr neon_get_write_lsn(void);
 
 /* utils for neon relsize cache */
 extern void relsize_hash_init(void);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 4189af4d32..9ef393b8ff 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -502,6 +502,37 @@ nm_adjust_lsn(XLogRecPtr lsn)
 	return lsn;
 }
 
+/*
+ * Get a LSN to use to stamp an operation like relation create or truncate.
+ * On operations on individual pages we use the LSN of the page, but when
+ * e.g. smgrcreate() is called, we have to do something else.
+ */
+XLogRecPtr
+neon_get_write_lsn(void)
+{
+	XLogRecPtr	lsn;
+
+	if (RecoveryInProgress())
+	{
+		/*
+		 * FIXME: v14 doesn't have GetCurrentReplayRecPtr(). Options:
+		 * - add it in our fork
+		 * - store a magic value that means that you must use
+		 *   current latest possible LSN at the time that the request
+		 *   on this thing is made again (or some other recent enough
+		 *   lsn).
+		 */
+#if PG_VERSION_NUM >= 150000
+		lsn = GetCurrentReplayRecPtr(NULL);
+#else
+		lsn = GetXLogReplayRecPtr(NULL); /* FIXME: this is wrong, see above */
+#endif
+	}
+	else
+		lsn = GetXLogInsertRecPtr();
+
+	return lsn;
+}
 
 /*
  * Return LSN for requesting pages and number of blocks from page server
@@ -824,13 +855,15 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
 	if (neon_enable_new_communicator)
 	{
+		XLogRecPtr	lsn = neon_get_write_lsn();
+
 		if (isRedo)
 		{
 			if (!communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum))
-				communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
+				communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
 		}
 		else
-			communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum);
+			communicator_new_rel_create(InfoFromSMgrRel(reln), forkNum, lsn);
 	}
 	else
 	{
@@ -902,7 +935,9 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 	{
 		if (neon_enable_new_communicator)
 		{
-			communicator_new_rel_unlink(InfoFromNInfoB(rinfo), forkNum);
+			XLogRecPtr	lsn = neon_get_write_lsn();
+
+			communicator_new_rel_unlink(InfoFromNInfoB(rinfo), forkNum, lsn);
 		}
 		else
 			forget_cached_relsize(InfoFromNInfoB(rinfo), forkNum);
@@ -1962,7 +1997,9 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 
 	if (neon_enable_new_communicator)
 	{
-		communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks);
+		XLogRecPtr	lsn = neon_get_write_lsn();
+
+		communicator_new_rel_truncate(InfoFromSMgrRel(reln), forknum, nblocks, lsn);
 	}
 	else
 	{
@@ -2226,12 +2263,15 @@ neon_end_unlogged_build(SMgrRelation reln)
 		nblocks = mdnblocks(reln, MAIN_FORKNUM);
 		recptr = GetXLogInsertRecPtr();
 
-		neon_set_lwlsn_block_range(recptr,
-								   InfoFromNInfoB(rinfob),
-								   MAIN_FORKNUM, 0, nblocks);
-		neon_set_lwlsn_relation(recptr,
-								InfoFromNInfoB(rinfob),
-								MAIN_FORKNUM);
+		if (!neon_enable_new_communicator)
+		{
+			neon_set_lwlsn_block_range(recptr,
+									   InfoFromNInfoB(rinfob),
+									   MAIN_FORKNUM, 0, nblocks);
+			neon_set_lwlsn_relation(recptr,
+									InfoFromNInfoB(rinfob),
+									MAIN_FORKNUM);
+		}
 
 		/* Remove local copy */
 		for (int forknum = 0; forknum <= MAX_FORKNUM; forknum++)
@@ -2240,8 +2280,12 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
 				 forknum);
 
-			// FIXME: also do this with the new communicator
-			if (!neon_enable_new_communicator)
+			if (neon_enable_new_communicator)
+			{
+				/* TODO: we could update the cache with the size, since we have it at hand */
+				communicator_new_forget_cache(InfoFromSMgrRel(reln), forknum, nblocks, recptr);
+			}
+			else
 			{
 				forget_cached_relsize(InfoFromNInfoB(rinfob), forknum);
 				lfc_invalidate(InfoFromNInfoB(rinfob), forknum, nblocks);
diff --git a/test_runner/regress/test_gin_redo.py b/test_runner/regress/test_gin_redo.py
index 71382990dc..3ec2163203 100644
--- a/test_runner/regress/test_gin_redo.py
+++ b/test_runner/regress/test_gin_redo.py
@@ -16,6 +16,7 @@ def test_gin_redo(neon_simple_env: NeonEnv):
     secondary = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
     con = primary.connect()
     cur = con.cursor()
+    cur.execute("select pg_switch_wal()")
     cur.execute("create table gin_test_tbl(id integer, i int4[])")
     cur.execute("create index gin_test_idx on gin_test_tbl using gin (i)")
     cur.execute("insert into gin_test_tbl select g,array[3, 1, g] from generate_series(1, 10000) g")

From f72115d0a99c148953a15ec5fbf5a49f86ca741f Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Tue, 8 Jul 2025 15:37:24 +0100
Subject: [PATCH 017/163] Endpoint storage openapi spec (#12361)

https://github.com/neondatabase/cloud/issues/19011
---
 endpoint_storage/src/app.rs           |   2 +
 endpoint_storage/src/openapi_spec.yml | 146 ++++++++++++++++++++++++++
 2 files changed, 148 insertions(+)
 create mode 100644 endpoint_storage/src/openapi_spec.yml

diff --git a/endpoint_storage/src/app.rs b/endpoint_storage/src/app.rs
index 42431c0066..a7a18743ef 100644
--- a/endpoint_storage/src/app.rs
+++ b/endpoint_storage/src/app.rs
@@ -13,6 +13,8 @@ use utils::backoff::retry;
 pub fn app(state: Arc<Storage>) -> Router<()> {
     use axum::routing::{delete as _delete, get as _get};
     let delete_prefix = _delete(delete_prefix);
+    // NB: On any changes do not forget to update the OpenAPI spec
+    // in /endpoint_storage/src/openapi_spec.yml.
     Router::new()
         .route(
             "/{tenant_id}/{timeline_id}/{endpoint_id}/{*path}",
diff --git a/endpoint_storage/src/openapi_spec.yml b/endpoint_storage/src/openapi_spec.yml
new file mode 100644
index 0000000000..8d9abf902c
--- /dev/null
+++ b/endpoint_storage/src/openapi_spec.yml
@@ -0,0 +1,146 @@
+openapi: "3.0.2"
+info:
+  title: Endpoint Storage API
+  description: Endpoint Storage API
+  version: "1.0"
+  license:
+    name: "Apache"
+    url: https://github.com/neondatabase/neon/blob/main/LICENSE
+servers:
+  - url: ""
+paths:
+  /status:
+    description: Healthcheck endpoint
+    get:
+      description: Healthcheck
+      security: []
+      responses:
+        "200":
+          description: OK
+
+  /{tenant_id}/{timeline_id}/{endpoint_id}/{key}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: endpoint_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: key
+        in: path
+        required: true
+        schema:
+          type: string
+    get:
+      description: Get file from blob storage
+      responses:
+        "200":
+          description: "File stream from blob storage"
+          content:
+            application/octet-stream:
+              schema:
+                type: string
+                format: binary
+        "400":
+          description: File was not found
+        "403":
+          description: JWT does not authorize request to this route
+    put:
+      description: Insert file into blob storage. If file exists, override it
+      requestBody:
+        content:
+          application/octet-stream:
+            schema:
+              type: string
+              format: binary
+      responses:
+        "200":
+          description: File was inserted successfully
+        "403":
+          description: JWT does not authorize request to this route
+    delete:
+      description: Delete file from blob storage
+      responses:
+        "200":
+          description: File was successfully deleted or not found
+        "403":
+          description: JWT does not authorize request to this route
+
+  /{tenant_id}/{timeline_id}/{endpoint_id}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: endpoint_id
+        in: path
+        required: true
+        schema:
+          type: string
+    delete:
+      description: Delete endpoint data from blob storage
+      responses:
+        "200":
+          description: Endpoint data was deleted
+        "403":
+          description: JWT does not authorize request to this route
+
+  /{tenant_id}/{timeline_id}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+      - name: timeline_id
+        in: path
+        required: true
+        schema:
+          type: string
+    delete:
+      description: Delete timeline data from blob storage
+      responses:
+        "200":
+          description: Timeline data was deleted
+        "403":
+          description: JWT does not authorize request to this route
+
+  /{tenant_id}:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+    delete:
+      description: Delete tenant data from blob storage
+      responses:
+        "200":
+          description: Tenant data was deleted
+        "403":
+          description: JWT does not authorize request to this route
+
+components:
+  securitySchemes:
+    JWT:
+      type: http
+      scheme: bearer
+      bearerFormat: JWT
+
+security:
+  - JWT: []

From 8a042fb8ed62d56641836577fb01ccee4103dfb8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 8 Jul 2025 17:03:32 +0200
Subject: [PATCH 018/163] refactor(compaction): eliminate
 `CompactionError::Offload` variant, map to `::Other` (#12505)

Looks can be deceiving: the match blocks in
`maybe_trip_compaction_breaker`
and at the end of `compact_with_options` seem like differentiated error
handling, but in reality, these branches are unreachable at runtime
because the only source of `CompactionError::Offload` within the
compaction code is at the end of `Tenant::compaction_iteration`.

We can simply map offload cancellation to CompactionError::Cancelled and
all other offload errors to ::Other, since there's no differentiated
handling for them in the compaction code.

Also, the OffloadError::RemoteStorage variant has no differentiated
handling, but was wrapping the remote storage anyhow::Error in a
`anyhow(thiserror(anyhow))` sandwich. This PR removes that variant,
mapping all RemoteStorage errors to `OffloadError::Other`.
Thereby, the sandwich is gone and we will get a proper anyhow backtrace
to the remote storage error location if when we debug-print the
OffloadError (or the CompactionError if we map it to that).

refs
- https://databricks.atlassian.net/browse/LKB-182
- the observation that there's no need for differentiated handling of
CompactionError::Offload was made in
https://databricks.slack.com/archives/C09254R641L/p1751286453930269?thread_ts=1751284317.955159&cid=C09254R641L
---
 pageserver/src/http/routes.rs             |  1 -
 pageserver/src/tenant.rs                  |  7 +++----
 pageserver/src/tenant/tasks.rs            |  1 -
 pageserver/src/tenant/timeline.rs         | 17 -----------------
 pageserver/src/tenant/timeline/offload.rs |  4 +---
 5 files changed, 4 insertions(+), 26 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 02094e6aa9..23a090045a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2501,7 +2501,6 @@ async fn timeline_checkpoint_handler(
                 .map_err(|e|
                     match e {
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::Other(e) => ApiError::InternalServerError(e),
                         CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 9aabd6341f..ad767a1672 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3288,7 +3288,9 @@ impl TenantShard {
                         // Ignore this, we likely raced with unarchival.
                         OffloadError::NotArchived => Ok(()),
                         OffloadError::AlreadyInProgress => Ok(()),
-                        err => Err(err),
+                        OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
+                        // don't break the anyhow chain
+                        OffloadError::Other(err) => Err(CompactionError::Other(err)),
                     })?;
             }
 
@@ -3319,9 +3321,6 @@ impl TenantShard {
         match err {
             err if err.is_cancel() => {}
             CompactionError::ShuttingDown => (),
-            // Offload failures don't trip the circuit breaker, since they're cheap to retry and
-            // shouldn't block compaction.
-            CompactionError::Offload(_) => {}
             CompactionError::CollectKeySpaceError(err) => {
                 // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
                 self.compaction_circuit_breaker
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 954dd38bb4..2ba1ad2674 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -303,7 +303,6 @@ pub(crate) fn log_compaction_error(
     let level = match err {
         e if e.is_cancel() => return,
         ShuttingDown => return,
-        Offload(_) => Level::ERROR,
         AlreadyRunning(_) => Level::ERROR,
         CollectKeySpaceError(_) => Level::ERROR,
         _ if task_cancelled => Level::INFO,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index aeced98859..c2b49c0296 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -40,7 +40,6 @@ use layer_manager::{
     Shutdown,
 };
 
-use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 use pageserver_api::key::{
@@ -2078,9 +2077,6 @@ impl Timeline {
                 // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
                 self.compaction_failed.store(true, AtomicOrdering::Relaxed)
             }
-            // Don't change the current value on offload failure or shutdown. We don't want to
-            // abruptly stall nor resume L0 flushes in these cases.
-            Err(CompactionError::Offload(_)) => {}
         };
 
         result
@@ -6017,9 +6013,6 @@ impl Drop for Timeline {
 pub(crate) enum CompactionError {
     #[error("The timeline or pageserver is shutting down")]
     ShuttingDown,
-    /// Compaction tried to offload a timeline and failed
-    #[error("Failed to offload timeline: {0}")]
-    Offload(OffloadError),
     /// Compaction cannot be done right now; page reconstruction and so on.
     #[error("Failed to collect keyspace: {0}")]
     CollectKeySpaceError(#[from] CollectKeySpaceError),
@@ -6040,7 +6033,6 @@ impl CompactionError {
                 | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
                     PageReconstructError::Cancelled
                 ))
-                | Self::Offload(OffloadError::Cancelled)
         )
     }
 
@@ -6058,15 +6050,6 @@ impl CompactionError {
     }
 }
 
-impl From<OffloadError> for CompactionError {
-    fn from(e: OffloadError) -> Self {
-        match e {
-            OffloadError::Cancelled => Self::ShuttingDown,
-            _ => Self::Offload(e),
-        }
-    }
-}
-
 impl From<super::upload_queue::NotInitialized> for CompactionError {
     fn from(value: super::upload_queue::NotInitialized) -> Self {
         match value {
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 9464f034c7..e9cf2e9aa7 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -17,8 +17,6 @@ pub(crate) enum OffloadError {
     Cancelled,
     #[error("Timeline is not archived")]
     NotArchived,
-    #[error(transparent)]
-    RemoteStorage(anyhow::Error),
     #[error("Offload or deletion already in progress")]
     AlreadyInProgress,
     #[error("Unexpected offload error: {0}")]
@@ -29,7 +27,7 @@ impl From<TenantManifestError> for OffloadError {
     fn from(e: TenantManifestError) -> Self {
         match e {
             TenantManifestError::Cancelled => Self::Cancelled,
-            TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
+            TenantManifestError::RemoteStorage(e) => Self::Other(e),
         }
     }
 }

From 29d73e140453cc18acf96475c8a1dbf57f354468 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 8 Jul 2025 17:49:42 +0200
Subject: [PATCH 019/163] http-utils: Temporarily accept duplicate params
 (#12504)

## Problem

Grafana Alloy in cluster mode seems to send duplicate "seconds" scrape
URL parameters
when one of its instances is disrupted.

## Summary of changes

Temporarily accept duplicate parameters as long as their value is
identical.
---
 libs/http-utils/src/request.rs | 68 ++++++++++++++++++++++++++++++----
 1 file changed, 61 insertions(+), 7 deletions(-)

diff --git a/libs/http-utils/src/request.rs b/libs/http-utils/src/request.rs
index 9024a90a82..afb2ae8f47 100644
--- a/libs/http-utils/src/request.rs
+++ b/libs/http-utils/src/request.rs
@@ -41,17 +41,35 @@ pub fn get_query_param<'a>(
         Some(q) => q,
         None => return Ok(None),
     };
-    let mut values = url::form_urlencoded::parse(query.as_bytes())
+    let values = url::form_urlencoded::parse(query.as_bytes())
         .filter_map(|(k, v)| if k == param_name { Some(v) } else { None })
         // we call .next() twice below. If it's None the first time, .fuse() ensures it's None afterwards
         .fuse();
 
-    let value1 = values.next();
-    if values.next().is_some() {
-        return Err(ApiError::BadRequest(anyhow!(
-            "param {param_name} specified more than once"
-        )));
-    }
+    // Work around an issue with Alloy's pyroscope scrape where the "seconds"
+    // parameter is added several times. https://github.com/grafana/alloy/issues/3026
+    // TODO: revert after Alloy is fixed.
+    let value1 = values
+        .map(Ok)
+        .reduce(|acc, i| {
+            match acc {
+                Err(_) => acc,
+
+                // It's okay to have duplicates as along as they have the same value.
+                Ok(ref a) if a == &i.unwrap() => acc,
+
+                _ => Err(ApiError::BadRequest(anyhow!(
+                    "param {param_name} specified more than once"
+                ))),
+            }
+        })
+        .transpose()?;
+    // if values.next().is_some() {
+    //     return Err(ApiError::BadRequest(anyhow!(
+    //         "param {param_name} specified more than once"
+    //     )));
+    // }
+
     Ok(value1)
 }
 
@@ -92,3 +110,39 @@ pub async fn ensure_no_body(request: &mut Request<Body>) -> Result<(), ApiError>
         None => Ok(()),
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_get_query_param_duplicate() {
+        let req = Request::builder()
+            .uri("http://localhost:12345/testuri?testparam=1")
+            .body(hyper::Body::empty())
+            .unwrap();
+        let value = get_query_param(&req, "testparam").unwrap();
+        assert_eq!(value.unwrap(), "1");
+
+        let req = Request::builder()
+            .uri("http://localhost:12345/testuri?testparam=1&testparam=1")
+            .body(hyper::Body::empty())
+            .unwrap();
+        let value = get_query_param(&req, "testparam").unwrap();
+        assert_eq!(value.unwrap(), "1");
+
+        let req = Request::builder()
+            .uri("http://localhost:12345/testuri")
+            .body(hyper::Body::empty())
+            .unwrap();
+        let value = get_query_param(&req, "testparam").unwrap();
+        assert!(value.is_none());
+
+        let req = Request::builder()
+            .uri("http://localhost:12345/testuri?testparam=1&testparam=2&testparam=3")
+            .body(hyper::Body::empty())
+            .unwrap();
+        let value = get_query_param(&req, "testparam");
+        assert!(value.is_err());
+    }
+}

From f9b05a42d7a408bc98f485463c13f58496101160 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 8 Jul 2025 18:45:34 +0200
Subject: [PATCH 020/163] refactor(compaction): remove
 `CompactionError::AlreadyRunning` variant, use `::Other` instead (#12512)

The only call stack that can emit the `::AlreadyRunning` variant is
```
-> iteration_inner
	-> iteration
		-> compaction_iteration
			-> compaction_loop
				-> start_background_loops
```

And on that call stack, the only differentiated handling of it is its
invocations of
`log_compaction_error -> CompactionError::is_cancel()`, which returns
`true` for
`::AlreadyRunning`.

I think the condition of `AlreadyRunning` is severe; it really shouldn't
happen.
So, this PR starts treating it as something that is to be logged at
`ERROR` / `WARN`
level, depending on the `degrate_to_warning` argument to
`log_compaction_error`.

refs
- https://databricks.atlassian.net/browse/LKB-182
---
 pageserver/src/http/routes.rs                | 3 +--
 pageserver/src/tenant.rs                     | 1 -
 pageserver/src/tenant/tasks.rs               | 1 -
 pageserver/src/tenant/timeline.rs            | 6 ------
 pageserver/src/tenant/timeline/compaction.rs | 8 ++++----
 5 files changed, 5 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 23a090045a..55582659df 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2503,7 +2503,6 @@ async fn timeline_checkpoint_handler(
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
                         CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::Other(e) => ApiError::InternalServerError(e),
-                        CompactionError::AlreadyRunning(_) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                     }
                 )?;
         }
@@ -3697,7 +3696,7 @@ async fn tenant_evaluate_feature_flag(
         let tenant = state
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
-        // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s) 
+        // TODO: the properties we get here might be stale right after it is collected. But such races are rare (updated every 10s)
         // and we don't need to worry about it for now.
         let properties = tenant.feature_resolver.collect_properties();
         if as_type.as_deref() == Some("boolean") {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ad767a1672..49b92915da 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3334,7 +3334,6 @@ impl TenantShard {
                     .unwrap()
                     .fail(&CIRCUIT_BREAKERS_BROKEN, err);
             }
-            CompactionError::AlreadyRunning(_) => {}
         }
     }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 2ba1ad2674..356f495972 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -303,7 +303,6 @@ pub(crate) fn log_compaction_error(
     let level = match err {
         e if e.is_cancel() => return,
         ShuttingDown => return,
-        AlreadyRunning(_) => Level::ERROR,
         CollectKeySpaceError(_) => Level::ERROR,
         _ if task_cancelled => Level::INFO,
         Other(err) => {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c2b49c0296..296b922599 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2067,9 +2067,6 @@ impl Timeline {
             Err(CompactionError::ShuttingDown) => {
                 // Covered by the `Err(e) if e.is_cancel()` branch.
             }
-            Err(CompactionError::AlreadyRunning(_)) => {
-                // Covered by the `Err(e) if e.is_cancel()` branch.
-            }
             Err(CompactionError::Other(_)) => {
                 self.compaction_failed.store(true, AtomicOrdering::Relaxed)
             }
@@ -6018,8 +6015,6 @@ pub(crate) enum CompactionError {
     CollectKeySpaceError(#[from] CollectKeySpaceError),
     #[error(transparent)]
     Other(anyhow::Error),
-    #[error("Compaction already running: {0}")]
-    AlreadyRunning(&'static str),
 }
 
 impl CompactionError {
@@ -6028,7 +6023,6 @@ impl CompactionError {
         matches!(
             self,
             Self::ShuttingDown
-                | Self::AlreadyRunning(_)
                 | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
                 | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
                     PageReconstructError::Cancelled
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ac3930fb71..2c0b98c1e2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -572,7 +572,7 @@ impl GcCompactionQueue {
         match res {
             Ok(res) => Ok(res),
             Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
-            Err(_) => {
+            Err(CompactionError::CollectKeySpaceError(_) | CompactionError::Other(_)) => {
                 // There are some cases where traditional gc might collect some layer
                 // files causing gc-compaction cannot read the full history of the key.
                 // This needs to be resolved in the long-term by improving the compaction
@@ -591,9 +591,9 @@ impl GcCompactionQueue {
         timeline: &Arc<Timeline>,
     ) -> Result<CompactionOutcome, CompactionError> {
         let Ok(_one_op_at_a_time_guard) = self.consumer_lock.try_lock() else {
-            return Err(CompactionError::AlreadyRunning(
-                "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue.",
-            ));
+            return Err(CompactionError::Other(anyhow::anyhow!(
+                "cannot run gc-compaction because another gc-compaction is running. This should not happen because we only call this function from the gc-compaction queue."
+            )));
         };
         let has_pending_tasks;
         let mut yield_for_l0 = false;

From 477ab12b691be1e44f557d45dbe64294009259d4 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 8 Jul 2025 17:46:55 +0100
Subject: [PATCH 021/163] pageserver: touch up broker subscription reset
 (#12503)

## Problem

The goal of this code was to test out if resetting the broker
subscription helps alleviate the issues we've been seeing in staging.
Looks like it did the trick. However, the original version was too
eager.

## Summary of Changes

Only reset the stream when:
* we are waiting for WAL
* there's no connection candidates lined up
* we're not already connected to a safekeeper
---
 .../timeline/walreceiver/connection_manager.rs    | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 9b151d2449..aba94244a3 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -182,12 +182,19 @@ pub(super) async fn connection_manager_loop_step(
                 }
             },
 
+            // If we've not received any updates from the broker from a while, are waiting for WAL
+            // and have no safekeeper connection or connection candidates, then it might be that
+            // the broker subscription is wedged. Drop the currrent subscription and re-subscribe
+            // with the goal of unblocking it.
             _ = broker_reset_interval.tick() => {
-                if wait_lsn_status.borrow().is_some() {
-                    tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...")
-                }
+                let awaiting_lsn = wait_lsn_status.borrow().is_some();
+                let no_candidates = connection_manager_state.wal_stream_candidates.is_empty();
+                let no_connection = connection_manager_state.wal_connection.is_none();
 
-                broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
+                if awaiting_lsn && no_candidates && no_connection {
+                    tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
+                    broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
+                }
             },
 
             new_event = async {

From a06c560ad05ecec0c13901f97916807259665bfa Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 8 Jul 2025 12:55:00 -0400
Subject: [PATCH 022/163] feat(pageserver): critical path feature flags
 (#12449)

## Problem

Some feature flags are used heavily on the critical path and we want the
"get feature flag" operation as cheap as possible.

## Summary of changes

Add a `test_remote_size_flag` as an example of such flags. In the
future, we can use macro to generate all those fields. The flag is
updated in the housekeeping loop. The retrieval of the flag is simply
reading an atomic flag.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/feature_resolver.rs | 29 ++++++++++++++++++++++++-----
 pageserver/src/tenant.rs           |  8 ++++----
 pageserver/src/tenant/timeline.rs  |  6 +++---
 3 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs
index 6ce4522080..65cac8eea1 100644
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -1,4 +1,8 @@
-use std::{collections::HashMap, sync::Arc, time::Duration};
+use std::{
+    collections::HashMap,
+    sync::{Arc, atomic::AtomicBool},
+    time::Duration,
+};
 
 use arc_swap::ArcSwap;
 use pageserver_api::config::NodeMetadata;
@@ -355,11 +359,17 @@ impl PerTenantProperties {
     }
 }
 
-#[derive(Clone)]
 pub struct TenantFeatureResolver {
     inner: FeatureResolver,
     tenant_id: TenantId,
-    cached_tenant_properties: Arc<ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>>,
+    cached_tenant_properties: ArcSwap<HashMap<String, PostHogFlagFilterPropertyValue>>,
+
+    // Add feature flag on the critical path below.
+    //
+    // If a feature flag will be used on the critical path, we will update it in the tenant housekeeping loop insetad of
+    // resolving directly by calling `evaluate_multivariate` or `evaluate_boolean`. Remember to update the flag in the
+    // housekeeping loop. The user should directly read this atomic flag instead of using the set of evaluate functions.
+    pub feature_test_remote_size_flag: AtomicBool,
 }
 
 impl TenantFeatureResolver {
@@ -367,7 +377,8 @@ impl TenantFeatureResolver {
         Self {
             inner,
             tenant_id,
-            cached_tenant_properties: Arc::new(ArcSwap::new(Arc::new(HashMap::new()))),
+            cached_tenant_properties: ArcSwap::new(Arc::new(HashMap::new())),
+            feature_test_remote_size_flag: AtomicBool::new(false),
         }
     }
 
@@ -396,7 +407,8 @@ impl TenantFeatureResolver {
         self.inner.is_feature_flag_boolean(flag_key)
     }
 
-    pub fn update_cached_tenant_properties(&self, tenant_shard: &TenantShard) {
+    /// Refresh the cached properties and flags on the critical path.
+    pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
         let mut remote_size_mb = None;
         for timeline in tenant_shard.list_timelines() {
             let size = timeline.metrics.resident_physical_size_get();
@@ -410,5 +422,12 @@ impl TenantFeatureResolver {
         self.cached_tenant_properties.store(Arc::new(
             PerTenantProperties { remote_size_mb }.into_posthog_properties(),
         ));
+
+        // BEGIN: Update the feature flag on the critical path.
+        self.feature_test_remote_size_flag.store(
+            self.evaluate_boolean("test-remote-size-flag").is_ok(),
+            std::sync::atomic::Ordering::Relaxed,
+        );
+        // END: Update the feature flag on the critical path.
     }
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 49b92915da..96ed4672a6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -388,7 +388,7 @@ pub struct TenantShard {
 
     l0_flush_global_state: L0FlushGlobalState,
 
-    pub(crate) feature_resolver: TenantFeatureResolver,
+    pub(crate) feature_resolver: Arc<TenantFeatureResolver>,
 }
 impl std::fmt::Debug for TenantShard {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
@@ -3411,7 +3411,7 @@ impl TenantShard {
         }
 
         // Update the feature resolver with the latest tenant-spcific data.
-        self.feature_resolver.update_cached_tenant_properties(self);
+        self.feature_resolver.refresh_properties_and_flags(self);
     }
 
     pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
@@ -4500,10 +4500,10 @@ impl TenantShard {
             gc_block: Default::default(),
             l0_flush_global_state,
             basebackup_cache,
-            feature_resolver: TenantFeatureResolver::new(
+            feature_resolver: Arc::new(TenantFeatureResolver::new(
                 feature_resolver,
                 tenant_shard_id.tenant_id,
-            ),
+            )),
         }
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 296b922599..44a4f1e911 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -201,7 +201,7 @@ pub struct TimelineResources {
     pub l0_compaction_trigger: Arc<Notify>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
     pub basebackup_cache: Arc<BasebackupCache>,
-    pub feature_resolver: TenantFeatureResolver,
+    pub feature_resolver: Arc<TenantFeatureResolver>,
 }
 
 pub struct Timeline {
@@ -449,7 +449,7 @@ pub struct Timeline {
     /// A channel to send async requests to prepare a basebackup for the basebackup cache.
     basebackup_cache: Arc<BasebackupCache>,
 
-    feature_resolver: TenantFeatureResolver,
+    feature_resolver: Arc<TenantFeatureResolver>,
 }
 
 pub(crate) enum PreviousHeatmap {
@@ -3122,7 +3122,7 @@ impl Timeline {
 
                 basebackup_cache: resources.basebackup_cache,
 
-                feature_resolver: resources.feature_resolver,
+                feature_resolver: resources.feature_resolver.clone(),
             };
 
             result.repartition_threshold =

From 81e7218c278288cd8e95ed3f50f4e4a423391dc8 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 8 Jul 2025 20:15:13 +0200
Subject: [PATCH 023/163] pageserver: tighten up gRPC `page_api::Client`
 (#12396)

This patch tightens up `page_api::Client`. It's mostly superficial
changes, but also adds a new constructor that takes an existing gRPC
channel, for use with the communicator connection pool.
---
 compute_tools/src/compute.rs                  |   2 +-
 compute_tools/src/lsn_lease.rs                |   2 +-
 pageserver/page_api/src/client.rs             | 327 +++++++++---------
 pageserver/pagebench/src/cmd/basebackup.rs    |   2 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   2 +-
 5 files changed, 159 insertions(+), 176 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index e870cecc58..f25aff1110 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1057,7 +1057,7 @@ impl ComputeNode {
         };
 
         let (reader, connected) = tokio::runtime::Handle::current().block_on(async move {
-            let mut client = page_api::Client::new(
+            let mut client = page_api::Client::connect(
                 shard0_connstr,
                 spec.tenant_id,
                 spec.timeline_id,
diff --git a/compute_tools/src/lsn_lease.rs b/compute_tools/src/lsn_lease.rs
index 3346c18c0d..bb0828429d 100644
--- a/compute_tools/src/lsn_lease.rs
+++ b/compute_tools/src/lsn_lease.rs
@@ -192,7 +192,7 @@ fn acquire_lsn_lease_grpc(
     lsn: Lsn,
 ) -> Result<Option<SystemTime>> {
     tokio::runtime::Handle::current().block_on(async move {
-        let mut client = page_api::Client::new(
+        let mut client = page_api::Client::connect(
             connstring.to_string(),
             tenant_shard_id.tenant_id,
             timeline_id,
diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs
index 65e41540b8..6523d00d3d 100644
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,23 +1,151 @@
-use anyhow::Result;
+use anyhow::Context as _;
 use futures::{Stream, StreamExt as _, TryStreamExt as _};
 use tokio::io::AsyncRead;
 use tokio_util::io::StreamReader;
+use tonic::codec::CompressionEncoding;
 use tonic::metadata::AsciiMetadataValue;
-use tonic::metadata::errors::InvalidMetadataValue;
-use tonic::transport::Channel;
-use tonic::{Request, Streaming};
+use tonic::service::Interceptor;
+use tonic::service::interceptor::InterceptedService;
+use tonic::transport::{Channel, Endpoint};
 
-use utils::id::TenantId;
-use utils::id::TimelineId;
+use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
-use crate::model;
+use crate::model::*;
 use crate::proto;
 
-///
-/// AuthInterceptor adds tenant, timeline, and auth header to the channel. These
-/// headers are required at the pageserver.
-///
+/// A basic Pageserver gRPC client, for a single tenant shard. This API uses native Rust domain
+/// types from `model` rather than generated Protobuf types.
+pub struct Client {
+    inner: proto::PageServiceClient<InterceptedService<Channel, AuthInterceptor>>,
+}
+
+impl Client {
+    /// Connects to the given gRPC endpoint.
+    pub async fn connect<E>(
+        endpoint: E,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self>
+    where
+        E: TryInto<Endpoint> + Send + Sync + 'static,
+        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
+    {
+        let endpoint: Endpoint = endpoint.try_into().context("invalid endpoint")?;
+        let channel = endpoint.connect().await?;
+        Self::new(
+            channel,
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token,
+            compression,
+        )
+    }
+
+    /// Creates a new client using the given gRPC channel.
+    pub fn new(
+        channel: Channel,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
+    ) -> anyhow::Result<Self> {
+        let auth = AuthInterceptor::new(tenant_id, timeline_id, shard_id, auth_token)?;
+        let mut inner = proto::PageServiceClient::with_interceptor(channel, auth);
+
+        if let Some(compression) = compression {
+            // TODO: benchmark this (including network latency).
+            inner = inner
+                .accept_compressed(compression)
+                .send_compressed(compression);
+        }
+
+        Ok(Self { inner })
+    }
+
+    /// Returns whether a relation exists.
+    pub async fn check_rel_exists(
+        &mut self,
+        req: CheckRelExistsRequest,
+    ) -> tonic::Result<CheckRelExistsResponse> {
+        let req = proto::CheckRelExistsRequest::from(req);
+        let resp = self.inner.check_rel_exists(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches a base backup.
+    pub async fn get_base_backup(
+        &mut self,
+        req: GetBaseBackupRequest,
+    ) -> tonic::Result<impl AsyncRead + use<>> {
+        let req = proto::GetBaseBackupRequest::from(req);
+        let chunks = self.inner.get_base_backup(req).await?.into_inner();
+        Ok(StreamReader::new(
+            chunks
+                .map_ok(|resp| resp.chunk)
+                .map_err(std::io::Error::other),
+        ))
+    }
+
+    /// Returns the total size of a database, as # of bytes.
+    pub async fn get_db_size(&mut self, req: GetDbSizeRequest) -> tonic::Result<GetDbSizeResponse> {
+        let req = proto::GetDbSizeRequest::from(req);
+        let resp = self.inner.get_db_size(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches pages.
+    ///
+    /// This is implemented as a bidirectional streaming RPC for performance. Per-request errors are
+    /// typically returned as status_code instead of errors, to avoid tearing down the entire stream
+    /// via a tonic::Status error.
+    pub async fn get_pages(
+        &mut self,
+        reqs: impl Stream<Item = GetPageRequest> + Send + 'static,
+    ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
+        let reqs = reqs.map(proto::GetPageRequest::from);
+        let resps = self.inner.get_pages(reqs).await?.into_inner();
+        Ok(resps.map_ok(GetPageResponse::from))
+    }
+
+    /// Returns the size of a relation, as # of blocks.
+    pub async fn get_rel_size(
+        &mut self,
+        req: GetRelSizeRequest,
+    ) -> tonic::Result<GetRelSizeResponse> {
+        let req = proto::GetRelSizeRequest::from(req);
+        let resp = self.inner.get_rel_size(req).await?.into_inner();
+        Ok(resp.into())
+    }
+
+    /// Fetches an SLRU segment.
+    pub async fn get_slru_segment(
+        &mut self,
+        req: GetSlruSegmentRequest,
+    ) -> tonic::Result<GetSlruSegmentResponse> {
+        let req = proto::GetSlruSegmentRequest::from(req);
+        let resp = self.inner.get_slru_segment(req).await?.into_inner();
+        Ok(resp.try_into()?)
+    }
+
+    /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
+    /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
+    ///
+    /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
+    /// acquired because the LSN has already been garbage collected.
+    pub async fn lease_lsn(&mut self, req: LeaseLsnRequest) -> tonic::Result<LeaseLsnResponse> {
+        let req = proto::LeaseLsnRequest::from(req);
+        let resp = self.inner.lease_lsn(req).await?.into_inner();
+        Ok(resp.try_into()?)
+    }
+}
+
+/// Adds authentication metadata to gRPC requests.
 #[derive(Clone)]
 struct AuthInterceptor {
     tenant_id: AsciiMetadataValue,
@@ -30,174 +158,29 @@ impl AuthInterceptor {
     fn new(
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        auth_token: Option<String>,
         shard_id: ShardIndex,
-    ) -> Result<Self, InvalidMetadataValue> {
-        let tenant_ascii: AsciiMetadataValue = tenant_id.to_string().try_into()?;
-        let timeline_ascii: AsciiMetadataValue = timeline_id.to_string().try_into()?;
-        let shard_ascii: AsciiMetadataValue = shard_id.to_string().try_into()?;
-
-        let auth_header: Option<AsciiMetadataValue> = match auth_token {
-            Some(token) => Some(format!("Bearer {token}").try_into()?),
-            None => None,
-        };
-
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
         Ok(Self {
-            tenant_id: tenant_ascii,
-            shard_id: shard_ascii,
-            timeline_id: timeline_ascii,
-            auth_header,
+            tenant_id: tenant_id.to_string().try_into()?,
+            timeline_id: timeline_id.to_string().try_into()?,
+            shard_id: shard_id.to_string().try_into()?,
+            auth_header: auth_token
+                .map(|token| format!("Bearer {token}").try_into())
+                .transpose()?,
         })
     }
 }
 
-impl tonic::service::Interceptor for AuthInterceptor {
-    fn call(&mut self, mut req: tonic::Request<()>) -> Result<tonic::Request<()>, tonic::Status> {
-        req.metadata_mut()
-            .insert("neon-tenant-id", self.tenant_id.clone());
-        req.metadata_mut()
-            .insert("neon-shard-id", self.shard_id.clone());
-        req.metadata_mut()
-            .insert("neon-timeline-id", self.timeline_id.clone());
-        if let Some(auth_header) = &self.auth_header {
-            req.metadata_mut()
-                .insert("authorization", auth_header.clone());
+impl Interceptor for AuthInterceptor {
+    fn call(&mut self, mut req: tonic::Request<()>) -> tonic::Result<tonic::Request<()>> {
+        let metadata = req.metadata_mut();
+        metadata.insert("neon-tenant-id", self.tenant_id.clone());
+        metadata.insert("neon-timeline-id", self.timeline_id.clone());
+        metadata.insert("neon-shard-id", self.shard_id.clone());
+        if let Some(ref auth_header) = self.auth_header {
+            metadata.insert("authorization", auth_header.clone());
         }
         Ok(req)
     }
 }
-
-#[derive(Clone)]
-pub struct Client {
-    client: proto::PageServiceClient<
-        tonic::service::interceptor::InterceptedService<Channel, AuthInterceptor>,
-    >,
-}
-
-impl Client {
-    pub async fn new<T: TryInto<tonic::transport::Endpoint> + Send + Sync + 'static>(
-        into_endpoint: T,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        shard_id: ShardIndex,
-        auth_header: Option<String>,
-        compression: Option<tonic::codec::CompressionEncoding>,
-    ) -> anyhow::Result<Self> {
-        let endpoint: tonic::transport::Endpoint = into_endpoint
-            .try_into()
-            .map_err(|_e| anyhow::anyhow!("failed to convert endpoint"))?;
-        let channel = endpoint.connect().await?;
-        let auth = AuthInterceptor::new(tenant_id, timeline_id, auth_header, shard_id)
-            .map_err(|e| anyhow::anyhow!(e.to_string()))?;
-        let mut client = proto::PageServiceClient::with_interceptor(channel, auth);
-
-        if let Some(compression) = compression {
-            // TODO: benchmark this (including network latency).
-            client = client
-                .accept_compressed(compression)
-                .send_compressed(compression);
-        }
-
-        Ok(Self { client })
-    }
-
-    /// Returns whether a relation exists.
-    pub async fn check_rel_exists(
-        &mut self,
-        req: model::CheckRelExistsRequest,
-    ) -> Result<model::CheckRelExistsResponse, tonic::Status> {
-        let proto_req = proto::CheckRelExistsRequest::from(req);
-
-        let response = self.client.check_rel_exists(proto_req).await?;
-
-        let proto_resp = response.into_inner();
-        Ok(proto_resp.into())
-    }
-
-    /// Fetches a base backup.
-    pub async fn get_base_backup(
-        &mut self,
-        req: model::GetBaseBackupRequest,
-    ) -> Result<impl AsyncRead + use<>, tonic::Status> {
-        let req = proto::GetBaseBackupRequest::from(req);
-        let chunks = self.client.get_base_backup(req).await?.into_inner();
-        let reader = StreamReader::new(
-            chunks
-                .map_ok(|resp| resp.chunk)
-                .map_err(std::io::Error::other),
-        );
-        Ok(reader)
-    }
-
-    /// Returns the total size of a database, as # of bytes.
-    pub async fn get_db_size(
-        &mut self,
-        req: model::GetDbSizeRequest,
-    ) -> Result<u64, tonic::Status> {
-        let proto_req = proto::GetDbSizeRequest::from(req);
-
-        let response = self.client.get_db_size(proto_req).await?;
-        Ok(response.into_inner().into())
-    }
-
-    /// Fetches pages.
-    ///
-    /// This is implemented as a bidirectional streaming RPC for performance.
-    /// Per-request errors are often returned as status_code instead of errors,
-    /// to avoid tearing down the entire stream via tonic::Status.
-    pub async fn get_pages<ReqSt>(
-        &mut self,
-        inbound: ReqSt,
-    ) -> Result<
-        impl Stream<Item = Result<model::GetPageResponse, tonic::Status>> + Send + 'static,
-        tonic::Status,
-    >
-    where
-        ReqSt: Stream<Item = model::GetPageRequest> + Send + 'static,
-    {
-        let outbound_proto = inbound.map(|domain_req| domain_req.into());
-
-        let req_new = Request::new(outbound_proto);
-
-        let response_stream: Streaming<proto::GetPageResponse> =
-            self.client.get_pages(req_new).await?.into_inner();
-
-        let domain_stream = response_stream.map_ok(model::GetPageResponse::from);
-
-        Ok(domain_stream)
-    }
-
-    /// Returns the size of a relation, as # of blocks.
-    pub async fn get_rel_size(
-        &mut self,
-        req: model::GetRelSizeRequest,
-    ) -> Result<model::GetRelSizeResponse, tonic::Status> {
-        let proto_req = proto::GetRelSizeRequest::from(req);
-        let response = self.client.get_rel_size(proto_req).await?;
-        let proto_resp = response.into_inner();
-        Ok(proto_resp.into())
-    }
-
-    /// Fetches an SLRU segment.
-    pub async fn get_slru_segment(
-        &mut self,
-        req: model::GetSlruSegmentRequest,
-    ) -> Result<model::GetSlruSegmentResponse, tonic::Status> {
-        let proto_req = proto::GetSlruSegmentRequest::from(req);
-        let response = self.client.get_slru_segment(proto_req).await?;
-        Ok(response.into_inner().try_into()?)
-    }
-
-    /// Acquires or extends a lease on the given LSN. This guarantees that the Pageserver won't
-    /// garbage collect the LSN until the lease expires. Must be acquired on all relevant shards.
-    ///
-    /// Returns the lease expiration time, or a FailedPrecondition status if the lease could not be
-    /// acquired because the LSN has already been garbage collected.
-    pub async fn lease_lsn(
-        &mut self,
-        req: model::LeaseLsnRequest,
-    ) -> Result<model::LeaseLsnResponse, tonic::Status> {
-        let req = proto::LeaseLsnRequest::from(req);
-        Ok(self.client.lease_lsn(req).await?.into_inner().try_into()?)
-    }
-}
diff --git a/pageserver/pagebench/src/cmd/basebackup.rs b/pageserver/pagebench/src/cmd/basebackup.rs
index 4b7a70504a..c14bb73136 100644
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -326,7 +326,7 @@ impl GrpcClient {
         ttid: TenantTimelineId,
         compression: bool,
     ) -> anyhow::Result<Self> {
-        let inner = page_api::Client::new(
+        let inner = page_api::Client::connect(
             connstring.to_string(),
             ttid.tenant_id,
             ttid.timeline_id,
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index a297819e9b..f14caf548c 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -625,7 +625,7 @@ impl GrpcClient {
         ttid: TenantTimelineId,
         compression: bool,
     ) -> anyhow::Result<Self> {
-        let mut client = page_api::Client::new(
+        let mut client = page_api::Client::connect(
             connstring.to_string(),
             ttid.tenant_id,
             ttid.timeline_id,

From d63f1d259adeab70fc307dd9736f90c5db513409 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 8 Jul 2025 21:33:25 +0300
Subject: [PATCH 024/163] avoid assertion failure about calling palloc() in
 critical section

---
 pgxn/neon/communicator_new.c | 45 ++++++++++++++++++++++++------------
 1 file changed, 30 insertions(+), 15 deletions(-)

diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index 44070dd72d..bc27942bb1 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -1199,112 +1199,127 @@ communicator_new_forget_cache(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumbe
 static char *
 print_neon_io_request(NeonIORequest *request)
 {
+	static char buf[100];
+
 	switch (request->tag)
 	{
 		case NeonIORequest_Empty:
-			return pstrdup("Empty");
-			break;
+			snprintf(buf, sizeof(buf), "Empty");
+			return buf;
 		case NeonIORequest_RelExists:
 			{
 				CRelExistsRequest *r = &request->rel_exists;
 
-				return psprintf("RelExists: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+				snprintf(buf, sizeof(buf), "RelExists: req " UINT64_FORMAT " rel %u/%u/%u.%u",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+				return buf;
 			}
 		case NeonIORequest_RelSize:
 			{
 				CRelSizeRequest *r = &request->rel_size;
 
-				return psprintf("RelSize: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+				snprintf(buf, sizeof(buf), "RelSize: req " UINT64_FORMAT " rel %u/%u/%u.%u",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+				return buf;
 			}
 		case NeonIORequest_GetPageV:
 			{
 				CGetPageVRequest *r = &request->get_page_v;
 
-				return psprintf("GetPageV: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %d-%d",
+				snprintf(buf, sizeof(buf), "GetPageV: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %d-%d",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number, r->block_number + r->nblocks);
+				return buf;
 			}
 		case NeonIORequest_PrefetchV:
 			{
 				CPrefetchVRequest *r = &request->prefetch_v;
 
-				return psprintf("PrefetchV: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %d-%d",
+				snprintf(buf, sizeof(buf), "PrefetchV: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %d-%d",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number, r->block_number + r->nblocks);
+				return buf;
 			}
 		case NeonIORequest_DbSize:
 			{
 				CDbSizeRequest *r = &request->db_size;
 
-				return psprintf("PrefetchV: req " UINT64_FORMAT " db %u",
+				snprintf(buf, sizeof(buf), "PrefetchV: req " UINT64_FORMAT " db %u",
 								r->request_id, r->db_oid);
+				return buf;
 			}
 		case NeonIORequest_WritePage:
 			{
 				CWritePageRequest *r = &request->write_page;
 
-				return psprintf("WritePage: req " UINT64_FORMAT " rel %u/%u/%u.%u blk %u lsn %X/%X",
+				snprintf(buf, sizeof(buf), "WritePage: req " UINT64_FORMAT " rel %u/%u/%u.%u blk %u lsn %X/%X",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number,
 								LSN_FORMAT_ARGS(r->lsn));
+				return buf;
 			}
 		case NeonIORequest_RelExtend:
 			{
 				CRelExtendRequest *r = &request->rel_extend;
 
-				return psprintf("RelExtend: req " UINT64_FORMAT " rel %u/%u/%u.%u blk %u lsn %X/%X",
+				snprintf(buf, sizeof(buf), "RelExtend: req " UINT64_FORMAT " rel %u/%u/%u.%u blk %u lsn %X/%X",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number,
 								LSN_FORMAT_ARGS(r->lsn));
+				return buf;
 			}
 		case NeonIORequest_RelZeroExtend:
 			{
 				CRelZeroExtendRequest *r = &request->rel_zero_extend;
 
-				return psprintf("RelZeroExtend: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %u-%u lsn %X/%X",
+				snprintf(buf, sizeof(buf), "RelZeroExtend: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %u-%u lsn %X/%X",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number, r->block_number + r->nblocks,
 								LSN_FORMAT_ARGS(r->lsn));
+				return buf;
 			}
 		case NeonIORequest_RelCreate:
 			{
 				CRelCreateRequest *r = &request->rel_create;
 
-				return psprintf("RelCreate: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+				snprintf(buf, sizeof(buf), "RelCreate: req " UINT64_FORMAT " rel %u/%u/%u.%u",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+				return buf;
 			}
 		case NeonIORequest_RelTruncate:
 			{
 				CRelTruncateRequest *r = &request->rel_truncate;
 
-				return psprintf("RelTruncate: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %u",
+				snprintf(buf, sizeof(buf), "RelTruncate: req " UINT64_FORMAT " rel %u/%u/%u.%u blks %u",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->nblocks);
+				return buf;
 			}
 		case NeonIORequest_RelUnlink:
 			{
 				CRelUnlinkRequest *r = &request->rel_unlink;
 
-				return psprintf("RelUnlink: req " UINT64_FORMAT " rel %u/%u/%u.%u",
+				snprintf(buf, sizeof(buf), "RelUnlink: req " UINT64_FORMAT " rel %u/%u/%u.%u",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
+				return buf;
 			}
 		case NeonIORequest_ForgetCache:
 			{
 				CForgetCacheRequest *r = &request->forget_cache;
 
-				return psprintf("ForgetCache: req " UINT64_FORMAT " rel %u/%u/%u.%u blocks: %u",
+				snprintf(buf, sizeof(buf), "ForgetCache: req " UINT64_FORMAT " rel %u/%u/%u.%u blocks: %u",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number,
 					r->nblocks);
+				return buf;
 			}
 	}
-	return psprintf("Unknown request type %u", request->tag);
+	snprintf(buf, sizeof(buf), "Unknown request type %d", (int) request->tag);
+	return buf;
 }
 
 
From 3dad4698ec134e5e5491fa7c9c5195041216e248 Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Tue, 8 Jul 2025 12:43:01 -0700
Subject: [PATCH 025/163] PS changes #1 (#12467)

# TLDR
All changes are no-op except
1. publishing additional metrics.
2. problem VI

## Problem I

It has come to my attention that the Neon Storage Controller doesn't
correctly update its "observed" state of tenants previously associated
with PSs that has come back up after a local data loss. It would still
think that the old tenants are still attached to page servers and won't
ask more questions. The pageserver has enough information from the
reattach request/response to tell that something is wrong, but it
doesn't do anything about it either. We need to detect this situation in
production while I work on a fix.

(I think there is just some misunderstanding about how Neon manages
their pageserver deployments which got me confused about all the
invariants.)

## Summary of changes I

Added a `pageserver_local_data_loss_suspected` gauge metric that will be
set to 1 if we detect a problematic situation from the reattch response.
The problematic situation is when the PS doesn't have any local tenants
but received a reattach response containing tenants.

We can set up an alert using this metric. The alert should be raised
whenever this metric reports non-zero number.

Also added a HTTP PUT
`http://pageserver/hadron-internal/reset_alert_gauges` API on the
pageserver that can be used to reset the gauge and the alert once we
manually rectify the situation (by restarting the HCC).

## Problem II
Azure upload is 3x slower than AWS. -> 3x slower ingestion.

The reason for the slower upload is that Azure upload in page server is
much slower => higher flush latency => higher disk consistent LSN =>
higher back pressure.

## Summary of changes II
Use Azure put_block API to uploads a 1 GB layer file in 8 blocks in
parallel.

I set the put_block block size to be 128 MB by default in azure config.

To minimize neon changes, upload function passes the layer file path to
the azure upload code through the storage metadata. This allows the
azure put block to use FileChunkStreamRead to stream read from one
partition in the file instead of loading all file data in memory and
split it into 8 128 MB chunks.

## How is this tested? II
1. rust test_real_azure tests the put_block change.
3. I deployed the change in azure dev and saw flush latency reduces from
~30 seconds to 10 seconds.
4. I also did a bunch of stress test using sqlsmith and 100 GB TPCDS
runs.

## Problem III
Currently Neon limits the compaction tasks as 3/4 * CPU cores. This
limits the overall compaction throughput and it can easily cause
head-of-the-line blocking problems when a few large tenants are
compacting.

## Summary of changes III
This PR increases the limit of compaction tasks as `BG_TASKS_PER_THREAD`
(default 4) * CPU cores. Note that `CONCURRENT_BACKGROUND_TASKS` also
limits some other tasks `logical_size_calculation` and `layer eviction`
. But compaction should be the most frequent and time-consuming task.

## Summary of changes IV
This PR adds the following PageServer metrics:
1. `pageserver_disk_usage_based_eviction_evicted_bytes_total`: captures
the total amount of bytes evicted. It's more straightforward to see the
bytes directly instead of layers.
2. `pageserver_active_storage_operations_count`: captures the active
storage operation, e.g., flush, L0 compaction, image creation etc. It's
useful to visualize these active operations to get a better idea of what
PageServers are spending cycles on in the background.

## Summary of changes V
When investigating data corruptions, it's useful to search the base
image and all WAL records of a page up to an LSN, i.e., a breakdown of
GetPage@LSN request. This PR implements this functionality with two
tools:

1. Extended `pagectl` with a new command to search the layer files for a
given key up to a given LSN from the `index_part.json` file. The output
can be used to download the files from S3 and then search the file
contents using the second tool.
Example usage:
```
cargo run --bin pagectl index-part search --tenant-id 09b99ea3239bbb3b2d883a59f087659d --timeline-id 7bedf4a6995baff7c0421ff9aebbcdab --path ~/Downloads/corruption/index_part.json-0000000c-formatted --key 000000067F000080140000802100000D61BD --lsn 70C/BF3D61D8
```
Example output:
```
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F0000801400008028000002FEFF__000007089F0B5381-0000070C7679EEB9-0000000c
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000000000000000000000000000000000-000000067F0000801400008028000002F3F1__000006DD95B6F609-000006E2BA14C369-0000000c
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F000080140000802100001B0973__000006D33429F539-000006DD95B6F609-0000000c
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F00008014000080210000164D81__000006C6343B2D31-000006D33429F539-0000000b
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F0000801400008021000017687B__000006BA344FA7F1-000006C6343B2D31-0000000b
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F00008014000080210000165BAB__000006AD34613D19-000006BA344FA7F1-0000000b
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F0000801400000B180000000002-000000067F00008014000080210000137A39__0000069F34773461-000006AD34613D19-0000000b
tenants/09b99ea3239bbb3b2d883a59f087659d-0304/timelines/7bedf4a6995baff7c0421ff9aebbcdab/000000067F000080140000802100000D4000-000000067F000080140000802100000F0000__0000069F34773460-0000000b
```

2. Added a unit test to search the layer file contents. It's not
implemented part of `pagectl` because it depends on some test harness
code, which can only be used by unit tests.

Example usage:
```
cargo test --package pageserver --lib -- tenant::debug::test_search_key --exact --nocapture -- --tenant-id 09b99ea3239bbb3b2d883a59f087659d --timeline-id 7bedf4a6995baff7c0421ff9aebbcdab --data-dir /Users/chen.luo/Downloads/corruption --key 000000067F000080140000802100000D61BD --lsn 70C/BF3D61D8
```
Example output:
```
# omitted image for brievity
delta: 69F/769D8180: will_init: false, "OgAAALGkuwXwYp12nwYAAECGAAASIqLHAAAAAH8GAAAUgAAAIYAAAL1hDQD/DLGkuwUDAAAAEAAWAA=="
delta: 69F/769CB6D8: will_init: false, "PQAAALGkuwXotZx2nwYAABAJAAAFk7tpACAGAH8GAAAUgAAAIYAAAL1hDQD/CQUAEAASALExuwUBAAAAAA=="
```

## Problem VI
Currently when page service resolves shards from page numbers, it
doesn't fully support the case that the shard could be split in the
middle. This will lead to query failures during the tenant split for
either commit or abort cases (it's mostly for abort).

## Summary of changes VI
This PR adds retry logic in `Cache::get()` to deal with shard resolution
errors more gracefully. Specifically, it'll clear the cache and retry,
instead of failing the query immediately. It also reduces the internal
timeout to make retries faster.

The PR also fixes a very obvious bug in
`TenantManager::resolve_attached_shard` where the code tries to cache
the computed the shard number, but forgot to recompute when the shard
count is different.

---------

Co-authored-by: William Huang <william.huang@databricks.com>
Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
Co-authored-by: Chen Luo <chen.luo@databricks.com>
Co-authored-by: Vlad Lazar <vlad.lazar@databricks.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 Cargo.lock                                    |   3 +
 libs/remote_storage/Cargo.toml                |   3 +
 libs/remote_storage/src/azure_blob.rs         | 143 ++++++-
 libs/remote_storage/src/config.rs             |  18 +
 libs/remote_storage/tests/common/mod.rs       |  34 +-
 libs/remote_storage/tests/test_real_azure.rs  |   3 +
 pageserver/Cargo.toml                         |   1 +
 pageserver/ctl/src/index_part.rs              | 102 ++++-
 pageserver/src/disk_usage_eviction_task.rs    |   3 +
 pageserver/src/http/routes.rs                 |  15 +
 pageserver/src/metrics.rs                     |  91 ++++-
 pageserver/src/page_service.rs                |   6 +-
 pageserver/src/tenant.rs                      |  16 +-
 pageserver/src/tenant/debug.rs                | 366 ++++++++++++++++++
 pageserver/src/tenant/mgr.rs                  |  17 +-
 .../tenant/remote_timeline_client/upload.rs   |  22 +-
 pageserver/src/tenant/tasks.rs                |  15 +
 pageserver/src/tenant/timeline.rs             |   2 +-
 pageserver/src/tenant/timeline/handle.rs      |  49 ++-
 pageserver/src/walredo.rs                     |  70 ++--
 test_runner/fixtures/metrics.py               |   3 +
 .../fixtures/pageserver/allowed_errors.py     |   8 +
 test_runner/regress/test_sharding.py          | 168 ++++++++
 23 files changed, 1097 insertions(+), 61 deletions(-)
 create mode 100644 pageserver/src/tenant/debug.rs

diff --git a/Cargo.lock b/Cargo.lock
index 237defaec3..39c43d94a3 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4339,6 +4339,7 @@ dependencies = [
  "arc-swap",
  "async-compression",
  "async-stream",
+ "base64 0.22.1",
  "bincode",
  "bit_field",
  "byteorder",
@@ -5684,6 +5685,8 @@ dependencies = [
  "azure_identity",
  "azure_storage",
  "azure_storage_blobs",
+ "base64 0.22.1",
+ "byteorder",
  "bytes",
  "camino",
  "camino-tempfile",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index bd18d80915..69316fd493 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -13,6 +13,7 @@ aws-smithy-async.workspace = true
 aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
+base64.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
@@ -41,6 +42,8 @@ http-body-util.workspace = true
 itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }
 
+byteorder = "1.4"
+
 [dev-dependencies]
 camino-tempfile.workspace = true
 test-context.workspace = true
diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index e9c24ac723..db30829216 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -14,17 +14,25 @@ use anyhow::{Context, Result, anyhow};
 use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range};
 use azure_core::{Continuable, HttpClient, RetryOptions, TransportOptions};
 use azure_storage::StorageCredentials;
-use azure_storage_blobs::blob::operations::GetBlobBuilder;
+use azure_storage_blobs::blob::BlobBlockType;
+use azure_storage_blobs::blob::BlockList;
 use azure_storage_blobs::blob::{Blob, CopyStatus};
 use azure_storage_blobs::container::operations::ListBlobsBuilder;
-use azure_storage_blobs::prelude::{ClientBuilder, ContainerClient};
+use azure_storage_blobs::prelude::ClientBuilder;
+use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
+use base64::{Engine as _, engine::general_purpose::URL_SAFE};
+use byteorder::{BigEndian, ByteOrder};
 use bytes::Bytes;
+use camino::Utf8Path;
 use futures::FutureExt;
 use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::{StreamExt, TryStreamExt};
 use http_types::{StatusCode, Url};
 use scopeguard::ScopeGuard;
+use tokio::fs::File;
+use tokio::io::AsyncReadExt;
+use tokio::io::AsyncSeekExt;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;
 use utils::backoff;
@@ -51,6 +59,9 @@ pub struct AzureBlobStorage {
 
     // Alternative timeout used for metadata objects which are expected to be small
     pub small_timeout: Duration,
+    /* BEGIN_HADRON */
+    pub put_block_size_mb: Option<usize>,
+    /* END_HADRON */
 }
 
 impl AzureBlobStorage {
@@ -107,6 +118,9 @@ impl AzureBlobStorage {
             concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
             timeout,
             small_timeout,
+            /* BEGIN_HADRON */
+            put_block_size_mb: azure_config.put_block_size_mb,
+            /* END_HADRON */
         })
     }
 
@@ -583,31 +597,137 @@ impl RemoteStorage for AzureBlobStorage {
 
         let started_at = start_measuring_requests(kind);
 
-        let op = async {
+        let mut metadata_map = metadata.unwrap_or([].into());
+        let timeline_file_path = metadata_map.0.remove("databricks_azure_put_block");
+
+        /* BEGIN_HADRON */
+        let op = async move {
             let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+            let put_block_size = self.put_block_size_mb.unwrap_or(0) * 1024 * 1024;
+            if timeline_file_path.is_none() || put_block_size == 0 {
+                // Use put_block_blob directly.
+                let from: Pin<
+                    Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
+                > = Box::pin(from);
+                let from = NonSeekableStream::new(from, data_size_bytes);
+                let body = azure_core::Body::SeekableStream(Box::new(from));
 
-            let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
-                Box::pin(from);
+                let mut builder = blob_client.put_block_blob(body);
+                if !metadata_map.0.is_empty() {
+                    builder = builder.metadata(to_azure_metadata(metadata_map));
+                }
+                let fut = builder.into_future();
+                let fut = tokio::time::timeout(self.timeout, fut);
+                let result = fut.await;
+                match result {
+                    Ok(Ok(_response)) => return Ok(()),
+                    Ok(Err(azure)) => return Err(azure.into()),
+                    Err(_timeout) => return Err(TimeoutOrCancel::Timeout.into()),
+                };
+            }
+            // Upload chunks concurrently using Put Block.
+            // Each PutBlock uploads put_block_size bytes of the file.
+            let mut upload_futures: Vec<tokio::task::JoinHandle<Result<(), azure_core::Error>>> =
+                vec![];
+            let mut block_list = BlockList::default();
+            let mut start_bytes = 0u64;
+            let mut remaining_bytes = data_size_bytes;
+            let mut block_list_count = 0;
 
-            let from = NonSeekableStream::new(from, data_size_bytes);
+            while remaining_bytes > 0 {
+                let block_size = std::cmp::min(remaining_bytes, put_block_size);
+                let end_bytes = start_bytes + block_size as u64;
+                let block_id = block_list_count;
+                let timeout = self.timeout;
+                let blob_client = blob_client.clone();
+                let timeline_file = timeline_file_path.clone().unwrap().clone();
 
-            let body = azure_core::Body::SeekableStream(Box::new(from));
+                let mut encoded_block_id = [0u8; 8];
+                BigEndian::write_u64(&mut encoded_block_id, block_id);
+                URL_SAFE.encode(encoded_block_id);
 
-            let mut builder = blob_client.put_block_blob(body);
+                // Put one block.
+                let part_fut = async move {
+                    let mut file = File::open(Utf8Path::new(&timeline_file.clone())).await?;
+                    file.seek(io::SeekFrom::Start(start_bytes)).await?;
+                    let limited_reader = file.take(block_size as u64);
+                    let file_chunk_stream =
+                        tokio_util::io::ReaderStream::with_capacity(limited_reader, 1024 * 1024);
+                    let file_chunk_stream_pin: Pin<
+                        Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>,
+                    > = Box::pin(file_chunk_stream);
+                    let stream_wrapper = NonSeekableStream::new(file_chunk_stream_pin, block_size);
+                    let body = azure_core::Body::SeekableStream(Box::new(stream_wrapper));
+                    // Azure put block takes URL-encoded block ids and all blocks must have the same byte length.
+                    // https://learn.microsoft.com/en-us/rest/api/storageservices/put-block?tabs=microsoft-entra-id#uri-parameters
+                    let builder = blob_client.put_block(encoded_block_id.to_vec(), body);
+                    let fut = builder.into_future();
+                    let fut = tokio::time::timeout(timeout, fut);
+                    let result = fut.await;
+                    tracing::debug!(
+                        "azure put block id-{} size {} start {} end {} file {} response {:#?}",
+                        block_id,
+                        block_size,
+                        start_bytes,
+                        end_bytes,
+                        timeline_file,
+                        result
+                    );
+                    match result {
+                        Ok(Ok(_response)) => Ok(()),
+                        Ok(Err(azure)) => Err(azure),
+                        Err(_timeout) => Err(azure_core::Error::new(
+                            azure_core::error::ErrorKind::Io,
+                            std::io::Error::new(
+                                std::io::ErrorKind::TimedOut,
+                                "Operation timed out",
+                            ),
+                        )),
+                    }
+                };
+                upload_futures.push(tokio::spawn(part_fut));
 
-            if let Some(metadata) = metadata {
-                builder = builder.metadata(to_azure_metadata(metadata));
+                block_list_count += 1;
+                remaining_bytes -= block_size;
+                start_bytes += block_size as u64;
+
+                block_list
+                    .blocks
+                    .push(BlobBlockType::Uncommitted(encoded_block_id.to_vec().into()));
             }
 
+            tracing::debug!(
+                "azure put blocks {} total MB: {} chunk size MB: {}",
+                block_list_count,
+                data_size_bytes / 1024 / 1024,
+                put_block_size / 1024 / 1024
+            );
+            // Wait for all blocks to be uploaded.
+            let upload_results = futures::future::try_join_all(upload_futures).await;
+            if upload_results.is_err() {
+                return Err(anyhow::anyhow!(format!(
+                    "Failed to upload all blocks {:#?}",
+                    upload_results.unwrap_err()
+                )));
+            }
+
+            // Commit the blocks.
+            let mut builder = blob_client.put_block_list(block_list);
+            if !metadata_map.0.is_empty() {
+                builder = builder.metadata(to_azure_metadata(metadata_map));
+            }
             let fut = builder.into_future();
             let fut = tokio::time::timeout(self.timeout, fut);
+            let result = fut.await;
+            tracing::debug!("azure put block list response {:#?}", result);
 
-            match fut.await {
+            match result {
                 Ok(Ok(_response)) => Ok(()),
                 Ok(Err(azure)) => Err(azure.into()),
                 Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
             }
         };
+        /* END_HADRON */
 
         let res = tokio::select! {
             res = op => res,
@@ -622,7 +742,6 @@ impl RemoteStorage for AzureBlobStorage {
         crate::metrics::BUCKET_METRICS
             .req_seconds
             .observe_elapsed(kind, outcome, started_at);
-
         res
     }
 
diff --git a/libs/remote_storage/src/config.rs b/libs/remote_storage/src/config.rs
index 5bc1f678ae..e13e17d544 100644
--- a/libs/remote_storage/src/config.rs
+++ b/libs/remote_storage/src/config.rs
@@ -195,8 +195,19 @@ pub struct AzureConfig {
     pub max_keys_per_list_response: Option<i32>,
     #[serde(default = "default_azure_conn_pool_size")]
     pub conn_pool_size: usize,
+    /* BEGIN_HADRON */
+    #[serde(default = "default_azure_put_block_size_mb")]
+    pub put_block_size_mb: Option<usize>,
+    /* END_HADRON */
 }
 
+/* BEGIN_HADRON */
+fn default_azure_put_block_size_mb() -> Option<usize> {
+    // Disable parallel upload by default.
+    Some(0)
+}
+/* END_HADRON */
+
 fn default_remote_storage_azure_concurrency_limit() -> NonZeroUsize {
     NonZeroUsize::new(DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT).unwrap()
 }
@@ -213,6 +224,9 @@ impl Debug for AzureConfig {
                 "max_keys_per_list_response",
                 &self.max_keys_per_list_response,
             )
+            /* BEGIN_HADRON */
+            .field("put_block_size_mb", &self.put_block_size_mb)
+            /* END_HADRON */
             .finish()
     }
 }
@@ -352,6 +366,7 @@ timeout = '5s'";
     upload_storage_class = 'INTELLIGENT_TIERING'
     timeout = '7s'
     conn_pool_size = 8
+    put_block_size_mb = 1024
     ";
 
         let config = parse(toml).unwrap();
@@ -367,6 +382,9 @@ timeout = '5s'";
                     concurrency_limit: default_remote_storage_azure_concurrency_limit(),
                     max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
                     conn_pool_size: 8,
+                    /* BEGIN_HADRON */
+                    put_block_size_mb: Some(1024),
+                    /* END_HADRON */
                 }),
                 timeout: Duration::from_secs(7),
                 small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT
diff --git a/libs/remote_storage/tests/common/mod.rs b/libs/remote_storage/tests/common/mod.rs
index daab05d91a..fb7d6fd482 100644
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -165,10 +165,42 @@ pub(crate) async fn upload_remote_data(
 
             let (data, data_len) =
                 upload_stream(format!("remote blob data {i}").into_bytes().into());
+
+            /* BEGIN_HADRON */
+            let mut metadata = None;
+            if matches!(&*task_client, GenericRemoteStorage::AzureBlob(_)) {
+                let file_path = "/tmp/dbx_upload_tmp_file.txt";
+                {
+                    // Open the file in append mode
+                    let mut file = std::fs::OpenOptions::new()
+                        .append(true)
+                        .create(true) // Create the file if it doesn't exist
+                        .open(file_path)?;
+                    // Append some bytes to the file
+                    std::io::Write::write_all(
+                        &mut file,
+                        &format!("remote blob data {i}").into_bytes(),
+                    )?;
+                    file.sync_all()?;
+                }
+                metadata = Some(remote_storage::StorageMetadata::from([(
+                    "databricks_azure_put_block",
+                    file_path,
+                )]));
+            }
+            /* END_HADRON */
+
             task_client
-                .upload(data, data_len, &blob_path, None, &cancel)
+                .upload(data, data_len, &blob_path, metadata, &cancel)
                 .await?;
 
+            // TODO: Check upload is using the put_block upload.
+            // We cannot consume data here since data is moved inside the upload.
+            // let total_bytes = data.fold(0, |acc, chunk| async move {
+            //     acc + chunk.map(|bytes| bytes.len()).unwrap_or(0)
+            // }).await;
+            // assert_eq!(total_bytes, data_len);
+
             Ok::<_, anyhow::Error>((blob_prefix, blob_path))
         });
     }
diff --git a/libs/remote_storage/tests/test_real_azure.rs b/libs/remote_storage/tests/test_real_azure.rs
index 31c9ca3200..4d7caabd39 100644
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -219,6 +219,9 @@ async fn create_azure_client(
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
             max_keys_per_list_response,
             conn_pool_size: 8,
+            /* BEGIN_HADRON */
+            put_block_size_mb: Some(1),
+            /* END_HADRON */
         }),
         timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
         small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 8a2e2ed3be..1fd0dccff0 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -112,6 +112,7 @@ twox-hash.workspace = true
 procfs.workspace = true
 
 [dev-dependencies]
+base64.workspace = true
 criterion.workspace = true
 hex-literal.workspace = true
 tokio = { workspace = true, features = ["process", "sync", "fs", "rt", "io-util", "time", "test-util"] }
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 6cce2844c7..838d00e490 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,10 +1,101 @@
+use std::str::FromStr;
+
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use pageserver::tenant::IndexPart;
+use pageserver::tenant::{
+    IndexPart,
+    layer_map::{LayerMap, SearchResult},
+    remote_timeline_client::remote_layer_path,
+    storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
+};
+use pageserver_api::key::Key;
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+    shard::TenantShardId,
+};
 
 #[derive(clap::Subcommand)]
 pub(crate) enum IndexPartCmd {
-    Dump { path: Utf8PathBuf },
+    Dump {
+        path: Utf8PathBuf,
+    },
+    /// Find all layers that need to be searched to construct the given page at the given LSN.
+    Search {
+        #[arg(long)]
+        tenant_id: String,
+        #[arg(long)]
+        timeline_id: String,
+        #[arg(long)]
+        path: Utf8PathBuf,
+        #[arg(long)]
+        key: String,
+        #[arg(long)]
+        lsn: String,
+    },
+}
+
+async fn search_layers(
+    tenant_id: &str,
+    timeline_id: &str,
+    path: &Utf8PathBuf,
+    key: &str,
+    lsn: &str,
+) -> anyhow::Result<()> {
+    let tenant_id = TenantId::from_str(tenant_id).unwrap();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let timeline_id = TimelineId::from_str(timeline_id).unwrap();
+    let index_json = {
+        let bytes = tokio::fs::read(path).await?;
+        IndexPart::from_json_bytes(&bytes).unwrap()
+    };
+    let mut layer_map = LayerMap::default();
+    {
+        let mut updates = layer_map.batch_update();
+        for (key, value) in index_json.layer_metadata.iter() {
+            updates.insert_historic(PersistentLayerDesc::from_filename(
+                tenant_shard_id,
+                timeline_id,
+                key.clone(),
+                value.file_size,
+            ));
+        }
+    }
+    let key = Key::from_hex(key)?;
+
+    let lsn = Lsn::from_str(lsn).unwrap();
+    let mut end_lsn = lsn;
+    loop {
+        let result = layer_map.search(key, end_lsn);
+        match result {
+            Some(SearchResult { layer, lsn_floor }) => {
+                let disk_layer = match layer {
+                    ReadableLayerWeak::PersistentLayer(layer) => layer,
+                    ReadableLayerWeak::InMemoryLayer(_) => {
+                        anyhow::bail!("unexpected in-memory layer")
+                    }
+                };
+
+                let metadata = index_json
+                    .layer_metadata
+                    .get(&disk_layer.layer_name())
+                    .unwrap();
+                println!(
+                    "{}",
+                    remote_layer_path(
+                        &tenant_id,
+                        &timeline_id,
+                        metadata.shard,
+                        &disk_layer.layer_name(),
+                        metadata.generation
+                    )
+                );
+                end_lsn = lsn_floor;
+            }
+            None => break,
+        }
+    }
+    Ok(())
 }
 
 pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
@@ -16,5 +107,12 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             println!("{output}");
             Ok(())
         }
+        IndexPartCmd::Search {
+            tenant_id,
+            timeline_id,
+            path,
+            key,
+            lsn,
+        } => search_layers(tenant_id, timeline_id, path, key, lsn).await,
     }
 }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index f13b3709f5..e6529fb201 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -458,6 +458,9 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                 match next {
                     Ok(Ok(file_size)) => {
                         METRICS.layers_evicted.inc();
+                        /*BEGIN_HADRON */
+                        METRICS.bytes_evicted.inc_by(file_size);
+                        /*END_HADRON */
                         usage_assumed.add_available_bytes(file_size);
                     }
                     Ok(Err((
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 55582659df..0e40dbcd15 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -61,6 +61,7 @@ use crate::context;
 use crate::context::{DownloadBehavior, RequestContext, RequestContextBuilder};
 use crate::deletion_queue::DeletionQueueClient;
 use crate::feature_resolver::FeatureResolver;
+use crate::metrics::LOCAL_DATA_LOSS_SUSPECTED;
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationConf;
@@ -3628,6 +3629,17 @@ async fn activate_post_import_handler(
     .await
 }
 
+// [Hadron] Reset gauge metrics that are used to raised alerts. We need this API as a stop-gap measure to reset alerts
+// after we manually rectify situations such as local SSD data loss. We will eventually automate this.
+async fn hadron_reset_alert_gauges(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    LOCAL_DATA_LOSS_SUSPECTED.set(0);
+    json_response(StatusCode::OK, ())
+}
+
 /// Read the end of a tar archive.
 ///
 /// A tar archive normally ends with two consecutive blocks of zeros, 512 bytes each.
@@ -4154,5 +4166,8 @@ pub fn make_router(
         .post("/v1/feature_flag_spec", |r| {
             api_handler(r, update_feature_flag_spec)
         })
+        .post("/hadron-internal/reset_alert_gauges", |r| {
+            api_handler(r, hadron_reset_alert_gauges)
+        })
         .any(handler_404))
 }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 21faceef49..eb89e166b2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,3 +1,4 @@
+use std::cell::Cell;
 use std::collections::HashMap;
 use std::num::NonZeroUsize;
 use std::os::fd::RawFd;
@@ -102,7 +103,18 @@ pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::n
     .expect("failed to define a metric")
 });
 
-// Buckets for background operation duration in seconds, like compaction, GC, size calculation.
+/* BEGIN_HADRON */
+pub(crate) static STORAGE_ACTIVE_COUNT_PER_TIMELINE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_active_storage_operations_count",
+        "Count of active storage operations with operation, tenant and timeline dimensions",
+        &["operation", "tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+/*END_HADRON */
+
+// Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
 
 pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
@@ -2810,6 +2822,31 @@ pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
 pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
     Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
 
+pub(crate) static LOCAL_DATA_LOSS_SUSPECTED: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "pageserver_local_data_loss_suspected",
+        "Non-zero value indicates that pageserver local data loss is suspected (and highly likely)."
+    )
+    .expect("failed to define a metric")
+});
+
+// Counter keeping track of misrouted PageStream requests. Spelling out PageStream requests here to distinguish
+// it from other types of reqeusts (SK wal replication, http requests, etc.). PageStream requests are used by
+// Postgres compute to fetch data from pageservers.
+// A misrouted PageStream request is registered if the pageserver cannot find the tenant identified in the
+// request, or if the pageserver is not the "primary" serving the tenant shard. These error almost always identify
+// issues with compute configuration, caused by either the compute node itself being stuck in the wrong
+// configuration or Storage Controller reconciliation bugs. Misrouted requests are expected during tenant migration
+// and/or during recovery following a pageserver failure, but persistently high rates of misrouted requests
+// are indicative of bugs (and unavailability).
+pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_misrouted_pagestream_requests_total",
+        "Number of pageserver pagestream requests that were routed to the wrong pageserver"
+    )
+    .expect("failed to define a metric")
+});
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
@@ -3048,13 +3085,19 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
 pub(crate) struct StorageTimeMetricsTimer {
     metrics: StorageTimeMetrics,
     start: Instant,
+    stopped: Cell<bool>,
 }
 
 impl StorageTimeMetricsTimer {
     fn new(metrics: StorageTimeMetrics) -> Self {
+        /*BEGIN_HADRON */
+        // record the active operation as the timer starts
+        metrics.timeline_active_count.inc();
+        /*END_HADRON */
         Self {
             metrics,
             start: Instant::now(),
+            stopped: Cell::new(false),
         }
     }
 
@@ -3070,6 +3113,10 @@ impl StorageTimeMetricsTimer {
         self.metrics.timeline_sum.inc_by(seconds);
         self.metrics.timeline_count.inc();
         self.metrics.global_histogram.observe(seconds);
+        /* BEGIN_HADRON*/
+        self.stopped.set(true);
+        self.metrics.timeline_active_count.dec();
+        /*END_HADRON */
         duration
     }
 
@@ -3080,6 +3127,16 @@ impl StorageTimeMetricsTimer {
     }
 }
 
+/*BEGIN_HADRON */
+impl Drop for StorageTimeMetricsTimer {
+    fn drop(&mut self) {
+        if !self.stopped.get() {
+            self.metrics.timeline_active_count.dec();
+        }
+    }
+}
+/*END_HADRON */
+
 pub(crate) struct AlwaysRecordingStorageTimeMetricsTimer(Option<StorageTimeMetricsTimer>);
 
 impl Drop for AlwaysRecordingStorageTimeMetricsTimer {
@@ -3105,6 +3162,10 @@ pub(crate) struct StorageTimeMetrics {
     timeline_sum: Counter,
     /// Number of oeprations, per operation, tenant_id and timeline_id
     timeline_count: IntCounter,
+    /*BEGIN_HADRON */
+    /// Number of active operations per operation, tenant_id, and timeline_id
+    timeline_active_count: IntGauge,
+    /*END_HADRON */
     /// Global histogram having only the "operation" label.
     global_histogram: Histogram,
 }
@@ -3124,6 +3185,11 @@ impl StorageTimeMetrics {
         let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
             .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
             .unwrap();
+        /*BEGIN_HADRON */
+        let timeline_active_count = STORAGE_ACTIVE_COUNT_PER_TIMELINE
+            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
+            .unwrap();
+        /*END_HADRON */
         let global_histogram = STORAGE_TIME_GLOBAL
             .get_metric_with_label_values(&[operation])
             .unwrap();
@@ -3131,6 +3197,7 @@ impl StorageTimeMetrics {
         StorageTimeMetrics {
             timeline_sum,
             timeline_count,
+            timeline_active_count,
             global_histogram,
         }
     }
@@ -3544,6 +3611,14 @@ impl TimelineMetrics {
                 shard_id,
                 timeline_id,
             ]);
+            /* BEGIN_HADRON */
+            let _ = STORAGE_ACTIVE_COUNT_PER_TIMELINE.remove_label_values(&[
+                op,
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+            /*END_HADRON */
         }
 
         for op in StorageIoSizeOperation::VARIANTS {
@@ -4336,6 +4411,9 @@ pub(crate) mod disk_usage_based_eviction {
         pub(crate) layers_collected: IntCounter,
         pub(crate) layers_selected: IntCounter,
         pub(crate) layers_evicted: IntCounter,
+        /*BEGIN_HADRON */
+        pub(crate) bytes_evicted: IntCounter,
+        /*END_HADRON */
     }
 
     impl Default for Metrics {
@@ -4372,12 +4450,21 @@ pub(crate) mod disk_usage_based_eviction {
             )
             .unwrap();
 
+            /*BEGIN_HADRON */
+            let bytes_evicted = register_int_counter!(
+                "pageserver_disk_usage_based_eviction_evicted_bytes_total",
+                "Amount of bytes successfully evicted"
+            )
+            .unwrap();
+            /*END_HADRON */
+
             Self {
                 tenant_collection_time,
                 tenant_layer_count,
                 layers_collected,
                 layers_selected,
                 layers_evicted,
+                bytes_evicted,
             }
         }
     }
@@ -4497,6 +4584,7 @@ pub fn preinitialize_metrics(
         &CIRCUIT_BREAKERS_UNBROKEN,
         &PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL,
         &WAIT_LSN_IN_PROGRESS_GLOBAL_MICROS,
+        &MISROUTED_PAGESTREAM_REQUESTS,
     ]
     .into_iter()
     .for_each(|c| {
@@ -4534,6 +4622,7 @@ pub fn preinitialize_metrics(
 
     // gauges
     WALRECEIVER_ACTIVE_MANAGERS.get();
+    LOCAL_DATA_LOSS_SUSPECTED.get();
 
     // histograms
     [
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 49928a9036..6b614deac8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -70,7 +70,7 @@ use crate::context::{
 };
 use crate::metrics::{
     self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
-    SmgrOpTimer, TimelineMetrics,
+    MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
@@ -91,7 +91,8 @@ use crate::{CancellableTask, PERF_TRACE_TARGET, timed_after_cancellation};
 /// is not yet in state [`TenantState::Active`].
 ///
 /// NB: this is a different value than [`crate::http::routes::ACTIVE_TENANT_TIMEOUT`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+/// HADRON: reduced timeout and we will retry in Cache::get().
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
 
 /// Threshold at which to log slow GetPage requests.
 const LOG_SLOW_GETPAGE_THRESHOLD: Duration = Duration::from_secs(30);
@@ -1128,6 +1129,7 @@ impl PageServerHandler {
                                 // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
                                 // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
                                 // and talk to a different pageserver.
+                                MISROUTED_PAGESTREAM_REQUESTS.inc();
                                 return respond_error!(
                                     span,
                                     PageStreamError::Reconnect(
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 96ed4672a6..b0969a96c1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -142,6 +142,9 @@ mod gc_block;
 mod gc_result;
 pub(crate) mod throttle;
 
+#[cfg(test)]
+pub mod debug;
+
 pub(crate) use timeline::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -6015,12 +6018,11 @@ pub(crate) mod harness {
         }
 
         #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-        pub(crate) async fn do_try_load(
+        pub(crate) async fn do_try_load_with_redo(
             &self,
+            walredo_mgr: Arc<WalRedoManager>,
             ctx: &RequestContext,
         ) -> anyhow::Result<Arc<TenantShard>> {
-            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
-
             let (basebackup_cache, _) = BasebackupCache::new(Utf8PathBuf::new(), None);
 
             let tenant = Arc::new(TenantShard::new(
@@ -6058,6 +6060,14 @@ pub(crate) mod harness {
             Ok(tenant)
         }
 
+        pub(crate) async fn do_try_load(
+            &self,
+            ctx: &RequestContext,
+        ) -> anyhow::Result<Arc<TenantShard>> {
+            let walredo_mgr = Arc::new(WalRedoManager::from(TestRedoManager));
+            self.do_try_load_with_redo(walredo_mgr, ctx).await
+        }
+
         pub fn timeline_path(&self, timeline_id: &TimelineId) -> Utf8PathBuf {
             self.conf.timeline_path(&self.tenant_shard_id, timeline_id)
         }
diff --git a/pageserver/src/tenant/debug.rs b/pageserver/src/tenant/debug.rs
new file mode 100644
index 0000000000..604f7f265e
--- /dev/null
+++ b/pageserver/src/tenant/debug.rs
@@ -0,0 +1,366 @@
+use std::{ops::Range, str::FromStr, sync::Arc};
+
+use crate::walredo::RedoAttemptType;
+use base64::{Engine as _, engine::general_purpose::STANDARD};
+use bytes::{Bytes, BytesMut};
+use camino::Utf8PathBuf;
+use clap::Parser;
+use itertools::Itertools;
+use pageserver_api::{
+    key::Key,
+    keyspace::KeySpace,
+    shard::{ShardIdentity, ShardStripeSize},
+};
+use postgres_ffi::PgMajorVersion;
+use postgres_ffi::{BLCKSZ, page_is_new, page_set_lsn};
+use tracing::Instrument;
+use utils::{
+    generation::Generation,
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+    shard::{ShardCount, ShardIndex, ShardNumber},
+};
+use wal_decoder::models::record::NeonWalRecord;
+
+use crate::{
+    context::{DownloadBehavior, RequestContext},
+    task_mgr::TaskKind,
+    tenant::storage_layer::ValueReconstructState,
+    walredo::harness::RedoHarness,
+};
+
+use super::{
+    WalRedoManager, WalredoManagerId,
+    harness::TenantHarness,
+    remote_timeline_client::LayerFileMetadata,
+    storage_layer::{AsLayerDesc, IoConcurrency, Layer, LayerName, ValuesReconstructState},
+};
+
+fn process_page_image(next_record_lsn: Lsn, is_fpw: bool, img_bytes: Bytes) -> Bytes {
+    // To match the logic in libs/wal_decoder/src/serialized_batch.rs
+    let mut new_image: BytesMut = img_bytes.into();
+    if is_fpw && !page_is_new(&new_image) {
+        page_set_lsn(&mut new_image, next_record_lsn);
+    }
+    assert_eq!(new_image.len(), BLCKSZ as usize);
+    new_image.freeze()
+}
+
+async fn redo_wals(input: &str, key: Key) -> anyhow::Result<()> {
+    let tenant_id = TenantId::generate();
+    let timeline_id = TimelineId::generate();
+    let redo_harness = RedoHarness::new()?;
+    let span = redo_harness.span();
+    let tenant_conf = pageserver_api::models::TenantConfig {
+        ..Default::default()
+    };
+
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+    let tenant = TenantHarness::create_custom(
+        "search_key",
+        tenant_conf,
+        tenant_id,
+        ShardIdentity::unsharded(),
+        Generation::new(1),
+    )
+    .await?
+    .do_try_load_with_redo(
+        Arc::new(WalRedoManager::Prod(
+            WalredoManagerId::next(),
+            redo_harness.manager,
+        )),
+        &ctx,
+    )
+    .await
+    .unwrap();
+    let timeline = tenant
+        .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
+        .await?;
+    let contents = tokio::fs::read_to_string(input)
+        .await
+        .map_err(|e| anyhow::Error::msg(format!("Failed to read input file {input}: {e}")))
+        .unwrap();
+    let lines = contents.lines();
+    let mut last_wal_lsn: Option<Lsn> = None;
+    let state = {
+        let mut state = ValueReconstructState::default();
+        let mut is_fpw = false;
+        let mut is_first_line = true;
+        for line in lines {
+            if is_first_line {
+                is_first_line = false;
+                if line.trim() == "FPW" {
+                    is_fpw = true;
+                }
+                continue; // Skip the first line.
+            }
+            // Each input line is in the "<next_record_lsn>,<base64>" format.
+            let (lsn_str, payload_b64) = line
+                .split_once(',')
+                .expect("Invalid input format: expected '<lsn>,<base64>'");
+
+            // Parse the LSN and decode the payload.
+            let lsn = Lsn::from_str(lsn_str.trim()).expect("Invalid LSN format");
+            let bytes = Bytes::from(
+                STANDARD
+                    .decode(payload_b64.trim())
+                    .expect("Invalid base64 payload"),
+            );
+
+            // The first line is considered the base image, the rest are WAL records.
+            if state.img.is_none() {
+                state.img = Some((lsn, process_page_image(lsn, is_fpw, bytes)));
+            } else {
+                let wal_record = NeonWalRecord::Postgres {
+                    will_init: false,
+                    rec: bytes,
+                };
+                state.records.push((lsn, wal_record));
+                last_wal_lsn.replace(lsn);
+            }
+        }
+        state
+    };
+
+    assert!(state.img.is_some(), "No base image found");
+    assert!(!state.records.is_empty(), "No WAL records found");
+    let result = timeline
+        .reconstruct_value(key, last_wal_lsn.unwrap(), state, RedoAttemptType::ReadPage)
+        .instrument(span.clone())
+        .await?;
+
+    eprintln!("final image: {:?}", STANDARD.encode(result));
+
+    Ok(())
+}
+
+async fn search_key(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    dir: String,
+    key: Key,
+    lsn: Lsn,
+) -> anyhow::Result<()> {
+    let shard_index = ShardIndex {
+        shard_number: ShardNumber(0),
+        shard_count: ShardCount(4),
+    };
+
+    let redo_harness = RedoHarness::new()?;
+    let span = redo_harness.span();
+    let tenant_conf = pageserver_api::models::TenantConfig {
+        ..Default::default()
+    };
+    let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+    let tenant = TenantHarness::create_custom(
+        "search_key",
+        tenant_conf,
+        tenant_id,
+        ShardIdentity::new(
+            shard_index.shard_number,
+            shard_index.shard_count,
+            ShardStripeSize(32768),
+        )
+        .unwrap(),
+        Generation::new(1),
+    )
+    .await?
+    .do_try_load_with_redo(
+        Arc::new(WalRedoManager::Prod(
+            WalredoManagerId::next(),
+            redo_harness.manager,
+        )),
+        &ctx,
+    )
+    .await
+    .unwrap();
+
+    let timeline = tenant
+        .create_test_timeline(timeline_id, Lsn(0x10), PgMajorVersion::PG16, &ctx)
+        .await?;
+
+    let mut delta_layers: Vec<Layer> = Vec::new();
+    let mut img_layer: Option<Layer> = Option::None;
+    let mut dir = tokio::fs::read_dir(dir).await?;
+    loop {
+        let entry = dir.next_entry().await?;
+        if entry.is_none() || !entry.as_ref().unwrap().file_type().await?.is_file() {
+            break;
+        }
+        let path = Utf8PathBuf::from_path_buf(entry.unwrap().path()).unwrap();
+        let layer_name = match LayerName::from_str(path.file_name().unwrap()) {
+            Ok(name) => name,
+            Err(_) => {
+                eprintln!("Skipped invalid layer: {path}");
+                continue;
+            }
+        };
+        let layer = Layer::for_resident(
+            tenant.conf,
+            &timeline,
+            path.clone(),
+            layer_name,
+            LayerFileMetadata::new(
+                tokio::fs::metadata(path.clone()).await?.len(),
+                Generation::new(1),
+                shard_index,
+            ),
+        );
+        if layer.layer_desc().is_delta() {
+            delta_layers.push(layer.into());
+        } else if img_layer.is_none() {
+            img_layer = Some(layer.into());
+        } else {
+            anyhow::bail!("Found multiple image layers");
+        }
+    }
+    // sort delta layers based on the descending order of LSN
+    delta_layers.sort_by(|a, b| {
+        b.layer_desc()
+            .get_lsn_range()
+            .start
+            .cmp(&a.layer_desc().get_lsn_range().start)
+    });
+
+    let mut state = ValuesReconstructState::new(IoConcurrency::Sequential);
+
+    let key_space = KeySpace::single(Range {
+        start: key,
+        end: key.next(),
+    });
+    let lsn_range = Range {
+        start: img_layer
+            .as_ref()
+            .map_or(Lsn(0x00), |img| img.layer_desc().image_layer_lsn()),
+        end: lsn,
+    };
+    for delta_layer in delta_layers.iter() {
+        delta_layer
+            .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
+            .await?;
+    }
+
+    img_layer
+        .as_ref()
+        .unwrap()
+        .get_values_reconstruct_data(key_space.clone(), lsn_range.clone(), &mut state, &ctx)
+        .await?;
+
+    for (_key, result) in std::mem::take(&mut state.keys) {
+        let state = result.collect_pending_ios().await?;
+        if state.img.is_some() {
+            eprintln!(
+                "image: {}: {:x?}",
+                state.img.as_ref().unwrap().0,
+                STANDARD.encode(state.img.as_ref().unwrap().1.clone())
+            );
+        }
+        for delta in state.records.iter() {
+            match &delta.1 {
+                NeonWalRecord::Postgres { will_init, rec } => {
+                    eprintln!(
+                        "delta: {}: will_init: {}, {:x?}",
+                        delta.0,
+                        will_init,
+                        STANDARD.encode(rec)
+                    );
+                }
+                _ => {
+                    eprintln!("delta: {}: {:x?}", delta.0, delta.1);
+                }
+            }
+        }
+
+        let result = timeline
+            .reconstruct_value(key, lsn_range.end, state, RedoAttemptType::ReadPage)
+            .instrument(span.clone())
+            .await?;
+        eprintln!("final image: {lsn} : {result:?}");
+    }
+
+    Ok(())
+}
+
+/// Redo all WALs against the base image in the input file. Return the base64 encoded final image.
+/// Each line in the input file must be in the form "<lsn>,<base64>" where:
+///   * `<lsn>` is a PostgreSQL LSN in hexadecimal notation, e.g. `0/16ABCDE`.
+///   * `<base64>` is the base64‐encoded page image (first line) or WAL record (subsequent lines).
+///
+/// The first line provides the base image of a page. The LSN is the LSN of "next record" following
+/// the record containing the FPI. For example, if the FPI was extracted from a WAL record occuping
+/// [0/1, 0/200) in the WAL stream, the LSN appearing along side the page image here should be 0/200.
+///
+/// The subsequent lines are WAL records, ordered from the oldest to the newest. The LSN is the
+/// record LSN of the WAL record, not the "next record" LSN. For example, if the WAL record here
+/// occupies [0/1, 0/200) in the WAL stream, the LSN appearing along side the WAL record here should
+/// be 0/1.
+#[derive(Parser)]
+struct RedoWalsCmd {
+    #[clap(long)]
+    input: String,
+    #[clap(long)]
+    key: String,
+}
+
+#[tokio::test]
+async fn test_redo_wals() -> anyhow::Result<()> {
+    let args = std::env::args().collect_vec();
+    let pos = args
+        .iter()
+        .position(|arg| arg == "--")
+        .unwrap_or(args.len());
+    let slice = &args[pos..args.len()];
+    let cmd = match RedoWalsCmd::try_parse_from(slice) {
+        Ok(cmd) => cmd,
+        Err(err) => {
+            eprintln!("{err}");
+            return Ok(());
+        }
+    };
+
+    let key = Key::from_hex(&cmd.key).unwrap();
+    redo_wals(&cmd.input, key).await?;
+
+    Ok(())
+}
+
+/// Search for a page at the given LSN in all layers of the data_dir.
+/// Return the base64-encoded image and all WAL records, as well as the final reconstructed image.
+#[derive(Parser)]
+struct SearchKeyCmd {
+    #[clap(long)]
+    tenant_id: String,
+    #[clap(long)]
+    timeline_id: String,
+    #[clap(long)]
+    data_dir: String,
+    #[clap(long)]
+    key: String,
+    #[clap(long)]
+    lsn: String,
+}
+
+#[tokio::test]
+async fn test_search_key() -> anyhow::Result<()> {
+    let args = std::env::args().collect_vec();
+    let pos = args
+        .iter()
+        .position(|arg| arg == "--")
+        .unwrap_or(args.len());
+    let slice = &args[pos..args.len()];
+    let cmd = match SearchKeyCmd::try_parse_from(slice) {
+        Ok(cmd) => cmd,
+        Err(err) => {
+            eprintln!("{err}");
+            return Ok(());
+        }
+    };
+
+    let tenant_id = TenantId::from_str(&cmd.tenant_id).unwrap();
+    let timeline_id = TimelineId::from_str(&cmd.timeline_id).unwrap();
+    let key = Key::from_hex(&cmd.key).unwrap();
+    let lsn = Lsn::from_str(&cmd.lsn).unwrap();
+    search_key(tenant_id, timeline_id, cmd.data_dir, key, lsn).await?;
+
+    Ok(())
+}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index be18b40862..15853d3614 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -43,7 +43,7 @@ use crate::controller_upcall_client::{
 };
 use crate::deletion_queue::DeletionQueueClient;
 use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
-use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
+use crate::metrics::{LOCAL_DATA_LOSS_SUSPECTED, TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{BACKGROUND_RUNTIME, TaskKind};
 use crate::tenant::config::{
     AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, SecondaryLocationConfig,
@@ -538,6 +538,21 @@ pub async fn init_tenant_mgr(
     // Determine which tenants are to be secondary or attached, and in which generation
     let tenant_modes = init_load_generations(conf, &tenant_configs, resources, cancel).await?;
 
+    // Hadron local SSD check: Raise an alert if our local filesystem does not contain any tenants but the re-attach request returned tenants.
+    // This can happen if the PS suffered a Kubernetes node failure resulting in loss of all local data, but recovered quickly on another node
+    // so the Storage Controller has not had the time to move tenants out.
+    let data_loss_suspected = if let Some(tenant_modes) = &tenant_modes {
+        tenant_configs.is_empty() && !tenant_modes.is_empty()
+    } else {
+        false
+    };
+    if data_loss_suspected {
+        tracing::error!(
+            "Local data loss suspected: no tenants found on local filesystem, but re-attach request returned tenants"
+        );
+    }
+    LOCAL_DATA_LOSS_SUSPECTED.set(if data_loss_suspected { 1 } else { 0 });
+
     tracing::info!(
         "Attaching {} tenants at startup, warming up {} at a time",
         tenant_configs.len(),
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index ffb4717d9f..f2fbf656a6 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -141,11 +141,29 @@ pub(super) async fn upload_timeline_layer<'a>(
 
     let fs_size = usize::try_from(fs_size)
         .with_context(|| format!("convert {local_path:?} size {fs_size} usize"))?;
-
+    /* BEGIN_HADRON */
+    let mut metadata = None;
+    match storage {
+        // Pass the file path as a storage metadata to minimize changes to neon.
+        // Otherwise, we need to change the upload interface.
+        GenericRemoteStorage::AzureBlob(s) => {
+            let block_size_mb = s.put_block_size_mb.unwrap_or(0);
+            if block_size_mb > 0 && fs_size > block_size_mb * 1024 * 1024 {
+                metadata = Some(remote_storage::StorageMetadata::from([(
+                    "databricks_azure_put_block",
+                    local_path.as_str(),
+                )]));
+            }
+        }
+        GenericRemoteStorage::LocalFs(_) => {}
+        GenericRemoteStorage::AwsS3(_) => {}
+        GenericRemoteStorage::Unreliable(_) => {}
+    };
+    /* END_HADRON */
     let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
 
     storage
-        .upload(reader, fs_size, remote_path, None, cancel)
+        .upload(reader, fs_size, remote_path, metadata, cancel)
         .await
         .with_context(|| format!("upload layer from local path '{local_path}'"))
 }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 356f495972..2ae6b7ff3d 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -34,6 +34,21 @@ use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
 /// We use 3/4 Tokio threads, to avoid blocking all threads in case we do any CPU-heavy work.
 static CONCURRENT_BACKGROUND_TASKS: Lazy<Semaphore> = Lazy::new(|| {
     let total_threads = TOKIO_WORKER_THREADS.get();
+
+    /*BEGIN_HADRON*/
+    // ideally we should run at least one compaction task per tenant in order to (1) maximize
+    // compaction throughput (2) avoid head-of-line blocking of large compactions. However doing
+    // that may create too many compaction tasks with lots of memory overheads. So we limit the
+    // number of compaction tasks based on the available CPU core count.
+    // Need to revisit.
+    // let tasks_per_thread = std::env::var("BG_TASKS_PER_THREAD")
+    //     .ok()
+    //     .and_then(|s| s.parse().ok())
+    //     .unwrap_or(4);
+    // let permits = usize::max(1, total_threads * tasks_per_thread);
+    // // assert!(permits < total_threads, "need threads for other work");
+    /*END_HADRON*/
+
     let permits = max(1, (total_threads * 3).checked_div(4).unwrap_or(0));
     assert_ne!(permits, 0, "we will not be adding in permits later");
     assert!(permits < total_threads, "need threads for other work");
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 44a4f1e911..293f3c484d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -6742,7 +6742,7 @@ impl Timeline {
     }
 
     /// Reconstruct a value, using the given base image and WAL records in 'data'.
-    async fn reconstruct_value(
+    pub(crate) async fn reconstruct_value(
         &self,
         key: Key,
         request_lsn: Lsn,
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 2dbff20ab2..33c97287c0 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -212,8 +212,12 @@
 //! to the parent shard during a shard split. Eventually, the shard split task will
 //! shut down the parent => case (1).
 
-use std::collections::{HashMap, hash_map};
-use std::sync::{Arc, Mutex, Weak};
+use std::collections::HashMap;
+use std::collections::hash_map;
+use std::sync::Arc;
+use std::sync::Mutex;
+use std::sync::Weak;
+use std::time::Duration;
 
 use pageserver_api::shard::ShardIdentity;
 use tracing::{instrument, trace};
@@ -333,6 +337,44 @@ enum RoutingResult<T: Types> {
 }
 
 impl<T: Types> Cache<T> {
+    /* BEGIN_HADRON */
+    /// A wrapper of do_get to resolve the tenant shard for a get page request.
+    #[instrument(level = "trace", skip_all)]
+    pub(crate) async fn get(
+        &mut self,
+        timeline_id: TimelineId,
+        shard_selector: ShardSelector,
+        tenant_manager: &T::TenantManager,
+    ) -> Result<Handle<T>, GetError<T>> {
+        const GET_MAX_RETRIES: usize = 10;
+        const RETRY_BACKOFF: Duration = Duration::from_millis(100);
+        let mut attempt = 0;
+        loop {
+            attempt += 1;
+            match self
+                .do_get(timeline_id, shard_selector, tenant_manager)
+                .await
+            {
+                Ok(handle) => return Ok(handle),
+                Err(e) => {
+                    // Retry on tenant manager error to handle tenant split more gracefully
+                    if attempt < GET_MAX_RETRIES {
+                        tracing::warn!(
+                            "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
+                            attempt,
+                            e
+                        );
+                        tokio::time::sleep(RETRY_BACKOFF).await;
+                        continue;
+                    } else {
+                        return Err(e);
+                    }
+                }
+            }
+        }
+    }
+    /* END_HADRON */
+
     /// See module-level comment for details.
     ///
     /// Does NOT check for the shutdown state of [`Types::Timeline`].
@@ -341,7 +383,7 @@ impl<T: Types> Cache<T> {
     /// and if so, return an error that causes the page service to
     /// close the connection.
     #[instrument(level = "trace", skip_all)]
-    pub(crate) async fn get(
+    async fn do_get(
         &mut self,
         timeline_id: TimelineId,
         shard_selector: ShardSelector,
@@ -879,6 +921,7 @@ mod tests {
             .await
             .err()
             .expect("documented behavior: can't get new handle after shutdown");
+
         assert_eq!(cache.map.len(), 1, "next access cleans up the cache");
 
         cache
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index b17b5a15f9..c6d3cafe9a 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -566,22 +566,55 @@ impl PostgresRedoManager {
     }
 }
 
+#[cfg(test)]
+pub(crate) mod harness {
+    use super::PostgresRedoManager;
+    use crate::config::PageServerConf;
+    use utils::{id::TenantId, shard::TenantShardId};
+
+    pub struct RedoHarness {
+        // underscored because unused, except for removal at drop
+        _repo_dir: camino_tempfile::Utf8TempDir,
+        pub manager: PostgresRedoManager,
+        tenant_shard_id: TenantShardId,
+    }
+
+    impl RedoHarness {
+        pub fn new() -> anyhow::Result<Self> {
+            crate::tenant::harness::setup_logging();
+
+            let repo_dir = camino_tempfile::tempdir()?;
+            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
+            let conf = Box::leak(Box::new(conf));
+            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
+
+            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
+
+            Ok(RedoHarness {
+                _repo_dir: repo_dir,
+                manager,
+                tenant_shard_id,
+            })
+        }
+        pub fn span(&self) -> tracing::Span {
+            tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use std::str::FromStr;
 
     use bytes::Bytes;
     use pageserver_api::key::Key;
-    use pageserver_api::shard::TenantShardId;
     use postgres_ffi::PgMajorVersion;
     use tracing::Instrument;
-    use utils::id::TenantId;
     use utils::lsn::Lsn;
     use wal_decoder::models::record::NeonWalRecord;
 
-    use super::PostgresRedoManager;
-    use crate::config::PageServerConf;
     use crate::walredo::RedoAttemptType;
+    use crate::walredo::harness::RedoHarness;
 
     #[tokio::test]
     async fn test_ping() {
@@ -692,33 +725,4 @@ mod tests {
             )
         ]
     }
-
-    struct RedoHarness {
-        // underscored because unused, except for removal at drop
-        _repo_dir: camino_tempfile::Utf8TempDir,
-        manager: PostgresRedoManager,
-        tenant_shard_id: TenantShardId,
-    }
-
-    impl RedoHarness {
-        fn new() -> anyhow::Result<Self> {
-            crate::tenant::harness::setup_logging();
-
-            let repo_dir = camino_tempfile::tempdir()?;
-            let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
-            let conf = Box::leak(Box::new(conf));
-            let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());
-
-            let manager = PostgresRedoManager::new(conf, tenant_shard_id);
-
-            Ok(RedoHarness {
-                _repo_dir: repo_dir,
-                manager,
-                tenant_shard_id,
-            })
-        }
-        fn span(&self) -> tracing::Span {
-            tracing::info_span!("RedoHarness", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug())
-        }
-    }
 }
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 1dd4fe8316..6e600b5a86 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -159,6 +159,9 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
 )
 
 PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (
+    # BEGIN_HADRON
+    "pageserver_active_storage_operations_count",
+    # END_HADRON
     "pageserver_current_logical_size",
     "pageserver_resident_physical_size",
     "pageserver_io_operations_bytes_total",
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 6a715c4b93..0e4dd571c0 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -111,6 +111,14 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*stalling layer flushes for compaction backpressure.*",
     ".*layer roll waiting for flush due to compaction backpressure.*",
     ".*BatchSpanProcessor.*",
+    # Can happen in tests that purposely wipe pageserver "local disk" data.
+    ".*Local data loss suspected.*",
+    # Too many frozen layers error is normal during intensive benchmarks
+    ".*too many frozen layers.*",
+    # Transient errors when resolving tenant shards by page service
+    ".*Fail to resolve tenant shard in attempt.*",
+    # Expected warnings when pageserver has not refreshed GC info yet
+    ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
     ".*No broker updates received for a while.*",
     *(
         [
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 93c621f564..8ff767eca4 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1,8 +1,11 @@
 from __future__ import annotations
 
 import os
+import random
+import threading
 import time
 from collections import defaultdict
+from threading import Event
 from typing import TYPE_CHECKING, Any
 
 import pytest
@@ -1505,6 +1508,171 @@ def test_sharding_split_failures(
     env.storage_controller.consistency_check()
 
 
+@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
+def test_back_pressure_during_split(neon_env_builder: NeonEnvBuilder):
+    """
+    Test backpressure can ignore new shards during tenant split so that if we abort the split,
+    PG can continue without being blocked.
+    """
+    DBNAME = "regression"
+
+    init_shard_count = 4
+    neon_env_builder.num_pageservers = init_shard_count
+    stripe_size = 32
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # All split failures log a warning when then enqueue the abort operation
+            ".*Enqueuing background abort.*",
+            # Tolerate any error lots that mention a failpoint
+            ".*failpoint.*",
+        ]
+    )
+
+    endpoint = env.endpoints.create(
+        "main",
+        config_lines=[
+            "max_replication_write_lag = 1MB",
+            "databricks.max_wal_mb_per_second = 1",
+            "neon.max_cluster_size = 10GB",
+        ],
+    )
+    endpoint.respec(skip_pg_catalog_updates=False)  # Needed for databricks_system to get created.
+    endpoint.start()
+
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
+
+    endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
+    write_done = Event()
+
+    def write_data(write_done):
+        while not write_done.is_set():
+            endpoint.safe_psql(
+                "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+            )
+        log.info("write_data thread exiting")
+
+    writer_thread = threading.Thread(target=write_data, args=(write_done,))
+    writer_thread.start()
+
+    env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
+    # split the tenant
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
+
+    write_done.set()
+    writer_thread.join()
+
+    # writing more data to page servers after split is aborted
+    for _i in range(5000):
+        endpoint.safe_psql(
+            "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+        )
+
+    # wait until write lag becomes 0
+    def check_write_lag_is_zero():
+        res = endpoint.safe_psql(
+            """
+            SELECT
+                pg_wal_lsn_diff(pg_current_wal_flush_lsn(), received_lsn) as received_lsn_lag
+                FROM neon.backpressure_lsns();
+            """,
+            dbname="databricks_system",
+            log_query=False,
+        )
+        log.info(f"received_lsn_lag = {res[0][0]}")
+        assert res[0][0] == 0
+
+    wait_until(check_write_lag_is_zero)
+    endpoint.stop_and_destroy()
+
+
+# BEGIN_HADRON
+def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that page service is able to resolve the correct shard during tenant split without causing query errors
+    """
+    DBNAME = "regression"
+    WORKER_THREADS = 16
+    ROW_COUNT = 10000
+
+    init_shard_count = 4
+    neon_env_builder.num_pageservers = 1
+    stripe_size = 16
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count, initial_tenant_shard_stripe_size=stripe_size
+    )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            # All split failures log a warning when then enqueue the abort operation
+            ".*Enqueuing background abort.*",
+            # Tolerate any error lots that mention a failpoint
+            ".*failpoint.*",
+        ]
+    )
+
+    endpoint = env.endpoints.create("main")
+    endpoint.respec(skip_pg_catalog_updates=False)  # Needed for databricks_system to get created.
+    endpoint.start()
+
+    endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
+
+    # generate 10MB of data
+    endpoint.safe_psql(
+        f"CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, {ROW_COUNT}) s;"
+    )
+    read_done = Event()
+
+    def read_data(read_done):
+        i = 0
+        while not read_done.is_set() or i < 10:
+            endpoint.safe_psql(
+                f"SELECT * FROM usertable where KEY = {random.randint(1, ROW_COUNT)}",
+                log_query=False,
+            )
+            i += 1
+        log.info(f"read_data thread exiting. Executed {i} queries.")
+
+    reader_threads = []
+    for _i in range(WORKER_THREADS):
+        reader_thread = threading.Thread(target=read_data, args=(read_done,))
+        reader_thread.start()
+        reader_threads.append(reader_thread)
+
+    env.storage_controller.configure_failpoints(("shard-split-pre-complete", "return(1)"))
+    # split the tenant
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=16)
+
+    # wait until abort is done
+    def check_tenant_status():
+        active_count = 0
+        for i in range(init_shard_count):
+            status = env.pageserver.http_client().tenant_status(
+                TenantShardId(env.initial_tenant, i, init_shard_count)
+            )
+            if status["state"]["slug"] == "Active":
+                active_count += 1
+        assert active_count == 4
+
+    wait_until(check_tenant_status)
+
+    read_done.set()
+    for thread in reader_threads:
+        thread.join()
+
+    endpoint.stop()
+
+
+# END_HADRON
+
+
 def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
     """
     Check a scenario when one of the shards is much slower than others.

From 8223c1ba9d9806ad4bf267d418d065717d70a71f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 8 Jul 2025 22:58:18 +0200
Subject: [PATCH 026/163] pageserver/client_grpc: add initial gRPC client pools
 (#12434)

## Problem

The communicator will need gRPC channel/client/stream pools for
efficient reuse across many backends.

Touches #11735.
Requires #12396.

## Summary of changes

Adds three nested resource pools:

* `ChannelPool` for gRPC channels (i.e. TCP connections).
* `ClientPool` for gRPC clients (i.e. `page_api::Client`). Acquires
channels from `ChannelPool`.
* `StreamPool` for gRPC GetPage streams. Acquires clients from
`ClientPool`.

These are minimal functional implementations that will need further
improvements and performance optimization. However, the overall
structure is expected to be roughly final, so reviews should focus on
that.

The pools are not yet in use, but will form the foundation of a rich
gRPC Pageserver client used by the communicator (see #12462). This PR
also adds the initial crate scaffolding for that client.

See doc comments for details.
---
 Cargo.lock                         |  15 +
 Cargo.toml                         |   1 +
 pageserver/client_grpc/Cargo.toml  |  16 +
 pageserver/client_grpc/src/lib.rs  |  14 +
 pageserver/client_grpc/src/pool.rs | 586 +++++++++++++++++++++++++++++
 5 files changed, 632 insertions(+)
 create mode 100644 pageserver/client_grpc/Cargo.toml
 create mode 100644 pageserver/client_grpc/src/lib.rs
 create mode 100644 pageserver/client_grpc/src/pool.rs

diff --git a/Cargo.lock b/Cargo.lock
index 39c43d94a3..893932fb9d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4493,6 +4493,21 @@ dependencies = [
  "workspace_hack",
 ]
 
+[[package]]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "futures",
+ "pageserver_page_api",
+ "tokio",
+ "tokio-stream",
+ "tonic 0.13.1",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "pageserver_compaction"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index 840e3c6036..14f2cfcb56 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -8,6 +8,7 @@ members = [
     "pageserver/compaction",
     "pageserver/ctl",
     "pageserver/client",
+    "pageserver/client_grpc",
     "pageserver/pagebench",
     "pageserver/page_api",
     "proxy",
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
new file mode 100644
index 0000000000..5a3a2761c2
--- /dev/null
+++ b/pageserver/client_grpc/Cargo.toml
@@ -0,0 +1,16 @@
+[package]
+name = "pageserver_client_grpc"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+futures.workspace = true
+pageserver_page_api.workspace = true
+tokio.workspace = true
+tokio-stream.workspace = true
+tonic.workspace = true
+tracing.workspace = true
+utils.workspace = true
+workspace_hack.workspace = true
diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs
new file mode 100644
index 0000000000..c900e1a939
--- /dev/null
+++ b/pageserver/client_grpc/src/lib.rs
@@ -0,0 +1,14 @@
+//! A rich Pageserver gRPC client. This client is more capable than the basic `page_api::Client`
+//! gRPC client, and supports:
+//!
+//! * Sharded tenants across multiple Pageservers.
+//! * Pooling of connections, clients, and streams for efficient resource use.
+//! * Concurrent use by many callers.
+//! * Internal handling of GetPage bidirectional streams.
+//! * Automatic retries.
+//! * Observability.
+//!
+//! The client is under development, this package is just a shell.
+
+#[allow(unused)]
+mod pool;
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
new file mode 100644
index 0000000000..518e4e5b84
--- /dev/null
+++ b/pageserver/client_grpc/src/pool.rs
@@ -0,0 +1,586 @@
+//! This module provides various Pageserver gRPC client resource pools.
+//!
+//! These pools are designed to reuse gRPC resources (connections, clients, and streams) across
+//! multiple concurrent callers (i.e. Postgres backends). This avoids the resource cost and latency
+//! of creating dedicated TCP connections and server tasks for every Postgres backend.
+//!
+//! Each resource has its own, nested pool. The pools are custom-built for the properties of each
+//! resource -- they are different enough that a generic pool isn't suitable.
+//!
+//! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
+//!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
+//!   per-channel client limit. Channels may be closed when they are no longer used by any clients.
+//!
+//! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
+//!   channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
+//!   single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
+//!   from the pool after some time, to free up the channel.
+//!
+//! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
+//!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
+//!   returns a guard that can be used to send a single request, to properly enforce queue depth and
+//!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
+//!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
+//!   queue depth). Idle streams may be removed from the pool after a while to free up the client.
+//!
+//! Each channel corresponds to one TCP connection. Each client unary request and each stream
+//! corresponds to one HTTP/2 stream and server task.
+//!
+//! TODO: error handling (including custom error types).
+//! TODO: observability.
+
+use std::collections::{BTreeMap, HashMap};
+use std::ops::{Deref, DerefMut};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex, Weak};
+
+use futures::StreamExt as _;
+use tokio::sync::mpsc::{Receiver, Sender};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use tonic::transport::{Channel, Endpoint};
+use tracing::{error, warn};
+
+use pageserver_page_api as page_api;
+use utils::id::{TenantId, TimelineId};
+use utils::shard::ShardIndex;
+
+/// Max number of concurrent clients per channel.
+///
+/// TODO: tune these constants, and make them configurable.
+/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
+/// with only streams.
+const CLIENTS_PER_CHANNEL: usize = 16;
+
+/// Maximum number of concurrent clients per `ClientPool`.
+const CLIENT_LIMIT: usize = 64;
+
+/// Max number of pipelined requests per gRPC GetPage stream.
+const STREAM_QUEUE_DEPTH: usize = 2;
+
+/// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
+/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of
+/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients.
+///
+/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
+///
+/// TODO: reap idle channels.
+/// TODO: consider prewarming a set of channels, to avoid initial connection latency.
+/// TODO: consider adding a circuit breaker for errors and fail fast.
+pub struct ChannelPool {
+    /// Pageserver endpoint to connect to.
+    endpoint: Endpoint,
+    /// Open channels.
+    channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
+    /// Channel ID generator.
+    next_channel_id: AtomicUsize,
+}
+
+type ChannelID = usize;
+
+struct ChannelEntry {
+    /// The gRPC channel (i.e. TCP connection). Shared by multiple clients.
+    channel: Channel,
+    /// Number of clients using this channel.
+    clients: usize,
+}
+
+impl ChannelPool {
+    /// Creates a new channel pool for the given Pageserver endpoint.
+    pub fn new<E>(endpoint: E) -> anyhow::Result<Arc<Self>>
+    where
+        E: TryInto<Endpoint> + Send + Sync + 'static,
+        <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
+    {
+        Ok(Arc::new(Self {
+            endpoint: endpoint.try_into()?,
+            channels: Mutex::default(),
+            next_channel_id: AtomicUsize::default(),
+        }))
+    }
+
+    /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
+    ///
+    /// This never blocks (except for mutex acquisition). The channel is connected lazily on first
+    /// use, and the `ChannelPool` does not have a channel limit. Channels will be re-established
+    /// automatically on failure (TODO: verify).
+    ///
+    /// Callers should not clone the returned channel, and must hold onto the returned guard as long
+    /// as the channel is in use. It is unfortunately not possible to enforce this: the Protobuf
+    /// client requires an owned `Channel` and we don't have access to the channel's internal
+    /// refcount.
+    ///
+    /// This is not performance-sensitive. It is only called when creating a new client, and clients
+    /// are pooled and reused by `ClientPool`. The total number of channels will also be small. O(n)
+    /// performance is therefore okay.
+    pub fn get(self: &Arc<Self>) -> ChannelGuard {
+        let mut channels = self.channels.lock().unwrap();
+
+        // Try to find an existing channel with available capacity. We check entries in BTreeMap
+        // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
+        // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
+        // channels, and free up higher-ordered channels such that they can be reaped.
+        for (&id, entry) in channels.iter_mut() {
+            assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow");
+            if entry.clients < CLIENTS_PER_CHANNEL {
+                entry.clients += 1;
+                return ChannelGuard {
+                    pool: Arc::downgrade(self),
+                    id,
+                    channel: Some(entry.channel.clone()),
+                };
+            }
+        }
+
+        // Create a new channel. We connect lazily on first use, such that we don't block here and
+        // other clients can join onto the same channel while it's connecting.
+        let channel = self.endpoint.connect_lazy();
+
+        let id = self.next_channel_id.fetch_add(1, Ordering::Relaxed);
+        let entry = ChannelEntry {
+            channel: channel.clone(),
+            clients: 1, // account for the guard below
+        };
+        channels.insert(id, entry);
+
+        ChannelGuard {
+            pool: Arc::downgrade(self),
+            id,
+            channel: Some(channel),
+        }
+    }
+}
+
+/// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
+/// since the gRPC client requires an owned `Channel`.
+pub struct ChannelGuard {
+    pool: Weak<ChannelPool>,
+    id: ChannelID,
+    channel: Option<Channel>,
+}
+
+impl ChannelGuard {
+    /// Returns the inner owned channel. Panics if called more than once. The caller must hold onto
+    /// the guard as long as the channel is in use, and should not clone it.
+    pub fn take(&mut self) -> Channel {
+        self.channel.take().expect("channel already taken")
+    }
+}
+
+/// Returns the channel to the pool.
+impl Drop for ChannelGuard {
+    fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+        let mut channels = pool.channels.lock().unwrap();
+        let entry = channels.get_mut(&self.id).expect("unknown channel");
+        assert!(entry.clients > 0, "channel underflow");
+        entry.clients -= 1;
+    }
+}
+
+/// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
+/// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total
+/// number of concurrent clients to `CLIENT_LIMIT` via semaphore.
+///
+/// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
+///
+/// TODO: reap idle clients.
+pub struct ClientPool {
+    /// Tenant ID.
+    tenant_id: TenantId,
+    /// Timeline ID.
+    timeline_id: TimelineId,
+    /// Shard ID.
+    shard_id: ShardIndex,
+    /// Authentication token, if any.
+    auth_token: Option<String>,
+    /// Channel pool to acquire channels from.
+    channel_pool: Arc<ChannelPool>,
+    /// Limits the max number of concurrent clients for this pool.
+    limiter: Arc<Semaphore>,
+    /// Idle pooled clients. Acquired clients are removed from here and returned on drop.
+    ///
+    /// The first client in the map will be acquired next. The map is sorted by client ID, which in
+    /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
+    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
+    /// clients are reaped.
+    idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Unique client ID generator.
+    next_client_id: AtomicUsize,
+}
+
+type ClientID = (ChannelID, usize);
+
+struct ClientEntry {
+    /// The pooled gRPC client.
+    client: page_api::Client,
+    /// The channel guard for the channel used by the client.
+    channel_guard: ChannelGuard,
+}
+
+impl ClientPool {
+    /// Creates a new client pool for the given tenant shard. Channels are acquired from the given
+    /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard.
+    pub fn new(
+        channel_pool: Arc<ChannelPool>,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token,
+            channel_pool,
+            idle: Mutex::default(),
+            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)),
+            next_client_id: AtomicUsize::default(),
+        })
+    }
+
+    /// Gets a client from the pool, or creates a new one if necessary. Connections are established
+    /// lazily and do not block, but this call can block if the pool is at `CLIENT_LIMIT`. The
+    /// client is returned to the pool when the guard is dropped.
+    ///
+    /// This is moderately performance-sensitive. It is called for every unary request, but these
+    /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
+    /// the `StreamPool` instead.
+    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+        let permit = self
+            .limiter
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("never closed");
+
+        // Fast path: acquire an idle client from the pool.
+        if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(ClientGuard {
+                pool: Arc::downgrade(self),
+                id,
+                client: Some(entry.client),
+                channel_guard: Some(entry.channel_guard),
+                permit,
+            });
+        }
+
+        // Slow path: construct a new client.
+        let mut channel_guard = self.channel_pool.get();
+        let client = page_api::Client::new(
+            channel_guard.take(),
+            self.tenant_id,
+            self.timeline_id,
+            self.shard_id,
+            self.auth_token.clone(),
+            None,
+        )?;
+
+        Ok(ClientGuard {
+            pool: Arc::downgrade(self),
+            id: (
+                channel_guard.id,
+                self.next_client_id.fetch_add(1, Ordering::Relaxed),
+            ),
+            client: Some(client),
+            channel_guard: Some(channel_guard),
+            permit,
+        })
+    }
+}
+
+/// A client acquired from the pool. The inner client can be accessed via Deref. The client is
+/// returned to the pool when dropped.
+pub struct ClientGuard {
+    pool: Weak<ClientPool>,
+    id: ClientID,
+    client: Option<page_api::Client>,    // Some until dropped
+    channel_guard: Option<ChannelGuard>, // Some until dropped
+    permit: OwnedSemaphorePermit,
+}
+
+impl Deref for ClientGuard {
+    type Target = page_api::Client;
+
+    fn deref(&self) -> &Self::Target {
+        self.client.as_ref().expect("not dropped")
+    }
+}
+
+impl DerefMut for ClientGuard {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.client.as_mut().expect("not dropped")
+    }
+}
+
+/// Returns the client to the pool.
+impl Drop for ClientGuard {
+    fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+        let entry = ClientEntry {
+            client: self.client.take().expect("dropped once"),
+            channel_guard: self.channel_guard.take().expect("dropped once"),
+        };
+        pool.idle.lock().unwrap().insert(self.id, entry);
+
+        _ = self.permit; // returned on drop, referenced for visibility
+    }
+}
+
+/// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
+/// acquires a client from the inner `ClientPool` for the stream's lifetime.
+///
+/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
+/// a single request and await the response. Internally, requests are multiplexed across streams and
+/// channels. This allows proper queue depth enforcement and response routing.
+///
+/// TODO: reap idle streams.
+/// TODO: consider making this generic over request and response types; not currently needed.
+pub struct StreamPool {
+    /// The client pool to acquire clients from.
+    client_pool: Arc<ClientPool>,
+    /// All pooled streams.
+    ///
+    /// Incoming requests will be sent over an existing stream with available capacity. If all
+    /// streams are full, a new one is spun up and added to the pool (up to the `ClientPool` limit).
+    /// Each stream has an associated Tokio task that processes requests and responses.
+    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
+    /// Limits the max number of concurrent requests (not streams).
+    limiter: Arc<Semaphore>,
+    /// Stream ID generator.
+    next_stream_id: AtomicUsize,
+}
+
+type StreamID = usize;
+type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
+type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
+type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
+
+struct StreamEntry {
+    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
+    sender: RequestSender,
+    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
+    /// completion without acquiring the `StreamPool::streams` lock.
+    queue_depth: Arc<AtomicUsize>,
+}
+
+impl StreamPool {
+    /// Creates a new stream pool, using the given client pool.
+    ///
+    /// NB: the stream pool should use a dedicated client pool. Otherwise, long-lived streams may
+    /// fill up the client pool and starve out unary requests. Client pools can share the same
+    /// `ChannelPool` though, since the channel pool is unbounded.
+    pub fn new(client_pool: Arc<ClientPool>) -> Arc<Self> {
+        Arc::new(Self {
+            client_pool,
+            streams: Arc::default(),
+            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH)),
+            next_stream_id: AtomicUsize::default(),
+        })
+    }
+
+    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
+    /// are full. Returns a guard that can be used to send a single request on the stream and await
+    /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
+    /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
+    ///
+    /// This is very performance-sensitive, as it is on the GetPage hot path.
+    ///
+    /// TODO: this must do something more sophisticated for performance. We want:
+    ///
+    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
+    /// * Quick acquisition of pooled streams with available capacity.
+    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
+    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
+    /// * Don't hold a lock while spinning up new streams.
+    /// * Allow concurrent clients to join onto streams while they're spun up.
+    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
+    ///
+    /// For now, we just do something simple and functional, but very inefficient (linear scan).
+    pub async fn get(&self) -> StreamGuard {
+        let permit = self
+            .limiter
+            .clone()
+            .acquire_owned()
+            .await
+            .expect("never closed");
+        let mut streams = self.streams.lock().unwrap();
+
+        // Look for a pooled stream with available capacity.
+        for entry in streams.values() {
+            assert!(
+                entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH,
+                "stream queue overflow"
+            );
+            if entry
+                .queue_depth
+                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
+                    // Increment the queue depth via compare-and-swap.
+                    // TODO: review ordering.
+                    (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1)
+                })
+                .is_ok()
+            {
+                return StreamGuard {
+                    sender: entry.sender.clone(),
+                    queue_depth: entry.queue_depth.clone(),
+                    permit,
+                };
+            }
+        }
+
+        // No available stream, spin up a new one. We install the stream entry in the pool first and
+        // return the guard, while spinning up the stream task async. This allows other callers to
+        // join onto this stream and also create additional streams concurrently if this fills up.
+        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
+        let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
+        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
+        let entry = StreamEntry {
+            sender: req_tx.clone(),
+            queue_depth: queue_depth.clone(),
+        };
+        streams.insert(id, entry);
+
+        // NB: make sure we don't overshoot the client limit. The semaphore limit is CLIENT_LIMIT *
+        // STREAM_QUEUE_DEPTH, but if we were to misaccount queue depth we'd try to spin up more
+        // streams than CLIENT_LIMIT and block on the client pool ~forever. This should not happen
+        // because we only acquire queue depth under lock and after acquiring a semaphore permit.
+        assert!(streams.len() <= CLIENT_LIMIT, "stream overflow");
+
+        let client_pool = self.client_pool.clone();
+        let streams = self.streams.clone();
+
+        tokio::spawn(async move {
+            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
+                error!("stream failed: {err}");
+            }
+            // Remove stream from pool on exit.
+            let entry = streams.lock().unwrap().remove(&id);
+            assert!(entry.is_some(), "unknown stream ID: {id}");
+        });
+
+        StreamGuard {
+            sender: req_tx,
+            queue_depth,
+            permit,
+        }
+    }
+
+    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
+    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
+    /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
+    /// atomic with pool stream acquisition.
+    ///
+    /// The task exits when the request channel is closed, or on a stream error. The caller is
+    /// responsible for removing the stream from the pool on exit.
+    async fn run_stream(
+        client_pool: Arc<ClientPool>,
+        mut caller_rx: RequestReceiver,
+    ) -> anyhow::Result<()> {
+        // Acquire a client from the pool and create a stream.
+        let mut client = client_pool.get().await?;
+
+        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
+        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Track caller response channels by request ID. If the task returns early, these response
+        // channels will be dropped and the waiting callers will receive an error.
+        let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH);
+
+        // Process requests and responses.
+        loop {
+            // NB: this can trip if the server doesn't respond to a request, so only debug_assert.
+            debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream queue overflow");
+
+            tokio::select! {
+                // Receive requests from callers and send them to the stream.
+                req = caller_rx.recv() => {
+                    // Shut down if request channel is closed.
+                    let Some((req, resp_tx)) = req else {
+                        return Ok(());
+                    };
+
+                    // Store the response channel by request ID.
+                    if callers.contains_key(&req.request_id) {
+                        // Error on request ID duplicates. Ignore callers that went away.
+                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
+                            format!("duplicate request ID: {}", req.request_id),
+                        )));
+                        continue;
+                    }
+                    callers.insert(req.request_id, resp_tx);
+
+                    // Send the request on the stream. Bail out if the send fails.
+                    req_tx.send(req).await.map_err(|_| {
+                        tonic::Status::unavailable("stream closed")
+                    })?;
+                }
+
+                // Receive responses from the stream and send them to callers.
+                resp = resp_stream.next() => {
+                    // Shut down if the stream is closed, and bail out on stream errors.
+                    let Some(resp) = resp.transpose()? else {
+                        return Ok(())
+                    };
+
+                    // Send the response to the caller. Ignore errors if the caller went away.
+                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
+                        warn!("received response for unknown request ID: {}", resp.request_id);
+                        continue;
+                    };
+                    _ = resp_tx.send(Ok(resp));
+                }
+            }
+        }
+    }
+}
+
+/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
+/// depth. Queue depth is already reserved and will be returned on drop.
+pub struct StreamGuard {
+    sender: RequestSender,
+    queue_depth: Arc<AtomicUsize>,
+    permit: OwnedSemaphorePermit,
+}
+
+impl StreamGuard {
+    /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
+    /// valid for a single request (to enforce queue depth). This also drops the guard on return and
+    /// returns the queue depth quota to the pool.
+    ///
+    /// The `GetPageRequest::request_id` must be unique across in-flight requests.
+    ///
+    /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
+    /// to avoid tearing down the stream for per-request errors. Callers must check this.
+    pub async fn send(
+        self,
+        req: page_api::GetPageRequest,
+    ) -> tonic::Result<page_api::GetPageResponse> {
+        let (resp_tx, resp_rx) = oneshot::channel();
+
+        self.sender
+            .send((req, resp_tx))
+            .await
+            .map_err(|_| tonic::Status::unavailable("stream closed"))?;
+
+        resp_rx
+            .await
+            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+    }
+}
+
+impl Drop for StreamGuard {
+    fn drop(&mut self) {
+        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
+        // before the response is received, but that's okay.
+        let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
+        assert!(prev_queue_depth > 0, "stream queue underflow");
+
+        _ = self.permit; // returned on drop, referenced for visibility
+    }
+}

From 09ff22a4d42ee0c97724e61dc08f325b2ea39111 Mon Sep 17 00:00:00 2001
From: Suhas Thalanki <54014218+thesuhas@users.noreply.github.com>
Date: Tue, 8 Jul 2025 17:12:26 -0400
Subject: [PATCH 027/163] fix(compute): removing `NEON_EXT_INT_UPD` log
 statement added for debugging verbosity (#12509)

Removes the `NEON_EXT_INT_UPD` log statement that was added for
debugging verbosity.
---
 compute_tools/src/compute.rs | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index f25aff1110..ec6e6c1634 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2433,19 +2433,11 @@ LIMIT 100",
         // If the value is -1, we never suspend so set the value to default collection.
         // If the value is 0, it means default, we will just continue to use the default.
         if spec.suspend_timeout_seconds == -1 || spec.suspend_timeout_seconds == 0 {
-            info!(
-                "[NEON_EXT_INT_UPD] Spec Timeout: {}, New Timeout: {}",
-                spec.suspend_timeout_seconds, DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL
-            );
             self.params.installed_extensions_collection_interval.store(
                 DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL,
                 std::sync::atomic::Ordering::SeqCst,
             );
         } else {
-            info!(
-                "[NEON_EXT_INT_UPD] Spec Timeout: {}",
-                spec.suspend_timeout_seconds
-            );
             self.params.installed_extensions_collection_interval.store(
                 spec.suspend_timeout_seconds as u64,
                 std::sync::atomic::Ordering::SeqCst,

From 4dee2bfd82b221f90b934e5d68343caa3a0e531f Mon Sep 17 00:00:00 2001
From: Trung Dinh <dinhanhtrung@gmail.com>
Date: Tue, 8 Jul 2025 14:14:04 -0700
Subject: [PATCH 028/163] pageserver: Introduce config to enable/disable
 eviction task (#12496)

## Problem
We lost capability to explicitly disable the global eviction task (for
testing).

## Summary of changes
Add an `enabled` flag to `DiskUsageEvictionTaskConfig` to indicate
whether we should run the eviction job or not.
---
 libs/pageserver_api/src/config.rs             | 21 ++++-
 pageserver/src/config.rs                      | 85 ++++++++++++++-----
 pageserver/src/disk_usage_eviction_task.rs    |  4 +-
 pageserver/src/utilization.rs                 |  7 +-
 ...er_max_throughput_getpage_at_latest_lsn.py | 18 +++-
 5 files changed, 103 insertions(+), 32 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 00d6b61399..dc7e9aed7f 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -5,6 +5,7 @@ mod tests;
 
 use const_format::formatcp;
 use posthog_client_lite::PostHogClientConfig;
+use utils::serde_percent::Percent;
 pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
 pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
 pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
@@ -223,7 +224,7 @@ pub struct ConfigToml {
     pub metric_collection_bucket: Option<RemoteStorageConfig>,
     #[serde(with = "humantime_serde")]
     pub synthetic_size_calculation_interval: Duration,
-    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+    pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
     pub test_remote_failures: u64,
     pub ondemand_download_behavior_treat_error_as_warn: bool,
     #[serde(with = "humantime_serde")]
@@ -273,6 +274,7 @@ pub struct ConfigToml {
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[serde(default)]
 pub struct DiskUsageEvictionTaskConfig {
     pub max_usage_pct: utils::serde_percent::Percent,
     pub min_avail_bytes: u64,
@@ -283,6 +285,21 @@ pub struct DiskUsageEvictionTaskConfig {
     /// Select sorting for evicted layers
     #[serde(default)]
     pub eviction_order: EvictionOrder,
+    pub enabled: bool,
+}
+
+impl Default for DiskUsageEvictionTaskConfig {
+    fn default() -> Self {
+        Self {
+            max_usage_pct: Percent::new(80).unwrap(),
+            min_avail_bytes: 2_000_000_000,
+            period: Duration::from_secs(60),
+            #[cfg(feature = "testing")]
+            mock_statvfs: None,
+            eviction_order: EvictionOrder::default(),
+            enabled: true,
+        }
+    }
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -738,7 +755,7 @@ impl Default for ConfigToml {
 
             metric_collection_bucket: (None),
 
-            disk_usage_based_eviction: (None),
+            disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
 
             test_remote_failures: (0),
 
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 6e22f9f36e..99d7e0ca3a 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -28,7 +28,6 @@ use reqwest::Url;
 use storage_broker::Uri;
 use utils::id::{NodeId, TimelineId};
 use utils::logging::{LogFormat, SecretString};
-use utils::serde_percent::Percent;
 
 use crate::tenant::storage_layer::inmemory_layer::IndexEntry;
 use crate::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
@@ -146,7 +145,7 @@ pub struct PageServerConf {
     pub metric_collection_bucket: Option<RemoteStorageConfig>,
     pub synthetic_size_calculation_interval: Duration,
 
-    pub disk_usage_based_eviction: Option<DiskUsageEvictionTaskConfig>,
+    pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
 
     pub test_remote_failures: u64,
 
@@ -460,16 +459,7 @@ impl PageServerConf {
             metric_collection_endpoint,
             metric_collection_bucket,
             synthetic_size_calculation_interval,
-            disk_usage_based_eviction: Some(disk_usage_based_eviction.unwrap_or(
-                DiskUsageEvictionTaskConfig {
-                    max_usage_pct: Percent::new(80).unwrap(),
-                    min_avail_bytes: 2_000_000_000,
-                    period: Duration::from_secs(60),
-                    #[cfg(feature = "testing")]
-                    mock_statvfs: None,
-                    eviction_order: Default::default(),
-                },
-            )),
+            disk_usage_based_eviction,
             test_remote_failures,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
@@ -719,8 +709,9 @@ mod tests {
     use std::time::Duration;
 
     use camino::Utf8PathBuf;
+    use pageserver_api::config::{DiskUsageEvictionTaskConfig, EvictionOrder};
     use rstest::rstest;
-    use utils::id::NodeId;
+    use utils::{id::NodeId, serde_percent::Percent};
 
     use super::PageServerConf;
 
@@ -820,19 +811,69 @@ mod tests {
             .expect("parse_and_validate");
     }
 
-    #[test]
-    fn test_config_disk_usage_based_eviction_is_valid() {
-        let input = r#"
+    #[rstest]
+    #[
+        case::omit_the_whole_config(
+            DiskUsageEvictionTaskConfig {
+                max_usage_pct: Percent::new(80).unwrap(),
+                min_avail_bytes: 2_000_000_000,
+                period: Duration::from_secs(60),
+                eviction_order: Default::default(),
+                #[cfg(feature = "testing")]
+                mock_statvfs: None,
+                enabled: true,
+            },
+        r#"
             control_plane_api = "http://localhost:6666"
-        "#;
+        "#,
+    )]
+    #[
+        case::omit_enabled_field(
+            DiskUsageEvictionTaskConfig {
+                max_usage_pct: Percent::new(80).unwrap(),
+                min_avail_bytes: 1_000_000_000,
+                period: Duration::from_secs(60),
+                eviction_order: EvictionOrder::RelativeAccessed {
+                    highest_layer_count_loses_first: true,
+                },
+                #[cfg(feature = "testing")]
+                mock_statvfs: None,
+                enabled: true,
+            },
+        r#"
+            control_plane_api = "http://localhost:6666"
+            disk_usage_based_eviction = { max_usage_pct = 80, min_avail_bytes = 1000000000, period = "60s" }
+        "#,
+    )]
+    #[case::disabled(
+        DiskUsageEvictionTaskConfig {
+            max_usage_pct: Percent::new(80).unwrap(),
+            min_avail_bytes: 2_000_000_000,
+            period: Duration::from_secs(60),
+            eviction_order: EvictionOrder::RelativeAccessed {
+                highest_layer_count_loses_first: true,
+            },
+            #[cfg(feature = "testing")]
+            mock_statvfs: None,
+            enabled: false,
+        },
+        r#"
+            control_plane_api = "http://localhost:6666"
+            disk_usage_based_eviction = { enabled = false }
+        "#
+    )]
+    fn test_config_disk_usage_based_eviction_is_valid(
+        #[case] expected_disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
+        #[case] input: &str,
+    ) {
         let config_toml = toml_edit::de::from_str::<pageserver_api::config::ConfigToml>(input)
             .expect("disk_usage_based_eviction is valid");
         let workdir = Utf8PathBuf::from("/nonexistent");
         let config = PageServerConf::parse_and_validate(NodeId(0), config_toml, &workdir).unwrap();
-        let disk_usage_based_eviction = config.disk_usage_based_eviction.unwrap();
-        assert_eq!(disk_usage_based_eviction.max_usage_pct.get(), 80);
-        assert_eq!(disk_usage_based_eviction.min_avail_bytes, 2_000_000_000);
-        assert_eq!(disk_usage_based_eviction.period, Duration::from_secs(60));
-        assert_eq!(disk_usage_based_eviction.eviction_order, Default::default());
+        let disk_usage_based_eviction = config.disk_usage_based_eviction;
+        assert_eq!(
+            expected_disk_usage_based_eviction,
+            disk_usage_based_eviction
+        );
     }
 }
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index e6529fb201..f1d34664a8 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -171,7 +171,8 @@ pub fn launch_disk_usage_global_eviction_task(
     tenant_manager: Arc<TenantManager>,
     background_jobs_barrier: completion::Barrier,
 ) -> Option<DiskUsageEvictionTask> {
-    let Some(task_config) = &conf.disk_usage_based_eviction else {
+    let task_config = &conf.disk_usage_based_eviction;
+    if !task_config.enabled {
         info!("disk usage based eviction task not configured");
         return None;
     };
@@ -1268,6 +1269,7 @@ mod filesystem_level_usage {
                 #[cfg(feature = "testing")]
                 mock_statvfs: None,
                 eviction_order: pageserver_api::config::EvictionOrder::default(),
+                enabled: true,
             },
             total_bytes: 100_000,
             avail_bytes: 0,
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index 29d1a31aaf..ccfad7a391 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -45,9 +45,10 @@ pub(crate) fn regenerate(
     let (disk_wanted_bytes, shard_count) = tenant_manager.calculate_utilization()?;
 
     // Fetch the fraction of disk space which may be used
-    let disk_usable_pct = match conf.disk_usage_based_eviction.clone() {
-        Some(e) => e.max_usage_pct,
-        None => Percent::new(100).unwrap(),
+    let disk_usable_pct = if conf.disk_usage_based_eviction.enabled {
+        conf.disk_usage_based_eviction.max_usage_pct
+    } else {
+        Percent::new(100).unwrap()
     };
 
     // Express a static value for how many shards we may schedule on one node
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index bf998a2a0a..8e7055ef78 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -71,7 +71,13 @@ def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_man
     n_clients: int,
 ):
     setup_and_run_pagebench_benchmark(
-        neon_env_builder, zenbenchmark, pg_bin, n_tenants, pgbench_scale, duration, n_clients
+        neon_env_builder,
+        zenbenchmark,
+        pg_bin,
+        n_tenants,
+        pgbench_scale,
+        duration,
+        n_clients,
     )
 
 
@@ -86,7 +92,8 @@ def setup_and_run_pagebench_benchmark(
 ):
     def record(metric, **kwargs):
         zenbenchmark.record(
-            metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}", **kwargs
+            metric_name=f"pageserver_max_throughput_getpage_at_latest_lsn.{metric}",
+            **kwargs,
         )
 
     params: dict[str, tuple[Any, dict[str, Any]]] = {}
@@ -104,7 +111,7 @@ def setup_and_run_pagebench_benchmark(
     # configure cache sizes like in prod
     page_cache_size = 16384
     max_file_descriptors = 500000
-    neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{max_usage_pct=99, min_avail_bytes=0, period = '999y'}}"
+    neon_env_builder.pageserver_config_override = f"page_cache_size={page_cache_size}; max_file_descriptors={max_file_descriptors}; disk_usage_based_eviction={{enabled = false}}"
 
     tracing_config = PageserverTracingConfig(
         sampling_ratio=(0, 1000),
@@ -120,7 +127,10 @@ def setup_and_run_pagebench_benchmark(
                 page_cache_size * 8192,
                 {"unit": "byte"},
             ),
-            "pageserver_config_override.max_file_descriptors": (max_file_descriptors, {"unit": ""}),
+            "pageserver_config_override.max_file_descriptors": (
+                max_file_descriptors,
+                {"unit": ""},
+            ),
             "pageserver_config_override.sampling_ratio": (ratio, {"unit": ""}),
         }
     )

From c848b995b296124b686b4eeec54b08aee3e539a1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 8 Jul 2025 22:24:59 +0100
Subject: [PATCH 029/163] safekeeper: trim dead senders before adding more
 (#12490)

## Problem

We only trim the senders if we tried to send a message to them and
discovered that the channel is closed. This is problematic if the
pageserver keeps connecting while there's nothing to send back for the
shard. In this scenario we never trim down the senders list and can
panic due to the u8 limit.

## Summary of Changes

Trim down the dead senders before adding a new one.

Closes LKB-178
---
 safekeeper/src/send_interpreted_wal.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 3797ac39d1..72a436e25f 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -561,6 +561,20 @@ impl InterpretedWalReader {
                         // Update internal and external state, then reset the WAL stream
                         // if required.
                         let senders = self.shard_senders.entry(shard_id).or_default();
+
+                        // Clean up any shard senders that have dropped out before adding the new
+                        // one. This avoids a build up of dead senders.
+                        senders.retain(|sender| {
+                            let closed = sender.tx.is_closed();
+
+                            if closed {
+                                let sender_id = ShardSenderId::new(shard_id, sender.sender_id);
+                                tracing::info!("Removed shard sender {}", sender_id);
+                            }
+
+                            !closed
+                        });
+
                         let new_sender_id = match senders.last() {
                             Some(sender) => sender.sender_id.next(),
                             None => SenderId::first()

From 43dbded8c871ef51f778b84fd1b1c264ea54ad44 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 8 Jul 2025 18:32:51 -0400
Subject: [PATCH 030/163] fix(pageserver): disallow lease creation below the
 applied gc cutoff (#12489)

## Problem

close LKB-209

## Summary of changes

- We should not allow lease creation below the applied gc cutoff.
- Also removed the condition for `AttachedSingle`. We should always
check the lease against the gc cutoff in all attach modes.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs | 42 +++++++++++++++----------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 293f3c484d..4a08172337 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -118,7 +118,6 @@ use crate::pgdatadir_mapping::{
     MAX_AUX_FILE_V2_DELTAS, MetricsUpdate,
 };
 use crate::task_mgr::TaskKind;
-use crate::tenant::config::AttachmentMode;
 use crate::tenant::gc_result::GcResult;
 use crate::tenant::layer_map::LayerMap;
 use crate::tenant::metadata::TimelineMetadata;
@@ -1771,30 +1770,31 @@ impl Timeline {
                     existing_lease.clone()
                 }
                 Entry::Vacant(vacant) => {
-                    // Reject already GC-ed LSN if we are in AttachedSingle and
-                    // not blocked by the lsn lease deadline.
+                    // Never allow a lease to be requested for an LSN below the applied GC cutoff. The data could have been deleted.
+                    let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
+                    if lsn < *latest_gc_cutoff_lsn {
+                        bail!(
+                            "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
+                            lsn,
+                            *latest_gc_cutoff_lsn
+                        );
+                    }
+
+                    // We allow create lease for those below the planned gc cutoff if we are still within the grace period
+                    // of GC blocking.
                     let validate = {
                         let conf = self.tenant_conf.load();
-                        conf.location.attach_mode == AttachmentMode::Single
-                            && !conf.is_gc_blocked_by_lsn_lease_deadline()
+                        !conf.is_gc_blocked_by_lsn_lease_deadline()
                     };
 
-                    if init || validate {
-                        let latest_gc_cutoff_lsn = self.get_applied_gc_cutoff_lsn();
-                        if lsn < *latest_gc_cutoff_lsn {
-                            bail!(
-                                "tried to request an lsn lease for an lsn below the latest gc cutoff. requested at {} gc cutoff {}",
-                                lsn,
-                                *latest_gc_cutoff_lsn
-                            );
-                        }
-                        if lsn < planned_cutoff {
-                            bail!(
-                                "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
-                                lsn,
-                                planned_cutoff
-                            );
-                        }
+                    // Do not allow initial lease creation to be below the planned gc cutoff. The client (compute_ctl) determines
+                    // whether it is a initial lease creation or a renewal.
+                    if (init || validate) && lsn < planned_cutoff {
+                        bail!(
+                            "tried to request an lsn lease for an lsn below the planned gc cutoff. requested at {} planned gc cutoff {}",
+                            lsn,
+                            planned_cutoff
+                        );
                     }
 
                     let dt: DateTime<Utc> = valid_until.into();

From 08399672be3b706a4f388710ed5f5a52de0543ee Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Jul 2025 09:49:15 +0200
Subject: [PATCH 031/163] Temporary workaround for timeout retry errors

---
 pageserver/client_grpc/src/pool.rs | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 0e4bff2f1b..376c97f7e9 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -580,6 +580,14 @@ impl StreamPool {
 
         // Track caller response channels by request ID. If the task returns early, these response
         // channels will be dropped and the waiting callers will receive an error.
+        //
+        // TODO: on timeouts, retries will send additional requests for the same request ID, which
+        // get piled up behind the already sent request. We should, at the very least, add deadlines
+        // for the request such that it's cancelled on the server side. This also means that only
+        // the last caller gets the response, and it may get a response for an earlier attempt. This
+        // needs rethinking.
+        //
+        // TODO: consider allocating separate request IDs for each retry.
         let mut callers = HashMap::new();
 
         // Process requests and responses.
@@ -593,13 +601,6 @@ impl StreamPool {
                     };
 
                     // Store the response channel by request ID.
-                    if callers.contains_key(&req.request_id) {
-                        // Error on request ID duplicates. Ignore callers that went away.
-                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
-                            format!("duplicate request ID: {}", req.request_id),
-                        )));
-                        continue;
-                    }
                     callers.insert(req.request_id, resp_tx);
 
                     // Send the request on the stream. Bail out if the send fails.
@@ -615,9 +616,10 @@ impl StreamPool {
                         return Ok(())
                     };
 
-                    // Send the response to the caller. Ignore errors if the caller went away.
+                    // Send the response to the caller. Ignore errors if the caller went away.  This
+                    // may have happened with e.g. a timeout retry, where multiple requests may have
+                    // been sent for the same ID.
                     let Some(resp_tx) = callers.remove(&resp.request_id) else {
-                        warn!("received response for unknown request ID: {}", resp.request_id);
                         continue;
                     };
                     _ = resp_tx.send(Ok(resp));

From aac1f8efb1086b6db7c599c26912920f08d479b3 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 9 Jul 2025 10:41:36 +0200
Subject: [PATCH 032/163] refactor(compaction): eliminate
 `CompactionError::CollectKeyspaceError` variant (#12517)

The only differentiated handling of it is for `is_critical`, which in
turn is a `matches!()` on several variants of the `enum
CollectKeyspaceError`
which is the value contained insided
`CompactionError::CollectKeyspaceError`.

This PR introduces a new error for `repartition()`, allowing its
immediate
callers to inspect it like `is_critical` did.

A drive-by fix is more precise classification of WaitLsnError::BadState
when mapping to `tonic::Status`.

refs
- https://databricks.atlassian.net/browse/LKB-182
---
 pageserver/src/http/routes.rs                |   1 -
 pageserver/src/pgdatadir_mapping.rs          |  17 +++
 pageserver/src/tenant.rs                     |   7 -
 pageserver/src/tenant/tasks.rs               |   3 +-
 pageserver/src/tenant/timeline.rs            | 129 ++++++++++++-------
 pageserver/src/tenant/timeline/compaction.rs |  43 +++++--
 6 files changed, 130 insertions(+), 70 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0e40dbcd15..2995a37089 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2502,7 +2502,6 @@ async fn timeline_checkpoint_handler(
                 .map_err(|e|
                     match e {
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::CollectKeySpaceError(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::Other(e) => ApiError::InternalServerError(e),
                     }
                 )?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 31f38d485f..8532a6938f 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -141,6 +141,23 @@ pub(crate) enum CollectKeySpaceError {
     Cancelled,
 }
 
+impl CollectKeySpaceError {
+    pub(crate) fn is_cancel(&self) -> bool {
+        match self {
+            CollectKeySpaceError::Decode(_) => false,
+            CollectKeySpaceError::PageRead(e) => e.is_cancel(),
+            CollectKeySpaceError::Cancelled => true,
+        }
+    }
+    pub(crate) fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            CollectKeySpaceError::Decode(e) => anyhow::Error::new(e),
+            CollectKeySpaceError::PageRead(e) => anyhow::Error::new(e),
+            CollectKeySpaceError::Cancelled => anyhow::Error::new(self),
+        }
+    }
+}
+
 impl From<PageReconstructError> for CollectKeySpaceError {
     fn from(err: PageReconstructError) -> Self {
         match err {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index b0969a96c1..f576119db8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3324,13 +3324,6 @@ impl TenantShard {
         match err {
             err if err.is_cancel() => {}
             CompactionError::ShuttingDown => (),
-            CompactionError::CollectKeySpaceError(err) => {
-                // CollectKeySpaceError::Cancelled and PageRead::Cancelled are handled in `err.is_cancel` branch.
-                self.compaction_circuit_breaker
-                    .lock()
-                    .unwrap()
-                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
-            }
             CompactionError::Other(err) => {
                 self.compaction_circuit_breaker
                     .lock()
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 2ae6b7ff3d..bcece5589a 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -318,7 +318,6 @@ pub(crate) fn log_compaction_error(
     let level = match err {
         e if e.is_cancel() => return,
         ShuttingDown => return,
-        CollectKeySpaceError(_) => Level::ERROR,
         _ if task_cancelled => Level::INFO,
         Other(err) => {
             let root_cause = err.root_cause();
@@ -328,7 +327,7 @@ pub(crate) fn log_compaction_error(
                 .is_some_and(|e| e.is_stopping());
             let timeline = root_cause
                 .downcast_ref::<PageReconstructError>()
-                .is_some_and(|e| e.is_stopping());
+                .is_some_and(|e| e.is_cancel());
             let buffered_writer_flush_task_canelled = root_cause
                 .downcast_ref::<FlushTaskError>()
                 .is_some_and(|e| e.is_cancel());
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4a08172337..6088f40669 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -585,6 +585,28 @@ pub(crate) enum PageReconstructError {
     MissingKey(Box<MissingKeyError>),
 }
 
+impl PageReconstructError {
+    pub(crate) fn is_cancel(&self) -> bool {
+        match self {
+            PageReconstructError::Other(_) => false,
+            PageReconstructError::AncestorLsnTimeout(e) => e.is_cancel(),
+            PageReconstructError::Cancelled => true,
+            PageReconstructError::WalRedo(_) => false,
+            PageReconstructError::MissingKey(_) => false,
+        }
+    }
+    #[allow(dead_code)] // we use the is_cancel + into_anyhow pattern in quite a few places, this one will follow soon enough
+    pub(crate) fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            PageReconstructError::Other(e) => e,
+            PageReconstructError::AncestorLsnTimeout(e) => e.into_anyhow(),
+            PageReconstructError::Cancelled => anyhow::Error::new(self),
+            PageReconstructError::WalRedo(e) => e,
+            PageReconstructError::MissingKey(_) => anyhow::Error::new(self),
+        }
+    }
+}
+
 impl From<anyhow::Error> for PageReconstructError {
     fn from(value: anyhow::Error) -> Self {
         // with walingest.rs many PageReconstructError are wrapped in as anyhow::Error
@@ -738,17 +760,6 @@ impl std::fmt::Display for MissingKeyError {
     }
 }
 
-impl PageReconstructError {
-    /// Returns true if this error indicates a tenant/timeline shutdown alike situation
-    pub(crate) fn is_stopping(&self) -> bool {
-        use PageReconstructError::*;
-        match self {
-            Cancelled => true,
-            Other(_) | AncestorLsnTimeout(_) | WalRedo(_) | MissingKey(_) => false,
-        }
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum CreateImageLayersError {
     #[error("timeline shutting down")]
@@ -951,13 +962,35 @@ pub enum WaitLsnError {
     Timeout(String),
 }
 
+impl WaitLsnError {
+    pub(crate) fn is_cancel(&self) -> bool {
+        match self {
+            WaitLsnError::Shutdown => true,
+            WaitLsnError::BadState(timeline_state) => match timeline_state {
+                TimelineState::Loading => false,
+                TimelineState::Active => false,
+                TimelineState::Stopping => true,
+                TimelineState::Broken { .. } => false,
+            },
+            WaitLsnError::Timeout(_) => false,
+        }
+    }
+    pub(crate) fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            WaitLsnError::Shutdown => anyhow::Error::new(self),
+            WaitLsnError::BadState(_) => anyhow::Error::new(self),
+            WaitLsnError::Timeout(_) => anyhow::Error::new(self),
+        }
+    }
+}
+
 impl From<WaitLsnError> for tonic::Status {
     fn from(err: WaitLsnError) -> Self {
         use tonic::Code;
-        let code = match &err {
-            WaitLsnError::Timeout(_) => Code::Internal,
-            WaitLsnError::BadState(_) => Code::Internal,
-            WaitLsnError::Shutdown => Code::Unavailable,
+        let code = if err.is_cancel() {
+            Code::Unavailable
+        } else {
+            Code::Internal
         };
         tonic::Status::new(code, err.to_string())
     }
@@ -1084,6 +1117,26 @@ enum ImageLayerCreationOutcome {
     Skip,
 }
 
+enum RepartitionError {
+    Other(anyhow::Error),
+    CollectKeyspace(CollectKeySpaceError),
+}
+
+impl RepartitionError {
+    fn is_cancel(&self) -> bool {
+        match self {
+            RepartitionError::Other(_) => false,
+            RepartitionError::CollectKeyspace(e) => e.is_cancel(),
+        }
+    }
+    fn into_anyhow(self) -> anyhow::Error {
+        match self {
+            RepartitionError::Other(e) => e,
+            RepartitionError::CollectKeyspace(e) => e.into_anyhow(),
+        }
+    }
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -2070,10 +2123,6 @@ impl Timeline {
             Err(CompactionError::Other(_)) => {
                 self.compaction_failed.store(true, AtomicOrdering::Relaxed)
             }
-            Err(CompactionError::CollectKeySpaceError(_)) => {
-                // Cancelled errors are covered by the `Err(e) if e.is_cancel()` branch.
-                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
-            }
         };
 
         result
@@ -4963,7 +5012,7 @@ impl Timeline {
                     ctx,
                 )
                 .await
-                .map_err(|e| FlushLayerError::from_anyhow(self, e.into()))?;
+                .map_err(|e| FlushLayerError::from_anyhow(self, e.into_anyhow()))?;
 
             if self.cancel.is_cancelled() {
                 return Err(FlushLayerError::Cancelled);
@@ -5213,18 +5262,18 @@ impl Timeline {
         partition_size: u64,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
-    ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
+    ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), RepartitionError> {
         let Ok(mut guard) = self.partitioning.try_write_guard() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
-            return Err(CompactionError::Other(anyhow!(
+            return Err(RepartitionError::Other(anyhow!(
                 "repartition() called concurrently"
             )));
         };
         let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
         if lsn < *partition_lsn {
-            return Err(CompactionError::Other(anyhow!(
+            return Err(RepartitionError::Other(anyhow!(
                 "repartition() called with LSN going backwards, this should not happen"
             )));
         }
@@ -5245,7 +5294,10 @@ impl Timeline {
             ));
         }
 
-        let (dense_ks, sparse_ks) = self.collect_keyspace(lsn, ctx).await?;
+        let (dense_ks, sparse_ks) = self
+            .collect_keyspace(lsn, ctx)
+            .await
+            .map_err(RepartitionError::CollectKeyspace)?;
         let dense_partitioning = dense_ks.partition(
             &self.shard_identity,
             partition_size,
@@ -6010,9 +6062,6 @@ impl Drop for Timeline {
 pub(crate) enum CompactionError {
     #[error("The timeline or pageserver is shutting down")]
     ShuttingDown,
-    /// Compaction cannot be done right now; page reconstruction and so on.
-    #[error("Failed to collect keyspace: {0}")]
-    CollectKeySpaceError(#[from] CollectKeySpaceError),
     #[error(transparent)]
     Other(anyhow::Error),
 }
@@ -6020,27 +6069,15 @@ pub(crate) enum CompactionError {
 impl CompactionError {
     /// Errors that can be ignored, i.e., cancel and shutdown.
     pub fn is_cancel(&self) -> bool {
-        matches!(
-            self,
-            Self::ShuttingDown
-                | Self::CollectKeySpaceError(CollectKeySpaceError::Cancelled)
-                | Self::CollectKeySpaceError(CollectKeySpaceError::PageRead(
-                    PageReconstructError::Cancelled
-                ))
-        )
+        matches!(self, Self::ShuttingDown)
     }
 
-    /// Critical errors that indicate data corruption.
-    pub fn is_critical(&self) -> bool {
-        matches!(
-            self,
-            Self::CollectKeySpaceError(
-                CollectKeySpaceError::Decode(_)
-                    | CollectKeySpaceError::PageRead(
-                        PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
-                    )
-            )
-        )
+    pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
+        if err.is_cancel() {
+            Self::ShuttingDown
+        } else {
+            Self::Other(err.into_anyhow())
+        }
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2c0b98c1e2..c263df1eb2 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -16,7 +16,8 @@ use super::{
     Timeline,
 };
 
-use crate::tenant::timeline::DeltaEntry;
+use crate::pgdatadir_mapping::CollectKeySpaceError;
+use crate::tenant::timeline::{DeltaEntry, RepartitionError};
 use crate::walredo::RedoAttemptType;
 use anyhow::{Context, anyhow};
 use bytes::Bytes;
@@ -64,7 +65,7 @@ use crate::tenant::timeline::{
     DeltaLayerWriter, ImageLayerCreationOutcome, ImageLayerWriter, IoConcurrency, Layer,
     ResidentLayer, drop_layer_manager_rlock,
 };
-use crate::tenant::{DeltaLayer, MaybeOffloaded};
+use crate::tenant::{DeltaLayer, MaybeOffloaded, PageReconstructError};
 use crate::virtual_file::{MaybeFatalIo, VirtualFile};
 
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
@@ -572,7 +573,7 @@ impl GcCompactionQueue {
         match res {
             Ok(res) => Ok(res),
             Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
-            Err(CompactionError::CollectKeySpaceError(_) | CompactionError::Other(_)) => {
+            Err(CompactionError::Other(_)) => {
                 // There are some cases where traditional gc might collect some layer
                 // files causing gc-compaction cannot read the full history of the key.
                 // This needs to be resolved in the long-term by improving the compaction
@@ -1417,22 +1418,33 @@ impl Timeline {
             }
 
             // Suppress errors when cancelled.
-            Err(_) if self.cancel.is_cancelled() => {}
+            //
+            // Log other errors but continue. Failure to repartition is normal, if the timeline was just created
+            // as an empty timeline. Also in unit tests, when we use the timeline as a simple
+            // key-value store, ignoring the datadir layout. Log the error but continue.
+            //
+            // TODO:
+            // 1. shouldn't we return early here if we observe cancellation
+            // 2. Experiment: can we stop checking self.cancel here?
+            Err(_) if self.cancel.is_cancelled() => {} // TODO: try how we fare removing this branch
             Err(err) if err.is_cancel() => {}
-
-            // Alert on critical errors that indicate data corruption.
-            Err(err) if err.is_critical() => {
+            Err(RepartitionError::CollectKeyspace(
+                e @ CollectKeySpaceError::Decode(_)
+                | e @ CollectKeySpaceError::PageRead(
+                    PageReconstructError::MissingKey(_) | PageReconstructError::WalRedo(_),
+                ),
+            )) => {
+                // Alert on critical errors that indicate data corruption.
                 critical_timeline!(
                     self.tenant_shard_id,
                     self.timeline_id,
-                    "could not compact, repartitioning keyspace failed: {err:?}"
+                    "could not compact, repartitioning keyspace failed: {e:?}"
                 );
             }
-
-            // Log other errors. No partitioning? This is normal, if the timeline was just created
-            // as an empty timeline. Also in unit tests, when we use the timeline as a simple
-            // key-value store, ignoring the datadir layout. Log the error but continue.
-            Err(err) => error!("could not compact, repartitioning keyspace failed: {err:?}"),
+            Err(e) => error!(
+                "could not compact, repartitioning keyspace failed: {:?}",
+                e.into_anyhow()
+            ),
         };
 
         let partition_count = self.partitioning.read().0.0.parts.len();
@@ -2518,7 +2530,10 @@ impl Timeline {
             return Err(CompactionError::ShuttingDown);
         }
 
-        let (dense_ks, _sparse_ks) = self.collect_keyspace(end_lsn, ctx).await?;
+        let (dense_ks, _sparse_ks) = self
+            .collect_keyspace(end_lsn, ctx)
+            .await
+            .map_err(CompactionError::from_collect_keyspace)?;
         // TODO(chi): ignore sparse_keyspace for now, compact it in the future.
         let mut adaptor = TimelineAdaptor::new(self, (end_lsn, dense_ks));
 

From 5ea0bb2d4fa1a9bccac73321512554e3a148427f Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 9 Jul 2025 11:58:46 +0200
Subject: [PATCH 033/163] proxy: Drop unused metrics (#12521)

* proxy_control_plane_token_acquire_seconds
* proxy_allowed_ips_cache_misses
* proxy_vpc_endpoint_id_cache_stats
* proxy_access_blocker_flags_cache_stats
* proxy_requests_auth_rate_limits_total
* proxy_endpoints_auth_rate_limits
* proxy_invalid_endpoints_total
---
 proxy/src/metrics.rs | 26 +-------------------------
 1 file changed, 1 insertion(+), 25 deletions(-)

diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 7a21e4ecee..9d1a3d4358 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -10,7 +10,7 @@ use measured::{
     Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
     MetricGroup,
 };
-use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
+use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLogVec};
 use tokio::time::{self, Instant};
 
 use crate::control_plane::messages::ColdStartInfo;
@@ -36,7 +36,6 @@ impl Metrics {
         metrics.proxy.redis_errors_total.init_all_dense();
         metrics.proxy.redis_events_count.init_all_dense();
         metrics.proxy.retries_metric.init_all_dense();
-        metrics.proxy.invalid_endpoints_total.init_all_dense();
         metrics.proxy.connection_failures_total.init_all_dense();
 
         SELF.set(metrics)
@@ -80,11 +79,6 @@ pub struct ProxyMetrics {
     )]
     pub console_request_latency: HistogramVec<ConsoleRequestSet, 16>,
 
-    /// Time it takes to acquire a token to call console plane.
-    // largest bucket = 3^16 * 0.05ms = 2.15s
-    #[metric(metadata = Thresholds::exponential_buckets(0.00005, 3.0))]
-    pub control_plane_token_acquire_seconds: Histogram<16>,
-
     /// Size of the HTTP request body lengths.
     // smallest bucket = 16 bytes
     // largest bucket = 4^12 * 16 bytes = 256MB
@@ -98,19 +92,10 @@ pub struct ProxyMetrics {
     /// Number of opened connections to a database.
     pub http_pool_opened_connections: Gauge,
 
-    /// Number of cache hits/misses for allowed ips.
-    pub allowed_ips_cache_misses: CounterVec<StaticLabelSet<CacheOutcome>>,
-
     /// Number of allowed ips
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
     pub allowed_ips_number: Histogram<10>,
 
-    /// Number of cache hits/misses for VPC endpoint IDs.
-    pub vpc_endpoint_id_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
-
-    /// Number of cache hits/misses for access blocker flags.
-    pub access_blocker_flags_cache_stats: CounterVec<StaticLabelSet<CacheOutcome>>,
-
     /// Number of allowed VPC endpoints IDs
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0]))]
     pub allowed_vpc_endpoint_ids: Histogram<10>,
@@ -139,21 +124,12 @@ pub struct ProxyMetrics {
     /// Number of TLS handshake failures
     pub tls_handshake_failures: Counter,
 
-    /// Number of connection requests affected by authentication rate limits
-    pub requests_auth_rate_limits_total: Counter,
-
     /// HLL approximate cardinality of endpoints that are connecting
     pub connecting_endpoints: HyperLogLogVec<StaticLabelSet<Protocol>, 32>,
 
     /// Number of endpoints affected by errors of a given classification
     pub endpoints_affected_by_errors: HyperLogLogVec<StaticLabelSet<crate::error::ErrorKind>, 32>,
 
-    /// Number of endpoints affected by authentication rate limits
-    pub endpoints_auth_rate_limits: HyperLogLog<32>,
-
-    /// Number of invalid endpoints (per protocol, per rejected).
-    pub invalid_endpoints_total: CounterVec<InvalidEndpointsSet>,
-
     /// Number of retries (per outcome, per retry_type).
     #[metric(metadata = Thresholds::with_buckets([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0]))]
     pub retries_metric: HistogramVec<RetriesMetricSet, 9>,

From 39159955306d505f9bae5d3677192daba9d005be Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Jul 2025 13:42:46 +0200
Subject: [PATCH 034/163] pageserver/client_grpc: add rich Pageserver gRPC
 client (#12462)

## Problem

For the communicator, we need a rich Pageserver gRPC client.

Touches #11735.
Requires #12434.

## Summary of changes

This patch adds an initial rich Pageserver gRPC client. It supports:

* Sharded tenants across multiple Pageservers.
* Pooling of connections, clients, and streams for efficient resource
use.
* Concurrent use by many callers.
* Internal handling of GetPage bidirectional streams, with pipelining
and error handling.
* Automatic retries.
* Observability.

The client is still under development. In particular, it needs GetPage
batch splitting, shard map updates, and performance optimization. This
will be addressed in follow-up PRs.
---
 Cargo.lock                           |   2 +
 libs/compute_api/src/spec.rs         |   2 +-
 libs/pageserver_api/src/shard.rs     |   6 +-
 libs/utils/src/shard.rs              |   6 +
 pageserver/client_grpc/Cargo.toml    |   2 +
 pageserver/client_grpc/src/client.rs | 303 +++++++++++++++++++++++++++
 pageserver/client_grpc/src/lib.rs    |  17 +-
 pageserver/client_grpc/src/retry.rs  | 151 +++++++++++++
 pageserver/page_api/src/model.rs     |  15 ++
 pageserver/src/page_service.rs       |   2 +
 10 files changed, 491 insertions(+), 15 deletions(-)
 create mode 100644 pageserver/client_grpc/src/client.rs
 create mode 100644 pageserver/client_grpc/src/retry.rs

diff --git a/Cargo.lock b/Cargo.lock
index 893932fb9d..558e8e2295 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4498,7 +4498,9 @@ name = "pageserver_client_grpc"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "compute_api",
  "futures",
+ "pageserver_api",
  "pageserver_page_api",
  "tokio",
  "tokio-stream",
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 60311aa3e6..0eeab2bebc 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -442,7 +442,7 @@ pub struct JwksSettings {
 }
 
 /// Protocol used to connect to a Pageserver. Parsed from the connstring scheme.
-#[derive(Clone, Copy, Debug, Default)]
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
 pub enum PageserverProtocol {
     /// The original protocol based on libpq and COPY. Uses postgresql:// or postgres:// scheme.
     #[default]
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 5a13aace64..d6f4cd5e66 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -332,7 +332,11 @@ fn hash_combine(mut a: u32, mut b: u32) -> u32 {
 ///
 /// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
 /// and will be handled at higher levels when shards are split.
-fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
+pub fn key_to_shard_number(
+    count: ShardCount,
+    stripe_size: ShardStripeSize,
+    key: &Key,
+) -> ShardNumber {
     // Fast path for un-sharded tenants or broadcast keys
     if count < ShardCount(2) || key_is_shard0(key) {
         return ShardNumber(0);
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index f2b81373e2..5a0edf8cea 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -171,6 +171,12 @@ impl std::fmt::Display for ShardNumber {
     }
 }
 
+impl std::fmt::Display for ShardCount {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
index 5a3a2761c2..0a8bcad2ef 100644
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -6,7 +6,9 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+compute_api.workspace = true
 futures.workspace = true
+pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
new file mode 100644
index 0000000000..5bccdeede3
--- /dev/null
+++ b/pageserver/client_grpc/src/client.rs
@@ -0,0 +1,303 @@
+use std::collections::HashMap;
+use std::sync::Arc;
+
+use anyhow::anyhow;
+use tracing::instrument;
+
+use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
+use crate::retry::Retry;
+use compute_api::spec::PageserverProtocol;
+use pageserver_api::key::{Key, rel_block_to_key};
+use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
+use pageserver_page_api as page_api;
+use utils::id::{TenantId, TimelineId};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
+
+/// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
+/// basic `page_api::Client` gRPC client, and supports:
+///
+/// * Sharded tenants across multiple Pageservers.
+/// * Pooling of connections, clients, and streams for efficient resource use.
+/// * Concurrent use by many callers.
+/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
+/// * Automatic retries.
+/// * Observability.
+///
+/// TODO: this client does not support base backups or LSN leases, as these are only used by
+/// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
+pub struct PageserverClient {
+    // TODO: support swapping out the shard map, e.g. via an ArcSwap.
+    shards: Shards,
+    retry: Retry,
+}
+
+impl PageserverClient {
+    /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
+    /// in the shard map, which must be complete and must use gRPC URLs.
+    pub fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_map: HashMap<ShardIndex, String>,
+        stripe_size: ShardStripeSize,
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
+        let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?;
+        Ok(Self {
+            shards,
+            retry: Retry,
+        })
+    }
+
+    /// Returns whether a relation exists.
+    #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
+    pub async fn check_rel_exists(
+        &self,
+        req: page_api::CheckRelExistsRequest,
+    ) -> tonic::Result<page_api::CheckRelExistsResponse> {
+        self.retry
+            .with(async || {
+                // Relation metadata is only available on shard 0.
+                let mut client = self.shards.get_zero().client().await?;
+                client.check_rel_exists(req).await
+            })
+            .await
+    }
+
+    /// Returns the total size of a database, as # of bytes.
+    #[instrument(skip_all, fields(db_oid=%req.db_oid, lsn=%req.read_lsn))]
+    pub async fn get_db_size(
+        &self,
+        req: page_api::GetDbSizeRequest,
+    ) -> tonic::Result<page_api::GetDbSizeResponse> {
+        self.retry
+            .with(async || {
+                // Relation metadata is only available on shard 0.
+                let mut client = self.shards.get_zero().client().await?;
+                client.get_db_size(req).await
+            })
+            .await
+    }
+
+    /// Fetches a page. The `request_id` must be unique across all in-flight requests.
+    ///
+    /// Unlike the `page_api::Client`, this client automatically converts `status_code` into
+    /// `tonic::Status` errors. All responses will have `GetPageStatusCode::Ok`.
+    #[instrument(skip_all, fields(
+        req_id = %req.request_id,
+        rel = %req.rel,
+        blkno = %req.block_numbers[0],
+        blks = %req.block_numbers.len(),
+        lsn = %req.read_lsn,
+    ))]
+    pub async fn get_page(
+        &self,
+        req: page_api::GetPageRequest,
+    ) -> tonic::Result<page_api::GetPageResponse> {
+        // TODO: this needs to split batch requests across shards and reassemble responses into a
+        // single response. It must also re-split the batch in case the shard map changes. For now,
+        // just use the first page.
+        let key = rel_block_to_key(
+            req.rel,
+            req.block_numbers
+                .first()
+                .copied()
+                .ok_or_else(|| tonic::Status::invalid_argument("no block numbers provided"))?,
+        );
+
+        self.retry
+            .with(async || {
+                let stream = self.shards.get_for_key(key).stream().await;
+                let resp = stream.send(req.clone()).await?;
+
+                if resp.status_code != page_api::GetPageStatusCode::Ok {
+                    return Err(tonic::Status::new(
+                        resp.status_code.into(),
+                        resp.reason.unwrap_or_else(|| String::from("unknown error")),
+                    ));
+                }
+
+                Ok(resp)
+            })
+            .await
+    }
+
+    /// Returns the size of a relation, as # of blocks.
+    #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
+    pub async fn get_rel_size(
+        &self,
+        req: page_api::GetRelSizeRequest,
+    ) -> tonic::Result<page_api::GetRelSizeResponse> {
+        self.retry
+            .with(async || {
+                // Relation metadata is only available on shard 0.
+                let mut client = self.shards.get_zero().client().await?;
+                client.get_rel_size(req).await
+            })
+            .await
+    }
+
+    /// Fetches an SLRU segment.
+    #[instrument(skip_all, fields(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn))]
+    pub async fn get_slru_segment(
+        &self,
+        req: page_api::GetSlruSegmentRequest,
+    ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
+        self.retry
+            .with(async || {
+                // SLRU segments are only available on shard 0.
+                let mut client = self.shards.get_zero().client().await?;
+                client.get_slru_segment(req).await
+            })
+            .await
+    }
+}
+
+/// Tracks the tenant's shards.
+struct Shards {
+    /// The shard count.
+    ///
+    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
+    count: ShardCount,
+    /// The stripe size. Only used for sharded tenants.
+    stripe_size: ShardStripeSize,
+    /// Shards by shard index.
+    ///
+    /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`.
+    ///
+    /// INVARIANT: every shard 0..count is present.
+    /// INVARIANT: shard 0 is always present.
+    map: HashMap<ShardIndex, Shard>,
+}
+
+impl Shards {
+    /// Creates a new set of shards based on a shard map.
+    fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_map: HashMap<ShardIndex, String>,
+        stripe_size: ShardStripeSize,
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
+        let count = match shard_map.len() {
+            0 => return Err(anyhow!("no shards provided")),
+            1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
+            n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
+            n => ShardCount::new(n as u8),
+        };
+
+        let mut map = HashMap::new();
+        for (shard_id, url) in shard_map {
+            // The shard index must match the computed shard count, even for unsharded tenants.
+            if shard_id.shard_count != count {
+                return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
+            }
+            // The shard index' number and count must be consistent.
+            if !shard_id.is_unsharded() && shard_id.shard_number.0 >= shard_id.shard_count.0 {
+                return Err(anyhow!("invalid shard index {shard_id}"));
+            }
+            // The above conditions guarantee that we have all shards 0..count: len() matches count,
+            // shard number < count, and numbers are unique (via hashmap).
+            let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?;
+            map.insert(shard_id, shard);
+        }
+
+        Ok(Self {
+            count,
+            stripe_size,
+            map,
+        })
+    }
+
+    /// Looks up the given shard.
+    #[allow(clippy::result_large_err)] // TODO: check perf impact
+    fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
+        self.map
+            .get(&shard_id)
+            .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
+    }
+
+    /// Looks up the shard that owns the given key.
+    fn get_for_key(&self, key: Key) -> &Shard {
+        let shard_number = key_to_shard_number(self.count, self.stripe_size, &key);
+        self.get(ShardIndex::new(shard_number, self.count))
+            .expect("must exist")
+    }
+
+    /// Returns shard 0.
+    fn get_zero(&self) -> &Shard {
+        self.get(ShardIndex::new(ShardNumber(0), self.count))
+            .expect("always present")
+    }
+}
+
+/// A single shard.
+///
+/// TODO: consider separate pools for normal and bulk traffic, with different settings.
+struct Shard {
+    /// Dedicated channel pool for this shard. Shared by all clients/streams in this shard.
+    _channel_pool: Arc<ChannelPool>,
+    /// Unary gRPC client pool for this shard. Uses the shared channel pool.
+    client_pool: Arc<ClientPool>,
+    /// GetPage stream pool for this shard. Uses a dedicated client pool, but shares the channel
+    /// pool with unary clients.
+    stream_pool: Arc<StreamPool>,
+}
+
+impl Shard {
+    /// Creates a new shard. It has its own dedicated resource pools.
+    fn new(
+        url: String,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_id: ShardIndex,
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
+        // Sanity-check that the URL uses gRPC.
+        if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc {
+            return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
+        }
+
+        // Use a common channel pool for all clients, to multiplex unary and stream requests across
+        // the same TCP connections. The channel pool is unbounded (but client pools are bounded).
+        let channel_pool = ChannelPool::new(url)?;
+
+        // Dedicated client pool for unary requests.
+        let client_pool = ClientPool::new(
+            channel_pool.clone(),
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token.clone(),
+        );
+
+        // Stream pool with dedicated client pool. If this shared a client pool with unary requests,
+        // long-lived streams could fill up the client pool and starve out unary requests. It shares
+        // the same underlying channel pool with unary clients though, which is unbounded.
+        let stream_pool = StreamPool::new(ClientPool::new(
+            channel_pool.clone(),
+            tenant_id,
+            timeline_id,
+            shard_id,
+            auth_token,
+        ));
+
+        Ok(Self {
+            _channel_pool: channel_pool,
+            client_pool,
+            stream_pool,
+        })
+    }
+
+    /// Returns a pooled client for this shard.
+    async fn client(&self) -> tonic::Result<ClientGuard> {
+        self.client_pool
+            .get()
+            .await
+            .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
+    }
+
+    /// Returns a pooled stream for this shard.
+    async fn stream(&self) -> StreamGuard {
+        self.stream_pool.get().await
+    }
+}
diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs
index c900e1a939..2a59f9868c 100644
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -1,14 +1,5 @@
-//! A rich Pageserver gRPC client. This client is more capable than the basic `page_api::Client`
-//! gRPC client, and supports:
-//!
-//! * Sharded tenants across multiple Pageservers.
-//! * Pooling of connections, clients, and streams for efficient resource use.
-//! * Concurrent use by many callers.
-//! * Internal handling of GetPage bidirectional streams.
-//! * Automatic retries.
-//! * Observability.
-//!
-//! The client is under development, this package is just a shell.
-
-#[allow(unused)]
+mod client;
 mod pool;
+mod retry;
+
+pub use client::PageserverClient;
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
new file mode 100644
index 0000000000..b0473204d7
--- /dev/null
+++ b/pageserver/client_grpc/src/retry.rs
@@ -0,0 +1,151 @@
+use std::time::Duration;
+
+use tokio::time::Instant;
+use tracing::{error, info, warn};
+
+use utils::backoff::exponential_backoff_duration;
+
+/// A retry handler for Pageserver gRPC requests.
+///
+/// This is used instead of backoff::retry for better control and observability.
+pub struct Retry;
+
+impl Retry {
+    /// The per-request timeout.
+    // TODO: tune these, and/or make them configurable. Should we retry forever?
+    const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+    /// The total timeout across all attempts
+    const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
+    /// The initial backoff duration.
+    const BASE_BACKOFF: Duration = Duration::from_millis(10);
+    /// The maximum backoff duration.
+    const MAX_BACKOFF: Duration = Duration::from_secs(10);
+    /// If true, log successful requests. For debugging.
+    const LOG_SUCCESS: bool = false;
+
+    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
+    /// using the current tracing span for context.
+    ///
+    /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
+    /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
+    pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
+    where
+        F: FnMut() -> O,
+        O: Future<Output = tonic::Result<T>>,
+    {
+        let started = Instant::now();
+        let deadline = started + Self::TOTAL_TIMEOUT;
+        let mut last_error = None;
+        let mut retries = 0;
+        loop {
+            // Set up a future to wait for the backoff (if any) and run the request with a timeout.
+            let backoff_and_try = async {
+                // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
+                // https://github.com/tokio-rs/tokio/issues/6866
+                if let Some(backoff) = Self::backoff_duration(retries) {
+                    tokio::time::sleep(backoff).await;
+                }
+
+                let request_started = Instant::now();
+                tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
+                    .await
+                    .map_err(|_| {
+                        tonic::Status::deadline_exceeded(format!(
+                            "request timed out after {:.3}s",
+                            request_started.elapsed().as_secs_f64()
+                        ))
+                    })?
+            };
+
+            // Wait for the backoff and request, or bail out if the total timeout is exceeded.
+            let result = tokio::select! {
+                result = backoff_and_try => result,
+
+                _ = tokio::time::sleep_until(deadline) => {
+                    let last_error = last_error.unwrap_or_else(|| {
+                        tonic::Status::deadline_exceeded(format!(
+                            "request timed out after {:.3}s",
+                            started.elapsed().as_secs_f64()
+                        ))
+                    });
+                    error!(
+                        "giving up after {:.3}s and {retries} retries, last error {:?}: {}",
+                        started.elapsed().as_secs_f64(), last_error.code(), last_error.message(),
+                    );
+                    return Err(last_error);
+                }
+            };
+
+            match result {
+                // Success, return the result.
+                Ok(result) => {
+                    if retries > 0 || Self::LOG_SUCCESS {
+                        info!(
+                            "request succeeded after {retries} retries in {:.3}s",
+                            started.elapsed().as_secs_f64(),
+                        );
+                    }
+
+                    return Ok(result);
+                }
+
+                // Error, retry or bail out.
+                Err(status) => {
+                    let (code, message) = (status.code(), status.message());
+                    let attempt = retries + 1;
+
+                    if !Self::should_retry(code) {
+                        // NB: include the attempt here too. This isn't necessarily the first
+                        // attempt, because the error may change between attempts.
+                        error!(
+                            "request failed with {code:?}: {message}, not retrying (attempt {attempt})"
+                        );
+                        return Err(status);
+                    }
+
+                    warn!("request failed with {code:?}: {message}, retrying (attempt {attempt})");
+
+                    retries += 1;
+                    last_error = Some(status);
+                }
+            }
+        }
+    }
+
+    /// Returns the backoff duration for the given retry attempt, or None for no backoff.
+    fn backoff_duration(retry: usize) -> Option<Duration> {
+        let backoff = exponential_backoff_duration(
+            retry as u32,
+            Self::BASE_BACKOFF.as_secs_f64(),
+            Self::MAX_BACKOFF.as_secs_f64(),
+        );
+        (!backoff.is_zero()).then_some(backoff)
+    }
+
+    /// Returns true if the given status code should be retries.
+    fn should_retry(code: tonic::Code) -> bool {
+        match code {
+            tonic::Code::Ok => panic!("unexpected Ok status code"),
+
+            // These codes are transient, so retry them.
+            tonic::Code::Aborted => true,
+            tonic::Code::Cancelled => true,
+            tonic::Code::DeadlineExceeded => true, // maybe transient slowness
+            tonic::Code::Internal => true,         // maybe transient failure?
+            tonic::Code::ResourceExhausted => true,
+            tonic::Code::Unavailable => true,
+
+            // The following codes will like continue to fail, so don't retry.
+            tonic::Code::AlreadyExists => false,
+            tonic::Code::DataLoss => false,
+            tonic::Code::FailedPrecondition => false,
+            tonic::Code::InvalidArgument => false,
+            tonic::Code::NotFound => false,
+            tonic::Code::OutOfRange => false,
+            tonic::Code::PermissionDenied => false,
+            tonic::Code::Unauthenticated => false,
+            tonic::Code::Unimplemented => false,
+            tonic::Code::Unknown => false,
+        }
+    }
+}
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index 4497fc6fc7..c5b6f06879 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -602,6 +602,21 @@ impl TryFrom<tonic::Code> for GetPageStatusCode {
     }
 }
 
+impl From<GetPageStatusCode> for tonic::Code {
+    fn from(status_code: GetPageStatusCode) -> Self {
+        use tonic::Code;
+
+        match status_code {
+            GetPageStatusCode::Unknown => Code::Unknown,
+            GetPageStatusCode::Ok => Code::Ok,
+            GetPageStatusCode::NotFound => Code::NotFound,
+            GetPageStatusCode::InvalidRequest => Code::InvalidArgument,
+            GetPageStatusCode::InternalError => Code::Internal,
+            GetPageStatusCode::SlowDown => Code::ResourceExhausted,
+        }
+    }
+}
+
 // Fetches the size of a relation at a given LSN, as # of blocks. Only valid on shard 0, other
 // shards will error.
 #[derive(Clone, Copy, Debug)]
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6b614deac8..70fdb2e789 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3353,6 +3353,8 @@ impl GrpcPageServiceHandler {
     /// NB: errors returned from here are intercepted in get_pages(), and may be converted to a
     /// GetPageResponse with an appropriate status code to avoid terminating the stream.
     ///
+    /// TODO: verify that the requested pages belong to this shard.
+    ///
     /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
     /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
     /// split them up in the client or server.

From 7049003cf790768eddf38a2e004d046e282adfb2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 9 Jul 2025 14:02:44 +0200
Subject: [PATCH 035/163] storcon: print viability of
 --timelines-onto-safekeepers (#12485)

The `--timelines-onto-safekeepers` flag is very consequential in the
sense that it controls every single timeline creation. However, we don't
have any automatic insight whether enabling the option will break things
or not.

The main way things can break is by misconfigured safekeepers, say they
are marked as paused in the storcon db. The best input so far we can
obtain via manually connecting via storcon_cli and listing safekeepers,
but this is cumbersome and manual so prone to human error.

So at storcon startup, do a simulated "test creation" in which we call
`timelines_onto_safekeepers` with the configuration provided to us, and
print whether it was successful or not. No actual timeline is created,
and nothing is written into the storcon db. The heartbeat info will not
have reached us at that point yet, but that's okay, because we still
fall back to safekeepers that don't have any heartbeat.

Also print some general scheduling policy stats on initial safekeeper
load.

Part of #11670.
---
 libs/pageserver_api/src/models.rs |  2 +-
 storage_controller/src/service.rs | 24 +++++++++++++++++++++++-
 2 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 16545364c1..6735320484 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -384,7 +384,7 @@ pub struct SafekeepersInfo {
     pub safekeepers: Vec<SafekeeperInfo>,
 }
 
-#[derive(Serialize, Deserialize, Clone)]
+#[derive(Serialize, Deserialize, Clone, Debug)]
 pub struct SafekeeperInfo {
     pub id: NodeId,
     pub hostname: String,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 403ae15b59..ed6643d641 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1677,7 +1677,21 @@ impl Service {
             .collect::<anyhow::Result<Vec<_>>>()?;
         let safekeepers: HashMap<NodeId, Safekeeper> =
             safekeepers.into_iter().map(|n| (n.get_id(), n)).collect();
-        tracing::info!("Loaded {} safekeepers from database.", safekeepers.len());
+        let count_policy = |policy| {
+            safekeepers
+                .iter()
+                .filter(|sk| sk.1.scheduling_policy() == policy)
+                .count()
+        };
+        let active_sk_count = count_policy(SkSchedulingPolicy::Active);
+        let activating_sk_count = count_policy(SkSchedulingPolicy::Activating);
+        let pause_sk_count = count_policy(SkSchedulingPolicy::Pause);
+        let decom_sk_count = count_policy(SkSchedulingPolicy::Decomissioned);
+        tracing::info!(
+            "Loaded {} safekeepers from database. Active {active_sk_count}, activating {activating_sk_count}, \
+            paused {pause_sk_count}, decomissioned {decom_sk_count}.",
+            safekeepers.len()
+        );
         metrics::METRICS_REGISTRY
             .metrics_group
             .storage_controller_safekeeper_nodes
@@ -1969,6 +1983,14 @@ impl Service {
             }
         });
 
+        // Check that there is enough safekeepers configured that we can create new timelines
+        let test_sk_res = this.safekeepers_for_new_timeline().await;
+        tracing::info!(
+            timeline_safekeeper_count = config.timeline_safekeeper_count,
+            timelines_onto_safekeepers = config.timelines_onto_safekeepers,
+            "viability test result (test timeline creation on safekeepers): {test_sk_res:?}",
+        );
+
         Ok(this)
     }
 

From 4ee0da0a2056d31d6d2194d862354e43193cbe23 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 9 Jul 2025 15:49:21 +0300
Subject: [PATCH 036/163] Check prefetch response before assignment to slot
 (#12371)

## Problem

See [Slack
Channel](https://databricks.enterprise.slack.com/archives/C091LHU6NNB)

Dropping connection without resetting prefetch state can cause
request/response mismatch.
And lack of check response correctness in communicator_prefetch_lookupv
can cause data corruption.

## Summary of changes

1. Validate response before assignment to prefetch slot.
2. Consume prefetch requests before sending any other requests.

---------

Co-authored-by: Kosntantin Knizhnik <konstantin.knizhnik@databricks.com>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/communicator.c | 219 ++++++++++++++++++++++++---------------
 1 file changed, 135 insertions(+), 84 deletions(-)

diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index 7c84be7d15..bd53855eab 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -65,6 +65,7 @@
 #include "port/pg_iovec.h"
 #include "postmaster/interrupt.h"
 #include "replication/walsender.h"
+#include "storage/ipc.h"
 #include "utils/timeout.h"
 
 #include "bitmap.h"
@@ -412,6 +413,47 @@ compact_prefetch_buffers(void)
 	return false;
 }
 
+/*
+ * Check that prefetch response matches the slot
+ */
+static void
+check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
+{
+	if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse)
+	{
+		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld",
+					   resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused);
+	}
+	if (neon_protocol_version >= 3)
+	{
+		NRelFileInfo rinfo = BufTagGetNRelFileInfo(slot->buftag);
+		if (resp->tag == T_NeonGetPageResponse)
+		{
+			NeonGetPageResponse * getpage_resp = (NeonGetPageResponse *)resp;
+			if (resp->reqid != slot->reqid ||
+				resp->lsn != slot->request_lsns.request_lsn ||
+				resp->not_modified_since != slot->request_lsns.not_modified_since ||
+				!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
+				getpage_resp->req.forknum != slot->buftag.forkNum ||
+				getpage_resp->req.blkno != slot->buftag.blockNum)
+			{
+				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
+											"Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+											resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
+											slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum);
+			}
+		}
+		else if (resp->reqid != slot->reqid ||
+				 resp->lsn != slot->request_lsns.request_lsn ||
+				 resp->not_modified_since != slot->request_lsns.not_modified_since)
+		{
+			elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+				 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
+				 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
+		}
+	}
+}
+
 /*
  * If there might be responses still in the TCP buffer, then we should try to
  * use those, to reduce any TCP backpressure on the OS/PS side.
@@ -446,15 +488,18 @@ communicator_prefetch_pump_state(void)
 		if (response == NULL)
 			break;
 
+		check_getpage_response(slot, response);
+
 		/* The slot should still be valid */
 		if (slot->status != PRFS_REQUESTED ||
 			slot->response != NULL ||
 			slot->my_ring_index != MyPState->ring_receive)
-			neon_shard_log(slot->shard_no, ERROR,
+		{
+			neon_shard_log(slot->shard_no, PANIC,
 						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
 						   slot->status, slot->response,
 						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
-
+		}
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
@@ -593,6 +638,21 @@ readahead_buffer_resize(int newsize, void *extra)
 }
 
 
+/*
+ * Callback to be called on backend exit to ensure correct state of compute-PS communication
+ * in case of backend cancel
+ */
+static void
+prefetch_on_exit(int code, Datum arg)
+{
+	if (code != 0) /* do disconnect only on abnormal backend termination */
+	{
+		shardno_t shard_no = DatumGetInt32(arg);
+		prefetch_on_ps_disconnect();
+		page_server->disconnect(shard_no);
+	}
+}
+
 
 /*
  * Make sure that there are no responses still in the buffer.
@@ -605,6 +665,11 @@ consume_prefetch_responses(void)
 {
 	if (MyPState->ring_receive < MyPState->ring_unused)
 		prefetch_wait_for(MyPState->ring_unused - 1);
+	/*
+	 * We know for sure we're not working on any prefetch pages after
+	 * this.
+	 */
+	END_PREFETCH_RECEIVE_WORK();
 }
 
 static void
@@ -722,10 +787,12 @@ prefetch_read(PrefetchRequest *slot)
 	if (slot->status != PRFS_REQUESTED ||
 		slot->response != NULL ||
 		slot->my_ring_index != MyPState->ring_receive)
-		neon_shard_log(slot->shard_no, ERROR,
+	{
+		neon_shard_log(slot->shard_no, PANIC,
 					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
 					   slot->status, slot->response,
 					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+	}
 
 	/*
 	 * Copy the request info so that if an error happens and the prefetch
@@ -741,14 +808,18 @@ prefetch_read(PrefetchRequest *slot)
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
+		check_getpage_response(slot, response);
+
 		/* The slot should still be valid */
 		if (slot->status != PRFS_REQUESTED ||
 			slot->response != NULL ||
 			slot->my_ring_index != MyPState->ring_receive)
-			neon_shard_log(shard_no, ERROR,
+		{
+			neon_shard_log(shard_no, PANIC,
 						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
 						   slot->status, slot->response,
 						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+		}
 
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
@@ -820,11 +891,10 @@ communicator_prefetch_receive(BufferTag tag)
 void
 prefetch_on_ps_disconnect(void)
 {
-	bool save_readpage_reentrant_guard = readpage_reentrant_guard;
 	MyPState->ring_flush = MyPState->ring_unused;
 
-	/* Prohibit callig of prefetch_pump_state */
-	START_PREFETCH_RECEIVE_WORK();
+	/* Nothing should cancel disconnect: we should not leave connection in opaque state */
+	HOLD_INTERRUPTS();
 
 	while (MyPState->ring_receive < MyPState->ring_unused)
 	{
@@ -854,9 +924,6 @@ prefetch_on_ps_disconnect(void)
 		MyNeonCounters->getpage_prefetch_discards_total += 1;
 	}
 
-	/* Restore guard */
-	readpage_reentrant_guard = save_readpage_reentrant_guard;
-
 	/*
 	 * We can have gone into retry due to network error, so update stats with
 	 * the latest available
@@ -865,6 +932,8 @@ prefetch_on_ps_disconnect(void)
 		MyPState->n_requests_inflight;
 	MyNeonCounters->getpage_prefetches_buffered =
 		MyPState->n_responses_buffered;
+
+	RESUME_INTERRUPTS();
 }
 
 /*
@@ -1027,16 +1096,11 @@ communicator_prefetch_lookupv(NRelFileInfo rinfo, ForkNumber forknum, BlockNumbe
 			/*
 			 * Ignore errors
 			 */
-			if (slot->response->tag != T_NeonGetPageResponse)
+			if (slot->response->tag == T_NeonErrorResponse)
 			{
-				if (slot->response->tag != T_NeonErrorResponse)
-				{
-					NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Expected GetPage (0x%02x) or Error (0x%02x) response to GetPageRequest, but got 0x%02x",
-											T_NeonGetPageResponse, T_NeonErrorResponse, slot->response->tag);
-				}
 				continue;
 			}
+			Assert(slot->response->tag == T_NeonGetPageResponse); /* checked by check_getpage_response when response was assigned to the slot */
 			memcpy(buffers[i], ((NeonGetPageResponse*)slot->response)->page, BLCKSZ);
 
 
@@ -1351,7 +1415,7 @@ equal_requests(NeonRequest* a, NeonRequest* b)
 static NeonResponse *
 page_server_request(void const *req)
 {
-	NeonResponse *resp;
+	NeonResponse *resp = NULL;
 	BufferTag tag = {0};
 	shardno_t shard_no;
 
@@ -1371,7 +1435,7 @@ page_server_request(void const *req)
 			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
 			break;
 		default:
-			neon_log(ERROR, "Unexpected request tag: %d", messageTag(req));
+			neon_log(PANIC, "Unexpected request tag: %d", messageTag(req));
 	}
 	shard_no = get_shard_number(&tag);
 
@@ -1384,9 +1448,12 @@ page_server_request(void const *req)
 		shard_no = 0;
 	}
 
-	do
+	consume_prefetch_responses();
+
+	PG_TRY();
 	{
-		PG_TRY();
+		before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+		do
 		{
 			while (!page_server->send(shard_no, (NeonRequest *) req)
 				   || !page_server->flush(shard_no))
@@ -1394,30 +1461,24 @@ page_server_request(void const *req)
 				/* do nothing */
 			}
 			MyNeonCounters->pageserver_open_requests++;
-			consume_prefetch_responses();
 			resp = page_server->receive(shard_no);
 			MyNeonCounters->pageserver_open_requests--;
-		}
-		PG_CATCH();
-		{
-			/*
-			 * Cancellation in this code needs to be handled better at some
-			 * point, but this currently seems fine for now.
-			 */
-			page_server->disconnect(shard_no);
-			MyNeonCounters->pageserver_open_requests = 0;
+		} while (resp == NULL);
+		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+	}
+	PG_CATCH();
+	{
+		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+		/* Nothing should cancel disconnect: we should not leave connection in opaque state */
+		HOLD_INTERRUPTS();
+		page_server->disconnect(shard_no);
+		MyNeonCounters->pageserver_open_requests = 0;
+		RESUME_INTERRUPTS();
 
-			/*
-			 * We know for sure we're not working on any prefetch pages after
-			 * this.
-			 */
-			END_PREFETCH_RECEIVE_WORK();
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 
-			PG_RE_THROW();
-		}
-		PG_END_TRY();
-
-	} while (resp == NULL);
 
 	return resp;
 }
@@ -1502,7 +1563,7 @@ nm_pack_request(NeonRequest *msg)
 		case T_NeonDbSizeResponse:
 		case T_NeonGetSlruSegmentResponse:
 		default:
-			neon_log(ERROR, "unexpected neon message tag 0x%02x", msg->tag);
+			neon_log(PANIC, "unexpected neon message tag 0x%02x", msg->tag);
 			break;
 	}
 	return s;
@@ -1654,7 +1715,7 @@ nm_unpack_response(StringInfo s)
 		case T_NeonDbSizeRequest:
 		case T_NeonGetSlruSegmentRequest:
 		default:
-			neon_log(ERROR, "unexpected neon message tag 0x%02x", tag);
+			neon_log(PANIC, "unexpected neon message tag 0x%02x", tag);
 			break;
 	}
 
@@ -1983,7 +2044,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 						!RelFileInfoEquals(exists_resp->req.rinfo, request.rinfo) ||
 						exists_resp->req.forknum != request.forknum)
 					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+						NEON_PANIC_CONNECTION_STATE(0, PANIC,
 													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
@@ -2014,7 +2075,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 				break;
 
 			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+				NEON_PANIC_CONNECTION_STATE(0, PANIC,
 											"Expected Exists (0x%02x) or Error (0x%02x) response to ExistsRequest, but got 0x%02x",
 											T_NeonExistsResponse, T_NeonErrorResponse, resp->tag);
 		}
@@ -2158,6 +2219,7 @@ Retry:
 		Assert(memcmp(&hashkey.buftag, &slot->buftag, sizeof(BufferTag)) == 0);
 		Assert(hashkey.buftag.blockNum == base_blockno + i);
 
+		/* We already checked that response match request when storing it in slot */
 		resp = slot->response;
 
 		switch (resp->tag)
@@ -2165,21 +2227,6 @@ Retry:
 			case T_NeonGetPageResponse:
 			{
 				NeonGetPageResponse* getpage_resp = (NeonGetPageResponse *) resp;
-				if (neon_protocol_version >= 3)
-				{
-					if (resp->reqid != slot->reqid ||
-						resp->lsn != slot->request_lsns.request_lsn ||
-						resp->not_modified_since != slot->request_lsns.not_modified_since ||
-						!RelFileInfoEquals(getpage_resp->req.rinfo, rinfo) ||
-						getpage_resp->req.forknum != forkNum ||
-						getpage_resp->req.blkno != base_blockno + i)
-					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
-													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
-													slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), forkNum, base_blockno + i);
-					}
-				}
 				memcpy(buffer, getpage_resp->page, BLCKSZ);
 
 				/*
@@ -2192,17 +2239,6 @@ Retry:
 				break;
 			}
 			case T_NeonErrorResponse:
-				if (neon_protocol_version >= 3)
-				{
-					if (resp->reqid != slot->reqid ||
-						resp->lsn != slot->request_lsns.request_lsn ||
-						resp->not_modified_since != slot->request_lsns.not_modified_since)
-					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
-							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
-							 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
-					}
-				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
 						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
@@ -2257,7 +2293,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 						!RelFileInfoEquals(relsize_resp->req.rinfo, request.rinfo) ||
 						relsize_resp->req.forknum != forknum)
 					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+						NEON_PANIC_CONNECTION_STATE(0, PANIC,
 													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
@@ -2288,7 +2324,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 				break;
 
 			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+				NEON_PANIC_CONNECTION_STATE(0, PANIC,
 											"Expected Nblocks (0x%02x) or Error (0x%02x) response to NblocksRequest, but got 0x%02x",
 											T_NeonNblocksResponse, T_NeonErrorResponse, resp->tag);
 		}
@@ -2327,7 +2363,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 					if (!equal_requests(resp, &request.hdr) ||
 						dbsize_resp->req.dbNode != dbNode)
 					{
-						NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+						NEON_PANIC_CONNECTION_STATE(0, PANIC,
 													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
@@ -2356,7 +2392,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 				break;
 
 			default:
-				NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+				NEON_PANIC_CONNECTION_STATE(0, PANIC,
 											"Expected DbSize (0x%02x) or Error (0x%02x) response to DbSizeRequest, but got 0x%02x",
 											T_NeonDbSizeResponse, T_NeonErrorResponse, resp->tag);
 		}
@@ -2372,7 +2408,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 {
 	int			n_blocks;
 	shardno_t	shard_no = 0; /* All SLRUs are at shard 0 */
-	NeonResponse *resp;
+	NeonResponse *resp = NULL;
 	NeonGetSlruSegmentRequest request;
 
 	request = (NeonGetSlruSegmentRequest) {
@@ -2383,14 +2419,29 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 		.segno = segno
 	};
 
-	do
+	consume_prefetch_responses();
+
+	PG_TRY();
 	{
-		while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
+		before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+		do
+		{
+			while (!page_server->send(shard_no, &request.hdr) || !page_server->flush(shard_no));
+			resp = page_server->receive(shard_no);
+		} while (resp == NULL);
+		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+	}
+	PG_CATCH();
+	{
+		cancel_before_shmem_exit(prefetch_on_exit, Int32GetDatum(shard_no));
+		/* Nothing should cancel disconnect: we should not leave connection in opaque state */
+		HOLD_INTERRUPTS();
+		page_server->disconnect(shard_no);
+		RESUME_INTERRUPTS();
 
-		consume_prefetch_responses();
-
-		resp = page_server->receive(shard_no);
-	} while (resp == NULL);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
 
 	switch (resp->tag)
 	{
@@ -2403,7 +2454,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 					slru_resp->req.kind != kind ||
 					slru_resp->req.segno != segno)
 				{
-					NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+					NEON_PANIC_CONNECTION_STATE(0, PANIC,
 												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
 												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
 												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
@@ -2435,7 +2486,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 			break;
 
 		default:
-			NEON_PANIC_CONNECTION_STATE(-1, PANIC,
+			NEON_PANIC_CONNECTION_STATE(0, PANIC,
 										"Expected GetSlruSegment (0x%02x) or Error (0x%02x) response to GetSlruSegmentRequest, but got 0x%02x",
 										T_NeonGetSlruSegmentResponse, T_NeonErrorResponse, resp->tag);
 	}

From e7d18bc1884ae7f5448fff99bd02f6d9390d5566 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Wed, 9 Jul 2025 13:55:10 +0100
Subject: [PATCH 037/163] Replica promotion in compute_ctl (#12183)

Add `/promote` method for `compute_ctl` promoting secondary replica to
primary,
depends on secondary being prewarmed.
Add `compute-ctl` mode to `test_replica_promotes`, testing happy path
only (no corner cases yet)
Add openapi spec for `/promote` and `/lfc` handlers

https://github.com/neondatabase/cloud/issues/19011
Resolves: https://github.com/neondatabase/cloud/issues/29807
---
 Cargo.lock                                   |   1 +
 compute_tools/Cargo.toml                     |   2 +-
 compute_tools/src/compute.rs                 |   7 +-
 compute_tools/src/compute_promote.rs         | 132 +++++++++++++++
 compute_tools/src/http/openapi_spec.yaml     | 145 +++++++++++++++++
 compute_tools/src/http/routes/mod.rs         |   1 +
 compute_tools/src/http/routes/promote.rs     |  14 ++
 compute_tools/src/http/server.rs             |   3 +-
 compute_tools/src/lib.rs                     |   1 +
 libs/compute_api/src/responses.rs            |  30 +++-
 test_runner/fixtures/endpoint/http.py        |  15 +-
 test_runner/regress/test_replica_promotes.py | 162 ++++++++++++-------
 12 files changed, 448 insertions(+), 65 deletions(-)
 create mode 100644 compute_tools/src/compute_promote.rs
 create mode 100644 compute_tools/src/http/routes/promote.rs

diff --git a/Cargo.lock b/Cargo.lock
index 558e8e2295..c49a2daba7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1348,6 +1348,7 @@ dependencies = [
  "p256 0.13.2",
  "pageserver_page_api",
  "postgres",
+ "postgres-types",
  "postgres_initdb",
  "postgres_versioninfo",
  "regex",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 1a03022d89..910bae3bda 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -66,7 +66,7 @@ url.workspace = true
 uuid.workspace = true
 walkdir.workspace = true
 x509-cert.workspace = true
-
+postgres-types.workspace = true
 postgres_versioninfo.workspace = true
 postgres_initdb.workspace = true
 compute_api.workspace = true
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index ec6e6c1634..0496d38e67 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -3,7 +3,7 @@ use chrono::{DateTime, Utc};
 use compute_api::privilege::Privilege;
 use compute_api::responses::{
     ComputeConfig, ComputeCtlConfig, ComputeMetrics, ComputeStatus, LfcOffloadState,
-    LfcPrewarmState, TlsConfig,
+    LfcPrewarmState, PromoteState, TlsConfig,
 };
 use compute_api::spec::{
     ComputeAudit, ComputeFeature, ComputeMode, ComputeSpec, ExtVersion, PageserverProtocol, PgIdent,
@@ -29,8 +29,7 @@ use std::sync::atomic::{AtomicU32, AtomicU64, Ordering};
 use std::sync::{Arc, Condvar, Mutex, RwLock};
 use std::time::{Duration, Instant};
 use std::{env, fs};
-use tokio::task::JoinHandle;
-use tokio::{spawn, time};
+use tokio::{spawn, sync::watch, task::JoinHandle, time};
 use tracing::{Instrument, debug, error, info, instrument, warn};
 use url::Url;
 use utils::id::{TenantId, TimelineId};
@@ -175,6 +174,7 @@ pub struct ComputeState {
     /// WAL flush LSN that is set after terminating Postgres and syncing safekeepers if
     /// mode == ComputeMode::Primary. None otherwise
     pub terminate_flush_lsn: Option<Lsn>,
+    pub promote_state: Option<watch::Receiver<PromoteState>>,
 
     pub metrics: ComputeMetrics,
 }
@@ -192,6 +192,7 @@ impl ComputeState {
             lfc_prewarm_state: LfcPrewarmState::default(),
             lfc_offload_state: LfcOffloadState::default(),
             terminate_flush_lsn: None,
+            promote_state: None,
         }
     }
 
diff --git a/compute_tools/src/compute_promote.rs b/compute_tools/src/compute_promote.rs
new file mode 100644
index 0000000000..42256faa22
--- /dev/null
+++ b/compute_tools/src/compute_promote.rs
@@ -0,0 +1,132 @@
+use crate::compute::ComputeNode;
+use anyhow::{Context, Result, bail};
+use compute_api::{
+    responses::{LfcPrewarmState, PromoteState, SafekeepersLsn},
+    spec::ComputeMode,
+};
+use std::{sync::Arc, time::Duration};
+use tokio::time::sleep;
+use utils::lsn::Lsn;
+
+impl ComputeNode {
+    /// Returns only when promote fails or succeeds. If a network error occurs
+    /// and http client disconnects, this does not stop promotion, and subsequent
+    /// calls block until promote finishes.
+    /// Called by control plane on secondary after primary endpoint is terminated
+    pub async fn promote(self: &Arc<Self>, safekeepers_lsn: SafekeepersLsn) -> PromoteState {
+        let cloned = self.clone();
+        let start_promotion = || {
+            let (tx, rx) = tokio::sync::watch::channel(PromoteState::NotPromoted);
+            tokio::spawn(async move {
+                tx.send(match cloned.promote_impl(safekeepers_lsn).await {
+                    Ok(_) => PromoteState::Completed,
+                    Err(err) => {
+                        tracing::error!(%err, "promoting");
+                        PromoteState::Failed {
+                            error: err.to_string(),
+                        }
+                    }
+                })
+            });
+            rx
+        };
+
+        let mut task;
+        // self.state is unlocked after block ends so we lock it in promote_impl
+        // and task.changed() is reached
+        {
+            task = self
+                .state
+                .lock()
+                .unwrap()
+                .promote_state
+                .get_or_insert_with(start_promotion)
+                .clone()
+        }
+        task.changed().await.expect("promote sender dropped");
+        task.borrow().clone()
+    }
+
+    // Why do we have to supply safekeepers?
+    // For secondary we use primary_connection_conninfo so safekeepers field is empty
+    async fn promote_impl(&self, safekeepers_lsn: SafekeepersLsn) -> Result<()> {
+        {
+            let state = self.state.lock().unwrap();
+            let mode = &state.pspec.as_ref().unwrap().spec.mode;
+            if *mode != ComputeMode::Replica {
+                bail!("{} is not replica", mode.to_type_str());
+            }
+
+            // we don't need to query Postgres so not self.lfc_prewarm_state()
+            match &state.lfc_prewarm_state {
+                LfcPrewarmState::NotPrewarmed | LfcPrewarmState::Prewarming => {
+                    bail!("prewarm not requested or pending")
+                }
+                LfcPrewarmState::Failed { error } => {
+                    tracing::warn!(%error, "replica prewarm failed")
+                }
+                _ => {}
+            }
+        }
+
+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?;
+
+        let primary_lsn = safekeepers_lsn.wal_flush_lsn;
+        let mut last_wal_replay_lsn: Lsn = Lsn::INVALID;
+        const RETRIES: i32 = 20;
+        for i in 0..=RETRIES {
+            let row = client
+                .query_one("SELECT pg_last_wal_replay_lsn()", &[])
+                .await
+                .context("getting last replay lsn")?;
+            let lsn: u64 = row.get::<usize, postgres_types::PgLsn>(0).into();
+            last_wal_replay_lsn = lsn.into();
+            if last_wal_replay_lsn >= primary_lsn {
+                break;
+            }
+            tracing::info!("Try {i}, replica lsn {last_wal_replay_lsn}, primary lsn {primary_lsn}");
+            sleep(Duration::from_secs(1)).await;
+        }
+        if last_wal_replay_lsn < primary_lsn {
+            bail!("didn't catch up with primary in {RETRIES} retries");
+        }
+
+        // using $1 doesn't work with ALTER SYSTEM SET
+        let safekeepers_sql = format!(
+            "ALTER SYSTEM SET neon.safekeepers='{}'",
+            safekeepers_lsn.safekeepers
+        );
+        client
+            .query(&safekeepers_sql, &[])
+            .await
+            .context("setting safekeepers")?;
+        client
+            .query("SELECT pg_reload_conf()", &[])
+            .await
+            .context("reloading postgres config")?;
+        let row = client
+            .query_one("SELECT * FROM pg_promote()", &[])
+            .await
+            .context("pg_promote")?;
+        if !row.get::<usize, bool>(0) {
+            bail!("pg_promote() returned false");
+        }
+
+        let client = ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
+            .await
+            .context("connecting to postgres")?;
+        let row = client
+            .query_one("SHOW transaction_read_only", &[])
+            .await
+            .context("getting transaction_read_only")?;
+        if row.get::<usize, &str>(0) == "on" {
+            bail!("replica in read only mode after promotion");
+        }
+
+        let mut state = self.state.lock().unwrap();
+        state.pspec.as_mut().unwrap().spec.mode = ComputeMode::Primary;
+        Ok(())
+    }
+}
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index bbdb7d0917..eaf33d1f82 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -83,6 +83,87 @@ paths:
               schema:
                 $ref: "#/components/schemas/DbsAndRoles"
 
+  /promote:
+    post:
+      tags:
+        - Promotion
+      summary: Promote secondary replica to primary
+      description: ""
+      operationId: promoteReplica
+      requestBody:
+        description: Promote requests data
+        required: true
+        content:
+          application/json:
+            schema:
+                $ref: "#/components/schemas/SafekeepersLsn"
+      responses:
+        200:
+          description: Promote succeeded or wasn't started
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PromoteState"
+        500:
+          description: Promote failed
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/PromoteState"
+
+  /lfc/prewarm:
+    post:
+      summary: Request LFC Prewarm
+      parameters:
+        - name: from_endpoint
+          in: query
+          schema:
+            type: string
+      description: ""
+      operationId: lfcPrewarm
+      responses:
+        202:
+          description: LFC prewarm started
+        429:
+          description: LFC prewarm ongoing
+    get:
+      tags:
+        - Prewarm
+      summary: Get LFC prewarm state
+      description: ""
+      operationId: getLfcPrewarmState
+      responses:
+        200:
+          description: Prewarm state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LfcPrewarmState"
+
+  /lfc/offload:
+    post:
+      summary: Request LFC offload
+      description: ""
+      operationId: lfcOffload
+      responses:
+        202:
+          description: LFC offload started
+        429:
+          description: LFC offload ongoing
+    get:
+      tags:
+        - Prewarm
+      summary: Get LFC offloading state
+      description: ""
+      operationId: getLfcOffloadState
+      responses:
+        200:
+          description: Offload state
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/LfcOffloadState"
+
   /database_schema:
     get:
       tags:
@@ -497,6 +578,70 @@ components:
           type: string
           example: "1.0.0"
 
+    SafekeepersLsn:
+      type: object
+      required:
+        - safekeepers
+        - wal_flush_lsn
+      properties:
+        safekeepers:
+          description: Primary replica safekeepers
+          type: string
+        wal_flush_lsn:
+          description: Primary last WAL flush LSN
+          type: string
+
+    LfcPrewarmState:
+      type: object
+      required:
+        - status
+        - total
+        - prewarmed
+        - skipped
+      properties:
+        status:
+          description: Lfc prewarm status
+          enum: [not_prewarmed, prewarming, completed, failed]
+          type: string
+        error:
+          description: Lfc prewarm error, if any
+          type: string
+        total:
+          description: Total pages processed
+          type: integer
+        prewarmed:
+          description: Total pages prewarmed
+          type: integer
+        skipped:
+          description: Pages processed but not prewarmed
+          type: integer
+
+    LfcOffloadState:
+      type: object
+      required:
+        - status
+      properties:
+        status:
+          description: Lfc offload status
+          enum: [not_offloaded, offloading, completed, failed]
+          type: string
+        error:
+          description: Lfc offload error, if any
+          type: string
+
+    PromoteState:
+      type: object
+      required:
+        - status
+      properties:
+        status:
+          description: Promote result
+          enum: [not_promoted, completed, failed]
+          type: string
+        error:
+          description: Promote error, if any
+          type: string
+
     InstalledExtensions:
       type: object
       properties:
diff --git a/compute_tools/src/http/routes/mod.rs b/compute_tools/src/http/routes/mod.rs
index 432e66a830..dd71f663eb 100644
--- a/compute_tools/src/http/routes/mod.rs
+++ b/compute_tools/src/http/routes/mod.rs
@@ -14,6 +14,7 @@ pub(in crate::http) mod insights;
 pub(in crate::http) mod lfc;
 pub(in crate::http) mod metrics;
 pub(in crate::http) mod metrics_json;
+pub(in crate::http) mod promote;
 pub(in crate::http) mod status;
 pub(in crate::http) mod terminate;
 
diff --git a/compute_tools/src/http/routes/promote.rs b/compute_tools/src/http/routes/promote.rs
new file mode 100644
index 0000000000..bc5f93b4da
--- /dev/null
+++ b/compute_tools/src/http/routes/promote.rs
@@ -0,0 +1,14 @@
+use crate::http::JsonResponse;
+use axum::Form;
+use http::StatusCode;
+
+pub(in crate::http) async fn promote(
+    compute: axum::extract::State<std::sync::Arc<crate::compute::ComputeNode>>,
+    Form(safekeepers_lsn): Form<compute_api::responses::SafekeepersLsn>,
+) -> axum::response::Response {
+    let state = compute.promote(safekeepers_lsn).await;
+    if let compute_api::responses::PromoteState::Failed { error } = state {
+        return JsonResponse::error(StatusCode::INTERNAL_SERVER_ERROR, error);
+    }
+    JsonResponse::success(StatusCode::OK, state)
+}
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index d5d2427971..17939e39d4 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -23,7 +23,7 @@ use super::{
     middleware::authorize::Authorize,
     routes::{
         check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
-        grants, insights, lfc, metrics, metrics_json, status, terminate,
+        grants, insights, lfc, metrics, metrics_json, promote, status, terminate,
     },
 };
 use crate::compute::ComputeNode;
@@ -87,6 +87,7 @@ impl From<&Server> for Router<Arc<ComputeNode>> {
                 let authenticated_router = Router::<Arc<ComputeNode>>::new()
                     .route("/lfc/prewarm", get(lfc::prewarm_state).post(lfc::prewarm))
                     .route("/lfc/offload", get(lfc::offload_state).post(lfc::offload))
+                    .route("/promote", post(promote::promote))
                     .route("/check_writability", post(check_writability::is_writable))
                     .route("/configure", post(configure::configure))
                     .route("/database_schema", get(database_schema::get_schema_dump))
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 3899a1ca76..2d5d4565b7 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -12,6 +12,7 @@ pub mod logger;
 pub mod catalog;
 pub mod compute;
 pub mod compute_prewarm;
+pub mod compute_promote;
 pub mod disk_quota;
 pub mod extension_server;
 pub mod installed_extensions;
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index a54411b06a..e10c381fb4 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,7 +46,7 @@ pub struct ExtensionInstallResponse {
     pub version: ExtVersion,
 }
 
-#[derive(Serialize, Default, Debug, Clone)]
+#[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
     #[default]
@@ -58,6 +58,17 @@ pub enum LfcPrewarmState {
     },
 }
 
+impl Display for LfcPrewarmState {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
+            LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
+            LfcPrewarmState::Completed => f.write_str("Completed"),
+            LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
+        }
+    }
+}
+
 #[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcOffloadState {
@@ -70,6 +81,23 @@ pub enum LfcOffloadState {
     },
 }
 
+#[derive(Serialize, Debug, Clone, PartialEq)]
+#[serde(tag = "status", rename_all = "snake_case")]
+/// Response of /promote
+pub enum PromoteState {
+    NotPromoted,
+    Completed,
+    Failed { error: String },
+}
+
+#[derive(Deserialize, Serialize, Default, Debug, Clone)]
+#[serde(rename_all = "snake_case")]
+/// Result of /safekeepers_lsn
+pub struct SafekeepersLsn {
+    pub safekeepers: String,
+    pub wal_flush_lsn: utils::lsn::Lsn,
+}
+
 /// Response of the /status API
 #[derive(Serialize, Debug, Deserialize)]
 #[serde(rename_all = "snake_case")]
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 294c52321b..1d278095ce 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -2,11 +2,12 @@ from __future__ import annotations
 
 import urllib.parse
 from enum import StrEnum
-from typing import TYPE_CHECKING, final
+from typing import TYPE_CHECKING, Any, final
 
 import requests
 from requests.adapters import HTTPAdapter
 from requests.auth import AuthBase
+from requests.exceptions import ReadTimeout
 from typing_extensions import override
 
 from fixtures.log_helper import log
@@ -102,6 +103,18 @@ class EndpointHttpClient(requests.Session):
 
         wait_until(offloaded)
 
+    def promote(self, safekeepers_lsn: dict[str, Any], disconnect: bool = False):
+        url = f"http://localhost:{self.external_port}/promote"
+        if disconnect:
+            try:  # send first request to start promote and disconnect
+                self.post(url, data=safekeepers_lsn, timeout=0.001)
+            except ReadTimeout:
+                pass  # wait on second request which returns on promotion finish
+        res = self.post(url, data=safekeepers_lsn)
+        res.raise_for_status()
+        json: dict[str, str] = res.json()
+        return json
+
     def database_schema(self, database: str):
         res = self.get(
             f"http://localhost:{self.external_port}/database_schema?database={urllib.parse.quote(database, safe='')}",
diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py
index 4486901bae..1f26269f40 100644
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -1,29 +1,51 @@
 """
-File with secondary->primary promotion testing.
-
-This far, only contains a test that we don't break and that the data is persisted.
+Secondary -> primary promotion testing
 """
 
+from enum import StrEnum
 from typing import cast
 
 import psycopg2
+import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import Endpoint, NeonEnv, wait_replica_caughtup
-from fixtures.pg_version import PgVersion
+from fixtures.utils import USE_LFC
+from psycopg2.extensions import cursor as Cursor
 from pytest import raises
 
 
 def stop_and_check_lsn(ep: Endpoint, expected_lsn: Lsn | None):
     ep.stop(mode="immediate-terminate")
     lsn = ep.terminate_flush_lsn
-    if expected_lsn is not None:
+    assert (lsn is not None) == (expected_lsn is not None), f"{lsn=}, {expected_lsn=}"
+    if lsn is not None:
         assert lsn >= expected_lsn, f"{expected_lsn=} < {lsn=}"
-    else:
-        assert lsn == expected_lsn, f"{expected_lsn=} != {lsn=}"
 
 
-def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
+def get_lsn_triple(cur: Cursor) -> tuple[str, str, str]:
+    cur.execute(
+        """
+        SELECT pg_current_wal_insert_lsn(),
+               pg_current_wal_lsn(),
+               pg_current_wal_flush_lsn()
+        """
+    )
+    return cast("tuple[str, str, str]", cur.fetchone())
+
+
+class PromoteMethod(StrEnum):
+    COMPUTE_CTL = "compute-ctl"
+    POSTGRES = "postgres"
+
+
+METHOD_OPTIONS = [e for e in PromoteMethod]
+METHOD_IDS = [e.value for e in PromoteMethod]
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+@pytest.mark.parametrize("method", METHOD_OPTIONS, ids=METHOD_IDS)
+def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
     """
     Test that a replica safely promotes, and can commit data updates which
     show up when the primary boots up after the promoted secondary endpoint
@@ -38,29 +60,26 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
 
     with primary.connect() as primary_conn:
         primary_cur = primary_conn.cursor()
+        primary_cur.execute("create extension neon")
         primary_cur.execute(
             "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
         )
         primary_cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
-        primary_cur.execute(
-            """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-        )
-        lsn_triple = cast("tuple[str, str, str]", primary_cur.fetchone())
+
+        lsn_triple = get_lsn_triple(primary_cur)
         log.info(f"Primary: Current LSN after workload is {lsn_triple}")
         expected_primary_lsn: Lsn = Lsn(lsn_triple[2])
         primary_cur.execute("show neon.safekeepers")
         safekeepers = primary_cur.fetchall()[0][0]
 
-    wait_replica_caughtup(primary, secondary)
+    if method == PromoteMethod.COMPUTE_CTL:
+        primary.http_client().offload_lfc()
+    else:
+        wait_replica_caughtup(primary, secondary)
 
     with secondary.connect() as secondary_conn:
         secondary_cur = secondary_conn.cursor()
         secondary_cur.execute("select count(*) from t")
-
         assert secondary_cur.fetchone() == (100,)
 
         with raises(psycopg2.Error):
@@ -71,28 +90,30 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
         secondary_cur.execute("select count(*) from t")
         assert secondary_cur.fetchone() == (100,)
 
+    primary_endpoint_id = primary.endpoint_id
     stop_and_check_lsn(primary, expected_primary_lsn)
 
     # Reconnect to the secondary to make sure we get a read-write connection
     promo_conn = secondary.connect()
     promo_cur = promo_conn.cursor()
-    promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
-    promo_cur.execute("select pg_reload_conf()")
+    if method == PromoteMethod.COMPUTE_CTL:
+        client = secondary.http_client()
+        client.prewarm_lfc(primary_endpoint_id)
+        # control plane knows safekeepers, simulate it by querying primary
+        assert (lsn := primary.terminate_flush_lsn)
+        safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
+        assert client.promote(safekeepers_lsn)["status"] == "completed"
+    else:
+        promo_cur.execute(f"alter system set neon.safekeepers='{safekeepers}'")
+        promo_cur.execute("select pg_reload_conf()")
+        promo_cur.execute("SELECT * FROM pg_promote()")
+        assert promo_cur.fetchone() == (True,)
 
-    promo_cur.execute("SELECT * FROM pg_promote()")
-    assert promo_cur.fetchone() == (True,)
-    promo_cur.execute(
-        """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-    )
-    log.info(f"Secondary: LSN after promotion is {promo_cur.fetchone()}")
+    lsn_triple = get_lsn_triple(promo_cur)
+    log.info(f"Secondary: LSN after promotion is {lsn_triple}")
 
     # Reconnect to the secondary to make sure we get a read-write connection
-    with secondary.connect() as new_primary_conn:
-        new_primary_cur = new_primary_conn.cursor()
+    with secondary.connect() as conn, conn.cursor() as new_primary_cur:
         new_primary_cur.execute("select count(*) from t")
         assert new_primary_cur.fetchone() == (100,)
 
@@ -101,43 +122,34 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
         )
         assert new_primary_cur.fetchall() == [(it,) for it in range(101, 201)]
 
-        new_primary_cur = new_primary_conn.cursor()
+        new_primary_cur = conn.cursor()
         new_primary_cur.execute("select payload from t")
         assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
 
         new_primary_cur.execute("select count(*) from t")
         assert new_primary_cur.fetchone() == (200,)
-        new_primary_cur.execute(
-            """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-        )
-        log.info(f"Secondary: LSN after workload is {new_primary_cur.fetchone()}")
 
-    with secondary.connect() as second_viewpoint_conn:
-        new_primary_cur = second_viewpoint_conn.cursor()
+        lsn_triple = get_lsn_triple(new_primary_cur)
+        log.info(f"Secondary: LSN after workload is {lsn_triple}")
+        expected_promoted_lsn = Lsn(lsn_triple[2])
+
+    with secondary.connect() as conn, conn.cursor() as new_primary_cur:
         new_primary_cur.execute("select payload from t")
         assert new_primary_cur.fetchall() == [(it,) for it in range(1, 201)]
 
-    # wait_for_last_flush_lsn(env, secondary, env.initial_tenant, env.initial_timeline)
-
-    # secondaries don't sync safekeepers on finish so LSN will be None
-    stop_and_check_lsn(secondary, None)
+    if method == PromoteMethod.COMPUTE_CTL:
+        # compute_ctl's /promote switches replica type to Primary so it syncs
+        # safekeepers on finish
+        stop_and_check_lsn(secondary, expected_promoted_lsn)
+    else:
+        # on testing postgres, we don't update replica type, secondaries don't
+        # sync so lsn should be None
+        stop_and_check_lsn(secondary, None)
 
     primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary2")
 
-    with primary.connect() as new_primary:
-        new_primary_cur = new_primary.cursor()
-        new_primary_cur.execute(
-            """
-            SELECT pg_current_wal_insert_lsn(),
-                   pg_current_wal_lsn(),
-                   pg_current_wal_flush_lsn()
-            """
-        )
-        lsn_triple = cast("tuple[str, str, str]", new_primary_cur.fetchone())
+    with primary.connect() as new_primary, new_primary.cursor() as new_primary_cur:
+        lsn_triple = get_lsn_triple(new_primary_cur)
         expected_primary_lsn = Lsn(lsn_triple[2])
         log.info(f"New primary: Boot LSN is {lsn_triple}")
 
@@ -146,5 +158,39 @@ def test_replica_promotes(neon_simple_env: NeonEnv, pg_version: PgVersion):
         new_primary_cur.execute("INSERT INTO t (payload) SELECT generate_series(201, 300)")
         new_primary_cur.execute("select count(*) from t")
         assert new_primary_cur.fetchone() == (300,)
-
     stop_and_check_lsn(primary, expected_primary_lsn)
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
+    """
+    Test that if a handler disconnects from /promote route of compute_ctl, promotion still happens
+    once, and no error is thrown
+    """
+    env: NeonEnv = neon_simple_env
+    primary: Endpoint = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
+    secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
+
+    with primary.connect() as conn, conn.cursor() as cur:
+        cur.execute("create extension neon")
+        cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
+        cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
+        cur.execute("show neon.safekeepers")
+        safekeepers = cur.fetchall()[0][0]
+
+    primary.http_client().offload_lfc()
+    primary_endpoint_id = primary.endpoint_id
+    primary.stop(mode="immediate-terminate")
+    assert (lsn := primary.terminate_flush_lsn)
+
+    client = secondary.http_client()
+    client.prewarm_lfc(primary_endpoint_id)
+    safekeepers_lsn = {"safekeepers": safekeepers, "wal_flush_lsn": lsn}
+    assert client.promote(safekeepers_lsn, disconnect=True)["status"] == "completed"
+
+    with secondary.connect() as conn, conn.cursor() as cur:
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (100,)
+        cur.execute("INSERT INTO t (payload) SELECT generate_series(101, 200) RETURNING payload")
+        cur.execute("select count(*) from t")
+        assert cur.fetchone() == (200,)

From 732bd26e7078a5be40f07306e1c59a2f7709a540 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Jul 2025 01:37:24 +0300
Subject: [PATCH 038/163] cargo fmt

---
 .../communicator/src/backend_interface.rs     |  7 ++++--
 pgxn/neon/communicator/src/init.rs            |  5 +---
 .../neon/communicator/src/integrated_cache.rs |  4 +---
 pgxn/neon/communicator/src/neon_request.rs    |  1 -
 .../src/worker_process/main_loop.rs           | 23 ++++++++++++++-----
 5 files changed, 24 insertions(+), 16 deletions(-)

diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs
index 9ed9028b96..fd0081e837 100644
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -161,8 +161,11 @@ pub extern "C" fn bcomm_get_request_slot_status(
             // leave a slot in this state, so if it sees that,
             // something's gone wrong and it's not clear what to do
             // with it.
-            panic!("unexpected Filling state in request slot {}", request_slot_idx);
-        },
+            panic!(
+                "unexpected Filling state in request slot {}",
+                request_slot_idx
+            );
+        }
         NeonIOHandleState::Submitted => true,
         NeonIOHandleState::Processing => true,
         NeonIOHandleState::Completed => true,
diff --git a/pgxn/neon/communicator/src/init.rs b/pgxn/neon/communicator/src/init.rs
index 5f7d593c35..20bb4923e8 100644
--- a/pgxn/neon/communicator/src/init.rs
+++ b/pgxn/neon/communicator/src/init.rs
@@ -46,10 +46,7 @@ impl std::fmt::Debug for CommunicatorInitStruct {
         fmt.debug_struct("CommunicatorInitStruct")
             .field("submission_pipe_read_fd", &self.submission_pipe_read_fd)
             .field("submission_pipe_write_fd", &self.submission_pipe_write_fd)
-            .field(
-                "num_neon_request_slots",
-                &self.num_neon_request_slots,
-            )
+            .field("num_neon_request_slots", &self.num_neon_request_slots)
             .field("neon_request_slots length", &self.neon_request_slots.len())
             .finish()
     }
diff --git a/pgxn/neon/communicator/src/integrated_cache.rs b/pgxn/neon/communicator/src/integrated_cache.rs
index a7009f0eb5..e00e49bf3d 100644
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -526,9 +526,7 @@ impl<'t> IntegratedCacheWriteAccess<'t> {
         self.relsize_cache.remove(&RelKey::from(rel));
 
         // update with flush LSN
-        let _ = self
-            .global_lw_lsn
-            .fetch_max(flush_lsn.0, Ordering::Relaxed);
+        let _ = self.global_lw_lsn.fetch_max(flush_lsn.0, Ordering::Relaxed);
 
         // also forget all cached blocks for the relation
         // FIXME
diff --git a/pgxn/neon/communicator/src/neon_request.rs b/pgxn/neon/communicator/src/neon_request.rs
index 32a02cd8c3..1868147fbf 100644
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -380,7 +380,6 @@ impl CRelUnlinkRequest {
     }
 }
 
-
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CForgetCacheRequest {
diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index 2eacd13609..aadf9b3a60 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -359,8 +359,14 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                 {
                     Ok(nblocks) => {
                         // update the cache
-                        tracing::info!("updated relsize for {:?} in cache: {}, lsn {}", rel, nblocks, read_lsn);
-                        self.cache.remember_rel_size(&rel, nblocks, not_modified_since);
+                        tracing::info!(
+                            "updated relsize for {:?} in cache: {}, lsn {}",
+                            rel,
+                            nblocks,
+                            read_lsn
+                        );
+                        self.cache
+                            .remember_rel_size(&rel, nblocks, not_modified_since);
 
                         NeonIOResult::RelSize(nblocks)
                     }
@@ -469,8 +475,11 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                 // TODO: We could put the empty pages to the cache. Maybe have
                 // a marker on the block entries for all-zero pages, instead of
                 // actually storing the empty pages.
-                self.cache
-                    .remember_rel_size(&req.reltag(), req.block_number + req.nblocks, Lsn(req.lsn));
+                self.cache.remember_rel_size(
+                    &req.reltag(),
+                    req.block_number + req.nblocks,
+                    Lsn(req.lsn),
+                );
                 NeonIOResult::WriteOK
             }
             NeonIORequest::RelCreate(req) => {
@@ -484,7 +493,8 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                 self.request_rel_truncate_counter.inc();
 
                 // TODO: need to grab an io-in-progress lock for this? I guess not
-                self.cache.remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
+                self.cache
+                    .remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
             NeonIORequest::RelUnlink(req) => {
@@ -496,7 +506,8 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
             }
             NeonIORequest::ForgetCache(req) => {
                 // TODO: need to grab an io-in-progress lock for this? I guess not
-                self.cache.forget_rel(&req.reltag(), Some(req.nblocks), Lsn(req.lsn));
+                self.cache
+                    .forget_rel(&req.reltag(), Some(req.nblocks), Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
         }

From 1ee24602d5f6d0b042b911f9f194b0f84975fd49 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 8 Jul 2025 23:41:05 +0300
Subject: [PATCH 039/163] Implement working set size estimation

---
 pgxn/neon/communicator_new.c | 43 ++++++++++++++++++++++++++++++++-
 pgxn/neon/communicator_new.h |  3 +++
 pgxn/neon/file_cache.c       | 47 +++++++++---------------------------
 pgxn/neon/file_cache.h       |  3 +++
 pgxn/neon/neon.c             | 36 +++++++++++++++++++++++++++
 5 files changed, 95 insertions(+), 37 deletions(-)

diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index bc27942bb1..d361ff7274 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -22,6 +22,7 @@
 #endif
 #include "access/xlog_internal.h"
 #include "access/xlogutils.h"
+#include "common/hashfn.h"
 #include "executor/instrument.h"
 #include "miscadmin.h"
 #include "postmaster/bgworker.h"
@@ -40,6 +41,7 @@
 #include "tcop/tcopprot.h"
 
 #include "communicator_new.h"
+#include "hll.h"
 #include "neon.h"
 #include "neon_perf_counters.h"
 #include "pagestore_client.h"
@@ -98,7 +100,19 @@ typedef struct CommunicatorShmemPerBackendData
 
 typedef struct CommunicatorShmemData
 {
-	int			dummy;
+	/*
+	 * Estimation of working set size.
+	 *
+	 * Note that this is not protected by any locks. That's sloppy, but works
+	 * fine in practice. To "add" a value to the HLL state, we just overwrite
+	 * one of the timestamps. Calculating the estimate reads all the values, but
+	 * it also doesn't depend on seeing a consistent snapshot of the values. We
+	 * could get bogus results if accessing the TimestampTz was not atomic, but
+	 * it on any 64-bit platforms we care about it is, and even if we observed a
+	 * torn read every now and then, it wouldn't affect the overall estimate
+	 * much.
+	 */
+	HyperLogLogState wss_estimation;
 
 	CommunicatorShmemPerBackendData backends[]; /* MaxProcs */
 
@@ -250,6 +264,9 @@ communicator_new_shmem_startup(void)
 	shmem_ptr = (char *) shmem_ptr + communicator_size;
 	shmem_size -= communicator_size;
 
+	/* Initialize hyper-log-log structure for estimating working set size */
+	initSHLL(&communicator_shmem_ptr->wss_estimation);
+
 	for (int i = 0; i < MaxProcs; i++)
 	{
 		InitSharedLatch(&communicator_shmem_ptr->backends[i].io_completion_latch);
@@ -743,6 +760,19 @@ communicator_new_read_at_lsnv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumbe
 		}
 	};
 
+	{
+		BufferTag tag;
+
+		CopyNRelFileInfoToBufTag(tag, rinfo);
+		tag.forkNum = forkNum;
+		for (int i = 0; i < nblocks; i++)
+		{
+			tag.blockNum = blockno;
+			addSHLL(&communicator_shmem_ptr->wss_estimation,
+					hash_bytes((uint8_t *) &tag, sizeof(tag)));
+		}
+	}
+
 	elog(DEBUG5, "getpagev called for rel %u/%u/%u.%u block %u (%u blocks)",
 		 RelFileInfoFmt(rinfo), forkNum, blockno, nblocks);
 
@@ -1357,3 +1387,14 @@ bounce_write_if_needed(void *buffer)
 	memcpy(p, buffer, BLCKSZ);
 	return p;
 }
+
+int32
+communicator_new_approximate_working_set_size_seconds(time_t duration, bool reset)
+{
+	int32		dc;
+
+	dc = (int32) estimateSHLL(&communicator_shmem_ptr->wss_estimation, duration);
+	if (reset)
+		memset(communicator_shmem_ptr->wss_estimation.regs, 0, sizeof(communicator_shmem_ptr->wss_estimation.regs));
+	return dc;
+}
diff --git a/pgxn/neon/communicator_new.h b/pgxn/neon/communicator_new.h
index 5b636b687a..dc38b3059e 100644
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -54,4 +54,7 @@ extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum
 extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
 extern void communicator_new_forget_cache(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
 
+/* other functions */
+extern int32 communicator_new_approximate_working_set_size_seconds(time_t duration, bool reset);
+
 #endif							/* COMMUNICATOR_NEW_H */
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index e5e2bb9183..847e2ba9f6 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -2159,46 +2159,21 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
 
-Datum
-approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+int32
+lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 {
-	if (neon_enable_new_communicator)
-		elog(ERROR, "TODO: not implemented");
+	int32		dc;
 
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
-		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
-}
+	if (lfc_size_limit == 0)
+		return -1;
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size);
-
-Datum
-approximate_working_set_size(PG_FUNCTION_ARGS)
-{
-	if (neon_enable_new_communicator)
-		elog(ERROR, "TODO: not implemented");
-
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		bool reset = PG_GETARG_BOOL(0);
-		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
-		if (reset)
-			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+	if (reset)
+		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
+	LWLockRelease(lfc_lock);
+	return dc;
 }
 
 PG_FUNCTION_INFO_V1(get_local_cache_state);
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
index 1b6ff36164..a5ffa6ea92 100644
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -52,6 +52,9 @@ extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
 
 PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
 
+extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
+
+
 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		 void *buffer)
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index ab51abc1de..548bdd9bb8 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -583,6 +583,8 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+PG_FUNCTION_INFO_V1(approximate_working_set_size);
 
 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -629,6 +631,40 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
 
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	time_t		duration;
+	int32		dc;
+
+	duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
+
+	if (neon_enable_new_communicator)
+		dc = communicator_new_approximate_working_set_size_seconds(duration, false);
+	else
+		dc = lfc_approximate_working_set_size_seconds(duration, false);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	int32		dc;
+	bool		reset = PG_GETARG_BOOL(0);
+
+	if (neon_enable_new_communicator)
+		dc = communicator_new_approximate_working_set_size_seconds(-1, reset);
+	else
+		dc = lfc_approximate_working_set_size_seconds(-1, reset);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
 static void
 neon_shmem_request(void)
 {

From 8db138ef64c4a93dadd17c071605b4d0437d935b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Jul 2025 01:28:16 +0300
Subject: [PATCH 040/163] Plumb through the stripe size to the communicator

---
 pageserver/client_grpc/src/client.rs                      | 3 ++-
 pageserver/client_grpc/src/lib.rs                         | 2 +-
 pgxn/neon/communicator/src/worker_process/main_loop.rs    | 6 +++---
 .../communicator/src/worker_process/worker_interface.rs   | 8 ++++++++
 pgxn/neon/communicator_new.c                              | 1 +
 pgxn/neon/libpagestore.c                                  | 8 ++++----
 pgxn/neon/pagestore_client.h                              | 1 +
 7 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 4f3f606935..8b1376ef4a 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -12,11 +12,12 @@ use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool}
 use crate::retry::Retry;
 use crate::split::GetPageSplitter;
 use compute_api::spec::PageserverProtocol;
-use pageserver_api::shard::ShardStripeSize;
 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 
+pub use pageserver_api::shard::ShardStripeSize;
+
 /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
 /// when full.
 ///
diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs
index 14fb3fbd5a..a9ace8cf98 100644
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -3,4 +3,4 @@ mod pool;
 mod retry;
 mod split;
 
-pub use client::{PageserverClient, ShardSpec};
+pub use client::{PageserverClient, ShardSpec, ShardStripeSize};
diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index aadf9b3a60..2ef82e7746 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -12,7 +12,7 @@ use crate::integrated_cache::{CacheResult, IntegratedCacheWriteAccess};
 use crate::neon_request::{CGetPageVRequest, CPrefetchVRequest};
 use crate::neon_request::{NeonIORequest, NeonIOResult};
 use crate::worker_process::in_progress_ios::{RequestInProgressKey, RequestInProgressTable};
-use pageserver_client_grpc::{PageserverClient, ShardSpec};
+use pageserver_client_grpc::{PageserverClient, ShardSpec, ShardStripeSize};
 use pageserver_page_api as page_api;
 
 use metrics::{IntCounter, IntCounterVec};
@@ -70,6 +70,7 @@ pub(super) async fn init(
     timeline_id: String,
     auth_token: Option<String>,
     shard_map: HashMap<utils::shard::ShardIndex, String>,
+    stripe_size: Option<ShardStripeSize>,
     initial_file_cache_size: u64,
     file_cache_path: Option<PathBuf>,
 ) -> CommunicatorWorkerProcessStruct<'static> {
@@ -91,10 +92,9 @@ pub(super) async fn init(
         .integrated_cache_init_struct
         .worker_process_init(last_lsn, file_cache);
 
-    // TODO: plumb through the stripe size.
     let tenant_id = TenantId::from_str(&tenant_id).expect("invalid tenant ID");
     let timeline_id = TimelineId::from_str(&timeline_id).expect("invalid timeline ID");
-    let shard_spec = ShardSpec::new(shard_map, None).expect("invalid shard spec");
+    let shard_spec = ShardSpec::new(shard_map, stripe_size).expect("invalid shard spec");
     let client = PageserverClient::new(tenant_id, timeline_id, shard_spec, auth_token)
         .expect("could not create client");
 
diff --git a/pgxn/neon/communicator/src/worker_process/worker_interface.rs b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
index 9aaa483c9e..e873555daa 100644
--- a/pgxn/neon/communicator/src/worker_process/worker_interface.rs
+++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
@@ -10,6 +10,8 @@ use crate::init::CommunicatorInitStruct;
 use crate::worker_process::main_loop;
 use crate::worker_process::main_loop::CommunicatorWorkerProcessStruct;
 
+use pageserver_client_grpc::ShardStripeSize;
+
 /// Launch the communicator's tokio tasks, which do most of the work.
 ///
 /// The caller has initialized the process as a regular PostgreSQL
@@ -24,6 +26,7 @@ pub extern "C" fn communicator_worker_process_launch(
     auth_token: *const c_char,
     shard_map: *mut *mut c_char,
     nshards: u32,
+    stripe_size: u32,
     file_cache_path: *const c_char,
     initial_file_cache_size: u64,
 ) -> &'static CommunicatorWorkerProcessStruct<'static> {
@@ -63,6 +66,11 @@ pub extern "C" fn communicator_worker_process_launch(
         timeline_id.to_string(),
         auth_token,
         shard_map,
+        if stripe_size > 0 {
+            Some(ShardStripeSize(stripe_size))
+        } else {
+            None
+        },
         initial_file_cache_size,
         file_cache_path,
     ));
diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index d361ff7274..3f2870621d 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -336,6 +336,7 @@ communicator_new_bgworker_main(Datum main_arg)
 									   neon_auth_token,
 									   connstrs,
 									   num_shards,
+									   neon_stripe_size,
 									   lfc_path,
 									   file_cache_size);
 	cis = NULL;
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index ee17b5d33b..f64e6ee233 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -80,7 +80,7 @@ int         neon_protocol_version = 3;
 
 static int	neon_compute_mode = 0;
 static int	max_reconnect_attempts = 60;
-static int	stripe_size;
+int		neon_stripe_size;
 static int	max_sockets;
 
 static int pageserver_response_log_timeout = 10000;
@@ -454,10 +454,10 @@ get_shard_number(BufferTag *tag)
 
 #if PG_MAJORVERSION_NUM < 16
 	hash = murmurhash32(tag->rnode.relNode);
-	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
 #else
 	hash = murmurhash32(tag->relNumber);
-	hash = hash_combine(hash, murmurhash32(tag->blockNum / stripe_size));
+	hash = hash_combine(hash, murmurhash32(tag->blockNum / neon_stripe_size));
 #endif
 
 	return hash % n_shards;
@@ -1510,7 +1510,7 @@ pg_init_libpagestore(void)
 	DefineCustomIntVariable("neon.stripe_size",
 							"sharding stripe size",
 							NULL,
-							&stripe_size,
+							&neon_stripe_size,
 							32768, 1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_BLOCKS,
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index eb3c80702e..b3c074c9ee 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -244,6 +244,7 @@ extern char *neon_timeline;
 extern char *neon_tenant;
 extern int32 max_cluster_size;
 extern int  neon_protocol_version;
+extern int	neon_stripe_size;
 
 extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p);
 extern shardno_t get_shard_number(BufferTag* tag);

From 60d87966b8c81d39568fbb66e1fd9155b1007b76 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Jul 2025 16:39:40 +0300
Subject: [PATCH 041/163] minor comment improvement

---
 .../src/worker_process/main_loop.rs           |  7 +--
 pgxn/neon/pagestore_smgr.c                    | 43 +++++++++++--------
 2 files changed, 28 insertions(+), 22 deletions(-)

diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index 2ef82e7746..04586f302c 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -259,9 +259,10 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
 
         // Is it possible that the last-written LSN is ahead of last flush LSN? Generally not, we
         // shouldn't evict a page from the buffer cache before all its modifications have been
-        // safely flushed. That's the "WAL before data" rule. However, such case does exist at index
-        // building: _bt_blwritepage logs the full page without flushing WAL before smgrextend
-        // (files are fsynced before build ends).
+        // safely flushed. That's the "WAL before data" rule. However, there are a few exceptions:
+        //
+        // - when creation an index: _bt_blwritepage logs the full page without flushing WAL before
+        // smgrextend (files are fsynced before build ends).
         //
         // XXX: If we make a request LSN greater than the current WAL flush LSN, the pageserver would
         // block waiting for the WAL arrive, until we flush it and it propagates through the
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 9ef393b8ff..a3a33e9f4b 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -853,6 +853,30 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 		 RelFileInfoFmt(InfoFromSMgrRel(reln)),
 		 forkNum);
 
+	/*
+	 * Newly created relation is empty, remember that in the relsize cache.
+	 *
+	 * Note that in REDO, this is called to make sure the relation fork
+	 * exists, but it does not truncate the relation. So, we can only update
+	 * the relsize if it didn't exist before.
+	 *
+	 * Also, in redo, we must make sure to update the cached size of the
+	 * relation, as that is the primary source of truth for REDO's file length
+	 * considerations, and as file extension isn't (perfectly) logged, we need
+	 * to take care of that before we hit file size checks.
+	 *
+	 * FIXME: This is currently not just an optimization, but required for
+	 * correctness. Postgres can call smgrnblocks() on the newly-created
+	 * relation. Currently, we don't call SetLastWrittenLSN() when a new
+	 * relation created, so if we didn't remember the size in the relsize
+	 * cache, we might call smgrnblocks() on the newly-created relation before
+	 * the creation WAL record has been received by the page server.
+	 *
+	 * XXX: with the new communicator, similar considerations apply. However,
+	 * during replay, neon_get_write_lsn() returns the (end-)LSN of the record
+	 * that's being replayed, so we should not have the correctness issue
+	 * mentioned in previous paragraph.
+	 */
 	if (neon_enable_new_communicator)
 	{
 		XLogRecPtr	lsn = neon_get_write_lsn();
@@ -867,25 +891,6 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	}
 	else
 	{
-		/*
-		 * Newly created relation is empty, remember that in the relsize cache.
-		 *
-		 * Note that in REDO, this is called to make sure the relation fork
-		 * exists, but it does not truncate the relation. So, we can only update
-		 * the relsize if it didn't exist before.
-		 *
-		 * Also, in redo, we must make sure to update the cached size of the
-		 * relation, as that is the primary source of truth for REDO's file length
-		 * considerations, and as file extension isn't (perfectly) logged, we need
-		 * to take care of that before we hit file size checks.
-		 *
-		 * FIXME: This is currently not just an optimization, but required for
-		 * correctness. Postgres can call smgrnblocks() on the newly-created
-		 * relation. Currently, we don't call SetLastWrittenLSN() when a new
-		 * relation created, so if we didn't remember the size in the relsize
-		 * cache, we might call smgrnblocks() on the newly-created relation before
-		 * the creation WAL record hass been received by the page server.
-		 */
 		if (isRedo)
 		{
 			update_cached_relsize(InfoFromSMgrRel(reln), forkNum, 0);

From 8f3351fa91c5170dbeb42b0fd84fd5ceec7b3ead Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Jul 2025 16:17:22 +0200
Subject: [PATCH 042/163] pageserver/client_grpc: split GetPage batches across
 shards (#12469)

## Problem

The rich gRPC Pageserver client needs to split GetPage batches that
straddle multiple shards.

Touches #11735.
Requires #12462.

## Summary of changes

Adds a `GetPageSplitter` which splits `GetPageRequest` that span
multiple shards, and then reassembles the responses. Dispatches
per-shard requests in parallel.
---
 Cargo.lock                           |   1 +
 pageserver/client_grpc/Cargo.toml    |   1 +
 pageserver/client_grpc/src/client.rs |  90 ++++++++++----
 pageserver/client_grpc/src/lib.rs    |   1 +
 pageserver/client_grpc/src/split.rs  | 172 +++++++++++++++++++++++++++
 5 files changed, 240 insertions(+), 25 deletions(-)
 create mode 100644 pageserver/client_grpc/src/split.rs

diff --git a/Cargo.lock b/Cargo.lock
index c49a2daba7..caed814d5f 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4499,6 +4499,7 @@ name = "pageserver_client_grpc"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "bytes",
  "compute_api",
  "futures",
  "pageserver_api",
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
index 0a8bcad2ef..84e27abb84 100644
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -6,6 +6,7 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+bytes.workspace = true
 compute_api.workspace = true
 futures.workspace = true
 pageserver_api.workspace = true
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 5bccdeede3..c21ce2e47d 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -2,13 +2,15 @@ use std::collections::HashMap;
 use std::sync::Arc;
 
 use anyhow::anyhow;
+use futures::stream::FuturesUnordered;
+use futures::{FutureExt as _, StreamExt as _};
 use tracing::instrument;
 
 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
 use crate::retry::Retry;
+use crate::split::GetPageSplitter;
 use compute_api::spec::PageserverProtocol;
-use pageserver_api::key::{Key, rel_block_to_key};
-use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
+use pageserver_api::shard::ShardStripeSize;
 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::{ShardCount, ShardIndex, ShardNumber};
@@ -78,10 +80,11 @@ impl PageserverClient {
             .await
     }
 
-    /// Fetches a page. The `request_id` must be unique across all in-flight requests.
+    /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
+    /// splits requests that straddle shard boundaries, and assembles the responses.
     ///
-    /// Unlike the `page_api::Client`, this client automatically converts `status_code` into
-    /// `tonic::Status` errors. All responses will have `GetPageStatusCode::Ok`.
+    /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
+    /// errors. All responses will have `GetPageStatusCode::Ok`.
     #[instrument(skip_all, fields(
         req_id = %req.request_id,
         rel = %req.rel,
@@ -93,22 +96,55 @@ impl PageserverClient {
         &self,
         req: page_api::GetPageRequest,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        // TODO: this needs to split batch requests across shards and reassemble responses into a
-        // single response. It must also re-split the batch in case the shard map changes. For now,
-        // just use the first page.
-        let key = rel_block_to_key(
-            req.rel,
-            req.block_numbers
-                .first()
-                .copied()
-                .ok_or_else(|| tonic::Status::invalid_argument("no block numbers provided"))?,
-        );
+        // Make sure we have at least one page.
+        if req.block_numbers.is_empty() {
+            return Err(tonic::Status::invalid_argument("no block number"));
+        }
 
-        self.retry
+        // Fast path: request is for a single shard.
+        if let Some(shard_id) =
+            GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size)
+        {
+            return self.get_page_for_shard(shard_id, req).await;
+        }
+
+        // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
+        // reassemble the responses.
+        //
+        // TODO: when we support shard map updates, we need to detect when it changes and re-split
+        // the request on errors.
+        let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size);
+
+        let mut shard_requests: FuturesUnordered<_> = splitter
+            .drain_requests()
+            .map(|(shard_id, shard_req)| {
+                // NB: each request will retry internally.
+                self.get_page_for_shard(shard_id, shard_req)
+                    .map(move |result| result.map(|resp| (shard_id, resp)))
+            })
+            .collect();
+
+        while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
+            splitter.add_response(shard_id, shard_response)?;
+        }
+
+        splitter.assemble_response()
+    }
+
+    /// Fetches pages that belong to the given shard.
+    #[instrument(skip_all, fields(shard = %shard_id))]
+    async fn get_page_for_shard(
+        &self,
+        shard_id: ShardIndex,
+        req: page_api::GetPageRequest,
+    ) -> tonic::Result<page_api::GetPageResponse> {
+        let resp = self
+            .retry
             .with(async || {
-                let stream = self.shards.get_for_key(key).stream().await;
+                let stream = self.shards.get(shard_id)?.stream().await;
                 let resp = stream.send(req.clone()).await?;
 
+                // Convert per-request errors into a tonic::Status.
                 if resp.status_code != page_api::GetPageStatusCode::Ok {
                     return Err(tonic::Status::new(
                         resp.status_code.into(),
@@ -118,7 +154,18 @@ impl PageserverClient {
 
                 Ok(resp)
             })
-            .await
+            .await?;
+
+        // Make sure we got the right number of pages.
+        // NB: check outside of the retry loop, since we don't want to retry this.
+        let (expected, actual) = (req.block_numbers.len(), resp.page_images.len());
+        if expected != actual {
+            return Err(tonic::Status::internal(format!(
+                "expected {expected} pages for shard {shard_id}, got {actual}",
+            )));
+        }
+
+        Ok(resp)
     }
 
     /// Returns the size of a relation, as # of blocks.
@@ -216,13 +263,6 @@ impl Shards {
             .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
     }
 
-    /// Looks up the shard that owns the given key.
-    fn get_for_key(&self, key: Key) -> &Shard {
-        let shard_number = key_to_shard_number(self.count, self.stripe_size, &key);
-        self.get(ShardIndex::new(shard_number, self.count))
-            .expect("must exist")
-    }
-
     /// Returns shard 0.
     fn get_zero(&self) -> &Shard {
         self.get(ShardIndex::new(ShardNumber(0), self.count))
diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs
index 2a59f9868c..3fc7178be2 100644
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -1,5 +1,6 @@
 mod client;
 mod pool;
 mod retry;
+mod split;
 
 pub use client::PageserverClient;
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
new file mode 100644
index 0000000000..5bbcaab393
--- /dev/null
+++ b/pageserver/client_grpc/src/split.rs
@@ -0,0 +1,172 @@
+use std::collections::HashMap;
+
+use bytes::Bytes;
+
+use pageserver_api::key::rel_block_to_key;
+use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
+use pageserver_page_api as page_api;
+use utils::shard::{ShardCount, ShardIndex};
+
+/// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
+/// TODO: add tests for this.
+pub struct GetPageSplitter {
+    /// The original request ID. Used for all shard requests.
+    request_id: page_api::RequestID,
+    /// Split requests by shard index.
+    requests: HashMap<ShardIndex, page_api::GetPageRequest>,
+    /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
+    /// the response pages in the same order as the original request.
+    block_shards: Vec<ShardIndex>,
+    /// Page responses by shard index. Will be assembled into a single response.
+    responses: HashMap<ShardIndex, Vec<Bytes>>,
+}
+
+impl GetPageSplitter {
+    /// Checks if the given request only touches a single shard, and returns the shard ID. This is
+    /// the common case, so we check first in order to avoid unnecessary allocations and overhead.
+    /// The caller must ensure that the request has at least one block number, or this will panic.
+    pub fn is_single_shard(
+        req: &page_api::GetPageRequest,
+        count: ShardCount,
+        stripe_size: ShardStripeSize,
+    ) -> Option<ShardIndex> {
+        // Fast path: unsharded tenant.
+        if count.is_unsharded() {
+            return Some(ShardIndex::unsharded());
+        }
+
+        // Find the base shard index for the first page, and compare with the rest.
+        let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
+        let shard_number = key_to_shard_number(count, stripe_size, &key);
+
+        req.block_numbers
+            .iter()
+            .skip(1) // computed above
+            .all(|&blkno| {
+                let key = rel_block_to_key(req.rel, blkno);
+                key_to_shard_number(count, stripe_size, &key) == shard_number
+            })
+            .then_some(ShardIndex::new(shard_number, count))
+    }
+
+    /// Splits the given request.
+    pub fn split(
+        req: page_api::GetPageRequest,
+        count: ShardCount,
+        stripe_size: ShardStripeSize,
+    ) -> Self {
+        // The caller should make sure we don't split requests unnecessarily.
+        debug_assert!(
+            Self::is_single_shard(&req, count, stripe_size).is_none(),
+            "unnecessary request split"
+        );
+
+        // Split the requests by shard index.
+        let mut requests = HashMap::with_capacity(2); // common case
+        let mut block_shards = Vec::with_capacity(req.block_numbers.len());
+        for blkno in req.block_numbers {
+            let key = rel_block_to_key(req.rel, blkno);
+            let shard_number = key_to_shard_number(count, stripe_size, &key);
+            let shard_id = ShardIndex::new(shard_number, count);
+
+            let shard_req = requests
+                .entry(shard_id)
+                .or_insert_with(|| page_api::GetPageRequest {
+                    request_id: req.request_id,
+                    request_class: req.request_class,
+                    rel: req.rel,
+                    read_lsn: req.read_lsn,
+                    block_numbers: Vec::new(),
+                });
+            shard_req.block_numbers.push(blkno);
+            block_shards.push(shard_id);
+        }
+
+        Self {
+            request_id: req.request_id,
+            responses: HashMap::with_capacity(requests.len()),
+            requests,
+            block_shards,
+        }
+    }
+
+    /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
+    pub fn drain_requests(
+        &mut self,
+    ) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
+        self.requests.drain()
+    }
+
+    /// Adds a response from the given shard.
+    #[allow(clippy::result_large_err)]
+    pub fn add_response(
+        &mut self,
+        shard_id: ShardIndex,
+        response: page_api::GetPageResponse,
+    ) -> tonic::Result<()> {
+        // The caller should already have converted status codes into tonic::Status.
+        assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok);
+
+        // Make sure the response matches the request ID.
+        if response.request_id != self.request_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID {} does not match request ID {}",
+                response.request_id, self.request_id
+            )));
+        }
+
+        // Add the response data to the map.
+        let old = self.responses.insert(shard_id, response.page_images);
+
+        if old.is_some() {
+            return Err(tonic::Status::internal(format!(
+                "duplicate response for shard {shard_id}",
+            )));
+        }
+
+        Ok(())
+    }
+
+    /// Assembles the shard responses into a single response. Responses must be present for all
+    /// relevant shards, and the total number of pages must match the original request.
+    #[allow(clippy::result_large_err)]
+    pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
+        let mut response = page_api::GetPageResponse {
+            request_id: self.request_id,
+            status_code: page_api::GetPageStatusCode::Ok,
+            reason: None,
+            page_images: Vec::with_capacity(self.block_shards.len()),
+        };
+
+        // Set up per-shard page iterators we can pull from.
+        let mut shard_responses = HashMap::with_capacity(self.responses.len());
+        for (shard_id, responses) in self.responses {
+            shard_responses.insert(shard_id, responses.into_iter());
+        }
+
+        // Reassemble the responses in the same order as the original request.
+        for shard_id in &self.block_shards {
+            let page = shard_responses
+                .get_mut(shard_id)
+                .ok_or_else(|| {
+                    tonic::Status::internal(format!("missing response for shard {shard_id}"))
+                })?
+                .next()
+                .ok_or_else(|| {
+                    tonic::Status::internal(format!("missing page from shard {shard_id}"))
+                })?;
+            response.page_images.push(page);
+        }
+
+        // Make sure there are no additional pages.
+        for (shard_id, mut pages) in shard_responses {
+            if pages.next().is_some() {
+                return Err(tonic::Status::internal(format!(
+                    "extra pages returned from shard {shard_id}"
+                )));
+            }
+        }
+
+        Ok(response)
+    }
+}

From ed4652b65bbd2b711e9d20b59e0c61c0904f1b8a Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Jul 2025 17:21:06 +0300
Subject: [PATCH 043/163] Update the relsize cache rather than forget it at end
 of index build

This greatly reduces the cases where we make a request to the
pageserver with a very recent LSN. Those cases are slow because the
pageserver needs to wait for the WAL to arrive. This speeds up the
Postgres pg_regress and isolation tests greatly.
---
 pgxn/neon/communicator/src/neon_request.rs    |  8 +++---
 .../src/worker_process/main_loop.rs           | 10 +++++--
 pgxn/neon/communicator_new.c                  | 25 ++++++++++++------
 pgxn/neon/communicator_new.h                  |  2 +-
 pgxn/neon/pagestore_smgr.c                    | 26 +++++++++++++++++--
 5 files changed, 54 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/communicator/src/neon_request.rs b/pgxn/neon/communicator/src/neon_request.rs
index 1868147fbf..9f5d134194 100644
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -30,7 +30,7 @@ pub enum NeonIORequest {
     RelUnlink(CRelUnlinkRequest),
 
     // Other requests
-    ForgetCache(CForgetCacheRequest),
+    UpdateCachedRelSize(CUpdateCachedRelSizeRequest),
 }
 
 #[repr(C)]
@@ -75,7 +75,7 @@ impl NeonIORequest {
             RelCreate(req) => req.request_id,
             RelTruncate(req) => req.request_id,
             RelUnlink(req) => req.request_id,
-            ForgetCache(req) => req.request_id,
+            UpdateCachedRelSize(req) => req.request_id,
         }
     }
 }
@@ -382,7 +382,7 @@ impl CRelUnlinkRequest {
 
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
-pub struct CForgetCacheRequest {
+pub struct CUpdateCachedRelSizeRequest {
     pub request_id: u64,
     pub spc_oid: COid,
     pub db_oid: COid,
@@ -392,7 +392,7 @@ pub struct CForgetCacheRequest {
     pub lsn: CLsn,
 }
 
-impl CForgetCacheRequest {
+impl CUpdateCachedRelSizeRequest {
     pub fn reltag(&self) -> page_api::RelTag {
         page_api::RelTag {
             spcnode: self.spc_oid,
diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index 04586f302c..171bb8fbf4 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -505,10 +505,10 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                 self.cache.forget_rel(&req.reltag(), None, Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
-            NeonIORequest::ForgetCache(req) => {
+            NeonIORequest::UpdateCachedRelSize(req) => {
                 // TODO: need to grab an io-in-progress lock for this? I guess not
                 self.cache
-                    .forget_rel(&req.reltag(), Some(req.nblocks), Lsn(req.lsn));
+                    .remember_rel_size(&req.reltag(), req.nblocks, Lsn(req.lsn));
                 NeonIOResult::WriteOK
             }
         }
@@ -597,6 +597,12 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                     );
                     return Err(-1);
                 }
+
+                info!(
+                    "received getpage response for blocks {:?} in rel {:?} lsns {}",
+                    block_numbers, rel, read_lsn
+                );
+
                 for (page_image, (blkno, _lsn, dest, _guard)) in
                     resp.page_images.into_iter().zip(cache_misses)
                 {
diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index 3f2870621d..f2cb23cd4e 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -1107,6 +1107,9 @@ communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr l
 	};
 	NeonIOResult result;
 
+	/* FIXME: see `request_lsns` in main_loop.rs for why this is needed */
+	XLogSetAsyncXactLSN(lsn);
+
 	perform_request(&request, &result);
 	switch (result.tag)
 	{
@@ -1141,6 +1144,9 @@ communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumbe
 	};
 	NeonIOResult result;
 
+	/* FIXME: see `request_lsns` in main_loop.rs for why this is needed */
+	XLogSetAsyncXactLSN(lsn);
+
 	perform_request(&request, &result);
 	switch (result.tag)
 	{
@@ -1174,6 +1180,9 @@ communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr l
 	};
 	NeonIOResult result;
 
+	/* FIXME: see `request_lsns` in main_loop.rs for why this is needed */
+	XLogSetAsyncXactLSN(lsn);
+
 	perform_request(&request, &result);
 	switch (result.tag)
 	{
@@ -1192,11 +1201,11 @@ communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr l
 }
 
 void
-communicator_new_forget_cache(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn)
+communicator_new_update_cached_rel_size(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn)
 {
 	NeonIORequest request = {
-		.tag = NeonIORequest_ForgetCache,
-		.forget_cache = {
+		.tag = NeonIORequest_UpdateCachedRelSize,
+		.update_cached_rel_size = {
 			.request_id = assign_request_id(),
 			.spc_oid = NInfoGetSpcOid(rinfo),
 			.db_oid = NInfoGetDbOid(rinfo),
@@ -1216,11 +1225,11 @@ communicator_new_forget_cache(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumbe
 		case NeonIOResult_Error:
 			ereport(ERROR,
 					(errcode_for_file_access(),
-					 errmsg("could not forget cache for rel %u/%u/%u.%u: %s",
+					 errmsg("could not update cached size for rel %u/%u/%u.%u: %s",
 							RelFileInfoFmt(rinfo), forkNum, pg_strerror(result.error))));
 			break;
 		default:
-			elog(ERROR, "unexpected result for ForgetCache operation: %d", result.tag);
+			elog(ERROR, "unexpected result for UpdateCachedRelSize operation: %d", result.tag);
 			break;
 	}
 }
@@ -1338,11 +1347,11 @@ print_neon_io_request(NeonIORequest *request)
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number);
 				return buf;
 			}
-		case NeonIORequest_ForgetCache:
+		case NeonIORequest_UpdateCachedRelSize:
 			{
-				CForgetCacheRequest *r = &request->forget_cache;
+				CUpdateCachedRelSizeRequest *r = &request->update_cached_rel_size;
 
-				snprintf(buf, sizeof(buf), "ForgetCache: req " UINT64_FORMAT " rel %u/%u/%u.%u blocks: %u",
+				snprintf(buf, sizeof(buf), "UpdateCachedRelSize: req " UINT64_FORMAT " rel %u/%u/%u.%u blocks: %u",
 								r->request_id,
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number,
 					r->nblocks);
diff --git a/pgxn/neon/communicator_new.h b/pgxn/neon/communicator_new.h
index dc38b3059e..1323c48e15 100644
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -52,7 +52,7 @@ extern void communicator_new_rel_zeroextend(NRelFileInfo rinfo, ForkNumber forkN
 extern void communicator_new_rel_create(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
 extern void communicator_new_rel_truncate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
 extern void communicator_new_rel_unlink(NRelFileInfo rinfo, ForkNumber forkNum, XLogRecPtr lsn);
-extern void communicator_new_forget_cache(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
+extern void communicator_new_update_cached_rel_size(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks, XLogRecPtr lsn);
 
 /* other functions */
 extern int32 communicator_new_approximate_working_set_size_seconds(time_t duration, bool reset);
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index a3a33e9f4b..9340d49f5a 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -531,6 +531,29 @@ neon_get_write_lsn(void)
 	else
 		lsn = GetXLogInsertRecPtr();
 
+	/*
+	 * If the insert LSN points to just after page header, round it down to
+	 * the beginning of the page, because the page header might not have been
+	 * inserted to the WAL yet, and if we tried to flush it, the WAL flushing
+	 * code gets upset.
+	 */
+	{
+		int			segoff;
+
+		segoff = XLogSegmentOffset(lsn, wal_segment_size);
+		if (segoff == SizeOfXLogLongPHD)
+		{
+			lsn = lsn - segoff;
+		}
+		else
+		{
+			int			offset = lsn % XLOG_BLCKSZ;
+
+			if (offset == SizeOfXLogShortPHD)
+				lsn = lsn - offset;
+		}
+	}
+
 	return lsn;
 }
 
@@ -2287,8 +2310,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 
 			if (neon_enable_new_communicator)
 			{
-				/* TODO: we could update the cache with the size, since we have it at hand */
-				communicator_new_forget_cache(InfoFromSMgrRel(reln), forknum, nblocks, recptr);
+				communicator_new_update_cached_rel_size(InfoFromSMgrRel(reln), forknum, nblocks, recptr);
 			}
 			else
 			{

From bc6a756f1c41145c7eba01b105c7d3a3e08829cf Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Wed, 9 Jul 2025 15:29:45 +0100
Subject: [PATCH 044/163] ci: lint openapi specs using redocly (#12510)

We need to lint specs for pageserver, endpoint storage, and safekeeper
#0000
---
 .github/workflows/build_and_test.yml     | 18 ++++++++++++
 Makefile                                 |  9 ++++++
 compute_tools/src/http/openapi_spec.yaml | 29 -------------------
 pageserver/src/http/openapi_spec.yml     | 37 +++++++++---------------
 4 files changed, 41 insertions(+), 52 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 864abad574..cc9534f05d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -87,6 +87,24 @@ jobs:
     uses: ./.github/workflows/build-build-tools-image.yml
     secrets: inherit
 
+  lint-openapi-spec:
+    runs-on: ubuntu-22.04
+    needs: [ meta, check-permissions ]
+    # We do need to run this in `.*-rc-pr` because of hotfixes.
+    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    steps:
+      - name: Harden the runner (Audit all outbound calls)
+        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+        with:
+          egress-policy: audit
+      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
+        with:
+          registry: ghcr.io
+          username: ${{ github.actor }}
+          password: ${{ secrets.GITHUB_TOKEN }}
+      - run: make lint-openapi-spec
+
   check-codestyle-python:
     needs: [ meta, check-permissions, build-build-tools-image ]
     # No need to run on `main` because we this in the merge queue. We do need to run this in `.*-rc-pr` because of hotfixes.
diff --git a/Makefile b/Makefile
index 4b31e26810..d07ac907b4 100644
--- a/Makefile
+++ b/Makefile
@@ -220,6 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
 
+.PHONY: lint-openapi-spec
+lint-openapi-spec:
+	# operation-2xx-response: pageserver timeline delete returns 404 on success
+	find . -iname "openapi_spec.y*ml" -exec\
+		docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
+			--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
+			--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
+			lint {} \+
+
 # Targets for building PostgreSQL are defined in postgres.mk.
 #
 # But if the caller has indicated that PostgreSQL is already
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index eaf33d1f82..3c58b284b3 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -416,15 +416,6 @@ components:
         total_startup_ms:
           type: integer
 
-    Info:
-      type: object
-      description: Information about VM/Pod.
-      required:
-        - num_cpus
-      properties:
-        num_cpus:
-          type: integer
-
     DbsAndRoles:
       type: object
       description: Databases and Roles
@@ -642,26 +633,6 @@ components:
           description: Promote error, if any
           type: string
 
-    InstalledExtensions:
-      type: object
-      properties:
-        extensions:
-          description: Contains list of installed extensions.
-          type: array
-          items:
-            type: object
-            properties:
-              extname:
-                type: string
-              version:
-                type: string
-                items:
-                  type: string
-              n_databases:
-                type: integer
-              owned_by_superuser:
-                type: integer
-
     SetRoleGrantsRequest:
       type: object
       required:
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index e8d1367d6c..3ffc80f19a 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -116,26 +116,6 @@ paths:
               schema:
                 type: string
 
-  /v1/tenant/{tenant_id}/timeline:
-    parameters:
-      - name: tenant_id
-        in: path
-        required: true
-        schema:
-          type: string
-    get:
-      description: Get timelines for tenant
-      responses:
-        "200":
-          description: TimelineInfo
-          content:
-            application/json:
-              schema:
-                type: array
-                items:
-                  $ref: "#/components/schemas/TimelineInfo"
-
-
   /v1/tenant/{tenant_id}/timeline/{timeline_id}:
     parameters:
       - name: tenant_id
@@ -618,7 +598,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/SecondaryProgress"
 
-  /v1/tenant/{tenant_id}/timeline/:
+  /v1/tenant/{tenant_id}/timeline:
     parameters:
       - name: tenant_id
         in: path
@@ -685,6 +665,17 @@ paths:
             application/json:
               schema:
                 $ref: "#/components/schemas/Error"
+    get:
+      description: Get timelines for tenant
+      responses:
+        "200":
+          description: TimelineInfo
+          content:
+            application/json:
+              schema:
+                type: array
+                items:
+                  $ref: "#/components/schemas/TimelineInfo"
 
   /v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/detach_ancestor:
     parameters:
@@ -767,7 +758,7 @@ paths:
                 $ref: "#/components/schemas/ServiceUnavailableError"
 
 
-  /v1/tenant/:
+  /v1/tenant:
     get:
       description: Get tenants list
       responses:
@@ -847,7 +838,7 @@ paths:
                 items:
                   $ref: "#/components/schemas/TenantInfo"
 
-  /v1/tenant/{tenant_id}/config/:
+  /v1/tenant/{tenant_id}/config:
     parameters:
       - name: tenant_id
         in: path

From 5c0de4ee8ccc39f67d718c9575897b6ca0c388b8 Mon Sep 17 00:00:00 2001
From: Alexander Lakhin <alexander.lakhin@neon.tech>
Date: Wed, 9 Jul 2025 18:22:54 +0300
Subject: [PATCH 045/163] Fix parameter name in workload for
 test_multiple_subscription_branching (#12522)

## Problem

As discovered in https://github.com/neondatabase/neon/issues/12394,
test_multiple_subscription_branching generates skewed data distribution,
that leads to test failures when the unevenly filled last table receives
even more data.
for table t0: pub_res = (42001,), sub_res = (42001,)
for table t1: pub_res = (29001,), sub_res = (29001,)
for table t2: pub_res = (21001,), sub_res = (21001,)
for table t3: pub_res = (21001,), sub_res = (21001,)
for table t4: pub_res = (1711001,), sub_res = (1711001,)

## Summary of changes
Fix the name of the workload parameter to generate data as expected.
---
 test_runner/regress/test_subscriber_branching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_subscriber_branching.py b/test_runner/regress/test_subscriber_branching.py
index 83bebc19be..63772f7cd4 100644
--- a/test_runner/regress/test_subscriber_branching.py
+++ b/test_runner/regress/test_subscriber_branching.py
@@ -332,7 +332,7 @@ def test_multiple_subscription_branching(neon_simple_env: NeonEnv):
 
             last_insert_lsn = query_scalar(cursor, "select pg_current_wal_insert_lsn();")
 
-    def start_publisher_workload(table_num: int, duration: int):
+    def start_publisher_workload(i: int, duration: int):
         start = time.time()
         with endpoint.cursor(dbname="publisher_db") as cur:
             while time.time() - start < duration:

From 78a6daa874b216711e3afa204bbbb0eb1af3b64b Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 9 Jul 2025 17:28:04 +0200
Subject: [PATCH 046/163] Add retrying in Random ops test if parent branch is
 not found. (#12513)

## Problem
Due to a lag in replication, we sometimes cannot get the parent branch
definition just after completion of the Public API restore call. This
leads to the test failures.
https://databricks.atlassian.net/browse/LKB-279
## Summary of changes
The workaround is implemented. Now test retries up to 30 seconds,
waiting for the branch definition to appear.

---------

Co-authored-by: Alexey Masterov <alexey.masterov@databricks.com>
---
 test_runner/random_ops/test_random_ops.py | 103 ++++++++++++----------
 1 file changed, 55 insertions(+), 48 deletions(-)

diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py
index d3815c40bb..5c43b06bc5 100644
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -117,7 +117,9 @@ class NeonBranch:
     def create_child_branch(self) -> NeonBranch | None:
         return self.project.create_branch(self.id)
 
-    def create_ro_endpoint(self) -> NeonEndpoint:
+    def create_ro_endpoint(self) -> NeonEndpoint | None:
+        if not self.project.check_limit_endpoints():
+            return None
         return NeonEndpoint(
             self.project,
             self.neon_api.create_endpoint(self.project_id, self.id, "read_only", {})["endpoint"],
@@ -151,11 +153,26 @@ class NeonBranch:
             return
         self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"])
         parent_id: str = res["branch"]["parent_id"]
+        # XXX Retry get parent details to work around the issue
+        # https://databricks.atlassian.net/browse/LKB-279
+        target_time = datetime.now() + timedelta(seconds=30)
+        while datetime.now() < target_time:
+            try:
+                parent_def = self.neon_api.get_branch_details(self.project_id, parent_id)
+            except HTTPError as he:
+                if he.response.status_code == 404:
+                    log.info("Branch not found, waiting...")
+                    time.sleep(1)
+                else:
+                    raise HTTPError(he) from he
+            else:
+                break
+        else:
+            raise RuntimeError(f"Branch {parent_id} not found")
+
         # Creates an object for the parent branch
         # After the reset operation a new parent branch is created
-        parent = NeonBranch(
-            self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True
-        )
+        parent = NeonBranch(self.project, parent_def, True)
         self.project.branches[parent_id] = parent
         self.parent = parent
         parent.children[self.id] = self
@@ -168,29 +185,21 @@ class NeonBranch:
         source_timestamp: str | None = None,
         preserve_under_name: str | None = None,
     ) -> dict[str, Any] | None:
+        if not self.project.check_limit_branches():
+            return None
         endpoints = [ep for ep in self.endpoints.values() if ep.type == "read_only"]
         # Terminate all the benchmarks running to prevent errors. Errors in benchmark during pgbench are expected
         for ep in endpoints:
             ep.terminate_benchmark()
         self.terminate_benchmark()
-        try:
-            res: dict[str, Any] = self.neon_api.restore_branch(
-                self.project_id,
-                self.id,
-                source_branch_id,
-                source_lsn,
-                source_timestamp,
-                preserve_under_name,
-            )
-        except HTTPError as he:
-            if (
-                he.response.status_code == 422
-                and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED"
-            ):
-                log.info("Branch limit exceeded, skipping")
-                return None
-            else:
-                raise HTTPError(he) from he
+        res: dict[str, Any] = self.neon_api.restore_branch(
+            self.project_id,
+            self.id,
+            source_branch_id,
+            source_lsn,
+            source_timestamp,
+            preserve_under_name,
+        )
         self.project.wait()
         self.start_benchmark()
         for ep in endpoints:
@@ -239,19 +248,30 @@ class NeonProject:
     def delete(self) -> None:
         self.neon_api.delete_project(self.id)
 
+    def check_limit_branches(self) -> bool:
+        if self.limits["max_branches"] == -1 or len(self.branches) < self.limits["max_branches"]:
+            return True
+        log.info("branch limit exceeded (%s/%s)", len(self.branches), self.limits["max_branches"])
+        return False
+
+    def check_limit_endpoints(self) -> bool:
+        if (
+            self.limits["max_read_only_endpoints"] == -1
+            or self.read_only_endpoints_total < self.limits["max_read_only_endpoints"]
+        ):
+            return True
+        log.info(
+            "Maximum read only endpoint limit exceeded (%s/%s)",
+            self.read_only_endpoints_total,
+            self.limits["max_read_only_endpoints"],
+        )
+        return False
+
     def create_branch(self, parent_id: str | None = None) -> NeonBranch | None:
         self.wait()
-        try:
-            branch_def = self.neon_api.create_branch(self.id, parent_id=parent_id)
-        except HTTPError as he:
-            if (
-                he.response.status_code == 422
-                and he.response.json()["code"] == "BRANCHES_LIMIT_EXCEEDED"
-            ):
-                log.info("Branch limit exceeded, skipping")
-                return None
-            else:
-                raise HTTPError(he) from he
+        if not self.check_limit_branches():
+            return None
+        branch_def = self.neon_api.create_branch(self.id, parent_id=parent_id)
         new_branch = NeonBranch(self, branch_def)
         self.wait()
         return new_branch
@@ -388,17 +408,9 @@ def do_action(project: NeonProject, action: str) -> bool:
     log.info("Action: %s", action)
     if action == "new_branch":
         log.info("Trying to create a new branch")
-        if 0 <= project.limits["max_branches"] <= len(project.branches):
-            log.info(
-                "Maximum branch limit exceeded (%s of %s)",
-                len(project.branches),
-                project.limits["max_branches"],
-            )
-            return False
         parent = project.branches[
             random.choice(list(set(project.branches.keys()) - project.reset_branches))
         ]
-        log.info("Parent: %s", parent)
         child = parent.create_child_branch()
         if child is None:
             return False
@@ -413,16 +425,11 @@ def do_action(project: NeonProject, action: str) -> bool:
             log.info("Leaf branches not found, skipping")
             return False
     elif action == "new_ro_endpoint":
-        if 0 <= project.limits["max_read_only_endpoints"] <= project.read_only_endpoints_total:
-            log.info(
-                "Maximum read only endpoint limit exceeded (%s of %s)",
-                project.read_only_endpoints_total,
-                project.limits["max_read_only_endpoints"],
-            )
-            return False
         ep = random.choice(
             [br for br in project.branches.values() if br.id not in project.reset_branches]
         ).create_ro_endpoint()
+        if ep is None:
+            return False
         log.info("Created the RO endpoint with id %s branch: %s", ep.id, ep.branch.id)
         ep.start_benchmark()
     elif action == "delete_ro_endpoint":

From 5ec82105cccf5a8a6c1d28e6f817c703d4b77a26 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 9 Jul 2025 11:35:19 -0400
Subject: [PATCH 047/163] fix(pageserver): ensure remote size gets computed
 (#12520)

## Problem

Follow up of #12400

## Summary of changes

We didn't set remote_size_mb to Some when initialized so it never gets
computed :(

Also added a new API to force refresh the properties.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/feature_resolver.rs       |  3 ++-
 pageserver/src/http/routes.rs            | 20 ++++++++++++++++++++
 test_runner/fixtures/pageserver/http.py  |  7 +++++++
 test_runner/regress/test_feature_flag.py |  9 +++++++++
 4 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/feature_resolver.rs b/pageserver/src/feature_resolver.rs
index 65cac8eea1..f0178fd9b3 100644
--- a/pageserver/src/feature_resolver.rs
+++ b/pageserver/src/feature_resolver.rs
@@ -409,11 +409,12 @@ impl TenantFeatureResolver {
 
     /// Refresh the cached properties and flags on the critical path.
     pub fn refresh_properties_and_flags(&self, tenant_shard: &TenantShard) {
-        let mut remote_size_mb = None;
+        let mut remote_size_mb = Some(0.0);
         for timeline in tenant_shard.list_timelines() {
             let size = timeline.metrics.resident_physical_size_get();
             if size == 0 {
                 remote_size_mb = None;
+                break;
             }
             if let Some(ref mut remote_size_mb) = remote_size_mb {
                 *remote_size_mb += size as f64 / 1024.0 / 1024.0;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2995a37089..3612686b5d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3691,6 +3691,23 @@ async fn read_tar_eof(mut reader: (impl tokio::io::AsyncRead + Unpin)) -> anyhow
     Ok(())
 }
 
+async fn force_refresh_feature_flag(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let tenant = state
+        .tenant_manager
+        .get_attached_tenant_shard(tenant_shard_id)?;
+    tenant
+        .feature_resolver
+        .refresh_properties_and_flags(&tenant);
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_evaluate_feature_flag(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -4156,6 +4173,9 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_shard_id/feature_flag/:flag_key", |r| {
             api_handler(r, tenant_evaluate_feature_flag)
         })
+        .post("/v1/tenant/:tenant_shard_id/force_refresh_feature_flag", |r| {
+            api_handler(r, force_refresh_feature_flag)
+        })
         .put("/v1/feature_flag/:flag_key", |r| {
             testing_api_handler("force override feature flag - put", r, force_override_feature_flag_for_testing_put)
         })
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index d9037f2d08..79cfba8da6 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1247,3 +1247,10 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         )
         self.verbose_error(res)
         return res.json()
+
+    def force_refresh_feature_flag(self, tenant_id: TenantId | TenantShardId):
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/force_refresh_feature_flag",
+        )
+        self.verbose_error(res)
+        return res.json()
diff --git a/test_runner/regress/test_feature_flag.py b/test_runner/regress/test_feature_flag.py
index 2712d13dcc..c6c192b6f1 100644
--- a/test_runner/regress/test_feature_flag.py
+++ b/test_runner/regress/test_feature_flag.py
@@ -49,3 +49,12 @@ def test_feature_flag(neon_env_builder: NeonEnvBuilder):
             env.initial_tenant, "test-feature-flag"
         )["result"]
     )
+
+    env.pageserver.http_client().force_refresh_feature_flag(env.initial_tenant)
+
+    # Check if the properties exist
+    result = env.pageserver.http_client().evaluate_feature_flag_multivariate(
+        env.initial_tenant, "test-feature-flag"
+    )
+    assert "tenant_remote_size_mb" in result["properties"]
+    assert "tenant_id" in result["properties"]

From 2f71eda00fbfea3ee3382fef3024b6376676726e Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Jul 2025 18:12:59 +0200
Subject: [PATCH 048/163] pageserver/client_grpc: add separate pools for bulk
 requests (#12475)

## Problem

GetPage bulk requests such as prefetches and vacuum can head-of-line
block foreground requests, causing increased latency.

Touches #11735.
Requires #12469.

## Summary of changes

* Use dedicated channel/client/stream pools for bulk GetPage requests.
* Use lower concurrency but higher queue depth for bulk pools.
* Make pool limits configurable.
* Require unbounded client pool for stream pool, to avoid accidental
starvation.
---
 pageserver/client_grpc/src/client.rs | 112 +++++++++++++++-----
 pageserver/client_grpc/src/pool.rs   | 148 +++++++++++++++------------
 pageserver/page_api/src/model.rs     |  15 ++-
 3 files changed, 180 insertions(+), 95 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index c21ce2e47d..63852868c3 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -1,4 +1,5 @@
 use std::collections::HashMap;
+use std::num::NonZero;
 use std::sync::Arc;
 
 use anyhow::anyhow;
@@ -15,6 +16,32 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 
+/// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
+/// when full.
+///
+/// TODO: tune all of these constants, and consider making them configurable.
+/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
+/// with only streams.
+const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
+
+/// Max number of concurrent unary request clients per shard.
+const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
+
+/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
+/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
+const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
+
+/// Max number of pipelined requests per stream.
+const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
+
+/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
+/// are more throughput-oriented, we have a smaller limit but higher queue depth.
+const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
+
+/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
+/// get a larger queue depth.
+const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
+
 /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
 /// basic `page_api::Client` gRPC client, and supports:
 ///
@@ -87,6 +114,7 @@ impl PageserverClient {
     /// errors. All responses will have `GetPageStatusCode::Ok`.
     #[instrument(skip_all, fields(
         req_id = %req.request_id,
+        class = %req.request_class,
         rel = %req.rel,
         blkno = %req.block_numbers[0],
         blks = %req.block_numbers.len(),
@@ -141,7 +169,11 @@ impl PageserverClient {
         let resp = self
             .retry
             .with(async || {
-                let stream = self.shards.get(shard_id)?.stream().await;
+                let stream = self
+                    .shards
+                    .get(shard_id)?
+                    .stream(req.request_class.is_bulk())
+                    .await;
                 let resp = stream.send(req.clone()).await?;
 
                 // Convert per-request errors into a tonic::Status.
@@ -270,17 +302,22 @@ impl Shards {
     }
 }
 
-/// A single shard.
+/// A single shard. Uses dedicated resource pools with the following structure:
 ///
-/// TODO: consider separate pools for normal and bulk traffic, with different settings.
+/// * Channel pool: unbounded.
+///   * Unary client pool: MAX_UNARY_CLIENTS.
+///   * Stream client pool: unbounded.
+///     * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
+/// * Bulk channel pool: unbounded.
+///   * Bulk client pool: unbounded.
+///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
 struct Shard {
-    /// Dedicated channel pool for this shard. Shared by all clients/streams in this shard.
-    _channel_pool: Arc<ChannelPool>,
-    /// Unary gRPC client pool for this shard. Uses the shared channel pool.
+    /// Unary gRPC client pool.
     client_pool: Arc<ClientPool>,
-    /// GetPage stream pool for this shard. Uses a dedicated client pool, but shares the channel
-    /// pool with unary clients.
+    /// GetPage stream pool.
     stream_pool: Arc<StreamPool>,
+    /// GetPage stream pool for bulk requests, e.g. prefetches.
+    bulk_stream_pool: Arc<StreamPool>,
 }
 
 impl Shard {
@@ -297,34 +334,53 @@ impl Shard {
             return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
         }
 
-        // Use a common channel pool for all clients, to multiplex unary and stream requests across
-        // the same TCP connections. The channel pool is unbounded (but client pools are bounded).
-        let channel_pool = ChannelPool::new(url)?;
+        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
+        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
 
-        // Dedicated client pool for unary requests.
+        // Client pool for unary requests.
         let client_pool = ClientPool::new(
             channel_pool.clone(),
             tenant_id,
             timeline_id,
             shard_id,
             auth_token.clone(),
+            Some(MAX_UNARY_CLIENTS),
         );
 
-        // Stream pool with dedicated client pool. If this shared a client pool with unary requests,
-        // long-lived streams could fill up the client pool and starve out unary requests. It shares
-        // the same underlying channel pool with unary clients though, which is unbounded.
-        let stream_pool = StreamPool::new(ClientPool::new(
-            channel_pool.clone(),
-            tenant_id,
-            timeline_id,
-            shard_id,
-            auth_token,
-        ));
+        // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
+        // but shares a channel pool with it (as it's unbounded).
+        let stream_pool = StreamPool::new(
+            ClientPool::new(
+                channel_pool.clone(),
+                tenant_id,
+                timeline_id,
+                shard_id,
+                auth_token.clone(),
+                None, // unbounded, limited by stream pool
+            ),
+            Some(MAX_STREAMS),
+            MAX_STREAM_QUEUE_DEPTH,
+        );
+
+        // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
+        // to avoid head-of-line blocking of latency-sensitive requests.
+        let bulk_stream_pool = StreamPool::new(
+            ClientPool::new(
+                ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
+                tenant_id,
+                timeline_id,
+                shard_id,
+                auth_token,
+                None, // unbounded, limited by stream pool
+            ),
+            Some(MAX_BULK_STREAMS),
+            MAX_BULK_STREAM_QUEUE_DEPTH,
+        );
 
         Ok(Self {
-            _channel_pool: channel_pool,
             client_pool,
             stream_pool,
+            bulk_stream_pool,
         })
     }
 
@@ -336,8 +392,12 @@ impl Shard {
             .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
     }
 
-    /// Returns a pooled stream for this shard.
-    async fn stream(&self) -> StreamGuard {
-        self.stream_pool.get().await
+    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
+    /// pool (e.g. for prefetches).
+    async fn stream(&self, bulk: bool) -> StreamGuard {
+        match bulk {
+            false => self.stream_pool.get().await,
+            true => self.bulk_stream_pool.get().await,
+        }
     }
 }
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 518e4e5b84..5a50004fd1 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -30,6 +30,7 @@
 //! TODO: observability.
 
 use std::collections::{BTreeMap, HashMap};
+use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
@@ -44,22 +45,10 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
-/// Max number of concurrent clients per channel.
-///
-/// TODO: tune these constants, and make them configurable.
-/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
-/// with only streams.
-const CLIENTS_PER_CHANNEL: usize = 16;
-
-/// Maximum number of concurrent clients per `ClientPool`.
-const CLIENT_LIMIT: usize = 64;
-
-/// Max number of pipelined requests per gRPC GetPage stream.
-const STREAM_QUEUE_DEPTH: usize = 2;
-
 /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
-/// stream multiplexing), up to `CLIENTS_PER_CHANNEL`. The pool does not limit the number of
-/// channels, and instead relies on `ClientPool` to limit the number of concurrent clients.
+/// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
+/// The pool does not limit the number of channels, and instead relies on `ClientPool` or
+/// `StreamPool` to limit the number of concurrent clients.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
 ///
@@ -69,6 +58,8 @@ const STREAM_QUEUE_DEPTH: usize = 2;
 pub struct ChannelPool {
     /// Pageserver endpoint to connect to.
     endpoint: Endpoint,
+    /// Max number of clients per channel. Beyond this, a new channel will be created.
+    max_clients_per_channel: NonZero<usize>,
     /// Open channels.
     channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
     /// Channel ID generator.
@@ -86,13 +77,14 @@ struct ChannelEntry {
 
 impl ChannelPool {
     /// Creates a new channel pool for the given Pageserver endpoint.
-    pub fn new<E>(endpoint: E) -> anyhow::Result<Arc<Self>>
+    pub fn new<E>(endpoint: E, max_clients_per_channel: NonZero<usize>) -> anyhow::Result<Arc<Self>>
     where
         E: TryInto<Endpoint> + Send + Sync + 'static,
         <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
     {
         Ok(Arc::new(Self {
             endpoint: endpoint.try_into()?,
+            max_clients_per_channel,
             channels: Mutex::default(),
             next_channel_id: AtomicUsize::default(),
         }))
@@ -120,8 +112,11 @@ impl ChannelPool {
         // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
         // channels, and free up higher-ordered channels such that they can be reaped.
         for (&id, entry) in channels.iter_mut() {
-            assert!(entry.clients <= CLIENTS_PER_CHANNEL, "channel overflow");
-            if entry.clients < CLIENTS_PER_CHANNEL {
+            assert!(
+                entry.clients <= self.max_clients_per_channel.get(),
+                "channel overflow"
+            );
+            if entry.clients < self.max_clients_per_channel.get() {
                 entry.clients += 1;
                 return ChannelGuard {
                     pool: Arc::downgrade(self),
@@ -181,7 +176,7 @@ impl Drop for ChannelGuard {
 
 /// A pool of gRPC clients for a single tenant shard. Each client acquires a channel from the inner
 /// `ChannelPool`. A client is only given out to single caller at a time. The pool limits the total
-/// number of concurrent clients to `CLIENT_LIMIT` via semaphore.
+/// number of concurrent clients to `max_clients` via semaphore.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
 ///
@@ -197,8 +192,8 @@ pub struct ClientPool {
     auth_token: Option<String>,
     /// Channel pool to acquire channels from.
     channel_pool: Arc<ChannelPool>,
-    /// Limits the max number of concurrent clients for this pool.
-    limiter: Arc<Semaphore>,
+    /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
+    limiter: Option<Arc<Semaphore>>,
     /// Idle pooled clients. Acquired clients are removed from here and returned on drop.
     ///
     /// The first client in the map will be acquired next. The map is sorted by client ID, which in
@@ -221,13 +216,15 @@ struct ClientEntry {
 
 impl ClientPool {
     /// Creates a new client pool for the given tenant shard. Channels are acquired from the given
-    /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard.
+    /// `ChannelPool`, which must point to a Pageserver that hosts the tenant shard. Allows up to
+    /// `max_clients` concurrent clients, or unbounded if None.
     pub fn new(
         channel_pool: Arc<ChannelPool>,
         tenant_id: TenantId,
         timeline_id: TimelineId,
         shard_id: ShardIndex,
         auth_token: Option<String>,
+        max_clients: Option<NonZero<usize>>,
     ) -> Arc<Self> {
         Arc::new(Self {
             tenant_id,
@@ -236,25 +233,24 @@ impl ClientPool {
             auth_token,
             channel_pool,
             idle: Mutex::default(),
-            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT)),
+            limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
             next_client_id: AtomicUsize::default(),
         })
     }
 
     /// Gets a client from the pool, or creates a new one if necessary. Connections are established
-    /// lazily and do not block, but this call can block if the pool is at `CLIENT_LIMIT`. The
-    /// client is returned to the pool when the guard is dropped.
+    /// lazily and do not block, but this call can block if the pool is at `max_clients`. The client
+    /// is returned to the pool when the guard is dropped.
     ///
     /// This is moderately performance-sensitive. It is called for every unary request, but these
     /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
     /// the `StreamPool` instead.
     pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
-        let permit = self
-            .limiter
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("never closed");
+        // Acquire a permit if the pool is bounded.
+        let mut permit = None;
+        if let Some(limiter) = self.limiter.clone() {
+            permit = Some(limiter.acquire_owned().await.expect("never closed"));
+        }
 
         // Fast path: acquire an idle client from the pool.
         if let Some((id, entry)) = self.idle.lock().unwrap().pop_first() {
@@ -296,9 +292,9 @@ impl ClientPool {
 pub struct ClientGuard {
     pool: Weak<ClientPool>,
     id: ClientID,
-    client: Option<page_api::Client>,    // Some until dropped
-    channel_guard: Option<ChannelGuard>, // Some until dropped
-    permit: OwnedSemaphorePermit,
+    client: Option<page_api::Client>,     // Some until dropped
+    channel_guard: Option<ChannelGuard>,  // Some until dropped
+    permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
 impl Deref for ClientGuard {
@@ -341,16 +337,21 @@ impl Drop for ClientGuard {
 /// TODO: reap idle streams.
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
-    /// The client pool to acquire clients from.
+    /// The client pool to acquire clients from. Must be unbounded.
     client_pool: Arc<ClientPool>,
     /// All pooled streams.
     ///
     /// Incoming requests will be sent over an existing stream with available capacity. If all
-    /// streams are full, a new one is spun up and added to the pool (up to the `ClientPool` limit).
-    /// Each stream has an associated Tokio task that processes requests and responses.
+    /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
+    /// stream has an associated Tokio task that processes requests and responses.
     streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
-    /// Limits the max number of concurrent requests (not streams).
-    limiter: Arc<Semaphore>,
+    /// The max number of concurrent streams, or None if unbounded.
+    max_streams: Option<NonZero<usize>>,
+    /// The max number of concurrent requests per stream.
+    max_queue_depth: NonZero<usize>,
+    /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
+    /// None if the pool is unbounded.
+    limiter: Option<Arc<Semaphore>>,
     /// Stream ID generator.
     next_stream_id: AtomicUsize,
 }
@@ -369,16 +370,27 @@ struct StreamEntry {
 }
 
 impl StreamPool {
-    /// Creates a new stream pool, using the given client pool.
+    /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
+    /// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
     ///
-    /// NB: the stream pool should use a dedicated client pool. Otherwise, long-lived streams may
-    /// fill up the client pool and starve out unary requests. Client pools can share the same
-    /// `ChannelPool` though, since the channel pool is unbounded.
-    pub fn new(client_pool: Arc<ClientPool>) -> Arc<Self> {
+    /// The client pool must be unbounded. The stream pool will enforce its own limits, and because
+    /// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
+    /// The stream pool should generally have its own dedicated client pool (but it can share a
+    /// channel pool with others since these are always unbounded).
+    pub fn new(
+        client_pool: Arc<ClientPool>,
+        max_streams: Option<NonZero<usize>>,
+        max_queue_depth: NonZero<usize>,
+    ) -> Arc<Self> {
+        assert!(client_pool.limiter.is_none(), "bounded client pool");
         Arc::new(Self {
             client_pool,
             streams: Arc::default(),
-            limiter: Arc::new(Semaphore::new(CLIENT_LIMIT * STREAM_QUEUE_DEPTH)),
+            limiter: max_streams.map(|max_streams| {
+                Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
+            }),
+            max_streams,
+            max_queue_depth,
             next_stream_id: AtomicUsize::default(),
         })
     }
@@ -402,18 +414,17 @@ impl StreamPool {
     ///
     /// For now, we just do something simple and functional, but very inefficient (linear scan).
     pub async fn get(&self) -> StreamGuard {
-        let permit = self
-            .limiter
-            .clone()
-            .acquire_owned()
-            .await
-            .expect("never closed");
+        // Acquire a permit if the pool is bounded.
+        let mut permit = None;
+        if let Some(limiter) = self.limiter.clone() {
+            permit = Some(limiter.acquire_owned().await.expect("never closed"));
+        }
         let mut streams = self.streams.lock().unwrap();
 
         // Look for a pooled stream with available capacity.
         for entry in streams.values() {
             assert!(
-                entry.queue_depth.load(Ordering::Relaxed) <= STREAM_QUEUE_DEPTH,
+                entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(),
                 "stream queue overflow"
             );
             if entry
@@ -421,7 +432,7 @@ impl StreamPool {
                 .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
                     // Increment the queue depth via compare-and-swap.
                     // TODO: review ordering.
-                    (queue_depth < STREAM_QUEUE_DEPTH).then_some(queue_depth + 1)
+                    (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1)
                 })
                 .is_ok()
             {
@@ -438,18 +449,16 @@ impl StreamPool {
         // join onto this stream and also create additional streams concurrently if this fills up.
         let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
         let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
-        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
+        let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
         let entry = StreamEntry {
             sender: req_tx.clone(),
             queue_depth: queue_depth.clone(),
         };
         streams.insert(id, entry);
 
-        // NB: make sure we don't overshoot the client limit. The semaphore limit is CLIENT_LIMIT *
-        // STREAM_QUEUE_DEPTH, but if we were to misaccount queue depth we'd try to spin up more
-        // streams than CLIENT_LIMIT and block on the client pool ~forever. This should not happen
-        // because we only acquire queue depth under lock and after acquiring a semaphore permit.
-        assert!(streams.len() <= CLIENT_LIMIT, "stream overflow");
+        if let Some(max_streams) = self.max_streams {
+            assert!(streams.len() <= max_streams.get(), "stream overflow");
+        };
 
         let client_pool = self.client_pool.clone();
         let streams = self.streams.clone();
@@ -484,19 +493,22 @@ impl StreamPool {
         // Acquire a client from the pool and create a stream.
         let mut client = client_pool.get().await?;
 
-        let (req_tx, req_rx) = mpsc::channel(STREAM_QUEUE_DEPTH);
-        let req_stream = tokio_stream::wrappers::ReceiverStream::new(req_rx);
+        // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
+        // theoretically deadlock if both the client and server block on sends (since we're not
+        // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
+        // low queue depths, but it was seen to happen with the libpq protocol so better safe than
+        // sorry. It should never buffer more than the queue depth anyway, but using an unbounded
+        // channel guarantees that it will never block.
+        let (req_tx, req_rx) = mpsc::unbounded_channel();
+        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
         let mut resp_stream = client.get_pages(req_stream).await?;
 
         // Track caller response channels by request ID. If the task returns early, these response
         // channels will be dropped and the waiting callers will receive an error.
-        let mut callers = HashMap::with_capacity(STREAM_QUEUE_DEPTH);
+        let mut callers = HashMap::new();
 
         // Process requests and responses.
         loop {
-            // NB: this can trip if the server doesn't respond to a request, so only debug_assert.
-            debug_assert!(callers.len() <= STREAM_QUEUE_DEPTH, "stream queue overflow");
-
             tokio::select! {
                 // Receive requests from callers and send them to the stream.
                 req = caller_rx.recv() => {
@@ -515,8 +527,8 @@ impl StreamPool {
                     }
                     callers.insert(req.request_id, resp_tx);
 
-                    // Send the request on the stream. Bail out if the send fails.
-                    req_tx.send(req).await.map_err(|_| {
+                    // Send the request on the stream. Bail out if the stream is closed.
+                    req_tx.send(req).map_err(|_| {
                         tonic::Status::unavailable("stream closed")
                     })?;
                 }
@@ -545,7 +557,7 @@ impl StreamPool {
 pub struct StreamGuard {
     sender: RequestSender,
     queue_depth: Arc<AtomicUsize>,
-    permit: OwnedSemaphorePermit,
+    permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
 impl StreamGuard {
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index c5b6f06879..d0d3517d41 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -384,7 +384,7 @@ impl From<GetPageRequest> for proto::GetPageRequest {
 pub type RequestID = u64;
 
 /// A GetPage request class.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, strum_macros::Display)]
 pub enum GetPageClass {
     /// Unknown class. For backwards compatibility: used when an older client version sends a class
     /// that a newer server version has removed.
@@ -397,6 +397,19 @@ pub enum GetPageClass {
     Background,
 }
 
+impl GetPageClass {
+    /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
+    /// latency-sensitive).
+    pub fn is_bulk(&self) -> bool {
+        match self {
+            Self::Unknown => false,
+            Self::Normal => false,
+            Self::Prefetch => true,
+            Self::Background => true,
+        }
+    }
+}
+
 impl From<proto::GetPageClass> for GetPageClass {
     fn from(pb: proto::GetPageClass) -> Self {
         match pb {

From 12c26243fc5e8a4522a8f848923f6e3eb7e0ad7c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Jul 2025 11:47:21 -0500
Subject: [PATCH 049/163] Fix typo in migration testing related to pg_monitor
 (#12530)

We should be joining on the neon_superuser roleid, not the pg_monitor
roleid.

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
---
 .../tests/0004-grant_pg_monitor_to_neon_superuser.sql           | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
index acb8dd417d..deb7a364af 100644
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -6,7 +6,7 @@ BEGIN
             admin_option AS admin
         INTO monitor
         FROM pg_auth_members
-        WHERE roleid = 'pg_monitor'::regrole
+        WHERE roleid = 'neon_superuser'::regrole
             AND member = 'pg_monitor'::regrole;
 
     IF NOT monitor.member THEN

From 4bbabc092a7b675eb5b624c3d647e216e98dbe2d Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Wed, 9 Jul 2025 21:16:06 +0400
Subject: [PATCH 050/163] tests: wait for flush lsn in
 test_branch_creation_before_gc (#12527)

## Problem
Test `test_branch_creation_before_gc` is flaky in the internal repo.
Pageserver sometimes lags behind write LSN. When we call GC it might not
reach the LSN we try to create the branch at yet.

## Summary of changes
- Wait till flush lsn on pageserver reaches the latest LSN before
calling GC.
---
 test_runner/regress/test_branch_and_gc.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index 8447c9bf2d..148f469a95 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
 import pytest
 from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
+from fixtures.neon_fixtures import wait_for_last_flush_lsn
 from fixtures.pageserver.http import TimelineCreate406
 from fixtures.utils import query_scalar, skip_in_debug_build
 
@@ -162,6 +163,9 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
     )
     lsn = Lsn(res[2][0][0])
 
+    # Wait for all WAL to reach the pageserver, so GC cutoff LSN is greater than `lsn`.
+    wait_for_last_flush_lsn(env, endpoint0, tenant, b0)
+
     # Use `failpoint=sleep` and `threading` to make the GC iteration triggers *before* the
     # branch creation task but the individual timeline GC iteration happens *after*
     # the branch creation task.

From fe0ddb7169514b3c755c01a519d1067b696d2925 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 9 Jul 2025 18:41:34 +0100
Subject: [PATCH 051/163] libs: make remote storage failure injection
 probabilistic (#12526)

Change the unreliable storage wrapper to fail by probability when there
are more failure attempts left.

Co-authored-by: Yecheng Yang <carlton.yang@databricks.com>
---
 libs/pageserver_api/src/config.rs            |  2 +
 libs/remote_storage/Cargo.toml               |  1 +
 libs/remote_storage/src/lib.rs               | 10 +++-
 libs/remote_storage/src/simulate_failures.rs | 31 ++++++++--
 libs/utils/src/env.rs                        | 59 ++++++++++++++++++++
 pageserver/src/bin/pageserver.rs             |  7 ++-
 pageserver/src/config.rs                     |  6 ++
 proxy/src/context/parquet.rs                 |  2 +-
 8 files changed, 107 insertions(+), 11 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index dc7e9aed7f..22815955c1 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -226,6 +226,7 @@ pub struct ConfigToml {
     pub synthetic_size_calculation_interval: Duration,
     pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
     pub test_remote_failures: u64,
+    pub test_remote_failures_probability: u64,
     pub ondemand_download_behavior_treat_error_as_warn: bool,
     #[serde(with = "humantime_serde")]
     pub background_task_maximum_delay: Duration,
@@ -758,6 +759,7 @@ impl Default for ConfigToml {
             disk_usage_based_eviction: DiskUsageEvictionTaskConfig::default(),
 
             test_remote_failures: (0),
+            test_remote_failures_probability: (100),
 
             ondemand_download_behavior_treat_error_as_warn: (false),
 
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index 69316fd493..0ae13552b8 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -43,6 +43,7 @@ itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }
 
 byteorder = "1.4"
+rand = "0.8.5"
 
 [dev-dependencies]
 camino-tempfile.workspace = true
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index ed416b2811..5885c3e791 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -732,9 +732,15 @@ impl GenericRemoteStorage {
         })
     }
 
-    pub fn unreliable_wrapper(s: Self, fail_first: u64) -> Self {
-        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
+    /* BEGIN_HADRON */
+    pub fn unreliable_wrapper(s: Self, fail_first: u64, fail_probability: u64) -> Self {
+        Self::Unreliable(Arc::new(UnreliableWrapper::new(
+            s,
+            fail_first,
+            fail_probability,
+        )))
     }
+    /* END_HADRON */
 
     /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
     pub async fn upload_storage_object(
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index f9856a5856..30d116f57c 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
+use rand::Rng;
+use std::cmp;
 use std::collections::HashMap;
 use std::collections::hash_map::Entry;
 use std::num::NonZeroU32;
@@ -25,6 +27,12 @@ pub struct UnreliableWrapper {
 
     // Tracks how many failed attempts of each operation has been made.
     attempts: Mutex<HashMap<RemoteOp, u64>>,
+
+    /* BEGIN_HADRON */
+    // This the probability of failure for each operation, ranged from [0, 100].
+    // The probability is default to 100, which means that all operations will fail.
+    attempt_failure_probability: u64,
+    /* END_HADRON */
 }
 
 /// Used to identify retries of different unique operation.
@@ -40,7 +48,11 @@ enum RemoteOp {
 }
 
 impl UnreliableWrapper {
-    pub fn new(inner: crate::GenericRemoteStorage, attempts_to_fail: u64) -> Self {
+    pub fn new(
+        inner: crate::GenericRemoteStorage,
+        attempts_to_fail: u64,
+        attempt_failure_probability: u64,
+    ) -> Self {
         assert!(attempts_to_fail > 0);
         let inner = match inner {
             GenericRemoteStorage::AwsS3(s) => GenericRemoteStorage::AwsS3(s),
@@ -51,9 +63,11 @@ impl UnreliableWrapper {
                 panic!("Can't wrap unreliable wrapper unreliably")
             }
         };
+        let actual_attempt_failure_probability = cmp::min(attempt_failure_probability, 100);
         UnreliableWrapper {
             inner,
             attempts_to_fail,
+            attempt_failure_probability: actual_attempt_failure_probability,
             attempts: Mutex::new(HashMap::new()),
         }
     }
@@ -66,6 +80,7 @@ impl UnreliableWrapper {
     ///
     fn attempt(&self, op: RemoteOp) -> anyhow::Result<u64> {
         let mut attempts = self.attempts.lock().unwrap();
+        let mut rng = rand::thread_rng();
 
         match attempts.entry(op) {
             Entry::Occupied(mut e) => {
@@ -75,15 +90,19 @@ impl UnreliableWrapper {
                     *p
                 };
 
-                if attempts_before_this >= self.attempts_to_fail {
-                    // let it succeed
-                    e.remove();
-                    Ok(attempts_before_this)
-                } else {
+                /* BEGIN_HADRON */
+                // If there are more attempts to fail, fail the request by probability.
+                if (attempts_before_this < self.attempts_to_fail)
+                    && (rng.gen_range(0..=100) < self.attempt_failure_probability)
+                {
                     let error =
                         anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
                     Err(error)
+                } else {
+                    e.remove();
+                    Ok(attempts_before_this)
                 }
+                /* END_HADRON */
             }
             Entry::Vacant(e) => {
                 let error = anyhow::anyhow!("simulated failure of remote operation {:?}", e.key());
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
index 2a85f54a01..cc1cbf8009 100644
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -44,3 +44,62 @@ where
         }
     }
 }
+
+/* BEGIN_HADRON */
+pub enum DeploymentMode {
+    Dev,
+    Staging,
+    Prod,
+}
+
+pub fn get_deployment_mode() -> Option<DeploymentMode> {
+    match std::env::var("DEPLOYMENT_MODE") {
+        Ok(env) => match env.as_str() {
+            "development" => Some(DeploymentMode::Dev),
+            "staging" => Some(DeploymentMode::Staging),
+            "production" => Some(DeploymentMode::Prod),
+            _ => {
+                tracing::error!("Unexpected DEPLOYMENT_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            tracing::error!("DEPLOYMENT_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_dev_or_staging() -> bool {
+    matches!(
+        get_deployment_mode(),
+        Some(DeploymentMode::Dev) | Some(DeploymentMode::Staging)
+    )
+}
+
+pub enum TestingMode {
+    Chaos,
+    Stress,
+}
+
+pub fn get_test_mode() -> Option<TestingMode> {
+    match std::env::var("HADRON_TEST_MODE") {
+        Ok(env) => match env.as_str() {
+            "chaos" => Some(TestingMode::Chaos),
+            "stress" => Some(TestingMode::Stress),
+            _ => {
+                tracing::error!("Unexpected HADRON_TEST_MODE: {}", env);
+                None
+            }
+        },
+        Err(_) => {
+            tracing::error!("HADRON_TEST_MODE not set");
+            None
+        }
+    }
+}
+
+pub fn is_chaos_testing() -> bool {
+    matches!(get_test_mode(), Some(TestingMode::Chaos))
+}
+/* END_HADRON */
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 327384fd82..78aba25d2e 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -889,8 +889,11 @@ async fn create_remote_storage_client(
             "Simulating remote failures for first {} attempts of each op",
             conf.test_remote_failures
         );
-        remote_storage =
-            GenericRemoteStorage::unreliable_wrapper(remote_storage, conf.test_remote_failures);
+        remote_storage = GenericRemoteStorage::unreliable_wrapper(
+            remote_storage,
+            conf.test_remote_failures,
+            conf.test_remote_failures_probability,
+        );
     }
 
     Ok(remote_storage)
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 99d7e0ca3a..15ec31b0a6 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -147,7 +147,11 @@ pub struct PageServerConf {
 
     pub disk_usage_based_eviction: DiskUsageEvictionTaskConfig,
 
+    // The number of allowed failures in remote storage operations.
     pub test_remote_failures: u64,
+    // The probability of failure in remote storage operations. Only works when test_remote_failures > 1.
+    // Use 100 for 100% failure, 0 for no failure.
+    pub test_remote_failures_probability: u64,
 
     pub ondemand_download_behavior_treat_error_as_warn: bool,
 
@@ -392,6 +396,7 @@ impl PageServerConf {
             synthetic_size_calculation_interval,
             disk_usage_based_eviction,
             test_remote_failures,
+            test_remote_failures_probability,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
             control_plane_api,
@@ -461,6 +466,7 @@ impl PageServerConf {
             synthetic_size_calculation_interval,
             disk_usage_based_eviction,
             test_remote_failures,
+            test_remote_failures_probability,
             ondemand_download_behavior_treat_error_as_warn,
             background_task_maximum_delay,
             control_plane_api: control_plane_api
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b55cc14532..4d8df19476 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -267,7 +267,7 @@ async fn worker_inner(
 ) -> anyhow::Result<()> {
     #[cfg(any(test, feature = "testing"))]
     let storage = if config.test_remote_failures > 0 {
-        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures)
+        GenericRemoteStorage::unreliable_wrapper(storage, config.test_remote_failures, 100)
     } else {
         storage
     };

From 28f604d628bfefa26b3016421756f91c8b6b2817 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Jul 2025 13:45:50 -0500
Subject: [PATCH 052/163] Make pg_monitor neon_superuser test more robust
 (#12532)

Make sure to check for NULL just in case.

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
Co-authored-by: Vikas Jain <vikas.jain@databricks.com>
---
 .../0004-grant_pg_monitor_to_neon_superuser.sql      | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
index deb7a364af..3464a2b1cf 100644
--- a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
@@ -6,14 +6,18 @@ BEGIN
             admin_option AS admin
         INTO monitor
         FROM pg_auth_members
-        WHERE roleid = 'neon_superuser'::regrole
-            AND member = 'pg_monitor'::regrole;
+        WHERE roleid = 'pg_monitor'::regrole
+            AND member = 'neon_superuser'::regrole;
 
-    IF NOT monitor.member THEN
+    IF monitor IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_monitor';
+    END IF;
+
+    IF monitor.admin IS NULL OR NOT monitor.member THEN
         RAISE EXCEPTION 'neon_superuser is not a member of pg_monitor';
     END IF;
 
-    IF NOT monitor.admin THEN
+    IF monitor.admin IS NULL OR NOT monitor.admin THEN
         RAISE EXCEPTION 'neon_superuser cannot grant pg_monitor';
     END IF;
 END $$;

From 0b639ba608d65b90df468346e72777e8953a4f42 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 9 Jul 2025 16:22:55 -0400
Subject: [PATCH 053/163] fix(storcon): correctly pass through lease error code
 (#12519)

## Problem

close LKB-199

## Summary of changes

We always return the error as 500 to the cplane if a LSN lease request
fails. This cause issues for the cplane as they don't retry on 500. This
patch correctly passes through the error and assign the error code so
that cplane can know if it is a retryable error. (TODO: look at the
cplane code and learn the retry logic).

Note that this patch does not resolve LKB-253 -- we need to handle not
found error separately in the lsn lease path, like wait until the tenant
gets attached, or return 503 so that cplane can retry.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/service.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ed6643d641..d2f7287be9 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4428,7 +4428,7 @@ impl Service {
                 .await;
 
             let mut failed = 0;
-            for (tid, result) in targeted_tenant_shards.iter().zip(results.into_iter()) {
+            for (tid, (_, result)) in targeted_tenant_shards.iter().zip(results.into_iter()) {
                 match result {
                     Ok(ok) => {
                         if tid.is_shard_zero() {
@@ -4795,7 +4795,7 @@ impl Service {
             .await;
 
         let mut valid_until = None;
-        for r in res {
+        for (node, r) in res {
             match r {
                 Ok(lease) => {
                     if let Some(ref mut valid_until) = valid_until {
@@ -4805,7 +4805,7 @@ impl Service {
                     }
                 }
                 Err(e) => {
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
+                    return Err(passthrough_api_error(&node, e));
                 }
             }
         }
@@ -4919,7 +4919,7 @@ impl Service {
         max_retries: u32,
         timeout: Duration,
         cancel: &CancellationToken,
-    ) -> Vec<mgmt_api::Result<T>>
+    ) -> Vec<(Node, mgmt_api::Result<T>)>
     where
         O: Fn(TenantShardId, PageserverClient) -> F + Copy,
         F: std::future::Future<Output = mgmt_api::Result<T>>,
@@ -4940,16 +4940,16 @@ impl Service {
                         cancel,
                     )
                     .await;
-                (idx, r)
+                (idx, node, r)
             });
         }
 
-        while let Some((idx, r)) = futs.next().await {
-            results.push((idx, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
+        while let Some((idx, node, r)) = futs.next().await {
+            results.push((idx, node, r.unwrap_or(Err(mgmt_api::Error::Cancelled))));
         }
 
-        results.sort_by_key(|(idx, _)| *idx);
-        results.into_iter().map(|(_, r)| r).collect()
+        results.sort_by_key(|(idx, _, _)| *idx);
+        results.into_iter().map(|(_, node, r)| (node, r)).collect()
     }
 
     /// Helper for safely working with the shards in a tenant remotely on pageservers, for example
@@ -5862,7 +5862,7 @@ impl Service {
             return;
         }
 
-        for result in self
+        for (_, result) in self
             .tenant_for_shards_api(
                 attached,
                 |tenant_shard_id, client| async move {
@@ -5881,7 +5881,7 @@ impl Service {
             }
         }
 
-        for result in self
+        for (_, result) in self
             .tenant_for_shards_api(
                 secondary,
                 |tenant_shard_id, client| async move {
@@ -8768,7 +8768,7 @@ impl Service {
             )
             .await;
 
-        for ((tenant_shard_id, node, optimization), secondary_status) in
+        for ((tenant_shard_id, node, optimization), (_, secondary_status)) in
             want_secondary_status.into_iter().zip(results.into_iter())
         {
             match secondary_status {

From 2edd59aefbcd79288735cbbed335a27880597529 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 9 Jul 2025 23:15:44 +0200
Subject: [PATCH 054/163] impr(compaction): unify checking of `CompactionError`
 for cancellation reason (#12392)

There are a couple of places that call `CompactionError::is_cancel` but
don't check the `::Other` variant via downcasting for root cause being
cancellation.
The only place that does it is `log_compaction_error`.
It's sad we have to do it, but, until we get around cleaning up all the
culprits,
a step forward is to unify the behavior so that all places that inspect
a
`CompactionError` for cancellation reason follow the same behavior.

Thus, this PR ...
- moves the downcasting checks against the `::Other` variant from
  `log_compaction_error` into `is_cancel()` and
- enforces via type system that `.is_cancel()` is used to check whether
  a CompactionError is due to cancellation (matching on the
  `CompactionError::ShuttingDown` will cause a compile-time error).

I don't think there's a _serious_ case right now where matching instead
of using `is_cancel` causes problems.
The worst case I could find is the circuit breaker and
`compaction_failed`,
which don't really matter if we're shutting down the timeline anyway.
But it's unaesthetic and might cause log/alert noise down the line,
so, this PR fixes that at least.

Refs
- https://databricks.atlassian.net/browse/LKB-182
- slack conversation about this PR:
https://databricks.slack.com/archives/C09254R641L/p1751284317955159
---
 pageserver/src/http/routes.rs                |  11 +-
 pageserver/src/tenant.rs                     |  17 ++-
 pageserver/src/tenant/tasks.rs               |  46 +-------
 pageserver/src/tenant/timeline.rs            | 115 ++++++++++++++-----
 pageserver/src/tenant/timeline/compaction.rs |  26 ++---
 5 files changed, 117 insertions(+), 98 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3612686b5d..767bba49e2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -79,8 +79,8 @@ use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerNa
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
 use crate::tenant::timeline::{
-    CompactFlags, CompactOptions, CompactRequest, CompactionError, MarkInvisibleRequest, Timeline,
-    WaitLsnTimeout, WaitLsnWaiter, import_pgdata,
+    CompactFlags, CompactOptions, CompactRequest, MarkInvisibleRequest, Timeline, WaitLsnTimeout,
+    WaitLsnWaiter, import_pgdata,
 };
 use crate::tenant::{
     GetTimelineError, LogicalSizeCalculationCause, OffloadedTimeline, PageReconstructError,
@@ -2500,9 +2500,10 @@ async fn timeline_checkpoint_handler(
                 .compact(&cancel, flags, &ctx)
                 .await
                 .map_err(|e|
-                    match e {
-                        CompactionError::ShuttingDown => ApiError::ShuttingDown,
-                        CompactionError::Other(e) => ApiError::InternalServerError(e),
+                    if e.is_cancel() {
+                        ApiError::ShuttingDown
+                    } else {
+                        ApiError::InternalServerError(e.into_anyhow())
                     }
                 )?;
         }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f576119db8..240ba36236 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3291,7 +3291,7 @@ impl TenantShard {
                         // Ignore this, we likely raced with unarchival.
                         OffloadError::NotArchived => Ok(()),
                         OffloadError::AlreadyInProgress => Ok(()),
-                        OffloadError::Cancelled => Err(CompactionError::ShuttingDown),
+                        OffloadError::Cancelled => Err(CompactionError::new_cancelled()),
                         // don't break the anyhow chain
                         OffloadError::Other(err) => Err(CompactionError::Other(err)),
                     })?;
@@ -3321,16 +3321,13 @@ impl TenantShard {
 
     /// Trips the compaction circuit breaker if appropriate.
     pub(crate) fn maybe_trip_compaction_breaker(&self, err: &CompactionError) {
-        match err {
-            err if err.is_cancel() => {}
-            CompactionError::ShuttingDown => (),
-            CompactionError::Other(err) => {
-                self.compaction_circuit_breaker
-                    .lock()
-                    .unwrap()
-                    .fail(&CIRCUIT_BREAKERS_BROKEN, err);
-            }
+        if err.is_cancel() {
+            return;
         }
+        self.compaction_circuit_breaker
+            .lock()
+            .unwrap()
+            .fail(&CIRCUIT_BREAKERS_BROKEN, err);
     }
 
     /// Cancel scheduled compaction tasks
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index bcece5589a..08fc7d61a5 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -17,17 +17,14 @@ use tracing::*;
 use utils::backoff::exponential_backoff_duration;
 use utils::completion::Barrier;
 use utils::pausable_failpoint;
-use utils::sync::gate::GateError;
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::{self, BackgroundLoopSemaphoreMetricsRecorder, TENANT_TASK_EVENTS};
 use crate::task_mgr::{self, BACKGROUND_RUNTIME, TOKIO_WORKER_THREADS, TaskKind};
-use crate::tenant::blob_io::WriteBlobError;
 use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::compaction::CompactionOutcome;
 use crate::tenant::{TenantShard, TenantState};
-use crate::virtual_file::owned_buffers_io::write::FlushTaskError;
 
 /// Semaphore limiting concurrent background tasks (across all tenants).
 ///
@@ -310,45 +307,12 @@ pub(crate) fn log_compaction_error(
     task_cancelled: bool,
     degrade_to_warning: bool,
 ) {
-    use CompactionError::*;
+    let is_cancel = err.is_cancel();
 
-    use crate::tenant::PageReconstructError;
-    use crate::tenant::upload_queue::NotInitialized;
-
-    let level = match err {
-        e if e.is_cancel() => return,
-        ShuttingDown => return,
-        _ if task_cancelled => Level::INFO,
-        Other(err) => {
-            let root_cause = err.root_cause();
-
-            let upload_queue = root_cause
-                .downcast_ref::<NotInitialized>()
-                .is_some_and(|e| e.is_stopping());
-            let timeline = root_cause
-                .downcast_ref::<PageReconstructError>()
-                .is_some_and(|e| e.is_cancel());
-            let buffered_writer_flush_task_canelled = root_cause
-                .downcast_ref::<FlushTaskError>()
-                .is_some_and(|e| e.is_cancel());
-            let write_blob_cancelled = root_cause
-                .downcast_ref::<WriteBlobError>()
-                .is_some_and(|e| e.is_cancel());
-            let gate_closed = root_cause
-                .downcast_ref::<GateError>()
-                .is_some_and(|e| e.is_cancel());
-            let is_stopping = upload_queue
-                || timeline
-                || buffered_writer_flush_task_canelled
-                || write_blob_cancelled
-                || gate_closed;
-
-            if is_stopping {
-                Level::INFO
-            } else {
-                Level::ERROR
-            }
-        }
+    let level = if is_cancel || task_cancelled {
+        Level::INFO
+    } else {
+        Level::ERROR
     };
 
     if let Some((error_count, sleep_duration)) = retry_info {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6088f40669..0a026d288e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1002,7 +1002,7 @@ impl From<WaitLsnError> for tonic::Status {
 impl From<CreateImageLayersError> for CompactionError {
     fn from(e: CreateImageLayersError) -> Self {
         match e {
-            CreateImageLayersError::Cancelled => CompactionError::ShuttingDown,
+            CreateImageLayersError::Cancelled => CompactionError::new_cancelled(),
             CreateImageLayersError::Other(e) => {
                 CompactionError::Other(e.context("create image layers"))
             }
@@ -2117,12 +2117,7 @@ impl Timeline {
         match &result {
             Ok(_) => self.compaction_failed.store(false, AtomicOrdering::Relaxed),
             Err(e) if e.is_cancel() => {}
-            Err(CompactionError::ShuttingDown) => {
-                // Covered by the `Err(e) if e.is_cancel()` branch.
-            }
-            Err(CompactionError::Other(_)) => {
-                self.compaction_failed.store(true, AtomicOrdering::Relaxed)
-            }
+            Err(_) => self.compaction_failed.store(true, AtomicOrdering::Relaxed),
         };
 
         result
@@ -6057,26 +6052,88 @@ impl Drop for Timeline {
     }
 }
 
-/// Top-level failure to compact.
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum CompactionError {
-    #[error("The timeline or pageserver is shutting down")]
-    ShuttingDown,
-    #[error(transparent)]
-    Other(anyhow::Error),
-}
+pub(crate) use compaction_error::CompactionError;
+/// In a private mod to enforce that [`CompactionError::is_cancel`] is used
+/// instead of `match`ing on [`CompactionError::ShuttingDown`].
+mod compaction_error {
+    use utils::sync::gate::GateError;
 
-impl CompactionError {
-    /// Errors that can be ignored, i.e., cancel and shutdown.
-    pub fn is_cancel(&self) -> bool {
-        matches!(self, Self::ShuttingDown)
+    use crate::{
+        pgdatadir_mapping::CollectKeySpaceError,
+        tenant::{PageReconstructError, blob_io::WriteBlobError, upload_queue::NotInitialized},
+        virtual_file::owned_buffers_io::write::FlushTaskError,
+    };
+
+    /// Top-level failure to compact. Use [`Self::is_cancel`].
+    #[derive(Debug, thiserror::Error)]
+    pub(crate) enum CompactionError {
+        /// Use [`Self::is_cancel`] instead of checking for this variant.
+        #[error("The timeline or pageserver is shutting down")]
+        #[allow(private_interfaces)]
+        ShuttingDown(ForbidMatching), // private ForbidMatching enforces use of [`Self::is_cancel`].
+        #[error(transparent)]
+        Other(anyhow::Error),
     }
 
-    pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
-        if err.is_cancel() {
-            Self::ShuttingDown
-        } else {
-            Self::Other(err.into_anyhow())
+    #[derive(Debug)]
+    struct ForbidMatching;
+
+    impl CompactionError {
+        pub fn new_cancelled() -> Self {
+            Self::ShuttingDown(ForbidMatching)
+        }
+        /// Errors that can be ignored, i.e., cancel and shutdown.
+        pub fn is_cancel(&self) -> bool {
+            let other = match self {
+                CompactionError::ShuttingDown(_) => return true,
+                CompactionError::Other(other) => other,
+            };
+
+            // The write path of compaction in particular often lacks differentiated
+            // handling errors stemming from cancellation from other errors.
+            // So, if requested, we also check the ::Other variant by downcasting.
+            // The list below has been found empirically from flaky tests and production logs.
+            // The process is simple: on ::Other(), compaction will print the enclosed
+            // anyhow::Error in debug mode, i.e., with backtrace. That backtrace contains the
+            // line where the write path / compaction code does undifferentiated error handling
+            // from a non-anyhow type to an anyhow type. Add the type to the list of downcasts
+            // below, following the same is_cancel() pattern.
+
+            let root_cause = other.root_cause();
+
+            let upload_queue = root_cause
+                .downcast_ref::<NotInitialized>()
+                .is_some_and(|e| e.is_stopping());
+            let timeline = root_cause
+                .downcast_ref::<PageReconstructError>()
+                .is_some_and(|e| e.is_cancel());
+            let buffered_writer_flush_task_canelled = root_cause
+                .downcast_ref::<FlushTaskError>()
+                .is_some_and(|e| e.is_cancel());
+            let write_blob_cancelled = root_cause
+                .downcast_ref::<WriteBlobError>()
+                .is_some_and(|e| e.is_cancel());
+            let gate_closed = root_cause
+                .downcast_ref::<GateError>()
+                .is_some_and(|e| e.is_cancel());
+            upload_queue
+                || timeline
+                || buffered_writer_flush_task_canelled
+                || write_blob_cancelled
+                || gate_closed
+        }
+        pub fn into_anyhow(self) -> anyhow::Error {
+            match self {
+                CompactionError::ShuttingDown(ForbidMatching) => anyhow::Error::new(self),
+                CompactionError::Other(e) => e,
+            }
+        }
+        pub fn from_collect_keyspace(err: CollectKeySpaceError) -> Self {
+            if err.is_cancel() {
+                Self::new_cancelled()
+            } else {
+                Self::Other(err.into_anyhow())
+            }
         }
     }
 }
@@ -6088,7 +6145,7 @@ impl From<super::upload_queue::NotInitialized> for CompactionError {
                 CompactionError::Other(anyhow::anyhow!(value))
             }
             super::upload_queue::NotInitialized::ShuttingDown
-            | super::upload_queue::NotInitialized::Stopped => CompactionError::ShuttingDown,
+            | super::upload_queue::NotInitialized::Stopped => CompactionError::new_cancelled(),
         }
     }
 }
@@ -6098,7 +6155,7 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
         match e {
             super::storage_layer::layer::DownloadError::TimelineShutdown
             | super::storage_layer::layer::DownloadError::DownloadCancelled => {
-                CompactionError::ShuttingDown
+                CompactionError::new_cancelled()
             }
             super::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
             | super::storage_layer::layer::DownloadError::DownloadRequired
@@ -6117,14 +6174,14 @@ impl From<super::storage_layer::layer::DownloadError> for CompactionError {
 
 impl From<layer_manager::Shutdown> for CompactionError {
     fn from(_: layer_manager::Shutdown) -> Self {
-        CompactionError::ShuttingDown
+        CompactionError::new_cancelled()
     }
 }
 
 impl From<super::storage_layer::errors::PutError> for CompactionError {
     fn from(e: super::storage_layer::errors::PutError) -> Self {
         if e.is_cancel() {
-            CompactionError::ShuttingDown
+            CompactionError::new_cancelled()
         } else {
             CompactionError::Other(e.into_anyhow())
         }
@@ -6223,7 +6280,7 @@ impl Timeline {
         let mut guard = tokio::select! {
             guard = self.layers.write(LayerManagerLockHolder::Compaction) => guard,
             _ = self.cancel.cancelled() => {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
         };
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index c263df1eb2..18a0ca852d 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -572,8 +572,8 @@ impl GcCompactionQueue {
         }
         match res {
             Ok(res) => Ok(res),
-            Err(CompactionError::ShuttingDown) => Err(CompactionError::ShuttingDown),
-            Err(CompactionError::Other(_)) => {
+            Err(e) if e.is_cancel() => Err(e),
+            Err(_) => {
                 // There are some cases where traditional gc might collect some layer
                 // files causing gc-compaction cannot read the full history of the key.
                 // This needs to be resolved in the long-term by improving the compaction
@@ -1260,7 +1260,7 @@ impl Timeline {
         // Is the timeline being deleted?
         if self.is_stopping() {
             trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         let target_file_size = self.get_checkpoint_distance();
@@ -1624,7 +1624,7 @@ impl Timeline {
 
         for (i, layer) in layers_to_rewrite.into_iter().enumerate() {
             if self.cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             info!(layer=%layer, "rewriting layer after shard split: {}/{}", i, total);
@@ -1722,7 +1722,7 @@ impl Timeline {
                     Ok(()) => {},
                     Err(WaitCompletionError::NotInitialized(ni)) => return Err(CompactionError::from(ni)),
                     Err(WaitCompletionError::UploadQueueShutDownOrStopped) => {
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                     }
                 },
                 // Don't wait if there's L0 compaction to do. We don't need to update the outcome
@@ -1985,7 +1985,7 @@ impl Timeline {
             let mut all_keys = Vec::new();
             for l in deltas_to_compact.iter() {
                 if self.cancel.is_cancelled() {
-                    return Err(CompactionError::ShuttingDown);
+                    return Err(CompactionError::new_cancelled());
                 }
                 let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
                 let keys = delta
@@ -2078,7 +2078,7 @@ impl Timeline {
         stats.read_lock_held_compute_holes_micros = stats.read_lock_held_key_sort_micros.till_now();
 
         if self.cancel.is_cancelled() {
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         stats.read_lock_drop_micros = stats.read_lock_held_compute_holes_micros.till_now();
@@ -2186,7 +2186,7 @@ impl Timeline {
                 // avoid hitting the cancellation token on every key. in benches, we end up
                 // shuffling an order of million keys per layer, this means we'll check it
                 // around tens of times per layer.
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             let same_key = prev_key == Some(key);
@@ -2271,7 +2271,7 @@ impl Timeline {
                 if writer.is_none() {
                     if self.cancel.is_cancelled() {
                         // to be somewhat responsive to cancellation, check for each new layer
-                        return Err(CompactionError::ShuttingDown);
+                        return Err(CompactionError::new_cancelled());
                     }
                     // Create writer if not initiaized yet
                     writer = Some(
@@ -2527,7 +2527,7 @@ impl Timeline {
         // Is the timeline being deleted?
         if self.is_stopping() {
             trace!("Dropping out of compaction on timeline shutdown");
-            return Err(CompactionError::ShuttingDown);
+            return Err(CompactionError::new_cancelled());
         }
 
         let (dense_ks, _sparse_ks) = self
@@ -3189,7 +3189,7 @@ impl Timeline {
         let gc_lock = async {
             tokio::select! {
                 guard = self.gc_lock.lock() => Ok(guard),
-                _ = cancel.cancelled() => Err(CompactionError::ShuttingDown),
+                _ = cancel.cancelled() => Err(CompactionError::new_cancelled()),
             }
         };
 
@@ -3462,7 +3462,7 @@ impl Timeline {
             }
             total_layer_size += layer.layer_desc().file_size;
             if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
             let should_yield = yield_for_l0
                 && self
@@ -3609,7 +3609,7 @@ impl Timeline {
             }
 
             if cancel.is_cancelled() {
-                return Err(CompactionError::ShuttingDown);
+                return Err(CompactionError::new_cancelled());
             }
 
             let should_yield = yield_for_l0

From 13e38a58a14c60da94486904d60a8b9e8e391503 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 9 Jul 2025 16:35:39 -0500
Subject: [PATCH 055/163] Grant pg_signal_backend to neon_superuser (#12533)

Allow neon_superuser to cancel backends from non-neon_superusers,
excluding Postgres superusers.

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
Co-authored-by: Vikas Jain <vikas.jain@databricks.com>
---
 ...nt_pg_signal_backend_to_neon_superuser.sql |  1 +
 ...nt_pg_signal_backend_to_neon_superuser.sql | 23 +++++++++++++++++++
 compute_tools/src/spec.rs                     |  1 +
 3 files changed, 25 insertions(+)
 create mode 100644 compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql

diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
new file mode 100644
index 0000000000..36e31544be
--- /dev/null
+++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1 @@
+GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
new file mode 100644
index 0000000000..e62b742d30
--- /dev/null
+++ b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
@@ -0,0 +1,23 @@
+DO $$
+DECLARE
+    signal_backend record;
+BEGIN
+    SELECT pg_has_role('neon_superuser', 'pg_signal_backend', 'member') AS member,
+            admin_option AS admin
+        INTO signal_backend
+        FROM pg_auth_members
+        WHERE roleid = 'pg_signal_backend'::regrole
+            AND member = 'neon_superuser'::regrole;
+
+    IF signal_backend IS NULL THEN
+        RAISE EXCEPTION 'no entry in pg_auth_members for neon_superuser and pg_signal_backend';
+    END IF;
+
+    IF signal_backend.member IS NULL OR NOT signal_backend.member THEN
+        RAISE EXCEPTION 'neon_superuser is not a member of pg_signal_backend';
+    END IF;
+
+    IF signal_backend.admin IS NULL OR NOT signal_backend.admin THEN
+        RAISE EXCEPTION 'neon_superuser cannot grant pg_signal_backend';
+    END IF;
+END $$;
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index 43cfbb48f7..b6382b2f56 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -197,6 +197,7 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
         include_str!(
             "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
         ),
+        include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
     ];
 
     MigrationRunner::new(client, &migrations)

From 1a45b2ec900b37c40608ca94a1d9d37cad12fce8 Mon Sep 17 00:00:00 2001
From: Dimitri Fontaine <dim@tapoueh.org>
Date: Thu, 10 Jul 2025 10:06:33 +0200
Subject: [PATCH 056/163] Review security model for executing Event Trigger
 code. (#12463)

When a function is owned by a superuser (bootstrap user or otherwise),
we consider it safe to run it. Only a superuser could have installed it,
typically from CREATE EXTENSION script: we trust the code to execute.

## Problem

This is intended to solve running pg_graphql Event Triggers
graphql_watch_ddl and graphql_watch_drop which are executing the secdef
function graphql.increment_schema_version().

## Summary of changes

Allow executing Event Trigger function owned by a superuser and with
SECURITY DEFINER properties. The Event Trigger code runs with superuser
privileges, and we consider that it's fine.

---------

Co-authored-by: Tristan Partin <tristan.partin@databricks.com>
---
 pgxn/neon/neon_ddl_handler.c                  |  33 +-----
 test_runner/fixtures/neon_fixtures.py         |  27 +++++
 .../test_event_trigger_extension--1.0.sql     |  32 ++++++
 .../test_event_trigger_extension.control      |   8 ++
 .../regress/test_download_extensions.py       |  22 ----
 .../regress/test_event_trigger_extension.py   | 102 ++++++++++++++++++
 6 files changed, 174 insertions(+), 50 deletions(-)
 create mode 100644 test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
 create mode 100644 test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
 create mode 100644 test_runner/regress/test_event_trigger_extension.py

diff --git a/pgxn/neon/neon_ddl_handler.c b/pgxn/neon/neon_ddl_handler.c
index 2ce7b0086b..1f03e52c67 100644
--- a/pgxn/neon/neon_ddl_handler.c
+++ b/pgxn/neon/neon_ddl_handler.c
@@ -953,7 +953,9 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 
 	/*
 	 * Fire Event Trigger if both function owner and current user are
-	 * superuser, or none of them are.
+	 * superuser. Allow executing Event Trigger function that belongs to a
+	 * superuser when connected as a non-superuser, even when the function is
+	 * SECURITY DEFINER.
 	 */
     else if (event == FHET_START
 		/* still enable it to pass pg_regress tests */
@@ -976,32 +978,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		function_is_owned_by_super = superuser_arg(function_owner);
 
 		/*
-		 * 1. Refuse to run SECURITY DEFINER function that belongs to a
-		 * superuser when the current user is not a superuser itself.
-		 */
-		if (!role_is_super
-			&& function_is_owned_by_super
-			&& function_is_secdef)
-		{
-			char *func_name = get_func_name(flinfo->fn_oid);
-
-			ereport(WARNING,
-					(errmsg("Skipping Event Trigger"),
-					 errdetail("Event Trigger function \"%s\" is owned by \"%s\" "
-							   "and is SECURITY DEFINER",
-							   func_name,
-							   GetUserNameFromId(function_owner, false))));
-
-			/*
-			 * we can't skip execution directly inside the fmgr_hook so
-			 * instead we change the event trigger function to a noop
-			 * function.
-			 */
-			force_noop(flinfo);
-		}
-
-		/*
-		 * 2. Refuse to run functions that belongs to a non-superuser when the
+		 * Refuse to run functions that belongs to a non-superuser when the
 		 * current user is a superuser.
 		 *
 		 * We could run a SECURITY DEFINER user-function here and be safe with
@@ -1009,7 +986,7 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		 * infrastructure maintenance operations, where we prefer to skip
 		 * running user-defined code.
 		 */
-		else if (role_is_super && !function_is_owned_by_super)
+		if (role_is_super && !function_is_owned_by_super)
 		{
 			char *func_name = get_func_name(flinfo->fn_oid);
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f54d5be635..42924f9b83 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1795,6 +1795,33 @@ def neon_env_builder(
         record_property("preserve_database_files", builder.preserve_database_files)
 
 
+@pytest.fixture(scope="function")
+def neon_env_builder_local(
+    neon_env_builder: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_distrib_dir: Path,
+) -> NeonEnvBuilder:
+    """
+    Fixture to create a Neon environment for test with its own pg_install copy.
+
+    This allows the test to edit the list of available extensions in the
+    local instance of Postgres used for the test, and install extensions via
+    downloading them when a remote extension is tested, for instance, or
+    copying files around for local extension testing.
+    """
+    test_local_pginstall = test_output_dir / "pg_install"
+    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
+
+    # We can't copy only the version that we are currently testing because other
+    # binaries like the storage controller need specific Postgres versions.
+    shutil.copytree(pg_distrib_dir, test_local_pginstall)
+
+    neon_env_builder.pg_distrib_dir = test_local_pginstall
+    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
+
+    return neon_env_builder
+
+
 @dataclass
 class PageserverPort:
     pg: int
diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
new file mode 100644
index 0000000000..2b82102802
--- /dev/null
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension--1.0.sql
@@ -0,0 +1,32 @@
+\echo Use "CREATE EXTENSION test_event_trigger_extension" to load this file. \quit
+
+CREATE SCHEMA event_trigger;
+
+create sequence if not exists event_trigger.seq_schema_version as int cycle;
+
+create or replace function event_trigger.increment_schema_version()
+    returns event_trigger
+    security definer
+    language plpgsql
+as $$
+begin
+    perform pg_catalog.nextval('event_trigger.seq_schema_version');
+end;
+$$;
+
+create or replace function event_trigger.get_schema_version()
+    returns int
+    security definer
+    language sql
+as $$
+    select last_value from event_trigger.seq_schema_version;
+$$;
+
+-- On DDL event, increment the schema version number
+create event trigger event_trigger_watch_ddl
+    on ddl_command_end
+    execute procedure event_trigger.increment_schema_version();
+
+create event trigger event_trigger_watch_drop
+    on sql_drop
+    execute procedure event_trigger.increment_schema_version();
diff --git a/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
new file mode 100644
index 0000000000..4fe8c3341b
--- /dev/null
+++ b/test_runner/regress/data/test_event_trigger_extension/test_event_trigger_extension.control
@@ -0,0 +1,8 @@
+default_version = '1.0'
+comment = 'Test extension with Event Trigger'
+
+# make sure the extension objects are owned by the bootstrap user
+# to check that the SECURITY DEFINER event trigger function is still
+# called during non-superuser DDL events.
+superuser = true
+trusted = true
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index fe3b220c67..d7f78afac8 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import os
 import platform
-import shutil
 import tarfile
 from enum import StrEnum
 from pathlib import Path
@@ -31,27 +30,6 @@ if TYPE_CHECKING:
     from werkzeug.wrappers.request import Request
 
 
-# use neon_env_builder_local fixture to override the default neon_env_builder fixture
-# and use a test-specific pg_install instead of shared one
-@pytest.fixture(scope="function")
-def neon_env_builder_local(
-    neon_env_builder: NeonEnvBuilder,
-    test_output_dir: Path,
-    pg_distrib_dir: Path,
-) -> NeonEnvBuilder:
-    test_local_pginstall = test_output_dir / "pg_install"
-    log.info(f"copy {pg_distrib_dir} to {test_local_pginstall}")
-
-    # We can't copy only the version that we are currently testing because other
-    # binaries like the storage controller need specific Postgres versions.
-    shutil.copytree(pg_distrib_dir, test_local_pginstall)
-
-    neon_env_builder.pg_distrib_dir = test_local_pginstall
-    log.info(f"local neon_env_builder.pg_distrib_dir: {neon_env_builder.pg_distrib_dir}")
-
-    return neon_env_builder
-
-
 @final
 class RemoteExtension(StrEnum):
     SQL_ONLY = "test_extension_sql_only"
diff --git a/test_runner/regress/test_event_trigger_extension.py b/test_runner/regress/test_event_trigger_extension.py
new file mode 100644
index 0000000000..ac4351dcd5
--- /dev/null
+++ b/test_runner/regress/test_event_trigger_extension.py
@@ -0,0 +1,102 @@
+from __future__ import annotations
+
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.paths import BASE_DIR
+
+if TYPE_CHECKING:
+    from pathlib import Path
+
+    from fixtures.neon_fixtures import (
+        NeonEnvBuilder,
+    )
+    from fixtures.pg_version import PgVersion
+
+
+# use neon_env_builder_local fixture to override the default neon_env_builder fixture
+# and use a test-specific pg_install instead of shared one
+@pytest.fixture(scope="function")
+def neon_env_builder_event_trigger_extension(
+    neon_env_builder_local: NeonEnvBuilder,
+    test_output_dir: Path,
+    pg_version: PgVersion,
+) -> NeonEnvBuilder:
+    test_local_pginstall = test_output_dir / "pg_install"
+
+    # Now copy the SQL only extension test_event_trigger_extension in the local
+    # pginstall extension directory on-disk
+    test_event_trigger_extension_dir = (
+        BASE_DIR / "test_runner" / "regress" / "data" / "test_event_trigger_extension"
+    )
+
+    test_local_extension_dir = (
+        test_local_pginstall / f"v{pg_version}" / "share" / "postgresql" / "extension"
+    )
+
+    log.info(f"copy {test_event_trigger_extension_dir} to {test_local_extension_dir}")
+
+    for f in [
+        test_event_trigger_extension_dir / "test_event_trigger_extension.control",
+        test_event_trigger_extension_dir / "test_event_trigger_extension--1.0.sql",
+    ]:
+        shutil.copy(f, test_local_extension_dir)
+
+    return neon_env_builder_local
+
+
+def test_event_trigger_extension(neon_env_builder_event_trigger_extension: NeonEnvBuilder):
+    """
+    Test installing an extension that contains an Event Trigger.
+
+    The Event Trigger function is owned by the extension owner, which at
+    CREATE EXTENSION is going to be the Postgres bootstrap user, per the
+    extension control file where both superuser = true and trusted = true.
+
+    Also this function is SECURTY DEFINER, to allow for making changes to
+    the extension SQL objects, in our case a sequence.
+
+    This test makes sure that the event trigger function is fired correctly
+    by non-privileged user DDL actions such as CREATE TABLE.
+    """
+    env = neon_env_builder_event_trigger_extension.init_start()
+    env.create_branch("test_event_trigger_extension")
+
+    endpoint = env.endpoints.create_start("test_event_trigger_extension")
+    extension = "test_event_trigger_extension"
+    database = "test_event_trigger_extension"
+
+    endpoint.safe_psql(f"CREATE DATABASE {database}")
+    endpoint.safe_psql(f"CREATE EXTENSION {extension}", dbname=database)
+
+    # check that the extension is owned by the bootstrap superuser (cloud_admin)
+    pg_bootstrap_superuser_name = "cloud_admin"
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                f"select rolname from pg_roles r join pg_extension e on r.oid = e.extowner where extname = '{extension}'"
+            )
+            owner = cast("tuple[str]", cur.fetchone())[0]
+            assert owner == pg_bootstrap_superuser_name, (
+                f"extension {extension} is not owned by bootstrap user '{pg_bootstrap_superuser_name}'"
+            )
+
+    # test that the SQL-only Event Trigger (SECURITY DEFINER function) runs
+    # correctly now that the extension has been installed
+    #
+    # create table to trigger the event trigger, twice, check sequence count
+    with endpoint.connect(dbname=database) as pg_conn:
+        log.info("creating SQL objects (tables)")
+        with pg_conn.cursor() as cur:
+            cur.execute("CREATE TABLE foo1(id int primary key)")
+            cur.execute("CREATE TABLE foo2(id int)")
+
+            cur.execute("SELECT event_trigger.get_schema_version()")
+            res = cast("tuple[int]", cur.fetchone())
+            ver = res[0]
+
+            log.info(f"schema version is now {ver}")
+            assert ver == 2, "schema version is not 2"

From b5b1db29bb3b2c11665ae4f891b235fc9d5d5b31 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 12:25:15 +0300
Subject: [PATCH 057/163] Implement shard map live-update

---
 .../src/worker_process/main_loop.rs           | 38 +++++++-
 .../src/worker_process/worker_interface.rs    |  9 +-
 pgxn/neon/communicator_new.c                  | 35 +++++--
 pgxn/neon/libpagestore.c                      | 96 +++----------------
 pgxn/neon/pagestore_client.h                  |  8 +-
 5 files changed, 91 insertions(+), 95 deletions(-)

diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index 171bb8fbf4..3ae187ac16 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -29,17 +29,31 @@ use tracing::{error, info, info_span, trace};
 use utils::lsn::Lsn;
 
 pub struct CommunicatorWorkerProcessStruct<'a> {
-    neon_request_slots: &'a [NeonIOHandle],
+    /// Tokio runtime that the main loop and any other related tasks runs in.
+    runtime: tokio::runtime::Handle,
 
+    /// Client to communicate with the pageserver
     client: PageserverClient,
 
-    pub(crate) cache: IntegratedCacheWriteAccess<'a>,
+    /// Request slots that backends use to send IO requests to the communicator.
+    neon_request_slots: &'a [NeonIOHandle],
 
+    /// Notification pipe. Backends use this to notify the communicator that a request is waiting to
+    /// be processed in one of the request slots.
     submission_pipe_read_fd: OwnedFd,
 
+    /// Locking table for all in-progress IO requests.
     in_progress_table: RequestInProgressTable,
 
-    // Metrics
+    /// Local File Cache, relation size tracking, last-written LSN tracking
+    pub(crate) cache: IntegratedCacheWriteAccess<'a>,
+
+    /*** Static configuration ***/
+    /// Stripe size doesn't change after startup. (The shard map is not stored here, it's passed
+    /// directly to the client)
+    stripe_size: Option<ShardStripeSize>,
+
+    /*** Metrics ***/
     request_counters: IntCounterVec,
     request_rel_exists_counter: IntCounter,
     request_rel_size_counter: IntCounter,
@@ -146,6 +160,8 @@ pub(super) async fn init(
         request_nblocks_counters.with_label_values(&["rel_zero_extend"]);
 
     CommunicatorWorkerProcessStruct {
+        runtime: tokio::runtime::Handle::current(),
+        stripe_size,
         neon_request_slots: cis.neon_request_slots,
         client,
         cache,
@@ -179,6 +195,22 @@ pub(super) async fn init(
 }
 
 impl<'t> CommunicatorWorkerProcessStruct<'t> {
+    /// Update the configuration
+    pub(super) fn update_shard_map(
+        &self,
+        new_shard_map: HashMap<utils::shard::ShardIndex, String>,
+    ) {
+        let shard_spec =
+            ShardSpec::new(new_shard_map, self.stripe_size.clone()).expect("invalid shard spec");
+
+        {
+            let _in_runtime = self.runtime.enter();
+            if let Err(err) = self.client.update_shards(shard_spec) {
+                tracing::error!("could not update shard map: {err:?}");
+            }
+        }
+    }
+
     /// Main loop of the worker process. Receive requests from the backends and process them.
     pub(super) async fn run(&'static self) {
         let mut idxbuf: [u8; 4] = [0; 4];
diff --git a/pgxn/neon/communicator/src/worker_process/worker_interface.rs b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
index e873555daa..a7bd79fa83 100644
--- a/pgxn/neon/communicator/src/worker_process/worker_interface.rs
+++ b/pgxn/neon/communicator/src/worker_process/worker_interface.rs
@@ -51,7 +51,7 @@ pub extern "C" fn communicator_worker_process_launch(
             Some(PathBuf::from(c_str.to_str().unwrap()))
         }
     };
-    let shard_map = parse_shard_map(nshards, shard_map);
+    let shard_map = shard_map_to_hash(nshards, shard_map);
 
     // start main loop
     let runtime = tokio::runtime::Builder::new_multi_thread()
@@ -92,7 +92,7 @@ pub extern "C" fn communicator_worker_process_launch(
 }
 
 /// Convert the "shard map" from an array of C strings, indexed by shard no to a rust HashMap
-fn parse_shard_map(
+fn shard_map_to_hash(
     nshards: u32,
     shard_map: *mut *mut c_char,
 ) -> HashMap<utils::shard::ShardIndex, String> {
@@ -124,6 +124,11 @@ fn parse_shard_map(
 pub extern "C" fn communicator_worker_config_reload(
     proc_handle: &'static CommunicatorWorkerProcessStruct<'static>,
     file_cache_size: u64,
+    shard_map: *mut *mut c_char,
+    nshards: u32,
 ) {
     proc_handle.cache.resize_file_cache(file_cache_size as u32);
+
+    let shard_map = shard_map_to_hash(nshards, shard_map);
+    proc_handle.update_shard_map(shard_map);
 }
diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index f2cb23cd4e..cc0a1634a7 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -292,8 +292,8 @@ communicator_new_shmem_startup(void)
 void
 communicator_new_bgworker_main(Datum main_arg)
 {
-	char	  **connstrs;
-	shardno_t	num_shards;
+	char	  **connstrings;
+	ShardMap	shard_map;
 	struct LoggingState *logging;
 	char		errbuf[1000];
 	int			elevel;
@@ -325,7 +325,14 @@ communicator_new_bgworker_main(Datum main_arg)
 
 	BackgroundWorkerUnblockSignals();
 
-	get_shard_map(&connstrs, &num_shards);
+	if (!parse_shard_map(pageserver_grpc_urls, &shard_map))
+	{
+		/* shouldn't happen, as the GUC was verified already */
+		elog(FATAL, "could not parse neon.pageserver_grpcs_urls");
+	}
+	connstrings = palloc(shard_map.num_shards * sizeof(char *));
+	for (int i = 0; i < shard_map.num_shards; i++)
+		connstrings[i] = shard_map.connstring[i];
 
 	logging = configure_logging();
 
@@ -334,11 +341,12 @@ communicator_new_bgworker_main(Datum main_arg)
 									   neon_tenant,
 									   neon_timeline,
 									   neon_auth_token,
-									   connstrs,
-									   num_shards,
+									   connstrings,
+									   shard_map.num_shards,
 									   neon_stripe_size,
 									   lfc_path,
 									   file_cache_size);
+	pfree(connstrings);
 	cis = NULL;
 
 	elog(LOG, "communicator threads started");
@@ -357,7 +365,22 @@ communicator_new_bgworker_main(Datum main_arg)
 			file_cache_size = lfc_size_limit * (1024 * 1024 / BLCKSZ);
 			if (file_cache_size < 100)
 				file_cache_size = 100;
-			communicator_worker_config_reload(proc_handle, file_cache_size);
+
+			/* Reload pageserver URLs */
+			if (!parse_shard_map(pageserver_grpc_urls, &shard_map))
+			{
+				/* shouldn't happen, as the GUC was verified already */
+				elog(FATAL, "could not parse neon.pageserver_grpcs_urls");
+			}
+			connstrings = palloc(shard_map.num_shards * sizeof(char *));
+			for (int i = 0; i < shard_map.num_shards; i++)
+				connstrings[i] = shard_map.connstring[i];
+
+			communicator_worker_config_reload(proc_handle,
+											  file_cache_size,
+											  connstrings,
+											  shard_map.num_shards);
+			pfree(connstrings);
 		}
 
 		for (;;)
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index f64e6ee233..f99084633a 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -87,12 +87,6 @@ static int pageserver_response_log_timeout = 10000;
 /* 2.5 minutes. A bit higher than highest default TCP retransmission timeout */
 static int pageserver_response_disconnect_timeout = 150000;
 
-typedef struct
-{
-	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
-	size_t		num_shards;
-} ShardMap;
-
 /*
  * PagestoreShmemState is kept in shared memory. It contains the connection
  * strings for each shard.
@@ -193,8 +187,8 @@ PagestoreShmemIsValid(void)
  * not valid, returns false. The contents of *result are undefined in
  * that case, and must not be relied on.
  */
-static bool
-ParseShardMap(const char *connstr, ShardMap *result)
+bool
+parse_shard_map(const char *connstr, ShardMap *result)
 {
 	const char *p;
 	int			nshards = 0;
@@ -248,7 +242,7 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
 	char	   *p = *newval;
 
-	return ParseShardMap(p, NULL);
+	return parse_shard_map(p, NULL);
 }
 
 static void
@@ -257,11 +251,17 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	/*
 	 * 'neon.pageserver_connstring' is ignored if the new communicator is used.
 	 * In that case, the shard map is loaded from 'neon.pageserver_grpc_urls'
-	 * instead.
+	 * instead, and that happens in the communicator process only.
 	 */
 	if (neon_enable_new_communicator)
 		return;
 
+	/*
+	 * Only postmaster updates the copy in shared memory.
+	 */
+	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
+		return;
+
 	AssignShardMap(newval);
 }
 
@@ -272,36 +272,15 @@ CheckPageserverGrpcUrls(char **newval, void **extra, GucSource source)
 {
 	char	   *p = *newval;
 
-	return ParseShardMap(p, NULL);
+	return parse_shard_map(p, NULL);
 }
 
-static void
-AssignPageserverGrpcUrls(const char *newval, void *extra)
-{
-	/*
-	 * 'neon.pageserver_grpc-urls' is ignored if the new communicator is not
-	 * used.  In that case, the shard map is loaded from 'neon.pageserver_connstring'
-	  instead.
-	 */
-	if (!neon_enable_new_communicator)
-		return;
-
-	AssignShardMap(newval);
-}
-
-
 static void
 AssignShardMap(const char *newval)
 {
 	ShardMap	shard_map;
 
-	/*
-	 * Only postmaster updates the copy in shared memory.
-	 */
-	if (!PagestoreShmemIsValid() || IsUnderPostmaster)
-		return;
-
-	if (!ParseShardMap(newval, &shard_map))
+	if (!parse_shard_map(newval, &shard_map))
 	{
 		/*
 		 * shouldn't happen, because we already checked the value in
@@ -324,54 +303,6 @@ AssignShardMap(const char *newval)
 	}
 }
 
-/* Return a copy of the whole shard map from shared memory */
-void
-get_shard_map(char ***connstrs_p, shardno_t *num_shards_p)
-{
-	uint64		begin_update_counter;
-	uint64		end_update_counter;
-	ShardMap   *shard_map = &pagestore_shared->shard_map;
-	shardno_t	num_shards;
-	char	   *buf;
-	char	  **connstrs;
-
-	buf = palloc(MAX_SHARDS*MAX_PAGESERVER_CONNSTRING_SIZE);
-	connstrs = palloc(sizeof(char *) * MAX_SHARDS);
-
-	/*
-	 * Postmaster can update the shared memory values concurrently, in which
-	 * case we would copy a garbled mix of the old and new values. We will
-	 * detect it because the counter's won't match, and retry. But it's
-	 * important that we don't do anything within the retry-loop that would
-	 * depend on the string having valid contents.
-	 */
-	do
-	{
-		char		*p;
-
-		begin_update_counter = pg_atomic_read_u64(&pagestore_shared->begin_update_counter);
-		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
-
-		num_shards = shard_map->num_shards;
-
-		p = buf;
-		for (int i = 0; i < Min(num_shards, MAX_SHARDS); i++)
-		{
-			strlcpy(p, shard_map->connstring[i], MAX_PAGESERVER_CONNSTRING_SIZE);
-			connstrs[i] = p;
-			p += MAX_PAGESERVER_CONNSTRING_SIZE;
-		}
-
-		pg_memory_barrier();
-	}
-	while (begin_update_counter != end_update_counter
-		   || begin_update_counter != pg_atomic_read_u64(&pagestore_shared->begin_update_counter)
-		   || end_update_counter != pg_atomic_read_u64(&pagestore_shared->end_update_counter));
-
-	*connstrs_p = connstrs;
-	*num_shards_p = num_shards;
-}
-
 /*
  * Get the current number of shards, and/or the connection string for a
  * particular shard from the shard map in shared memory.
@@ -1396,7 +1327,6 @@ PagestoreShmemInit(void)
 		pg_atomic_init_u64(&pagestore_shared->end_update_counter, 0);
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(pageserver_connstring, NULL);
-		AssignPageserverGrpcUrls(pageserver_grpc_urls, NULL);
 	}
 
 	NeonPerfCountersShmemInit();
@@ -1462,7 +1392,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
-							   CheckPageserverGrpcUrls, AssignPageserverGrpcUrls, NULL);
+							   CheckPageserverGrpcUrls, NULL, NULL);
 
 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index b3c074c9ee..8ec8ce5408 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -246,7 +246,13 @@ extern int32 max_cluster_size;
 extern int  neon_protocol_version;
 extern int	neon_stripe_size;
 
-extern void get_shard_map(char ***connstrs_p, shardno_t *num_shards_p);
+typedef struct
+{
+	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
+	size_t		num_shards;
+} ShardMap;
+
+extern bool parse_shard_map(const char *connstr, ShardMap *result);
 extern shardno_t get_shard_number(BufferTag* tag);
 
 extern const f_smgr *smgr_neon(ProcNumber backend, NRelFileInfo rinfo);

From 08b19f001c77afbcd3fbde06a11e495d6222967a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 10 Jul 2025 11:07:21 +0100
Subject: [PATCH 058/163] pageserver: optionally force image layer creation on
 timeout (#12529)

This PR introduces a `image_creation_timeout` to page servers so that we
can force the image creation after a certain period. This is set to 1
day on dev/staging for now, and will rollout to production 1/2 weeks
later.

Majority of the PR are boilerplate code to add the new knob. Specific
changes of the PR are:
1. During L0 compaction, check if we should force a compaction if
min(LSN) of all delta layers < force_image_creation LSN.
2. During image creation, check if we should force a compaction if the
image's LSN < force_image_creation LSN and there are newer deltas with
overlapping key ranges.
3. Also tweaked the check image creation interval to make sure we honor
image_creation_timeout.

Vlad's note: This should be a no-op. I added an extra PS config for the
large timeline
threshold to enable this.

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
---
 control_plane/src/pageserver.rs               |   6 ++
 libs/pageserver_api/src/config.rs             |   9 ++
 libs/pageserver_api/src/models.rs             |  18 ++++
 pageserver/src/config.rs                      |   6 ++
 pageserver/src/tenant.rs                      |   9 ++
 pageserver/src/tenant/timeline.rs             |  68 ++++++++++--
 pageserver/src/tenant/timeline/compaction.rs  | 100 ++++++++++++++++--
 .../regress/test_attach_tenant_config.py      |   1 +
 test_runner/regress/test_compaction.py        |  75 +++++++++++++
 9 files changed, 275 insertions(+), 17 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3f66960edd..3673d1f4f2 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -452,6 +452,12 @@ impl PageServerNode {
                 .map(|x| x.parse::<usize>())
                 .transpose()
                 .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
+            // HADRON
+            image_layer_force_creation_period: settings
+                .remove("image_layer_force_creation_period")
+                .map(humantime::parse_duration)
+                .transpose()
+                .context("Failed to parse 'image_layer_force_creation_period' as duration")?,
             image_layer_creation_check_threshold: settings
                 .remove("image_layer_creation_check_threshold")
                 .map(|x| x.parse::<u8>())
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 22815955c1..9e9c7a4dcb 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -272,6 +272,8 @@ pub struct ConfigToml {
     pub timeline_import_config: TimelineImportConfig,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub basebackup_cache_config: Option<BasebackupCacheConfig>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -561,6 +563,11 @@ pub struct TenantConfigToml {
     pub gc_period: Duration,
     // Delta layer churn threshold to create L1 image layers.
     pub image_creation_threshold: usize,
+    // HADRON
+    // When the timeout is reached, PageServer will (1) force compact any remaining L0 deltas and
+    // (2) create image layers if there are any L1 deltas.
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
     // Determines how much history is retained, to allow
     // branching and read replicas at an older point in time.
     // The unit is time.
@@ -823,6 +830,7 @@ impl Default for ConfigToml {
             },
             basebackup_cache_config: None,
             posthog_config: None,
+            image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
         }
     }
 }
@@ -916,6 +924,7 @@ impl Default for TenantConfigToml {
             gc_period: humantime::parse_duration(DEFAULT_GC_PERIOD)
                 .expect("cannot parse default gc period"),
             image_creation_threshold: DEFAULT_IMAGE_CREATION_THRESHOLD,
+            image_layer_force_creation_period: None,
             pitr_interval: humantime::parse_duration(DEFAULT_PITR_INTERVAL)
                 .expect("cannot parse default PITR interval"),
             walreceiver_connect_timeout: humantime::parse_duration(
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 6735320484..56dd95eab3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -597,6 +597,9 @@ pub struct TenantConfigPatch {
     pub gc_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub image_creation_threshold: FieldPatch<usize>,
+    // HADRON
+    #[serde(skip_serializing_if = "FieldPatch::is_noop")]
+    pub image_layer_force_creation_period: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
     pub pitr_interval: FieldPatch<String>,
     #[serde(skip_serializing_if = "FieldPatch::is_noop")]
@@ -700,6 +703,11 @@ pub struct TenantConfig {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_creation_threshold: Option<usize>,
 
+    // HADRON
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(with = "humantime_serde")]
+    pub image_layer_force_creation_period: Option<Duration>,
+
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     pub pitr_interval: Option<Duration>,
@@ -798,6 +806,7 @@ impl TenantConfig {
             mut gc_horizon,
             mut gc_period,
             mut image_creation_threshold,
+            mut image_layer_force_creation_period,
             mut pitr_interval,
             mut walreceiver_connect_timeout,
             mut lagging_wal_timeout,
@@ -861,6 +870,11 @@ impl TenantConfig {
         patch
             .image_creation_threshold
             .apply(&mut image_creation_threshold);
+        // HADRON
+        patch
+            .image_layer_force_creation_period
+            .map(|v| humantime::parse_duration(&v))?
+            .apply(&mut image_layer_force_creation_period);
         patch
             .pitr_interval
             .map(|v| humantime::parse_duration(&v))?
@@ -942,6 +956,7 @@ impl TenantConfig {
             gc_horizon,
             gc_period,
             image_creation_threshold,
+            image_layer_force_creation_period,
             pitr_interval,
             walreceiver_connect_timeout,
             lagging_wal_timeout,
@@ -1016,6 +1031,9 @@ impl TenantConfig {
             image_creation_threshold: self
                 .image_creation_threshold
                 .unwrap_or(global_conf.image_creation_threshold),
+            image_layer_force_creation_period: self
+                .image_layer_force_creation_period
+                .or(global_conf.image_layer_force_creation_period),
             pitr_interval: self.pitr_interval.unwrap_or(global_conf.pitr_interval),
             walreceiver_connect_timeout: self
                 .walreceiver_connect_timeout
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 15ec31b0a6..f64c5838ff 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -252,6 +252,10 @@ pub struct PageServerConf {
     pub timeline_import_config: pageserver_api::config::TimelineImportConfig,
 
     pub basebackup_cache_config: Option<pageserver_api::config::BasebackupCacheConfig>,
+
+    /// Defines what is a big tenant for the purpose of image layer generation.
+    /// See Timeline::should_check_if_image_layers_required
+    pub image_layer_generation_large_timeline_threshold: Option<u64>,
 }
 
 /// Token for authentication to safekeepers
@@ -432,6 +436,7 @@ impl PageServerConf {
             posthog_config,
             timeline_import_config,
             basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -490,6 +495,7 @@ impl PageServerConf {
             dev_mode,
             timeline_import_config,
             basebackup_cache_config,
+            image_layer_generation_large_timeline_threshold,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 240ba36236..7e2e6d96b8 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4171,6 +4171,15 @@ impl TenantShard {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    // HADRON
+    pub fn get_image_creation_timeout(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
+        tenant_conf.image_layer_force_creation_period.or(self
+            .conf
+            .default_tenant_conf
+            .image_layer_force_creation_period)
+    }
+
     pub fn get_pitr_interval(&self) -> Duration {
         let tenant_conf = self.tenant_conf.load().tenant_conf.clone();
         tenant_conf
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0a026d288e..a9bc0a060b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -351,6 +351,13 @@ pub struct Timeline {
     last_image_layer_creation_check_at: AtomicLsn,
     last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
 
+    // HADRON
+    /// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
+    /// on this key range.
+    force_image_creation_lsn: AtomicLsn,
+    /// The last time instant when force_image_creation_lsn is computed.
+    force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
+
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
 
@@ -2846,6 +2853,18 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
     }
 
+    // HADRON
+    fn get_image_creation_timeout(&self) -> Option<Duration> {
+        let tenant_conf = self.tenant_conf.load();
+        tenant_conf
+            .tenant_conf
+            .image_layer_force_creation_period
+            .or(self
+                .conf
+                .default_tenant_conf
+                .image_layer_force_creation_period)
+    }
+
     fn get_compaction_algorithm_settings(&self) -> CompactionAlgorithmSettings {
         let tenant_conf = &self.tenant_conf.load();
         tenant_conf
@@ -3115,7 +3134,9 @@ impl Timeline {
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
                 last_image_layer_creation_check_instant: Mutex::new(None),
-
+                // HADRON
+                force_image_creation_lsn: AtomicLsn::new(0),
+                force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
                 last_received_wal: Mutex::new(None),
                 rel_size_latest_cache: RwLock::new(HashMap::new()),
                 rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
@@ -5036,6 +5057,7 @@ impl Timeline {
                 .create_image_layers(
                     &partitions,
                     self.initdb_lsn,
+                    None,
                     ImageLayerCreationMode::Initial,
                     ctx,
                     LastImageLayerCreationStatus::Initial,
@@ -5307,14 +5329,19 @@ impl Timeline {
     }
 
     // Is it time to create a new image layer for the given partition? True if we want to generate.
-    async fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> bool {
+    async fn time_for_new_image_layer(
+        &self,
+        partition: &KeySpace,
+        lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
+    ) -> bool {
         let threshold = self.get_image_creation_threshold();
 
         let guard = self.layers.read(LayerManagerLockHolder::Compaction).await;
         let Ok(layers) = guard.layer_map() else {
             return false;
         };
-
+        let mut min_image_lsn: Lsn = Lsn::MAX;
         let mut max_deltas = 0;
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn);
@@ -5349,9 +5376,22 @@ impl Timeline {
                         return true;
                     }
                 }
+                min_image_lsn = min(min_image_lsn, img_lsn);
             }
         }
 
+        // HADRON
+        if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
+            info!(
+                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
+                partition.ranges[0].start,
+                partition.ranges[0].end,
+                min_image_lsn,
+                force_image_creation_lsn.unwrap()
+            );
+            return true;
+        }
+
         debug!(
             max_deltas,
             "none of the partitioned ranges had >= {threshold} deltas"
@@ -5577,7 +5617,7 @@ impl Timeline {
     ///        suffer from the lack of image layers
     ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
     fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
-        const LARGE_TENANT_THRESHOLD: u64 = 2 * 1024 * 1024 * 1024;
+        let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
 
         let last_checks_at = self.last_image_layer_creation_check_at.load();
         let distance = lsn
@@ -5591,12 +5631,12 @@ impl Timeline {
         let mut time_based_decision = false;
         let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
         if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after = if Into::<u64>::into(&logical_size) >= LARGE_TENANT_THRESHOLD
-            {
-                self.get_checkpoint_timeout()
-            } else {
-                Duration::from_secs(3600 * 48)
-            };
+            let check_required_after =
+                if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
+                    self.get_checkpoint_timeout()
+                } else {
+                    Duration::from_secs(3600 * 48)
+                };
 
             time_based_decision = match *last_check_instant {
                 Some(last_check) => {
@@ -5624,10 +5664,12 @@ impl Timeline {
     /// true = we have generate all image layers, false = we preempt the process for L0 compaction.
     ///
     /// `partition_mode` is only for logging purpose and is not used anywhere in this function.
+    #[allow(clippy::too_many_arguments)]
     async fn create_image_layers(
         self: &Arc<Timeline>,
         partitioning: &KeyPartitioning,
         lsn: Lsn,
+        force_image_creation_lsn: Option<Lsn>,
         mode: ImageLayerCreationMode,
         ctx: &RequestContext,
         last_status: LastImageLayerCreationStatus,
@@ -5731,7 +5773,11 @@ impl Timeline {
             } else if let ImageLayerCreationMode::Try = mode {
                 // check_for_image_layers = false -> skip
                 // check_for_image_layers = true -> check time_for_new_image_layer -> skip/generate
-                if !check_for_image_layers || !self.time_for_new_image_layer(partition, lsn).await {
+                if !check_for_image_layers
+                    || !self
+                        .time_for_new_image_layer(partition, lsn, force_image_creation_lsn)
+                        .await
+                {
                     start = img_range.end;
                     continue;
                 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 18a0ca852d..171f9d1284 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,10 +4,11 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
+use std::cmp::min;
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::{Duration, Instant, SystemTime};
 
 use super::layer_manager::LayerManagerLockHolder;
 use super::{
@@ -33,6 +34,7 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;
+use postgres_ffi::to_pg_timestamp;
 use serde::Serialize;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
@@ -45,6 +47,7 @@ use wal_decoder::models::value::Value;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -1267,6 +1270,12 @@ impl Timeline {
 
         // Define partitioning schema if needed
 
+        // HADRON
+        let force_image_creation_lsn = self
+            .get_or_compute_force_image_creation_lsn(cancel, ctx)
+            .await
+            .map_err(CompactionError::Other)?;
+
         // 1. L0 Compact
         let l0_outcome = {
             let timer = self.metrics.compact_time_histo.start_timer();
@@ -1274,6 +1283,7 @@ impl Timeline {
                 .compact_level0(
                     target_file_size,
                     options.flags.contains(CompactFlags::ForceL0Compaction),
+                    force_image_creation_lsn,
                     ctx,
                 )
                 .await?;
@@ -1376,6 +1386,7 @@ impl Timeline {
                     .create_image_layers(
                         &partitioning,
                         lsn,
+                        force_image_creation_lsn,
                         mode,
                         &image_ctx,
                         self.last_image_layer_creation_status
@@ -1472,6 +1483,63 @@ impl Timeline {
         Ok(CompactionOutcome::Done)
     }
 
+    /* BEGIN_HADRON */
+    // Get the force image creation LSN. Compute it if the last computed LSN is too old.
+    async fn get_or_compute_force_image_creation_lsn(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Option<Lsn>> {
+        const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
+        let image_layer_force_creation_period = self.get_image_creation_timeout();
+        if image_layer_force_creation_period.is_none() {
+            return Ok(None);
+        }
+
+        let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
+        let force_image_creation_lsn_computed_at =
+            *self.force_image_creation_lsn_computed_at.lock().unwrap();
+        if force_image_creation_lsn_computed_at.is_none()
+            || force_image_creation_lsn_computed_at.unwrap().elapsed()
+                > FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
+        {
+            let now: SystemTime = SystemTime::now();
+            let timestamp = now
+                .checked_sub(image_layer_force_creation_period)
+                .ok_or_else(|| {
+                    anyhow::anyhow!(
+                        "image creation timeout is too large: {image_layer_force_creation_period:?}"
+                    )
+                })?;
+            let timestamp = to_pg_timestamp(timestamp);
+            let force_image_creation_lsn = match self
+                .find_lsn_for_timestamp(timestamp, cancel, ctx)
+                .await?
+            {
+                LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
+                _ => {
+                    let gc_lsn = *self.get_applied_gc_cutoff_lsn();
+                    tracing::info!(
+                        "no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
+                        gc_lsn
+                    );
+                    gc_lsn
+                }
+            };
+            self.force_image_creation_lsn
+                .store(force_image_creation_lsn);
+            *self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
+            tracing::info!(
+                "computed force image creation LSN: {}",
+                force_image_creation_lsn
+            );
+            Ok(Some(force_image_creation_lsn))
+        } else {
+            Ok(Some(self.force_image_creation_lsn.load()))
+        }
+    }
+    /* END_HADRON */
+
     /// Check for layers that are elegible to be rewritten:
     /// - Shard splitting: After a shard split, ancestor layers beyond pitr_interval, so that
     ///   we don't indefinitely retain keys in this shard that aren't needed.
@@ -1801,6 +1869,7 @@ impl Timeline {
         self: &Arc<Self>,
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<CompactionOutcome, CompactionError> {
         let CompactLevel0Phase1Result {
@@ -1821,6 +1890,7 @@ impl Timeline {
                 stats,
                 target_file_size,
                 force_compaction_ignore_threshold,
+                force_compaction_lsn,
                 &ctx,
             )
             .instrument(phase1_span)
@@ -1843,6 +1913,7 @@ impl Timeline {
         mut stats: CompactLevel0Phase1StatsBuilder,
         target_file_size: u64,
         force_compaction_ignore_threshold: bool,
+        force_compaction_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         let begin = tokio::time::Instant::now();
@@ -1872,11 +1943,28 @@ impl Timeline {
                     return Ok(CompactLevel0Phase1Result::default());
                 }
             } else {
-                debug!(
-                    level0_deltas = level0_deltas.len(),
-                    threshold, "too few deltas to compact"
-                );
-                return Ok(CompactLevel0Phase1Result::default());
+                // HADRON
+                let min_lsn = level0_deltas
+                    .iter()
+                    .map(|a| a.get_lsn_range().start)
+                    .reduce(min);
+                if force_compaction_lsn.is_some()
+                    && min_lsn.is_some()
+                    && min_lsn.unwrap() < force_compaction_lsn.unwrap()
+                {
+                    info!(
+                        "forcing L0 compaction of {} L0 deltas. Min lsn: {}, force compaction lsn: {}",
+                        level0_deltas.len(),
+                        min_lsn.unwrap(),
+                        force_compaction_lsn.unwrap()
+                    );
+                } else {
+                    debug!(
+                        level0_deltas = level0_deltas.len(),
+                        threshold, "too few deltas to compact"
+                    );
+                    return Ok(CompactLevel0Phase1Result::default());
+                }
             }
         }
 
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 7788faceb4..eaaa3014a5 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -165,6 +165,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "gc_horizon": 23 * (1024 * 1024),
         "gc_period": "2h 13m",
         "image_creation_threshold": 7,
+        "image_layer_force_creation_period": "1m",
         "pitr_interval": "1m",
         "lagging_wal_timeout": "23m",
         "lazy_slru_download": True,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 1570d40ae9..e67161c6b7 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -944,3 +944,78 @@ def test_image_layer_compression(neon_env_builder: NeonEnvBuilder, enabled: bool
                 f"SELECT count(*) FROM foo WHERE id={v} and val=repeat('abcde{v:0>3}', 500)"
             )
             assert res[0][0] == 1
+
+
+# BEGIN_HADRON
+def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
+    client = env.pageservers[ps_id].http_client()
+    layer_map = client.layer_map_info(tenant_shard_id, timeline_id)
+    image_layer_count = 0
+    delta_layer_count = 0
+    for layer in layer_map.historic_layers:
+        if layer.kind == "Image":
+            image_layer_count += 1
+        elif layer.kind == "Delta":
+            delta_layer_count += 1
+    return image_layer_count, delta_layer_count
+
+
+def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that page server can force creating new images if image creation timeout is enabled
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        # The lsn for forced image layer creations is calculated once every 10 minutes.
+        # Hence, drive compaction manually such that the test doesn't compute it at the
+        # wrong time.
+        "compaction_period": "0s",
+    }
+
+    # consider every tenant large to run the image layer generation check more eagerly
+    neon_env_builder.pageserver_config_override = (
+        "image_layer_generation_large_timeline_threshold=0"
+    )
+
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(initial_tenant_conf=tenant_conf)
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER PRIMARY KEY, val text)")
+    # Generate some rows.
+    for v in range(10):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    # Sleep a bit such that the inserts are considered when calculating the forced image layer creation LSN.
+    time.sleep(2)
+
+    def check_force_image_creation():
+        ps_http = env.pageserver.http_client()
+        ps_http.timeline_compact(tenant_id, timeline_id)
+        image, delta = get_layer_map(env, tenant_id, timeline_id, 0)
+        log.info(f"images: {image}, deltas: {delta}")
+        assert image > 0
+
+        env.pageserver.assert_log_contains("forcing L0 compaction of")
+        env.pageserver.assert_log_contains("forcing image creation for partitioned range")
+
+    wait_until(check_force_image_creation)
+
+    endpoint.stop_and_destroy()
+
+    env.pageserver.allowed_errors.append(
+        ".*created delta file of size.*larger than double of target.*"
+    )
+
+
+# END_HADRON

From f4b03ddd7b4cb858430b16d642f5f80f11e8b5b1 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 12:18:37 +0200
Subject: [PATCH 059/163] pageserver/client_grpc: reap idle pool resources
 (#12476)

## Problem

The gRPC client pools don't reap idle resources.

Touches #11735.
Requires #12475.

## Summary of changes

Reap idle pool resources (channels/clients/streams) after 3 minutes of
inactivity.

Also restructure the `StreamPool` to use a mutex rather than atomics for
synchronization, for simplicity. This will be optimized later.
---
 Cargo.lock                         |   1 +
 pageserver/client_grpc/Cargo.toml  |   4 +
 pageserver/client_grpc/src/pool.rs | 241 ++++++++++++++++++++++++-----
 3 files changed, 207 insertions(+), 39 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index caed814d5f..4150944ad0 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4506,6 +4506,7 @@ dependencies = [
  "pageserver_page_api",
  "tokio",
  "tokio-stream",
+ "tokio-util",
  "tonic 0.13.1",
  "tracing",
  "utils",
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
index 84e27abb84..ca224900ac 100644
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+testing = ["pageserver_api/testing"]
+
 [dependencies]
 anyhow.workspace = true
 bytes.workspace = true
@@ -13,6 +16,7 @@ pageserver_api.workspace = true
 pageserver_page_api.workspace = true
 tokio.workspace = true
 tokio-stream.workspace = true
+tokio-util.workspace = true
 tonic.workspace = true
 tracing.workspace = true
 utils.workspace = true
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 5a50004fd1..89b3bd646f 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -34,10 +34,12 @@ use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
+use std::time::{Duration, Instant};
 
 use futures::StreamExt as _;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use tokio_util::sync::CancellationToken;
 use tonic::transport::{Channel, Endpoint};
 use tracing::{error, warn};
 
@@ -45,6 +47,25 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
+/// Reap channels/clients/streams that have been idle for this long.
+///
+/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
+/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
+/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
+/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
+/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
+/// channels, and/or stream pool clients.
+const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(180),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
+/// Reap idle resources with this interval.
+const REAP_IDLE_INTERVAL: Duration = match cfg!(any(test, feature = "testing")) {
+    false => Duration::from_secs(10),
+    true => Duration::from_secs(1), // exercise reaping in tests
+};
+
 /// A gRPC channel pool, for a single Pageserver. A channel is shared by many clients (via HTTP/2
 /// stream multiplexing), up to `clients_per_channel` -- a new channel will be spun up beyond this.
 /// The pool does not limit the number of channels, and instead relies on `ClientPool` or
@@ -52,7 +73,6 @@ use utils::shard::ShardIndex;
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
 ///
-/// TODO: reap idle channels.
 /// TODO: consider prewarming a set of channels, to avoid initial connection latency.
 /// TODO: consider adding a circuit breaker for errors and fail fast.
 pub struct ChannelPool {
@@ -62,6 +82,8 @@ pub struct ChannelPool {
     max_clients_per_channel: NonZero<usize>,
     /// Open channels.
     channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
+    /// Reaps idle channels.
+    idle_reaper: Reaper,
     /// Channel ID generator.
     next_channel_id: AtomicUsize,
 }
@@ -73,6 +95,9 @@ struct ChannelEntry {
     channel: Channel,
     /// Number of clients using this channel.
     clients: usize,
+    /// The channel has been idle (no clients) since this time. None if channel is in use.
+    /// INVARIANT: Some if clients == 0, otherwise None.
+    idle_since: Option<Instant>,
 }
 
 impl ChannelPool {
@@ -82,12 +107,15 @@ impl ChannelPool {
         E: TryInto<Endpoint> + Send + Sync + 'static,
         <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
     {
-        Ok(Arc::new(Self {
+        let pool = Arc::new(Self {
             endpoint: endpoint.try_into()?,
             max_clients_per_channel,
             channels: Mutex::default(),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             next_channel_id: AtomicUsize::default(),
-        }))
+        });
+        pool.idle_reaper.spawn(&pool);
+        Ok(pool)
     }
 
     /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
@@ -116,8 +144,14 @@ impl ChannelPool {
                 entry.clients <= self.max_clients_per_channel.get(),
                 "channel overflow"
             );
+            assert_eq!(
+                entry.idle_since.is_some(),
+                entry.clients == 0,
+                "incorrect channel idle state"
+            );
             if entry.clients < self.max_clients_per_channel.get() {
                 entry.clients += 1;
+                entry.idle_since = None;
                 return ChannelGuard {
                     pool: Arc::downgrade(self),
                     id,
@@ -134,6 +168,7 @@ impl ChannelPool {
         let entry = ChannelEntry {
             channel: channel.clone(),
             clients: 1, // account for the guard below
+            idle_since: None,
         };
         channels.insert(id, entry);
 
@@ -145,6 +180,20 @@ impl ChannelPool {
     }
 }
 
+impl Reapable for ChannelPool {
+    /// Reaps channels that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.channels.lock().unwrap().retain(|_, entry| {
+            let Some(idle_since) = entry.idle_since else {
+                assert_ne!(entry.clients, 0, "empty channel not marked idle");
+                return true;
+            };
+            assert_eq!(entry.clients, 0, "idle channel has clients");
+            idle_since >= cutoff
+        })
+    }
+}
+
 /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
 /// since the gRPC client requires an owned `Channel`.
 pub struct ChannelGuard {
@@ -167,10 +216,15 @@ impl Drop for ChannelGuard {
         let Some(pool) = self.pool.upgrade() else {
             return; // pool was dropped
         };
+
         let mut channels = pool.channels.lock().unwrap();
         let entry = channels.get_mut(&self.id).expect("unknown channel");
+        assert!(entry.idle_since.is_none(), "active channel marked idle");
         assert!(entry.clients > 0, "channel underflow");
         entry.clients -= 1;
+        if entry.clients == 0 {
+            entry.idle_since = Some(Instant::now()); // mark channel as idle
+        }
     }
 }
 
@@ -179,8 +233,6 @@ impl Drop for ChannelGuard {
 /// number of concurrent clients to `max_clients` via semaphore.
 ///
 /// The pool is always wrapped in an outer `Arc`, to allow long-lived guards across tasks/threads.
-///
-/// TODO: reap idle clients.
 pub struct ClientPool {
     /// Tenant ID.
     tenant_id: TenantId,
@@ -201,6 +253,8 @@ pub struct ClientPool {
     /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
     /// clients are reaped.
     idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
+    /// Reaps idle clients.
+    idle_reaper: Reaper,
     /// Unique client ID generator.
     next_client_id: AtomicUsize,
 }
@@ -212,6 +266,9 @@ struct ClientEntry {
     client: page_api::Client,
     /// The channel guard for the channel used by the client.
     channel_guard: ChannelGuard,
+    /// The client has been idle since this time. All clients in `ClientPool::idle` are idle by
+    /// definition, so this is the time when it was added back to the pool.
+    idle_since: Instant,
 }
 
 impl ClientPool {
@@ -226,16 +283,19 @@ impl ClientPool {
         auth_token: Option<String>,
         max_clients: Option<NonZero<usize>>,
     ) -> Arc<Self> {
-        Arc::new(Self {
+        let pool = Arc::new(Self {
             tenant_id,
             timeline_id,
             shard_id,
             auth_token,
             channel_pool,
             idle: Mutex::default(),
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             limiter: max_clients.map(|max| Arc::new(Semaphore::new(max.get()))),
             next_client_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
     }
 
     /// Gets a client from the pool, or creates a new one if necessary. Connections are established
@@ -287,6 +347,16 @@ impl ClientPool {
     }
 }
 
+impl Reapable for ClientPool {
+    /// Reaps clients that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff)
+    }
+}
+
 /// A client acquired from the pool. The inner client can be accessed via Deref. The client is
 /// returned to the pool when dropped.
 pub struct ClientGuard {
@@ -317,9 +387,11 @@ impl Drop for ClientGuard {
         let Some(pool) = self.pool.upgrade() else {
             return; // pool was dropped
         };
+
         let entry = ClientEntry {
             client: self.client.take().expect("dropped once"),
             channel_guard: self.channel_guard.take().expect("dropped once"),
+            idle_since: Instant::now(),
         };
         pool.idle.lock().unwrap().insert(self.id, entry);
 
@@ -334,7 +406,6 @@ impl Drop for ClientGuard {
 /// a single request and await the response. Internally, requests are multiplexed across streams and
 /// channels. This allows proper queue depth enforcement and response routing.
 ///
-/// TODO: reap idle streams.
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
     /// The client pool to acquire clients from. Must be unbounded.
@@ -344,7 +415,7 @@ pub struct StreamPool {
     /// Incoming requests will be sent over an existing stream with available capacity. If all
     /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
     /// stream has an associated Tokio task that processes requests and responses.
-    streams: Arc<Mutex<HashMap<StreamID, StreamEntry>>>,
+    streams: Mutex<HashMap<StreamID, StreamEntry>>,
     /// The max number of concurrent streams, or None if unbounded.
     max_streams: Option<NonZero<usize>>,
     /// The max number of concurrent requests per stream.
@@ -352,6 +423,8 @@ pub struct StreamPool {
     /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
     /// None if the pool is unbounded.
     limiter: Option<Arc<Semaphore>>,
+    /// Reaps idle streams.
+    idle_reaper: Reaper,
     /// Stream ID generator.
     next_stream_id: AtomicUsize,
 }
@@ -364,9 +437,11 @@ type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
 struct StreamEntry {
     /// Sends caller requests to the stream task. The stream task exits when this is dropped.
     sender: RequestSender,
-    /// Number of in-flight requests on this stream. This is an atomic to allow decrementing it on
-    /// completion without acquiring the `StreamPool::streams` lock.
-    queue_depth: Arc<AtomicUsize>,
+    /// Number of in-flight requests on this stream.
+    queue_depth: usize,
+    /// The time when this stream went idle (queue_depth == 0).
+    /// INVARIANT: Some if queue_depth == 0, otherwise None.
+    idle_since: Option<Instant>,
 }
 
 impl StreamPool {
@@ -383,16 +458,19 @@ impl StreamPool {
         max_queue_depth: NonZero<usize>,
     ) -> Arc<Self> {
         assert!(client_pool.limiter.is_none(), "bounded client pool");
-        Arc::new(Self {
+        let pool = Arc::new(Self {
             client_pool,
-            streams: Arc::default(),
+            streams: Mutex::default(),
             limiter: max_streams.map(|max_streams| {
                 Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
             }),
             max_streams,
             max_queue_depth,
+            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             next_stream_id: AtomicUsize::default(),
-        })
+        });
+        pool.idle_reaper.spawn(&pool);
+        pool
     }
 
     /// Acquires an available stream from the pool, or spins up a new stream async if all streams
@@ -412,8 +490,8 @@ impl StreamPool {
     /// * Allow concurrent clients to join onto streams while they're spun up.
     /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
     ///
-    /// For now, we just do something simple and functional, but very inefficient (linear scan).
-    pub async fn get(&self) -> StreamGuard {
+    /// For now, we just do something simple but inefficient (linear scan under mutex).
+    pub async fn get(self: &Arc<Self>) -> StreamGuard {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
@@ -422,23 +500,23 @@ impl StreamPool {
         let mut streams = self.streams.lock().unwrap();
 
         // Look for a pooled stream with available capacity.
-        for entry in streams.values() {
+        for (&id, entry) in streams.iter_mut() {
             assert!(
-                entry.queue_depth.load(Ordering::Relaxed) <= self.max_queue_depth.get(),
+                entry.queue_depth <= self.max_queue_depth.get(),
                 "stream queue overflow"
             );
-            if entry
-                .queue_depth
-                .fetch_update(Ordering::SeqCst, Ordering::SeqCst, |queue_depth| {
-                    // Increment the queue depth via compare-and-swap.
-                    // TODO: review ordering.
-                    (queue_depth < self.max_queue_depth.get()).then_some(queue_depth + 1)
-                })
-                .is_ok()
-            {
+            assert_eq!(
+                entry.idle_since.is_some(),
+                entry.queue_depth == 0,
+                "incorrect stream idle state"
+            );
+            if entry.queue_depth < self.max_queue_depth.get() {
+                entry.queue_depth += 1;
+                entry.idle_since = None;
                 return StreamGuard {
+                    pool: Arc::downgrade(self),
+                    id,
                     sender: entry.sender.clone(),
-                    queue_depth: entry.queue_depth.clone(),
                     permit,
                 };
             }
@@ -448,11 +526,11 @@ impl StreamPool {
         // return the guard, while spinning up the stream task async. This allows other callers to
         // join onto this stream and also create additional streams concurrently if this fills up.
         let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let queue_depth = Arc::new(AtomicUsize::new(1)); // reserve quota for this caller
         let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
         let entry = StreamEntry {
             sender: req_tx.clone(),
-            queue_depth: queue_depth.clone(),
+            queue_depth: 1, // reserve quota for this caller
+            idle_since: None,
         };
         streams.insert(id, entry);
 
@@ -461,20 +539,23 @@ impl StreamPool {
         };
 
         let client_pool = self.client_pool.clone();
-        let streams = self.streams.clone();
+        let pool = Arc::downgrade(self);
 
         tokio::spawn(async move {
             if let Err(err) = Self::run_stream(client_pool, req_rx).await {
                 error!("stream failed: {err}");
             }
-            // Remove stream from pool on exit.
-            let entry = streams.lock().unwrap().remove(&id);
-            assert!(entry.is_some(), "unknown stream ID: {id}");
+            // Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
+            if let Some(pool) = pool.upgrade() {
+                let entry = pool.streams.lock().unwrap().remove(&id);
+                assert!(entry.is_some(), "unknown stream ID: {id}");
+            }
         });
 
         StreamGuard {
+            pool: Arc::downgrade(self),
+            id,
             sender: req_tx,
-            queue_depth,
             permit,
         }
     }
@@ -552,11 +633,26 @@ impl StreamPool {
     }
 }
 
+impl Reapable for StreamPool {
+    /// Reaps streams that have been idle since before the cutoff.
+    fn reap_idle(&self, cutoff: Instant) {
+        self.streams.lock().unwrap().retain(|_, entry| {
+            let Some(idle_since) = entry.idle_since else {
+                assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
+                return true;
+            };
+            assert_eq!(entry.queue_depth, 0, "idle stream has requests");
+            idle_since >= cutoff
+        });
+    }
+}
+
 /// A pooled stream reference. Can be used to send a single request, to properly enforce queue
 /// depth. Queue depth is already reserved and will be returned on drop.
 pub struct StreamGuard {
+    pool: Weak<StreamPool>,
+    id: StreamID,
     sender: RequestSender,
-    queue_depth: Arc<AtomicUsize>,
     permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
@@ -588,11 +684,78 @@ impl StreamGuard {
 
 impl Drop for StreamGuard {
     fn drop(&mut self) {
+        let Some(pool) = self.pool.upgrade() else {
+            return; // pool was dropped
+        };
+
         // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
         // before the response is received, but that's okay.
-        let prev_queue_depth = self.queue_depth.fetch_sub(1, Ordering::SeqCst);
-        assert!(prev_queue_depth > 0, "stream queue underflow");
+        let mut streams = pool.streams.lock().unwrap();
+        let entry = streams.get_mut(&self.id).expect("unknown stream");
+        assert!(entry.idle_since.is_none(), "active stream marked idle");
+        assert!(entry.queue_depth > 0, "stream queue underflow");
+        entry.queue_depth -= 1;
+        if entry.queue_depth == 0 {
+            entry.idle_since = Some(Instant::now()); // mark stream as idle
+        }
 
         _ = self.permit; // returned on drop, referenced for visibility
     }
 }
+
+/// Periodically reaps idle resources from a pool.
+struct Reaper {
+    /// The task check interval.
+    interval: Duration,
+    /// The threshold for reaping idle resources.
+    threshold: Duration,
+    /// Cancels the reaper task. Cancelled when the reaper is dropped.
+    cancel: CancellationToken,
+}
+
+impl Reaper {
+    /// Creates a new reaper.
+    pub fn new(threshold: Duration, interval: Duration) -> Self {
+        Self {
+            cancel: CancellationToken::new(),
+            threshold,
+            interval,
+        }
+    }
+
+    /// Spawns a task to periodically reap idle resources from the given task pool. The task is
+    /// cancelled when the reaper is dropped.
+    pub fn spawn(&self, pool: &Arc<impl Reapable>) {
+        // NB: hold a weak pool reference, otherwise the task will prevent dropping the pool.
+        let pool = Arc::downgrade(pool);
+        let cancel = self.cancel.clone();
+        let (interval, threshold) = (self.interval, self.threshold);
+
+        tokio::spawn(async move {
+            loop {
+                tokio::select! {
+                    _ = tokio::time::sleep(interval) => {
+                        let Some(pool) = pool.upgrade() else {
+                            return; // pool was dropped
+                        };
+                        pool.reap_idle(Instant::now() - threshold);
+                    }
+
+                    _ = cancel.cancelled() => return,
+                }
+            }
+        });
+    }
+}
+
+impl Drop for Reaper {
+    fn drop(&mut self) {
+        self.cancel.cancel(); // cancel reaper task
+    }
+}
+
+/// A reapable resource pool.
+trait Reapable: Send + Sync + 'static {
+    /// Reaps resources that have been idle since before the given cutoff.
+    fn reap_idle(&self, cutoff: Instant);
+}

From bdca5b500b078eb9afb528fd464f496e07c97024 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Thu, 10 Jul 2025 12:11:53 +0100
Subject: [PATCH 060/163] Fix test_lfc_prewarm: reduce number of prewarms,
 sleep before LFC offloading (#12515)

Fixes:
- Sleep before LFC offloading in `test_lfc_prewarm[autoprewarm]` to
ensure offloaded LFC is the one exported after all writes finish
- Reduce number of prewarms and increase timeout in
`test_lfc_prewarm_under_workload` as debug builds were failing due to
timeout.

Additional changes:
- Remove `check_pinned_entries`:
https://github.com/neondatabase/neon/pull/12447#discussion_r2185946210
- Fix LFC error metrics description:
https://github.com/neondatabase/neon/pull/12486#discussion_r2190763107
---
 compute_tools/src/metrics.rs            |  4 +--
 test_runner/regress/test_lfc_prewarm.py | 44 +++++++++++--------------
 2 files changed, 21 insertions(+), 27 deletions(-)

diff --git a/compute_tools/src/metrics.rs b/compute_tools/src/metrics.rs
index 91dedbb42a..6e4df73c0f 100644
--- a/compute_tools/src/metrics.rs
+++ b/compute_tools/src/metrics.rs
@@ -108,7 +108,7 @@ pub(crate) static LFC_PREWARMS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_PREWARM_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "compute_ctl_lfc_prewarm_errors_total",
-        "Total number of LFC prewarms errors requested by compute_ctl or autoprewarm option",
+        "Total number of LFC prewarm errors",
     )
     .expect("failed to define a metric")
 });
@@ -124,7 +124,7 @@ pub(crate) static LFC_OFFLOADS: Lazy<IntCounter> = Lazy::new(|| {
 pub(crate) static LFC_OFFLOAD_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "compute_ctl_lfc_offload_errors_total",
-        "Total number of LFC offload errors requested by compute_ctl or lfc_offload_period_seconds option",
+        "Total number of LFC offload errors",
     )
     .expect("failed to define a metric")
 });
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
index ae36bbda79..22e5bf576f 100644
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -1,6 +1,7 @@
 import random
 import threading
 from enum import StrEnum
+from time import sleep
 from typing import Any
 
 import pytest
@@ -24,18 +25,7 @@ OFFLOAD_LABEL = "compute_ctl_lfc_offloads_total"
 OFFLOAD_ERR_LABEL = "compute_ctl_lfc_offload_errors_total"
 METHOD_VALUES = [e for e in PrewarmMethod]
 METHOD_IDS = [e.value for e in PrewarmMethod]
-
-
-def check_pinned_entries(cur: Cursor):
-    """
-    Wait till none of LFC buffers are pinned
-    """
-
-    def none_pinned():
-        cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_chunks_pinned'")
-        assert cur.fetchall()[0][0] == 0
-
-    wait_until(none_pinned)
+AUTOOFFLOAD_INTERVAL_SECS = 2
 
 
 def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
@@ -49,9 +39,18 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
 
 
 def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
+    if method == PrewarmMethod.POSTGRES:
+        cur.execute("select get_local_cache_state()")
+        return cur.fetchall()[0][0]
+
     if method == PrewarmMethod.AUTOPREWARM:
+        # With autoprewarm, we need to be sure LFC was offloaded after all writes
+        # finish, so we sleep. Otherwise we'll have less prewarmed pages than we want
+        sleep(AUTOOFFLOAD_INTERVAL_SECS)
         client.offload_lfc_wait()
-    elif method == PrewarmMethod.COMPUTE_CTL:
+        return
+
+    if method == PrewarmMethod.COMPUTE_CTL:
         status = client.prewarm_lfc_status()
         assert status["status"] == "not_prewarmed"
         assert "error" not in status
@@ -60,11 +59,9 @@ def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor)
         parsed = prom_parse(client)
         desired = {OFFLOAD_LABEL: 1, PREWARM_LABEL: 0, OFFLOAD_ERR_LABEL: 0, PREWARM_ERR_LABEL: 0}
         assert parsed == desired, f"{parsed=} != {desired=}"
-    elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select get_local_cache_state()")
-        return cur.fetchall()[0][0]
-    else:
-        raise AssertionError(f"{method} not in PrewarmMethod")
+        return
+
+    raise AssertionError(f"{method} not in PrewarmMethod")
 
 
 def prewarm_endpoint(
@@ -106,14 +103,13 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
         "neon.file_cache_size_limit=1GB",
         "neon.file_cache_prewarm_limit=1000",
     ]
-    offload_secs = 2
 
     if method == PrewarmMethod.AUTOPREWARM:
         endpoint = env.endpoints.create_start(
             branch_name="main",
             config_lines=cfg,
             autoprewarm=True,
-            offload_lfc_interval_seconds=offload_secs,
+            offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS,
         )
     else:
         endpoint = env.endpoints.create_start(branch_name="main", config_lines=cfg)
@@ -135,7 +131,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
 
     endpoint.stop()
     if method == PrewarmMethod.AUTOPREWARM:
-        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=offload_secs)
+        endpoint.start(autoprewarm=True, offload_lfc_interval_seconds=AUTOOFFLOAD_INTERVAL_SECS)
     else:
         endpoint.start()
 
@@ -162,7 +158,6 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
     lfc_cur.execute("select sum(pk) from t")
     assert lfc_cur.fetchall()[0][0] == n_records * (n_records + 1) / 2
 
-    check_pinned_entries(pg_cur)
     desired = {"status": "completed", "total": total, "prewarmed": prewarmed, "skipped": skipped}
     check_prewarmed(method, client, desired)
 
@@ -243,9 +238,9 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
     prewarm_thread.start()
 
     def prewarmed():
-        assert n_prewarms > 5
+        assert n_prewarms > 3
 
-    wait_until(prewarmed)
+    wait_until(prewarmed, timeout=40)  # debug builds don't finish in 20s
 
     running = False
     for t in workload_threads:
@@ -256,7 +251,6 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
     total_balance = lfc_cur.fetchall()[0][0]
     assert total_balance == 0
 
-    check_pinned_entries(pg_cur)
     if method == PrewarmMethod.POSTGRES:
         return
     desired = {

From b67e8f2edc49ea9dd3428970b04c77aede344c59 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 14:49:29 +0300
Subject: [PATCH 061/163] Move some code, just for more natural logical
 ordering

---
 pgxn/neon/communicator/src/backend_comms.rs | 58 ++++++++++-----------
 1 file changed, 29 insertions(+), 29 deletions(-)

diff --git a/pgxn/neon/communicator/src/backend_comms.rs b/pgxn/neon/communicator/src/backend_comms.rs
index 998e0daf71..0423b4486e 100644
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -111,35 +111,6 @@ pub enum NeonIOHandleState {
     Completed,
 }
 
-pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
-
-unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
-unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
-
-impl<'a> RequestProcessingGuard<'a> {
-    pub fn get_request(&self) -> &NeonIORequest {
-        unsafe { &*self.0.request.get() }
-    }
-
-    pub fn get_owner_procno(&self) -> i32 {
-        self.0.owner_procno.load(Ordering::Relaxed)
-    }
-
-    pub fn completed(self, result: NeonIOResult) {
-        unsafe {
-            *self.0.result.get() = result;
-        };
-
-        // Ok, we have completed the IO. Mark the request as completed. After that,
-        // we no longer have ownership of the slot, and must not modify it.
-        let old_state = self
-            .0
-            .state
-            .swap(NeonIOHandleState::Completed, Ordering::Release);
-        assert!(old_state == NeonIOHandleState::Processing);
-    }
-}
-
 impl NeonIOHandle {
     pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
         // Verify that the slot is in Idle state previously, and start filling it.
@@ -205,3 +176,32 @@ impl NeonIOHandle {
         Some(RequestProcessingGuard(self))
     }
 }
+
+pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
+
+unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
+unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
+
+impl<'a> RequestProcessingGuard<'a> {
+    pub fn get_request(&self) -> &NeonIORequest {
+        unsafe { &*self.0.request.get() }
+    }
+
+    pub fn get_owner_procno(&self) -> i32 {
+        self.0.owner_procno.load(Ordering::Relaxed)
+    }
+
+    pub fn completed(self, result: NeonIOResult) {
+        unsafe {
+            *self.0.result.get() = result;
+        };
+
+        // Ok, we have completed the IO. Mark the request as completed. After that,
+        // we no longer have ownership of the slot, and must not modify it.
+        let old_state = self
+            .0
+            .state
+            .swap(NeonIOHandleState::Completed, Ordering::Release);
+        assert!(old_state == NeonIOHandleState::Processing);
+    }
+}

From 47c099a0fb752125b12abd1efd3737607ffd6ad0 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 14:52:16 +0300
Subject: [PATCH 062/163] Rename NeonIOHandle to NeonIORequestSlot

All the code talks about "request slots", better to make the struct
name reflect that. The "Handle" term was borrowed from Postgres v18
AIO implementation, from the similar handles or slots used to submit
IO requests from backends to worker processes. But even though the
idea is similar, it's a completely separate implementation and there's
nothing else shared between them than the very high level
design.
---
 pgxn/neon/communicator/src/backend_comms.rs   | 42 +++++++++----------
 .../communicator/src/backend_interface.rs     | 16 +++----
 pgxn/neon/communicator/src/init.rs            | 12 +++---
 .../src/worker_process/main_loop.rs           |  4 +-
 4 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/pgxn/neon/communicator/src/backend_comms.rs b/pgxn/neon/communicator/src/backend_comms.rs
index 0423b4486e..5851bd0d7a 100644
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -51,9 +51,9 @@ use atomic_enum::atomic_enum;
 /// has been submitted or when a response is ready. We only store the 'owner_procno'
 /// which can be used for waking up the backend on completion, but the wakeups are
 /// performed elsewhere.
-pub struct NeonIOHandle {
+pub struct NeonIORequestSlot {
     /// similar to PgAioHandleState
-    state: AtomicNeonIOHandleState,
+    state: AtomicNeonIORequestSlotState,
 
     /// The owning process's ProcNumber. The worker process uses this to set the process's
     /// latch on completion.
@@ -77,23 +77,23 @@ pub struct NeonIOHandle {
 
 // The protocol described in the "Lifecycle of a request" section above ensures
 // the safe access to the fields
-unsafe impl Send for NeonIOHandle {}
-unsafe impl Sync for NeonIOHandle {}
+unsafe impl Send for NeonIORequestSlot {}
+unsafe impl Sync for NeonIORequestSlot {}
 
-impl Default for NeonIOHandle {
-    fn default() -> NeonIOHandle {
-        NeonIOHandle {
+impl Default for NeonIORequestSlot {
+    fn default() -> NeonIORequestSlot {
+        NeonIORequestSlot {
             owner_procno: AtomicI32::new(-1),
             request: UnsafeCell::new(NeonIORequest::Empty),
             result: UnsafeCell::new(NeonIOResult::Empty),
-            state: AtomicNeonIOHandleState::new(NeonIOHandleState::Idle),
+            state: AtomicNeonIORequestSlotState::new(NeonIORequestSlotState::Idle),
         }
     }
 }
 
 #[atomic_enum]
 #[derive(Eq, PartialEq)]
-pub enum NeonIOHandleState {
+pub enum NeonIORequestSlotState {
     Idle,
 
     /// backend is filling in the request
@@ -111,7 +111,7 @@ pub enum NeonIOHandleState {
     Completed,
 }
 
-impl NeonIOHandle {
+impl NeonIORequestSlot {
     pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
         // Verify that the slot is in Idle state previously, and start filling it.
         //
@@ -119,8 +119,8 @@ impl NeonIOHandle {
         // and try to use a slot that's already in use, we could fill the slot and
         // switch it directly from Idle to Submitted state.
         if let Err(s) = self.state.compare_exchange(
-            NeonIOHandleState::Idle,
-            NeonIOHandleState::Filling,
+            NeonIORequestSlotState::Idle,
+            NeonIORequestSlotState::Filling,
             Ordering::Relaxed,
             Ordering::Relaxed,
         ) {
@@ -133,21 +133,21 @@ impl NeonIOHandle {
         self.owner_procno.store(proc_number, Ordering::Relaxed);
         unsafe { *self.request.get() = *request }
         self.state
-            .store(NeonIOHandleState::Submitted, Ordering::Release);
+            .store(NeonIORequestSlotState::Submitted, Ordering::Release);
     }
 
-    pub fn get_state(&self) -> NeonIOHandleState {
+    pub fn get_state(&self) -> NeonIORequestSlotState {
         self.state.load(Ordering::Relaxed)
     }
 
     pub fn try_get_result(&self) -> Option<NeonIOResult> {
         // FIXME: ordering?
         let state = self.state.load(Ordering::Relaxed);
-        if state == NeonIOHandleState::Completed {
+        if state == NeonIORequestSlotState::Completed {
             // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
             fence(Ordering::Acquire);
             let result = unsafe { *self.result.get() };
-            self.state.store(NeonIOHandleState::Idle, Ordering::Relaxed);
+            self.state.store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
             Some(result)
         } else {
             None
@@ -161,8 +161,8 @@ impl NeonIOHandle {
         // already processing. That could be a flag somewhere in communicator's private
         // memory, for example.
         if let Err(s) = self.state.compare_exchange(
-            NeonIOHandleState::Submitted,
-            NeonIOHandleState::Processing,
+            NeonIORequestSlotState::Submitted,
+            NeonIORequestSlotState::Processing,
             Ordering::Relaxed,
             Ordering::Relaxed,
         ) {
@@ -177,7 +177,7 @@ impl NeonIOHandle {
     }
 }
 
-pub struct RequestProcessingGuard<'a>(&'a NeonIOHandle);
+pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
 
 unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
 unsafe impl<'a> Sync for RequestProcessingGuard<'a> {}
@@ -201,7 +201,7 @@ impl<'a> RequestProcessingGuard<'a> {
         let old_state = self
             .0
             .state
-            .swap(NeonIOHandleState::Completed, Ordering::Release);
-        assert!(old_state == NeonIOHandleState::Processing);
+            .swap(NeonIORequestSlotState::Completed, Ordering::Release);
+        assert!(old_state == NeonIORequestSlotState::Processing);
     }
 }
diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs
index fd0081e837..91ecf3f4c4 100644
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -3,7 +3,7 @@
 
 use std::os::fd::OwnedFd;
 
-use crate::backend_comms::NeonIOHandle;
+use crate::backend_comms::NeonIORequestSlot;
 use crate::init::CommunicatorInitStruct;
 use crate::integrated_cache::{BackendCacheReadOp, IntegratedCacheReadAccess};
 use crate::neon_request::{CCachedGetPageVResult, COid};
@@ -12,7 +12,7 @@ use crate::neon_request::{NeonIORequest, NeonIOResult};
 pub struct CommunicatorBackendStruct<'t> {
     my_proc_number: i32,
 
-    neon_request_slots: &'t [NeonIOHandle],
+    neon_request_slots: &'t [NeonIORequestSlot],
 
     submission_pipe_write_fd: OwnedFd,
 
@@ -152,10 +152,10 @@ pub extern "C" fn bcomm_get_request_slot_status(
     bs: &mut CommunicatorBackendStruct,
     request_slot_idx: u32,
 ) -> bool {
-    use crate::backend_comms::NeonIOHandleState;
+    use crate::backend_comms::NeonIORequestSlotState;
     match bs.neon_request_slots[request_slot_idx as usize].get_state() {
-        NeonIOHandleState::Idle => false,
-        NeonIOHandleState::Filling => {
+        NeonIORequestSlotState::Idle => false,
+        NeonIORequestSlotState::Filling => {
             // 'false' would be the right result here. However, this
             // is a very transient state. The C code should never
             // leave a slot in this state, so if it sees that,
@@ -166,9 +166,9 @@ pub extern "C" fn bcomm_get_request_slot_status(
                 request_slot_idx
             );
         }
-        NeonIOHandleState::Submitted => true,
-        NeonIOHandleState::Processing => true,
-        NeonIOHandleState::Completed => true,
+        NeonIORequestSlotState::Submitted => true,
+        NeonIORequestSlotState::Processing => true,
+        NeonIORequestSlotState::Completed => true,
     }
 }
 
diff --git a/pgxn/neon/communicator/src/init.rs b/pgxn/neon/communicator/src/init.rs
index 20bb4923e8..f5af93cc97 100644
--- a/pgxn/neon/communicator/src/init.rs
+++ b/pgxn/neon/communicator/src/init.rs
@@ -23,7 +23,7 @@ use std::mem;
 use std::mem::MaybeUninit;
 use std::os::fd::OwnedFd;
 
-use crate::backend_comms::NeonIOHandle;
+use crate::backend_comms::NeonIORequestSlot;
 use crate::integrated_cache::IntegratedCacheInitStruct;
 
 /// This struct is created in the postmaster process, and inherited to
@@ -36,7 +36,7 @@ pub struct CommunicatorInitStruct {
     // Shared memory data structures
     pub num_neon_request_slots: u32,
 
-    pub neon_request_slots: &'static [NeonIOHandle],
+    pub neon_request_slots: &'static [NeonIORequestSlot],
 
     pub integrated_cache_init_struct: IntegratedCacheInitStruct<'static>,
 }
@@ -56,7 +56,7 @@ impl std::fmt::Debug for CommunicatorInitStruct {
 pub extern "C" fn rcommunicator_shmem_size(num_neon_request_slots: u32) -> u64 {
     let mut size = 0;
 
-    size += mem::size_of::<NeonIOHandle>() * num_neon_request_slots as usize;
+    size += mem::size_of::<NeonIORequestSlot>() * num_neon_request_slots as usize;
 
     // For integrated_cache's Allocator. TODO: make this adjustable
     size += IntegratedCacheInitStruct::shmem_size();
@@ -80,16 +80,16 @@ pub extern "C" fn rcommunicator_shmem_init(
         unsafe { std::slice::from_raw_parts_mut(shmem_area_ptr, shmem_area_len as usize) };
 
     let (neon_request_slots, remaining_area) =
-        alloc_array_from_slice::<NeonIOHandle>(shmem_area, num_neon_request_slots as usize);
+        alloc_array_from_slice::<NeonIORequestSlot>(shmem_area, num_neon_request_slots as usize);
 
     for slot in neon_request_slots.iter_mut() {
-        slot.write(NeonIOHandle::default());
+        slot.write(NeonIORequestSlot::default());
     }
 
     // 'neon_request_slots' is initialized now. (MaybeUninit::slice_assume_init_mut() is nightly-only
     // as of this writing.)
     let neon_request_slots = unsafe {
-        std::mem::transmute::<&mut [MaybeUninit<NeonIOHandle>], &mut [NeonIOHandle]>(
+        std::mem::transmute::<&mut [MaybeUninit<NeonIORequestSlot>], &mut [NeonIORequestSlot]>(
             neon_request_slots,
         )
     };
diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index 3ae187ac16..43145f7f22 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -4,7 +4,7 @@ use std::os::fd::OwnedFd;
 use std::path::PathBuf;
 use std::str::FromStr as _;
 
-use crate::backend_comms::NeonIOHandle;
+use crate::backend_comms::NeonIORequestSlot;
 use crate::file_cache::FileCache;
 use crate::global_allocator::MyAllocatorCollector;
 use crate::init::CommunicatorInitStruct;
@@ -36,7 +36,7 @@ pub struct CommunicatorWorkerProcessStruct<'a> {
     client: PageserverClient,
 
     /// Request slots that backends use to send IO requests to the communicator.
-    neon_request_slots: &'a [NeonIOHandle],
+    neon_request_slots: &'a [NeonIORequestSlot],
 
     /// Notification pipe. Backends use this to notify the communicator that a request is waiting to
     /// be processed in one of the request slots.

From ffeede085e3008616872372ac98edcef573c8677 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 10 Jul 2025 12:58:22 +0100
Subject: [PATCH 063/163] libs: move metric collection for pageserver and
 safekeeper in a background task (#12525)

## Problem

Safekeeper and pageserver metrics collection might time out. We've seen
this in both hadron and neon.

## Summary of changes

This PR moves metrics collection in PS/SK to the background so that we
will always get some metrics, despite there may be some delays. Will
leave it to the future work to reduce metrics collection time.

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
---
 libs/http-utils/src/endpoint.rs               | 37 ++++++++-
 libs/pageserver_api/src/config.rs             |  2 +
 libs/utils/src/lib.rs                         |  2 +
 libs/utils/src/metrics_collector.rs           | 75 +++++++++++++++++++
 pageserver/src/bin/pageserver.rs              | 41 +++++++++-
 pageserver/src/config.rs                      |  6 ++
 pageserver/src/http/routes.rs                 |  7 +-
 pageserver/src/lib.rs                         | 12 +++
 safekeeper/src/bin/safekeeper.rs              | 27 +++++++
 safekeeper/src/http/routes.rs                 |  9 ++-
 safekeeper/src/lib.rs                         |  2 +
 safekeeper/src/wal_backup.rs                  |  2 +-
 .../tests/walproposer_sim/safekeeper.rs       |  1 +
 test_runner/fixtures/pageserver/http.py       |  2 +-
 test_runner/fixtures/safekeeper/http.py       |  2 +-
 15 files changed, 217 insertions(+), 10 deletions(-)
 create mode 100644 libs/utils/src/metrics_collector.rs

diff --git a/libs/http-utils/src/endpoint.rs b/libs/http-utils/src/endpoint.rs
index f32ced1180..a61bf8e08a 100644
--- a/libs/http-utils/src/endpoint.rs
+++ b/libs/http-utils/src/endpoint.rs
@@ -20,6 +20,7 @@ use tokio_stream::wrappers::ReceiverStream;
 use tokio_util::io::ReaderStream;
 use tracing::{Instrument, debug, info, info_span, warn};
 use utils::auth::{AuthError, Claims, SwappableJwtAuth};
+use utils::metrics_collector::{METRICS_COLLECTOR, METRICS_STALE_MILLIS};
 
 use crate::error::{ApiError, api_error_handler, route_error_handler};
 use crate::request::{get_query_param, parse_query_param};
@@ -250,9 +251,28 @@ impl std::io::Write for ChannelWriter {
     }
 }
 
-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn prometheus_metrics_handler(
+    req: Request<Body>,
+    force_metric_collection_on_scrape: bool,
+) -> Result<Response<Body>, ApiError> {
     SERVE_METRICS_COUNT.inc();
 
+    // HADRON
+    let requested_use_latest = parse_query_param(&req, "use_latest")?;
+
+    let use_latest = match requested_use_latest {
+        None => force_metric_collection_on_scrape,
+        Some(true) => true,
+        Some(false) => {
+            if force_metric_collection_on_scrape {
+                // We don't cache in this case
+                true
+            } else {
+                false
+            }
+        }
+    };
+
     let started_at = std::time::Instant::now();
 
     let (tx, rx) = mpsc::channel(1);
@@ -277,12 +297,18 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
 
         let _span = span.entered();
 
-        let metrics = metrics::gather();
+        // HADRON
+        let collected = if use_latest {
+            // Skip caching the results if we always force metric collection on scrape.
+            METRICS_COLLECTOR.run_once(!force_metric_collection_on_scrape)
+        } else {
+            METRICS_COLLECTOR.last_collected()
+        };
 
         let gathered_at = std::time::Instant::now();
 
         let res = encoder
-            .encode(&metrics, &mut writer)
+            .encode(&collected.metrics, &mut writer)
             .and_then(|_| writer.flush().map_err(|e| e.into()));
 
         // this instant is not when we finally got the full response sent, sending is done by hyper
@@ -295,6 +321,10 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
         let encoded_in = encoded_at - gathered_at - writer.wait_time();
         let total = encoded_at - started_at;
 
+        // HADRON
+        let staleness_ms = (encoded_at - collected.collected_at).as_millis();
+        METRICS_STALE_MILLIS.set(staleness_ms as i64);
+
         match res {
             Ok(()) => {
                 tracing::info!(
@@ -303,6 +333,7 @@ pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<
                     spawning_ms = spawned_in.as_millis(),
                     collection_ms = collected_in.as_millis(),
                     encoding_ms = encoded_in.as_millis(),
+                    stalenss_ms = staleness_ms,
                     "responded /metrics"
                 );
             }
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 9e9c7a4dcb..f01c65d1bd 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -274,6 +274,7 @@ pub struct ConfigToml {
     pub basebackup_cache_config: Option<BasebackupCacheConfig>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_generation_large_timeline_threshold: Option<u64>,
+    pub force_metric_collection_on_scrape: bool,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -831,6 +832,7 @@ impl Default for ConfigToml {
             basebackup_cache_config: None,
             posthog_config: None,
             image_layer_generation_large_timeline_threshold: Some(2 * 1024 * 1024 * 1024),
+            force_metric_collection_on_scrape: true,
         }
     }
 }
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 11f787562c..2b81da017d 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -99,6 +99,8 @@ pub mod elapsed_accum;
 #[cfg(target_os = "linux")]
 pub mod linux_socket_ioctl;
 
+pub mod metrics_collector;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/libs/utils/src/metrics_collector.rs b/libs/utils/src/metrics_collector.rs
new file mode 100644
index 0000000000..9e57fcd643
--- /dev/null
+++ b/libs/utils/src/metrics_collector.rs
@@ -0,0 +1,75 @@
+use std::{
+    sync::{Arc, RwLock},
+    time::{Duration, Instant},
+};
+
+use metrics::{IntGauge, proto::MetricFamily, register_int_gauge};
+use once_cell::sync::Lazy;
+
+pub static METRICS_STALE_MILLIS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "metrics_metrics_stale_milliseconds",
+        "The current metrics stale time in milliseconds"
+    )
+    .expect("failed to define a metric")
+});
+
+#[derive(Debug)]
+pub struct CollectedMetrics {
+    pub metrics: Vec<MetricFamily>,
+    pub collected_at: Instant,
+}
+
+impl CollectedMetrics {
+    fn new(metrics: Vec<MetricFamily>) -> Self {
+        Self {
+            metrics,
+            collected_at: Instant::now(),
+        }
+    }
+}
+
+#[derive(Debug)]
+pub struct MetricsCollector {
+    last_collected: RwLock<Arc<CollectedMetrics>>,
+}
+
+impl MetricsCollector {
+    pub fn new() -> Self {
+        Self {
+            last_collected: RwLock::new(Arc::new(CollectedMetrics::new(vec![]))),
+        }
+    }
+
+    #[tracing::instrument(name = "metrics_collector", skip_all)]
+    pub fn run_once(&self, cache_metrics: bool) -> Arc<CollectedMetrics> {
+        let started = Instant::now();
+        let metrics = metrics::gather();
+        let collected = Arc::new(CollectedMetrics::new(metrics));
+        if cache_metrics {
+            let mut guard = self.last_collected.write().unwrap();
+            *guard = collected.clone();
+        }
+        tracing::info!(
+            "Collected {} metric families in {} ms",
+            collected.metrics.len(),
+            started.elapsed().as_millis()
+        );
+        collected
+    }
+
+    pub fn last_collected(&self) -> Arc<CollectedMetrics> {
+        self.last_collected.read().unwrap().clone()
+    }
+}
+
+impl Default for MetricsCollector {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+// Interval for metrics collection. Currently hard-coded to be the same as the metrics scape interval from the obs agent
+pub static METRICS_COLLECTION_INTERVAL: Duration = Duration::from_secs(30);
+
+pub static METRICS_COLLECTOR: Lazy<MetricsCollector> = Lazy::new(MetricsCollector::default);
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 78aba25d2e..299fe7e159 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -29,8 +29,8 @@ use pageserver::task_mgr::{
 };
 use pageserver::tenant::{TenantSharedResources, mgr, secondary};
 use pageserver::{
-    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener, http,
-    page_cache, page_service, task_mgr, virtual_file,
+    CancellableTask, ConsumptionMetricsTasks, HttpEndpointListener, HttpsEndpointListener,
+    MetricsCollectionTask, http, page_cache, page_service, task_mgr, virtual_file,
 };
 use postgres_backend::AuthType;
 use remote_storage::GenericRemoteStorage;
@@ -41,6 +41,7 @@ use tracing_utils::OtelGuard;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::crashsafe::syncfs;
 use utils::logging::TracingErrorLayerEnablement;
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{failpoint_support, logging, project_build_tag, project_git_version, tcp_listener};
 
@@ -763,6 +764,41 @@ fn start_pageserver(
         (http_task, https_task)
     };
 
+    /* BEGIN_HADRON */
+    let metrics_collection_task = {
+        let cancel = shutdown_pageserver.child_token();
+        let task = crate::BACKGROUND_RUNTIME.spawn({
+            let cancel = cancel.clone();
+            let background_jobs_barrier = background_jobs_barrier.clone();
+            async move {
+                if conf.force_metric_collection_on_scrape {
+                    return;
+                }
+
+                // first wait until background jobs are cleared to launch.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return; },
+                    _ = background_jobs_barrier.wait() => {}
+                };
+                let mut interval = tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    tokio::select! {
+                        _ = cancel.cancelled() => {
+                            tracing::info!("cancelled metrics collection task, exiting...");
+                             break;
+                        },
+                        _ = interval.tick() => {}
+                    }
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            }
+        });
+        MetricsCollectionTask(CancellableTask { task, cancel })
+    };
+    /* END_HADRON */
+
     let consumption_metrics_tasks = {
         let cancel = shutdown_pageserver.child_token();
         let task = crate::BACKGROUND_RUNTIME.spawn({
@@ -844,6 +880,7 @@ fn start_pageserver(
             https_endpoint_listener,
             page_service,
             page_service_grpc,
+            metrics_collection_task,
             consumption_metrics_tasks,
             disk_usage_eviction_task,
             &tenant_manager,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index f64c5838ff..bb73ae1dd5 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -256,6 +256,10 @@ pub struct PageServerConf {
     /// Defines what is a big tenant for the purpose of image layer generation.
     /// See Timeline::should_check_if_image_layers_required
     pub image_layer_generation_large_timeline_threshold: Option<u64>,
+
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    pub force_metric_collection_on_scrape: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -437,6 +441,7 @@ impl PageServerConf {
             timeline_import_config,
             basebackup_cache_config,
             image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -496,6 +501,7 @@ impl PageServerConf {
             timeline_import_config,
             basebackup_cache_config,
             image_layer_generation_large_timeline_threshold,
+            force_metric_collection_on_scrape,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 767bba49e2..ed0a5440cb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -3938,9 +3938,14 @@ pub fn make_router(
         .expect("construct launch timestamp header middleware"),
     );
 
+    let force_metric_collection_on_scrape = state.conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
     Ok(router
         .data(state)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| request_span(r, prometheus_metrics_handler_wrapper))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 0dd3c465e0..0864026f6b 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -73,6 +73,9 @@ pub struct HttpEndpointListener(pub CancellableTask);
 pub struct HttpsEndpointListener(pub CancellableTask);
 pub struct ConsumptionMetricsTasks(pub CancellableTask);
 pub struct DiskUsageEvictionTask(pub CancellableTask);
+// HADRON
+pub struct MetricsCollectionTask(pub CancellableTask);
+
 impl CancellableTask {
     pub async fn shutdown(self) {
         self.cancel.cancel();
@@ -87,6 +90,7 @@ pub async fn shutdown_pageserver(
     https_listener: Option<HttpsEndpointListener>,
     page_service: page_service::Listener,
     grpc_task: Option<CancellableTask>,
+    metrics_collection_task: MetricsCollectionTask,
     consumption_metrics_worker: ConsumptionMetricsTasks,
     disk_usage_eviction_task: Option<DiskUsageEvictionTask>,
     tenant_manager: &TenantManager,
@@ -211,6 +215,14 @@ pub async fn shutdown_pageserver(
     // Best effort to persist any outstanding deletions, to avoid leaking objects
     deletion_queue.shutdown(Duration::from_secs(5)).await;
 
+    // HADRON
+    timed(
+        metrics_collection_task.0.shutdown(),
+        "shutdown metrics collections metrics",
+        Duration::from_secs(1),
+    )
+    .await;
+
     timed(
         consumption_metrics_worker.0.shutdown(),
         "shutdown consumption metrics",
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 8fda625817..b2d5976ef4 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -37,6 +37,7 @@ use tracing::*;
 use utils::auth::{JwtAuth, Scope, SwappableJwtAuth};
 use utils::id::NodeId;
 use utils::logging::{self, LogFormat, SecretString};
+use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version, tcp_listener};
 
@@ -243,6 +244,11 @@ struct Args {
     #[arg(long)]
     enable_tls_wal_service_api: bool,
 
+    /// Controls whether to collect all metrics on each scrape or to return potentially stale
+    /// results.
+    #[arg(long, default_value_t = true)]
+    force_metric_collection_on_scrape: bool,
+
     /// Run in development mode (disables security checks)
     #[arg(long, help = "Run in development mode (disables security checks)")]
     dev: bool,
@@ -428,6 +434,7 @@ async fn main() -> anyhow::Result<()> {
         ssl_ca_certs,
         use_https_safekeeper_api: args.use_https_safekeeper_api,
         enable_tls_wal_service_api: args.enable_tls_wal_service_api,
+        force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
     });
 
     // initialize sentry if SENTRY_DSN is provided
@@ -640,6 +647,26 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
         .map(|res| ("broker main".to_owned(), res));
     tasks_handles.push(Box::pin(broker_task_handle));
 
+    /* BEGIN_HADRON */
+    if conf.force_metric_collection_on_scrape {
+        let metrics_handle = current_thread_rt
+            .as_ref()
+            .unwrap_or_else(|| BACKGROUND_RUNTIME.handle())
+            .spawn(async move {
+                let mut interval: tokio::time::Interval =
+                    tokio::time::interval(METRICS_COLLECTION_INTERVAL);
+                loop {
+                    interval.tick().await;
+                    tokio::task::spawn_blocking(|| {
+                        METRICS_COLLECTOR.run_once(true);
+                    });
+                }
+            })
+            .map(|res| ("broker main".to_owned(), res));
+        tasks_handles.push(Box::pin(metrics_handle));
+    }
+    /* END_HADRON */
+
     set_build_info_metric(GIT_VERSION, BUILD_TAG);
 
     // TODO: update tokio-stream, convert to real async Stream with
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 384c582678..4b061c65d9 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -699,6 +699,11 @@ pub fn make_router(
         }))
     }
 
+    let force_metric_collection_on_scrape = conf.force_metric_collection_on_scrape;
+
+    let prometheus_metrics_handler_wrapper =
+        move |req| prometheus_metrics_handler(req, force_metric_collection_on_scrape);
+
     // NB: on any changes do not forget to update the OpenAPI spec
     // located nearby (/safekeeper/src/http/openapi_spec.yaml).
     let auth = conf.http_auth.clone();
@@ -706,7 +711,9 @@ pub fn make_router(
         .data(conf)
         .data(global_timelines)
         .data(auth)
-        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
+        .get("/metrics", move |r| {
+            request_span(r, prometheus_metrics_handler_wrapper)
+        })
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
         .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index c461c071da..c0b5403ebf 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -134,6 +134,7 @@ pub struct SafeKeeperConf {
     pub ssl_ca_certs: Vec<Pem>,
     pub use_https_safekeeper_api: bool,
     pub enable_tls_wal_service_api: bool,
+    pub force_metric_collection_on_scrape: bool,
 }
 
 impl SafeKeeperConf {
@@ -183,6 +184,7 @@ impl SafeKeeperConf {
             ssl_ca_certs: Vec::new(),
             use_https_safekeeper_api: false,
             enable_tls_wal_service_api: false,
+            force_metric_collection_on_scrape: true,
         }
     }
 }
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index 7e10847a1b..0e8dfd64c3 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -166,7 +166,7 @@ fn hadron_determine_offloader(mgr: &Manager, state: &StateSnapshot) -> (Option<N
 
     let backup_lag = state.commit_lsn.checked_sub(state.backup_lsn);
     if backup_lag.is_none() {
-        info!("Backup lag is None. Skipping re-election.");
+        debug!("Backup lag is None. Skipping re-election.");
         return (offloader, election_dbg_str);
     }
 
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 1f6990c682..280cd790a4 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -190,6 +190,7 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         ssl_ca_certs: Vec::new(),
         use_https_safekeeper_api: false,
         enable_tls_wal_service_api: false,
+        force_metric_collection_on_scrape: true,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 79cfba8da6..8e7d957b22 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -1002,7 +1002,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
 
     def get_metrics_str(self) -> str:
         """You probably want to use get_metrics() instead."""
-        res = self.get(f"http://localhost:{self.port}/metrics")
+        res = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
         self.verbose_error(res)
         return res.text
 
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 942b620be6..ceb00c0f90 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -143,7 +143,7 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
 
     def get_metrics_str(self) -> str:
         """You probably want to use get_metrics() instead."""
-        request_result = self.get(f"http://localhost:{self.port}/metrics")
+        request_result = self.get(f"http://localhost:{self.port}/metrics?use_latest=true")
         request_result.raise_for_status()
         return request_result.text
 

From f30c59bec9772d84f93184eeeebd40a79efc7175 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 15:02:00 +0300
Subject: [PATCH 064/163] Improve comments on request slots

---
 pgxn/neon/communicator/src/backend_comms.rs   | 84 +++++++++++--------
 .../communicator/src/backend_interface.rs     |  2 +-
 2 files changed, 51 insertions(+), 35 deletions(-)

diff --git a/pgxn/neon/communicator/src/backend_comms.rs b/pgxn/neon/communicator/src/backend_comms.rs
index 5851bd0d7a..d2f65a8fd4 100644
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -1,9 +1,10 @@
-//! This module implements a request/response "slot" for submitting requests from backends
-//! to the communicator process.
+//! This module implements a request/response "slot" for submitting
+//! requests from backends to the communicator process.
 //!
 //! NB: The "backend" side of this code runs in Postgres backend processes,
 //! which means that it is not safe to use the 'tracing' crate for logging, nor
-//! to launch threads or use tokio tasks.
+//! to launch threads or use tokio tasks!
+
 use std::cell::UnsafeCell;
 use std::sync::atomic::fence;
 use std::sync::atomic::{AtomicI32, Ordering};
@@ -12,7 +13,8 @@ use crate::neon_request::{NeonIORequest, NeonIOResult};
 
 use atomic_enum::atomic_enum;
 
-/// One request/response slot. Each backend has its own set of slots that it uses.
+/// One request/response slot. Each backend has its own set of slots that it
+/// uses.
 ///
 /// This is the moral equivalent of PgAioHandle for Postgres AIO requests
 /// Like PgAioHandle, try to keep this small.
@@ -21,7 +23,7 @@ use atomic_enum::atomic_enum;
 ///
 /// ## Lifecycle of a request
 ///
-/// The slot is always owned by either the backend process or the communicator
+/// A slot is always owned by either the backend process or the communicator
 /// process, depending on the 'state'. Only the owning process is allowed to
 /// read or modify the slot, except for reading the 'state' itself to check who
 /// owns it.
@@ -39,39 +41,41 @@ use atomic_enum::atomic_enum;
 /// slot for a new request.
 ///
 /// For correctness of the above protocol, we really only need two states:
-/// "owned by backend" and "owned by communicator process. But to help with
-/// debugging, there are a few more states. When the backend starts to fill in
-/// the request details in the slot, it first sets the state from Idle to
-/// Filling, and when it's done with that, from Filling to Submitted. In the
-/// Filling state, the slot is still owned by the backend. Similarly, when the
-/// communicator process starts to process a request, it sets it to Processing
-/// state first, but the slot is still owned by the communicator process.
+/// "owned by backend" and "owned by communicator process". But to help with
+/// debugging and better assertions, there are a few more states. When the
+/// backend starts to fill in the request details in the slot, it first sets the
+/// state from Idle to Filling, and when it's done with that, from Filling to
+/// Submitted. In the Filling state, the slot is still owned by the
+/// backend. Similarly, when the communicator process starts to process a
+/// request, it sets it to Processing state first, but the slot is still owned
+/// by the communicator process.
 ///
 /// This struct doesn't handle waking up the communicator process when a request
-/// has been submitted or when a response is ready. We only store the 'owner_procno'
-/// which can be used for waking up the backend on completion, but the wakeups are
-/// performed elsewhere.
+/// has been submitted or when a response is ready. The 'owner_procno' is used
+/// for waking up the backend on completion, but that happens elsewhere.
 pub struct NeonIORequestSlot {
     /// similar to PgAioHandleState
     state: AtomicNeonIORequestSlotState,
 
-    /// The owning process's ProcNumber. The worker process uses this to set the process's
-    /// latch on completion.
+    /// The owning process's ProcNumber. The worker process uses this to set the
+    /// process's latch on completion.
     ///
-    /// (This could be calculated from num_neon_request_slots_per_backend and the index of
-    /// this slot in the overall 'neon_requst_slots array')
+    /// (This could be calculated from num_neon_request_slots_per_backend and
+    /// the index of this slot in the overall 'neon_requst_slots array'. But we
+    /// prefer the communicator process to not know how the request slots are
+    /// divided between the backends.)
     owner_procno: AtomicI32,
 
-    /// SAFETY: This is modified by fill_request(), after it has established ownership
-    /// of the slot by setting state from Idle to Filling
+    /// SAFETY: This is modified by submit_request(), after it has established
+    /// ownership of the slot by setting state from Idle to Filling
     request: UnsafeCell<NeonIORequest>,
 
-    /// valid when state is Completed
+    /// Valid when state is Completed
     ///
-    /// SAFETY: This is modified by RequestProcessingGuard::complete(). There can be
-    /// only one RequestProcessingGuard outstanding for a slot at a time, because
-    /// it is returned by start_processing_request() which checks the state, so
-    /// RequestProcessingGuard has exclusive access to the slot.
+    /// SAFETY: This is modified by RequestProcessingGuard::complete(). There
+    /// can be only one RequestProcessingGuard outstanding for a slot at a time,
+    /// because it is returned by start_processing_request() which checks the
+    /// state, so RequestProcessingGuard has exclusive access to the slot.
     result: UnsafeCell<NeonIOResult>,
 }
 
@@ -96,7 +100,7 @@ impl Default for NeonIORequestSlot {
 pub enum NeonIORequestSlotState {
     Idle,
 
-    /// backend is filling in the request
+    /// Backend is filling in the request
     Filling,
 
     /// Backend has submitted the request to the communicator, but the
@@ -112,12 +116,17 @@ pub enum NeonIORequestSlotState {
 }
 
 impl NeonIORequestSlot {
-    pub fn fill_request(&self, request: &NeonIORequest, proc_number: i32) {
-        // Verify that the slot is in Idle state previously, and start filling it.
+    /// Write a request to the slot, and mark it as Submitted.
+    ///
+    /// Note: This does not wake up the worker process to actually process
+    /// the request. It's the caller's responsibility to do that.
+    pub fn submit_request(&self, request: &NeonIORequest, proc_number: i32) {
+        // Verify that the slot is in Idle state previously, and put it in
+        // Filling state.
         //
-        // XXX: This step isn't strictly necessary. Assuming the caller didn't screw up
-        // and try to use a slot that's already in use, we could fill the slot and
-        // switch it directly from Idle to Submitted state.
+        // XXX: This step isn't strictly necessary. Assuming the caller didn't
+        // screw up and try to use a slot that's already in use, we could fill
+        // the slot and switch it directly from Idle to Submitted state.
         if let Err(s) = self.state.compare_exchange(
             NeonIORequestSlotState::Idle,
             NeonIORequestSlotState::Filling,
@@ -177,6 +186,12 @@ impl NeonIORequestSlot {
     }
 }
 
+/// [`NeonIORequestSlot::start_processing_request`] returns this guard object to
+/// indicate that the the caller now "owns" the slot, until it calls
+/// [`RequestProcessingGuard::completed`].
+///
+/// TODO: implement Drop on this, to mark the request as Aborted or Errored
+/// if [`RequestProcessingGuard::completed`] is not called.
 pub struct RequestProcessingGuard<'a>(&'a NeonIORequestSlot);
 
 unsafe impl<'a> Send for RequestProcessingGuard<'a> {}
@@ -192,12 +207,13 @@ impl<'a> RequestProcessingGuard<'a> {
     }
 
     pub fn completed(self, result: NeonIOResult) {
+        // Store the result to the slot.
         unsafe {
             *self.0.result.get() = result;
         };
 
-        // Ok, we have completed the IO. Mark the request as completed. After that,
-        // we no longer have ownership of the slot, and must not modify it.
+        // Mark the request as completed. After that, we no longer have
+        // ownership of the slot, and must not modify it.
         let old_state = self
             .0
             .state
diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs
index 91ecf3f4c4..a45583d5e3 100644
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -232,6 +232,6 @@ impl<'t> CommunicatorBackendStruct<'t> {
     pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
         let my_proc_number = self.my_proc_number;
 
-        self.neon_request_slots[request_slot_idx as usize].fill_request(request, my_proc_number);
+        self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
     }
 }

From 5da06d412930096fab1f5d03f9cd39051a0afb70 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 15:03:23 +0300
Subject: [PATCH 065/163] Make start_neon_io_request() wakeup the communicator
 process

All the callers did that previously. So rather than document that the
caller needs to do it, just do it in start_neon_io_request() straight
away. (We might want to revisit this if we get codepaths where the C
code submits multiple IO requests as a batch. In that case, it would
be more efficient to fill all the request slots first and only send
one notification to the pipe for all of them)
---
 .../communicator/src/backend_interface.rs     | 29 ++++++++-----------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs
index a45583d5e3..45715abee5 100644
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -76,9 +76,6 @@ pub extern "C" fn bcomm_start_io_request(
     // Create neon request and submit it
     bs.start_neon_io_request(slot_idx, request);
 
-    // Tell the communicator about it
-    bs.submit_request(slot_idx);
-
     slot_idx
 }
 
@@ -118,9 +115,6 @@ pub extern "C" fn bcomm_start_get_page_v_request(
     // Create neon request and submit it
     bs.start_neon_io_request(slot_idx, request);
 
-    // Tell the communicator about it
-    bs.submit_request(slot_idx);
-
     slot_idx
 }
 
@@ -208,10 +202,21 @@ pub extern "C" fn bcomm_cache_contains(
 }
 
 impl<'t> CommunicatorBackendStruct<'t> {
+    /// The slot must be free, or this panics.
+    pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
+        let my_proc_number = self.my_proc_number;
+
+        self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
+
+        // Tell the communicator about it
+        self.notify_about_request(request_slot_idx);
+    }
+
     /// Send a wakeup to the communicator process
-    fn submit_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
+    fn notify_about_request(self: &CommunicatorBackendStruct<'t>, request_slot_idx: i32) {
         // wake up communicator by writing the idx to the submission pipe
         //
+
         // This can block, if the pipe is full. That should be very rare,
         // because the communicator tries hard to drain the pipe to prevent
         // that. Also, there's a natural upper bound on how many wakeups can be
@@ -224,14 +229,4 @@ impl<'t> CommunicatorBackendStruct<'t> {
         let _res = nix::unistd::write(&self.submission_pipe_write_fd, &idxbuf);
         // FIXME: check result, return any errors
     }
-
-    /// Note: there's no guarantee on when the communicator might pick it up. You should ring
-    /// the doorbell. But it might pick it up immediately.
-    ///
-    /// The slot must be free, or this panics.
-    pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
-        let my_proc_number = self.my_proc_number;
-
-        self.neon_request_slots[request_slot_idx as usize].submit_request(request, my_proc_number);
-    }
 }

From c14cf15b525a3ca9f6b7ed8f7aa92fdfed8aefaa Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 15:06:59 +0300
Subject: [PATCH 066/163] Tidy up the memory ordering instructions on request
 slot code

I believe the explicit memory fence instructions are
unnecessary. Performing a store with Release ordering makes all the
previous non-atomic writes visible too. Per rust docs for Ordering::Release
( https://doc.rust-lang.org/std/sync/atomic/enum.Ordering.html#variant.Release):

> When coupled with a store, all previous operations become ordered
> before any load of this value with Acquire (or stronger)
> ordering. In particular, all previous writes become visible to all
> threads that perform an Acquire (or stronger) load of this value.
>
> ...
>
> Corresponds to memory_order_release in C++20.

The "all previous writes" means non-atomic writes too. It's not very
clear from that text, but the C++20 docs that it links to is more
explicit about it:

> All memory writes (including non-atomic and relaxed atomic) that
> happened-before the atomic store from the point of view of thread A,
> become visible side-effects in thread B. That is, once the atomic
> load is completed, thread B is guaranteed to see everything thread A
> wrote to memory.

In addition to removing the fence instructions, fix the comments on
each atomic Acquire operation to point to the correct Release
counterpart. We had such comments but they had gone out-of-date as
code has moved.
---
 pgxn/neon/communicator/src/backend_comms.rs | 31 +++++++++++----------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/pgxn/neon/communicator/src/backend_comms.rs b/pgxn/neon/communicator/src/backend_comms.rs
index d2f65a8fd4..704b7269b1 100644
--- a/pgxn/neon/communicator/src/backend_comms.rs
+++ b/pgxn/neon/communicator/src/backend_comms.rs
@@ -6,7 +6,6 @@
 //! to launch threads or use tokio tasks!
 
 use std::cell::UnsafeCell;
-use std::sync::atomic::fence;
 use std::sync::atomic::{AtomicI32, Ordering};
 
 use crate::neon_request::{NeonIORequest, NeonIOResult};
@@ -136,11 +135,13 @@ impl NeonIORequestSlot {
             panic!("unexpected state in request slot: {s:?}");
         }
 
-        // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
-        fence(Ordering::Acquire);
-
+        // Fill in the request details
         self.owner_procno.store(proc_number, Ordering::Relaxed);
         unsafe { *self.request.get() = *request }
+
+        // This synchronizes-with store/swap in [`start_processing_request`].
+        // Note that this ensures that the previous non-atomic writes visible
+        // to other threads too.
         self.state
             .store(NeonIORequestSlotState::Submitted, Ordering::Release);
     }
@@ -150,13 +151,12 @@ impl NeonIORequestSlot {
     }
 
     pub fn try_get_result(&self) -> Option<NeonIOResult> {
-        // FIXME: ordering?
-        let state = self.state.load(Ordering::Relaxed);
+        // This synchronizes-with the store/swap in [`RequestProcessingGuard::completed`]
+        let state = self.state.load(Ordering::Acquire);
         if state == NeonIORequestSlotState::Completed {
-            // This fence synchronizes-with store/swap in `communicator_process_main_loop`.
-            fence(Ordering::Acquire);
             let result = unsafe { *self.result.get() };
-            self.state.store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
+            self.state
+                .store(NeonIORequestSlotState::Idle, Ordering::Relaxed);
             Some(result)
         } else {
             None
@@ -165,14 +165,16 @@ impl NeonIORequestSlot {
 
     /// Read the IO request from the slot indicated in the wakeup
     pub fn start_processing_request<'a>(&'a self) -> Option<RequestProcessingGuard<'a>> {
-        // XXX: using compare_exchange for this is not strictly necessary, as long as
-        // the communicator process has _some_ means of tracking which requests it's
-        // already processing. That could be a flag somewhere in communicator's private
-        // memory, for example.
+        // XXX: using atomic load rather than compare_exchange would be
+        // sufficient here, as long as the communicator process has _some_ means
+        // of tracking which requests it's already processing. That could be a
+        // flag somewhere in communicator's private memory, for example.
+        //
+        // This synchronizes-with the store in [`submit_request`].
         if let Err(s) = self.state.compare_exchange(
             NeonIORequestSlotState::Submitted,
             NeonIORequestSlotState::Processing,
-            Ordering::Relaxed,
+            Ordering::Acquire,
             Ordering::Relaxed,
         ) {
             // FIXME surprising state. This is unexpected at the moment, but if we
@@ -180,7 +182,6 @@ impl NeonIORequestSlot {
             // read from the pipe, then this could happen
             panic!("unexpected state in request slot: {s:?}");
         }
-        fence(Ordering::Acquire);
 
         Some(RequestProcessingGuard(self))
     }

From dcf8e0565f620d4977fcd971fd7614b30fbf6eeb Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 14:47:36 +0300
Subject: [PATCH 067/163] Improve communicator README

---
 pgxn/neon/communicator/README.md  | 39 +++++++++++++++++++++----------
 pgxn/neon/communicator/src/lib.rs |  2 --
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/pgxn/neon/communicator/README.md b/pgxn/neon/communicator/README.md
index 8887a01cbc..a18f64c9f6 100644
--- a/pgxn/neon/communicator/README.md
+++ b/pgxn/neon/communicator/README.md
@@ -49,21 +49,36 @@ slots are statically allocated for each backend, and must not be
 accessed by other backends. The worker process reads requests from the
 shared memory slots, and writes responses back to the slots.
 
-To submit an IO request, first pick one of your backend's free slots,
-and write the details of the IO request in the slot. Finally, update
-the 'state' field of the slot to Submitted. That informs the worker
-process that it can start processing the request. Once the state has
-been set to Submitted, the backend *must not* access the slot anymore,
-until the worker process sets its state to 'Completed'. In other
-words, each slot is owned by either the backend or the worker process
-at all times, and the 'state' field indicates who has ownership at the
-moment.
+Here's an example snapshot of the system, when two requests from two
+different backends are in progress:
+
+```
+Backends           Request slots          Communicator process
+---------          -------------          --------------------
+
+Backend 1          1: Idle
+                   2: Idle
+                   3: Processing          tokio task handling request 3
+
+Backend 2          4: Completed
+                   5: Processing          tokio task handling request 5
+                   6: Idle
+
+...                ...
+```
+
+To submit an IO request, the backend first picks one of its Idle
+slots, writes the IO request in the slot, and updates it to
+'Submitted' state. That transfers the ownership of the slot to the
+worker process, until the worker process marks the request as
+Completed. The worker process spawns a separate Tokio task for each
+request.
 
 To inform the worker process that a request slot has a pending IO
 request, there's a pipe shared by the worker process and all backend
-processes. After you have changed the slot's state to Submitted, write
-the index of the request slot to the pipe. This wakes up the worker
-process.
+processes. The backend writes the index of the request slot to the
+pipe after changing the slot's state to Submitted. This wakes up the
+worker process.
 
 (Note that the pipe is just used for wakeups, but the worker process
 is free to pick up Submitted IO requests even without receiving the
diff --git a/pgxn/neon/communicator/src/lib.rs b/pgxn/neon/communicator/src/lib.rs
index fbe582df78..734e89a89a 100644
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -1,9 +1,7 @@
-//!
 //! Three main parts:
 //! - async tokio communicator core, which receives requests and processes them.
 //! - Main loop and requests queues, which routes requests from backends to the core
 //! - the per-backend glue code, which submits requests
-//!
 
 mod backend_comms;
 

From bceafc6c328fb797b8d450ab52571b198ded0d87 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 10 Jul 2025 16:36:30 +0300
Subject: [PATCH 068/163] Update LFC cache hit/miss counters

Fixes EXPLAIN (FILECACHE) option
---
 pgxn/neon/communicator_new.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index cc0a1634a7..bdd5a75d62 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -889,6 +889,9 @@ retry:
 			elog(DEBUG1, "read from local cache file was superseded by concurrent update");
 			goto retry;
 		}
+
+		pgBufferUsage.file_cache.hits += nblocks;
+
 		return;
 	}
 	Assert(request_idx == my_next_slot_idx);
@@ -898,6 +901,12 @@ retry:
 	inflight_requests[num_inflight_requests] = request_idx;
 	num_inflight_requests++;
 
+	/*
+	 * XXX: If some blocks were in cache but not others, we count all blocks
+	 * as a cache miss.
+	 */
+	pgBufferUsage.file_cache.misses += nblocks;
+
 	wait_request_completion(request_idx, &result);
 	Assert(num_inflight_requests == 1);
 	Assert(inflight_requests[0] == request_idx);

From d33b3c7457e1bbe15f1961ebec749249c6f77f5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 10 Jul 2025 16:03:20 +0200
Subject: [PATCH 069/163] Print viability via custom printing impl (#12544)

As per
https://github.com/neondatabase/neon/pull/12485#issuecomment-3056525882
,

we don't want to print the viability error via a debug impl as it prints
the backtrace. SafekeeperInfo doesn't have a display impl, so fall back
to `Debug` for the `Ok` case. It gives single line output so it's okay
to use `Debug` for it.

Follow up of https://github.com/neondatabase/neon/pull/12485
---
 storage_controller/src/service.rs | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index d2f7287be9..3844570b47 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1984,11 +1984,14 @@ impl Service {
         });
 
         // Check that there is enough safekeepers configured that we can create new timelines
-        let test_sk_res = this.safekeepers_for_new_timeline().await;
+        let test_sk_res_str = match this.safekeepers_for_new_timeline().await {
+            Ok(v) => format!("Ok({v:?})"),
+            Err(v) => format!("Err({v:})"),
+        };
         tracing::info!(
             timeline_safekeeper_count = config.timeline_safekeeper_count,
             timelines_onto_safekeepers = config.timelines_onto_safekeepers,
-            "viability test result (test timeline creation on safekeepers): {test_sk_res:?}",
+            "viability test result (test timeline creation on safekeepers): {test_sk_res_str}",
         );
 
         Ok(this)

From be5bbaecadda71478638608c469c184aaf124bf5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 10 Jul 2025 10:28:58 -0400
Subject: [PATCH 070/163] fix(storcon): correctly handle 404 error in lsn lease
 (#12537)

## Problem

close LKB-253

## Summary of changes

404 for timeline requests could happen when the tenant is intended to be
on a pageserver but not attached yet. This patch adds handling for the
lease request. In the future, we should extend this handling to more
operations.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/service.rs | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3844570b47..9c1b81d261 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4761,6 +4761,7 @@ impl Service {
         )
         .await;
 
+        let mut retry_if_not_attached = false;
         let targets = {
             let locked = self.inner.read().unwrap();
             let mut targets = Vec::new();
@@ -4777,6 +4778,24 @@ impl Service {
                         .expect("Pageservers may not be deleted while referenced");
 
                     targets.push((*tenant_shard_id, node.clone()));
+
+                    if let Some(location) = shard.observed.locations.get(node_id) {
+                        if let Some(ref conf) = location.conf {
+                            if conf.mode != LocationConfigMode::AttachedSingle
+                                && conf.mode != LocationConfigMode::AttachedMulti
+                            {
+                                // If the shard is attached as secondary, we need to retry if 404.
+                                retry_if_not_attached = true;
+                            }
+                            // If the shard is attached as primary, we should succeed.
+                        } else {
+                            // Location conf is not available yet, retry if 404.
+                            retry_if_not_attached = true;
+                        }
+                    } else {
+                        // The shard is not attached to the intended pageserver yet, retry if 404.
+                        retry_if_not_attached = true;
+                    }
                 }
             }
             targets
@@ -4807,6 +4826,18 @@ impl Service {
                         valid_until = Some(lease.valid_until);
                     }
                 }
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
+                    if retry_if_not_attached =>
+                {
+                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    return Err(ApiError::ResourceUnavailable(
+                        format!(
+                            "Timeline is not attached to the pageserver {} yet, please retry",
+                            node.get_id()
+                        )
+                        .into(),
+                    ));
+                }
                 Err(e) => {
                     return Err(passthrough_api_error(&node, e));
                 }

From 2c6b327be6619e66038427e5cbfe42159498a949 Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Thu, 10 Jul 2025 07:39:38 -0700
Subject: [PATCH 071/163] A few PS changes (#12540)

# TLDR
All changes are no-op except some metrics.

## Summary of changes I
### Pageserver
Added a new global counter metric
`pageserver_pagestream_handler_results_total` that categorizes
pagestream request results according to their outcomes:
1. Success
2. Internal errors
3. Other errors

Internal errors include:
1. Page reconstruction error: This probably indicates a pageserver
bug/corruption
2. LSN timeout error: Could indicate overload or bugs with PS's ability
to reach other components
3. Misrouted request error: Indicates bugs in the Storage Controller/HCC

Other errors include transient errors that are expected during normal
operation or errors indicating bugs with other parts of the system
(e.g., malformed requests, errors due to cancelled operations during PS
shutdown, etc.)


## Summary of changes II
This PR adds a pageserver endpoint and its counterpart in storage
controller to list visible size of all tenant shards. This will be a
prerequisite of the tenant rebalance command.


## Problem III
We need a way to download WAL
segments/layerfiles from S3 and replay WAL records. We cannot access
production S3 from our laptops directly, and we also can't transfer any
user data out of production systems for GDPR compliance, so we need
solutions.

## Summary of changes III

This PR adds a couple of tools to support the debugging
workflow in production:
1. A new `pagectl download-remote-object` command that can be used to
download remote storage objects assuming the correct access is set up.

## Summary of changes IV
This PR adds a command to list all visible delta and image layers from
index_part. This is useful to debug compaction issues as index_part
often contain a lot of covered layers due to PITR.

---------

Co-authored-by: William Huang <william.huang@databricks.com>
Co-authored-by: Chen Luo <chen.luo@databricks.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 Cargo.lock                                    |   1 +
 pageserver/client/src/mgmt_api.rs             |  66 ++++++++++-
 pageserver/ctl/Cargo.toml                     |   1 +
 pageserver/ctl/src/download_remote_object.rs  |  85 ++++++++++++++
 pageserver/ctl/src/index_part.rs              | 110 +++++++++++++++---
 pageserver/ctl/src/main.rs                    |   6 +
 pageserver/src/http/routes.rs                 |  29 ++++-
 pageserver/src/metrics.rs                     |  18 +++
 pageserver/src/page_service.rs                |  53 ++++++++-
 pageserver/src/tenant.rs                      |  10 ++
 .../src/tenant/storage_layer/layer_name.rs    |   2 +-
 safekeeper/src/metrics.rs                     |   9 ++
 safekeeper/src/safekeeper.rs                  |  19 ++-
 test_runner/fixtures/pageserver/http.py       |   7 ++
 test_runner/regress/test_pageserver_api.py    |  13 +++
 15 files changed, 404 insertions(+), 25 deletions(-)
 create mode 100644 pageserver/ctl/src/download_remote_object.rs

diff --git a/Cargo.lock b/Cargo.lock
index 4150944ad0..85080f8473 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4323,6 +4323,7 @@ dependencies = [
  "pageserver_api",
  "postgres_ffi",
  "remote_storage",
+ "serde",
  "serde_json",
  "svg_fmt",
  "thiserror 1.0.69",
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index af4be23b9b..fe1ddc2e7d 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{BTreeMap, HashMap};
 use std::error::Error as _;
 use std::time::Duration;
 
@@ -251,6 +251,70 @@ impl Client {
         Ok(())
     }
 
+    pub async fn tenant_timeline_compact(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        force_image_layer_creation: bool,
+        must_force_image_layer_creation: bool,
+        scheduled: bool,
+        wait_until_done: bool,
+    ) -> Result<()> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/compact",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+
+        if force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("force_image_layer_creation", "true");
+        }
+
+        if must_force_image_layer_creation {
+            path.query_pairs_mut()
+                .append_pair("must_force_image_layer_creation", "true");
+        }
+
+        if scheduled {
+            path.query_pairs_mut().append_pair("scheduled", "true");
+        }
+        if wait_until_done {
+            path.query_pairs_mut()
+                .append_pair("wait_until_scheduled_compaction_done", "true");
+            path.query_pairs_mut()
+                .append_pair("wait_until_uploaded", "true");
+        }
+        self.request(Method::PUT, path, ()).await?;
+        Ok(())
+    }
+
+    /* BEGIN_HADRON */
+    pub async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        let mut path = reqwest::Url::parse(&format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}",
+            self.mgmt_api_endpoint
+        ))
+        .expect("Cannot build URL");
+        path.query_pairs_mut()
+            .append_pair("include-image-consistent-lsn", "true");
+
+        let response: reqwest::Response = self.request(Method::GET, path, ()).await?;
+        let body = response.json().await.map_err(Error::ReceiveBody)?;
+        Ok(body)
+    }
+
+    pub async fn list_tenant_visible_size(&self) -> Result<BTreeMap<TenantShardId, u64>> {
+        let uri = format!("{}/v1/list_tenant_visible_size", self.mgmt_api_endpoint);
+        let resp = self.get(&uri).await?;
+        resp.json().await.map_err(Error::ReceiveBody)
+    }
+    /* END_HADRON */
+
     pub async fn tenant_scan_remote_storage(
         &self,
         tenant_id: TenantId,
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 7b70f0dc87..ba34fa1f69 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -17,6 +17,7 @@ pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
 postgres_ffi.workspace = true
+serde.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
diff --git a/pageserver/ctl/src/download_remote_object.rs b/pageserver/ctl/src/download_remote_object.rs
new file mode 100644
index 0000000000..aa09774701
--- /dev/null
+++ b/pageserver/ctl/src/download_remote_object.rs
@@ -0,0 +1,85 @@
+use camino::Utf8PathBuf;
+use clap::Parser;
+use tokio_util::sync::CancellationToken;
+
+/// Download a specific object from remote storage to a local file.
+///
+/// The remote storage configuration is supplied via the `REMOTE_STORAGE_CONFIG` environment
+/// variable, in the same TOML format that the pageserver itself understands. This allows the
+/// command to work with any cloud supported by the `remote_storage` crate (currently AWS S3,
+/// Azure Blob Storage and local files), as long as the credentials are available via the
+/// standard environment variables expected by the underlying SDKs.
+///
+/// Examples for setting the environment variable:
+///
+/// ```bash
+/// # AWS S3 (region can also be provided via AWS_REGION)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { bucket_name = "my-bucket", bucket_region = "us-east-2" }'
+///
+/// # Azure Blob Storage (account key picked up from AZURE_STORAGE_ACCOUNT_KEY)
+/// export REMOTE_STORAGE_CONFIG='remote_storage = { container = "my-container", account = "my-account" }'
+/// ```
+#[derive(Parser)]
+pub(crate) struct DownloadRemoteObjectCmd {
+    /// Key / path of the object to download (relative to the remote storage prefix).
+    ///
+    /// Examples:
+    ///   "wal/3aa8f.../00000001000000000000000A"
+    ///   "pageserver/v1/tenants/<tenant_id>/timelines/<timeline_id>/layer_12345"
+    pub remote_path: String,
+
+    /// Path of the local file to create. Existing file will be overwritten.
+    ///
+    /// Examples:
+    ///   "./segment"
+    ///   "/tmp/layer_12345.parquet"
+    pub output_file: Utf8PathBuf,
+}
+
+pub(crate) async fn main(cmd: &DownloadRemoteObjectCmd) -> anyhow::Result<()> {
+    use remote_storage::{DownloadOpts, GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+
+    // Fetch remote storage configuration from the environment
+    let config_str = std::env::var("REMOTE_STORAGE_CONFIG").map_err(|_| {
+        anyhow::anyhow!(
+            "'REMOTE_STORAGE_CONFIG' environment variable must be set to a valid remote storage TOML config"
+        )
+    })?;
+
+    let config = RemoteStorageConfig::from_toml_str(&config_str)?;
+
+    // Initialise remote storage client
+    let storage = GenericRemoteStorage::from_config(&config).await?;
+
+    // RemotePath must be relative – leading slashes confuse the parser.
+    let remote_path_str = cmd.remote_path.trim_start_matches('/');
+    let remote_path = RemotePath::from_string(remote_path_str)?;
+
+    let cancel = CancellationToken::new();
+
+    println!(
+        "Downloading '{remote_path}' from remote storage bucket {:?} ...",
+        config.storage.bucket_name()
+    );
+
+    // Start the actual download
+    let download = storage
+        .download(&remote_path, &DownloadOpts::default(), &cancel)
+        .await?;
+
+    // Stream to file
+    let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
+    let tmp_path = cmd.output_file.with_extension("tmp");
+    let mut file = tokio::fs::File::create(&tmp_path).await?;
+    tokio::io::copy(&mut reader, &mut file).await?;
+    file.sync_all().await?;
+    // Atomically move into place
+    tokio::fs::rename(&tmp_path, &cmd.output_file).await?;
+
+    println!(
+        "Downloaded to '{}'. Last modified: {:?}, etag: {}",
+        cmd.output_file, download.last_modified, download.etag
+    );
+
+    Ok(())
+}
diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 838d00e490..9801f3c9dc 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -1,14 +1,16 @@
 use std::str::FromStr;
 
-use anyhow::Context;
+use anyhow::{Context, Ok};
 use camino::Utf8PathBuf;
 use pageserver::tenant::{
     IndexPart,
     layer_map::{LayerMap, SearchResult},
-    remote_timeline_client::remote_layer_path,
-    storage_layer::{PersistentLayerDesc, ReadableLayerWeak},
+    remote_timeline_client::{index::LayerFileMetadata, remote_layer_path},
+    storage_layer::{LayerName, LayerVisibilityHint, PersistentLayerDesc, ReadableLayerWeak},
 };
 use pageserver_api::key::Key;
+use serde::Serialize;
+use std::collections::BTreeMap;
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
@@ -33,6 +35,31 @@ pub(crate) enum IndexPartCmd {
         #[arg(long)]
         lsn: String,
     },
+    /// List all visible delta and image layers at the latest LSN.
+    ListVisibleLayers {
+        #[arg(long)]
+        path: Utf8PathBuf,
+    },
+}
+
+fn create_layer_map_from_index_part(
+    index_part: &IndexPart,
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+) -> LayerMap {
+    let mut layer_map = LayerMap::default();
+    {
+        let mut updates = layer_map.batch_update();
+        for (key, value) in index_part.layer_metadata.iter() {
+            updates.insert_historic(PersistentLayerDesc::from_filename(
+                tenant_shard_id,
+                timeline_id,
+                key.clone(),
+                value.file_size,
+            ));
+        }
+    }
+    layer_map
 }
 
 async fn search_layers(
@@ -49,18 +76,7 @@ async fn search_layers(
         let bytes = tokio::fs::read(path).await?;
         IndexPart::from_json_bytes(&bytes).unwrap()
     };
-    let mut layer_map = LayerMap::default();
-    {
-        let mut updates = layer_map.batch_update();
-        for (key, value) in index_json.layer_metadata.iter() {
-            updates.insert_historic(PersistentLayerDesc::from_filename(
-                tenant_shard_id,
-                timeline_id,
-                key.clone(),
-                value.file_size,
-            ));
-        }
-    }
+    let layer_map = create_layer_map_from_index_part(&index_json, tenant_shard_id, timeline_id);
     let key = Key::from_hex(key)?;
 
     let lsn = Lsn::from_str(lsn).unwrap();
@@ -98,6 +114,69 @@ async fn search_layers(
     Ok(())
 }
 
+#[derive(Debug, Clone, Serialize)]
+struct VisibleLayers {
+    pub total_images: u64,
+    pub total_image_bytes: u64,
+    pub total_deltas: u64,
+    pub total_delta_bytes: u64,
+    pub layer_metadata: BTreeMap<LayerName, LayerFileMetadata>,
+}
+
+impl VisibleLayers {
+    pub fn new() -> Self {
+        Self {
+            layer_metadata: BTreeMap::new(),
+            total_images: 0,
+            total_image_bytes: 0,
+            total_deltas: 0,
+            total_delta_bytes: 0,
+        }
+    }
+
+    pub fn add_layer(&mut self, name: LayerName, layer: LayerFileMetadata) {
+        match name {
+            LayerName::Image(_) => {
+                self.total_images += 1;
+                self.total_image_bytes += layer.file_size;
+            }
+            LayerName::Delta(_) => {
+                self.total_deltas += 1;
+                self.total_delta_bytes += layer.file_size;
+            }
+        }
+        self.layer_metadata.insert(name, layer);
+    }
+}
+
+async fn list_visible_layers(path: &Utf8PathBuf) -> anyhow::Result<()> {
+    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let timeline_id = TimelineId::generate();
+
+    let bytes = tokio::fs::read(path).await.context("read file")?;
+    let index_part = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
+    let layer_map = create_layer_map_from_index_part(&index_part, tenant_shard_id, timeline_id);
+    let mut visible_layers = VisibleLayers::new();
+    let (layers, _key_space) = layer_map.get_visibility(Vec::new());
+    for (layer, visibility) in layers {
+        if visibility == LayerVisibilityHint::Visible {
+            visible_layers.add_layer(
+                layer.layer_name(),
+                index_part
+                    .layer_metadata
+                    .get(&layer.layer_name())
+                    .unwrap()
+                    .clone(),
+            );
+        }
+    }
+    let output = serde_json::to_string_pretty(&visible_layers).context("serialize output")?;
+    println!("{output}");
+
+    Ok(())
+}
+
 pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
     match cmd {
         IndexPartCmd::Dump { path } => {
@@ -114,5 +193,6 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
             key,
             lsn,
         } => search_layers(tenant_id, timeline_id, path, key, lsn).await,
+        IndexPartCmd::ListVisibleLayers { path } => list_visible_layers(path).await,
     }
 }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index 3cd4faaf2e..e84ad2c87f 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -4,6 +4,7 @@
 //!
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
 
+mod download_remote_object;
 mod draw_timeline_dir;
 mod index_part;
 mod key;
@@ -16,6 +17,7 @@ use std::time::{Duration, SystemTime};
 
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
+use download_remote_object::DownloadRemoteObjectCmd;
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use page_trace::PageTraceCmd;
@@ -63,6 +65,7 @@ enum Commands {
     /// Debug print a hex key found from logs
     Key(key::DescribeKeyCommand),
     PageTrace(PageTraceCmd),
+    DownloadRemoteObject(DownloadRemoteObjectCmd),
 }
 
 /// Read and update pageserver metadata file
@@ -185,6 +188,9 @@ async fn main() -> anyhow::Result<()> {
         }
         Commands::Key(dkc) => dkc.execute(),
         Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
+        Commands::DownloadRemoteObject(cmd) => {
+            download_remote_object::main(&cmd).await?;
+        }
     };
     Ok(())
 }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ed0a5440cb..7030ac368d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2,7 +2,9 @@
 //! Management HTTP API
 //!
 use std::cmp::Reverse;
-use std::collections::{BinaryHeap, HashMap};
+use std::collections::BTreeMap;
+use std::collections::BinaryHeap;
+use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
@@ -3214,6 +3216,30 @@ async fn get_utilization(
         .map_err(ApiError::InternalServerError)
 }
 
+/// HADRON
+async fn list_tenant_visible_size_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let state = get_state(&request);
+
+    let mut map = BTreeMap::new();
+    for (tenant_shard_id, slot) in state.tenant_manager.list() {
+        match slot {
+            TenantSlot::Attached(tenant) => {
+                let visible_size = tenant.get_visible_size();
+                map.insert(tenant_shard_id, visible_size);
+            }
+            TenantSlot::Secondary(_) | TenantSlot::InProgress(_) => {
+                continue;
+            }
+        }
+    }
+
+    json_response(StatusCode::OK, map)
+}
+
 async fn list_aux_files(
     mut request: Request<Body>,
     _cancel: CancellationToken,
@@ -4151,6 +4177,7 @@ pub fn make_router(
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
+        .get("/v1/list_tenant_visible_size", |r| api_handler(r, list_tenant_visible_size_handler))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
             |r| testing_api_handler("ingest_aux_files", r, ingest_aux_files),
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index eb89e166b2..1b783326a0 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2847,6 +2847,24 @@ pub(crate) static MISROUTED_PAGESTREAM_REQUESTS: Lazy<IntCounter> = Lazy::new(||
     .expect("failed to define a metric")
 });
 
+// Global counter for PageStream request results by outcome. Outcomes are divided into 3 categories:
+// - success
+// - internal_error: errors that indicate bugs in the storage cluster (e.g. page reconstruction errors, misrouted requests, LSN timeout errors)
+// - other_error: transient error conditions that are expected in normal operation or indicate bugs with other parts of the system (e.g. error due to pageserver shutdown, malformed requests etc.)
+pub(crate) static PAGESTREAM_HANDLER_RESULTS_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_pagestream_handler_results_total",
+        "Number of pageserver pagestream handler results by outcome (success, internal_error, other_error)",
+        &["outcome"]
+    )
+    .expect("failed to define a metric")
+});
+
+// Constants for pageserver_pagestream_handler_results_total's outcome labels
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_SUCCESS: &str = "success";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR: &str = "internal_error";
+pub(crate) const PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR: &str = "other_error";
+
 // Metrics collected on WAL redo operations
 //
 // We collect the time spent in actual WAL redo ('redo'), and time waiting
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 70fdb2e789..ebb1addcdb 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -70,7 +70,7 @@ use crate::context::{
 };
 use crate::metrics::{
     self, COMPUTE_COMMANDS_COUNTERS, ComputeCommandKind, GetPageBatchBreakReason, LIVE_CONNECTIONS,
-    MISROUTED_PAGESTREAM_REQUESTS, SmgrOpTimer, TimelineMetrics,
+    MISROUTED_PAGESTREAM_REQUESTS, PAGESTREAM_HANDLER_RESULTS_TOTAL, SmgrOpTimer, TimelineMetrics,
 };
 use crate::pgdatadir_mapping::{LsnRange, Version};
 use crate::span::{
@@ -1441,20 +1441,57 @@ impl PageServerHandler {
             let (response_msg, ctx) = match handler_result {
                 Err(e) => match &e.err {
                     PageStreamError::Shutdown => {
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR])
+                            .inc();
+                        // END HADRON
+
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
                         // shutdown, then do not send the error to the client.  Instead just drop the
                         // connection.
                         span.in_scope(|| info!("dropping connection due to shutdown"));
                         return Err(QueryError::Shutdown);
                     }
-                    PageStreamError::Reconnect(reason) => {
-                        span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    PageStreamError::Reconnect(_reason) => {
+                        span.in_scope(|| {
+                            // BEGIN HADRON
+                            // We can get here because the compute node is pointing at the wrong PS. We
+                            // already have a metric to keep track of this so suppressing this log to
+                            // reduce log spam. The information in this log message is not going to be that
+                            // helpful given the volume of logs that can be generated.
+                            // info!("handler requested reconnect: {reason}")
+                            // END HADRON
+                        });
+                        // BEGIN HADRON
+                        PAGESTREAM_HANDLER_RESULTS_TOTAL
+                            .with_label_values(&[
+                                metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                            ])
+                            .inc();
+                        // END HADRON
                         return Err(QueryError::Reconnect);
                     }
                     PageStreamError::Read(_)
                     | PageStreamError::LsnTimeout(_)
                     | PageStreamError::NotFound(_)
                     | PageStreamError::BadRequest(_) => {
+                        // BEGIN HADRON
+                        if let PageStreamError::Read(_) | PageStreamError::LsnTimeout(_) = &e.err {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_INTERNAL_ERROR,
+                                ])
+                                .inc();
+                        } else {
+                            PAGESTREAM_HANDLER_RESULTS_TOTAL
+                                .with_label_values(&[
+                                    metrics::PAGESTREAM_HANDLER_OUTCOME_OTHER_ERROR,
+                                ])
+                                .inc();
+                        }
+                        // END HADRON
+
                         // print the all details to the log with {:#}, but for the client the
                         // error message is enough.  Do not log if shutting down, as the anyhow::Error
                         // here includes cancellation which is not an error.
@@ -1472,7 +1509,15 @@ impl PageServerHandler {
                         )
                     }
                 },
-                Ok((response_msg, _op_timer_already_observed, ctx)) => (response_msg, Some(ctx)),
+                Ok((response_msg, _op_timer_already_observed, ctx)) => {
+                    // BEGIN HADRON
+                    PAGESTREAM_HANDLER_RESULTS_TOTAL
+                        .with_label_values(&[metrics::PAGESTREAM_HANDLER_OUTCOME_SUCCESS])
+                        .inc();
+                    // END HADRON
+
+                    (response_msg, Some(ctx))
+                }
             };
 
             let ctx = ctx.map(|req_ctx| {
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7e2e6d96b8..f67269851a 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5719,6 +5719,16 @@ impl TenantShard {
             .unwrap_or(0)
     }
 
+    /// HADRON
+    /// Return the visible size of all timelines in this tenant.
+    pub(crate) fn get_visible_size(&self) -> u64 {
+        let timelines = self.timelines.lock().unwrap();
+        timelines
+            .values()
+            .map(|t| t.metrics.visible_physical_size_gauge.get())
+            .sum()
+    }
+
     /// Builds a new tenant manifest, and uploads it if it differs from the last-known tenant
     /// manifest in `Self::remote_tenant_manifest`.
     ///
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 0f7995f87b..973852defc 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -225,7 +225,7 @@ impl fmt::Display for ImageLayerName {
 /// storage and object names in remote storage consist of the LayerName plus some extra qualifiers
 /// that uniquely identify the physical incarnation of a layer (see [crate::tenant::remote_timeline_client::remote_layer_path])
 /// and [`crate::tenant::storage_layer::layer::local_layer_path`])
-#[derive(Debug, PartialEq, Eq, Hash, Clone)]
+#[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)]
 pub enum LayerName {
     Image(ImageLayerName),
     Delta(DeltaLayerName),
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 9baa80f73a..1f98651e71 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -59,6 +59,15 @@ pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     .expect("Failed to register safekeeper_flush_wal_seconds histogram")
 });
 /* BEGIN_HADRON */
+// Counter of all ProposerAcceptorMessage requests received
+pub static PROPOSER_ACCEPTOR_MESSAGES_TOTAL: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "safekeeper_proposer_acceptor_messages_total",
+        "Total number of ProposerAcceptorMessage requests received by the Safekeeper.",
+        &["outcome"]
+    )
+    .expect("Failed to register safekeeper_proposer_acceptor_messages_total counter")
+});
 pub static WAL_DISK_IO_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "safekeeper_wal_disk_io_errors",
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 4d15fc9de3..09ca041e22 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -24,7 +24,7 @@ use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use crate::metrics::MISC_OPERATION_SECONDS;
+use crate::metrics::{MISC_OPERATION_SECONDS, PROPOSER_ACCEPTOR_MESSAGES_TOTAL};
 use crate::state::TimelineState;
 use crate::{control_file, wal_storage};
 
@@ -938,7 +938,7 @@ where
         &mut self,
         msg: &ProposerAcceptorMessage,
     ) -> Result<Option<AcceptorProposerMessage>> {
-        match msg {
+        let res = match msg {
             ProposerAcceptorMessage::Greeting(msg) => self.handle_greeting(msg).await,
             ProposerAcceptorMessage::VoteRequest(msg) => self.handle_vote_request(msg).await,
             ProposerAcceptorMessage::Elected(msg) => self.handle_elected(msg).await,
@@ -949,7 +949,20 @@ where
                 self.handle_append_request(msg, false).await
             }
             ProposerAcceptorMessage::FlushWAL => self.handle_flush().await,
-        }
+        };
+
+        // BEGIN HADRON
+        match &res {
+            Ok(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["success"])
+                .inc(),
+            Err(_) => PROPOSER_ACCEPTOR_MESSAGES_TOTAL
+                .with_label_values(&["error"])
+                .inc(),
+        };
+
+        res
+        // END HADRON
     }
 
     /// Handle initial message from proposer: check its sanity and send my
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 8e7d957b22..23b9d1c8c9 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -333,6 +333,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(f"http://localhost:{self.port}/v1/reload_auth_validation_keys")
         self.verbose_error(res)
 
+    def list_tenant_visible_size(self) -> dict[TenantShardId, int]:
+        res = self.get(f"http://localhost:{self.port}/v1/list_tenant_visible_size")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return res_json
+
     def tenant_list(self) -> list[dict[Any, Any]]:
         res = self.get(f"http://localhost:{self.port}/v1/tenant")
         self.verbose_error(res)
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 7f9207047e..92889e5de3 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING
 
 from fixtures.common_types import Lsn, TenantId, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     DEFAULT_BRANCH_NAME,
     NeonEnv,
@@ -164,3 +165,15 @@ def test_pageserver_http_index_part_force_patch(neon_env_builder: NeonEnvBuilder
             {"rel_size_migration": "legacy"},
         )
         assert client.timeline_detail(tenant_id, timeline_id)["rel_size_migration"] == "legacy"
+
+
+def test_pageserver_get_tenant_visible_size(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_pageservers = 1
+    env = neon_env_builder.init_start()
+    env.create_tenant(shard_count=4)
+    env.create_tenant(shard_count=2)
+
+    json = env.pageserver.http_client().list_tenant_visible_size()
+    log.info(f"{json}")
+    # initial tennat + 2 newly created tenants
+    assert len(json) == 7

From 2fc77c836b3c0e88946254fb9235ded1db60dd75 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 17:46:39 +0200
Subject: [PATCH 072/163] pageserver/client_grpc: add shard map updates
 (#12480)

## Problem

The communicator gRPC client must support changing the shard map on
splits.

Touches #11735.
Requires #12476.

## Summary of changes

* Wrap the shard set in a `ArcSwap` to allow swapping it out.
* Add a new `ShardSpec` parameter struct to pass validated shard info to
the client.
* Add `update_shards()` to change the shard set. In-flight requests are
allowed to complete using the old shards.
* Restructure `get_page` to use a stable view of the shard map, and
retry errors at the top (pre-split) level to pick up shard map changes.
* Also marks `tonic::Status::Internal` as non-retryable, so that we can
use it for client-side invariant checks without continually retrying
these.
---
 Cargo.lock                           |   1 +
 pageserver/client_grpc/Cargo.toml    |   1 +
 pageserver/client_grpc/src/client.rs | 257 ++++++++++++++++++---------
 pageserver/client_grpc/src/lib.rs    |   2 +-
 pageserver/client_grpc/src/retry.rs  |   5 +-
 pageserver/client_grpc/src/split.rs  |  31 ++--
 6 files changed, 199 insertions(+), 98 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 85080f8473..1d68b8f862 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4500,6 +4500,7 @@ name = "pageserver_client_grpc"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "arc-swap",
  "bytes",
  "compute_api",
  "futures",
diff --git a/pageserver/client_grpc/Cargo.toml b/pageserver/client_grpc/Cargo.toml
index ca224900ac..e2741ad839 100644
--- a/pageserver/client_grpc/Cargo.toml
+++ b/pageserver/client_grpc/Cargo.toml
@@ -9,6 +9,7 @@ testing = ["pageserver_api/testing"]
 
 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 bytes.workspace = true
 compute_api.workspace = true
 futures.workspace = true
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 63852868c3..ee09c1f13c 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -3,6 +3,7 @@ use std::num::NonZero;
 use std::sync::Arc;
 
 use anyhow::anyhow;
+use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
 use tracing::instrument;
@@ -55,28 +56,74 @@ const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
-    // TODO: support swapping out the shard map, e.g. via an ArcSwap.
-    shards: Shards,
+    /// The tenant ID.
+    tenant_id: TenantId,
+    /// The timeline ID.
+    timeline_id: TimelineId,
+    /// The JWT auth token for this tenant, if any.
+    auth_token: Option<String>,
+    /// The shards for this tenant.
+    shards: ArcSwap<Shards>,
+    /// The retry configuration.
     retry: Retry,
 }
 
 impl PageserverClient {
     /// Creates a new Pageserver client for a given tenant and timeline. Uses the Pageservers given
-    /// in the shard map, which must be complete and must use gRPC URLs.
+    /// in the shard spec, which must be complete and must use gRPC URLs.
     pub fn new(
         tenant_id: TenantId,
         timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
+        shard_spec: ShardSpec,
         auth_token: Option<String>,
     ) -> anyhow::Result<Self> {
-        let shards = Shards::new(tenant_id, timeline_id, shard_map, stripe_size, auth_token)?;
+        let shards = Shards::new(tenant_id, timeline_id, shard_spec, auth_token.clone())?;
         Ok(Self {
-            shards,
+            tenant_id,
+            timeline_id,
+            auth_token,
+            shards: ArcSwap::new(Arc::new(shards)),
             retry: Retry,
         })
     }
 
+    /// Updates the shards from the given shard spec. In-flight requests will complete using the
+    /// existing shards, but may retry with the new shards if they fail.
+    ///
+    /// TODO: verify that in-flight requests are allowed to complete, and that the old pools are
+    /// properly spun down and dropped afterwards.
+    pub fn update_shards(&self, shard_spec: ShardSpec) -> anyhow::Result<()> {
+        // Validate the shard spec. We should really use `ArcSwap::rcu` for this, to avoid races
+        // with concurrent updates, but that involves creating a new `Shards` on every attempt,
+        // which spins up a bunch of Tokio tasks and such. These should already be checked elsewhere
+        // in the stack, and if they're violated then we already have problems elsewhere, so a
+        // best-effort but possibly-racy check is okay here.
+        let old = self.shards.load_full();
+        if shard_spec.count < old.count {
+            return Err(anyhow!(
+                "can't reduce shard count from {} to {}",
+                old.count,
+                shard_spec.count
+            ));
+        }
+        if !old.count.is_unsharded() && shard_spec.stripe_size != old.stripe_size {
+            return Err(anyhow!(
+                "can't change stripe size from {} to {}",
+                old.stripe_size,
+                shard_spec.stripe_size
+            ));
+        }
+
+        let shards = Shards::new(
+            self.tenant_id,
+            self.timeline_id,
+            shard_spec,
+            self.auth_token.clone(),
+        )?;
+        self.shards.store(Arc::new(shards));
+        Ok(())
+    }
+
     /// Returns whether a relation exists.
     #[instrument(skip_all, fields(rel=%req.rel, lsn=%req.read_lsn))]
     pub async fn check_rel_exists(
@@ -86,7 +133,7 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.check_rel_exists(req).await
             })
             .await
@@ -101,7 +148,7 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_db_size(req).await
             })
             .await
@@ -129,28 +176,42 @@ impl PageserverClient {
             return Err(tonic::Status::invalid_argument("no block number"));
         }
 
+        // The shards may change while we're fetching pages. We execute the request using a stable
+        // view of the shards (especially important for requests that span shards), but retry the
+        // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
+        // retries and re-splits in some cases where requests span shards, but these are expected to
+        // be rare.
+        //
+        // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
+        // once we figure out how to handle these.
+        self.retry
+            .with(async || Self::get_page_with_shards(req.clone(), &self.shards.load_full()).await)
+            .await
+    }
+
+    /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
+    /// concurrent shard updates. Does not retry internally, but is retried by `get_page()`.
+    async fn get_page_with_shards(
+        req: page_api::GetPageRequest,
+        shards: &Shards,
+    ) -> tonic::Result<page_api::GetPageResponse> {
         // Fast path: request is for a single shard.
         if let Some(shard_id) =
-            GetPageSplitter::is_single_shard(&req, self.shards.count, self.shards.stripe_size)
+            GetPageSplitter::is_single_shard(&req, shards.count, shards.stripe_size)
         {
-            return self.get_page_for_shard(shard_id, req).await;
+            return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
         }
 
         // Request spans multiple shards. Split it, dispatch concurrent per-shard requests, and
         // reassemble the responses.
-        //
-        // TODO: when we support shard map updates, we need to detect when it changes and re-split
-        // the request on errors.
-        let mut splitter = GetPageSplitter::split(req, self.shards.count, self.shards.stripe_size);
+        let mut splitter = GetPageSplitter::split(req, shards.count, shards.stripe_size);
 
-        let mut shard_requests: FuturesUnordered<_> = splitter
-            .drain_requests()
-            .map(|(shard_id, shard_req)| {
-                // NB: each request will retry internally.
-                self.get_page_for_shard(shard_id, shard_req)
-                    .map(move |result| result.map(|resp| (shard_id, resp)))
-            })
-            .collect();
+        let mut shard_requests = FuturesUnordered::new();
+        for (shard_id, shard_req) in splitter.drain_requests() {
+            let future = Self::get_page_with_shard(shard_req, shards.get(shard_id)?)
+                .map(move |result| result.map(|resp| (shard_id, resp)));
+            shard_requests.push(future);
+        }
 
         while let Some((shard_id, shard_response)) = shard_requests.next().await.transpose()? {
             splitter.add_response(shard_id, shard_response)?;
@@ -159,41 +220,28 @@ impl PageserverClient {
         splitter.assemble_response()
     }
 
-    /// Fetches pages that belong to the given shard.
-    #[instrument(skip_all, fields(shard = %shard_id))]
-    async fn get_page_for_shard(
-        &self,
-        shard_id: ShardIndex,
+    /// Fetches pages on the given shard. Does not retry internally.
+    async fn get_page_with_shard(
         req: page_api::GetPageRequest,
+        shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let resp = self
-            .retry
-            .with(async || {
-                let stream = self
-                    .shards
-                    .get(shard_id)?
-                    .stream(req.request_class.is_bulk())
-                    .await;
-                let resp = stream.send(req.clone()).await?;
+        let expected = req.block_numbers.len();
+        let stream = shard.stream(req.request_class.is_bulk()).await;
+        let resp = stream.send(req).await?;
 
-                // Convert per-request errors into a tonic::Status.
-                if resp.status_code != page_api::GetPageStatusCode::Ok {
-                    return Err(tonic::Status::new(
-                        resp.status_code.into(),
-                        resp.reason.unwrap_or_else(|| String::from("unknown error")),
-                    ));
-                }
+        // Convert per-request errors into a tonic::Status.
+        if resp.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::new(
+                resp.status_code.into(),
+                resp.reason.unwrap_or_else(|| String::from("unknown error")),
+            ));
+        }
 
-                Ok(resp)
-            })
-            .await?;
-
-        // Make sure we got the right number of pages.
-        // NB: check outside of the retry loop, since we don't want to retry this.
-        let (expected, actual) = (req.block_numbers.len(), resp.page_images.len());
+        // Check that we received the expected number of pages.
+        let actual = resp.page_images.len();
         if expected != actual {
             return Err(tonic::Status::internal(format!(
-                "expected {expected} pages for shard {shard_id}, got {actual}",
+                "expected {expected} pages, got {actual}",
             )));
         }
 
@@ -209,7 +257,7 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // Relation metadata is only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_rel_size(req).await
             })
             .await
@@ -224,48 +272,51 @@ impl PageserverClient {
         self.retry
             .with(async || {
                 // SLRU segments are only available on shard 0.
-                let mut client = self.shards.get_zero().client().await?;
+                let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_slru_segment(req).await
             })
             .await
     }
 }
 
-/// Tracks the tenant's shards.
-struct Shards {
+/// Shard specification for a PageserverClient.
+pub struct ShardSpec {
+    /// Maps shard indices to gRPC URLs.
+    ///
+    /// INVARIANT: every shard 0..count is present, and shard 0 is always present.
+    /// INVARIANT: every URL is valid and uses grpc:// scheme.
+    urls: HashMap<ShardIndex, String>,
     /// The shard count.
     ///
     /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
     count: ShardCount,
-    /// The stripe size. Only used for sharded tenants.
+    /// The stripe size for these shards.
     stripe_size: ShardStripeSize,
-    /// Shards by shard index.
-    ///
-    /// NB: unsharded tenants use count 0, like `ShardIndex::unsharded()`.
-    ///
-    /// INVARIANT: every shard 0..count is present.
-    /// INVARIANT: shard 0 is always present.
-    map: HashMap<ShardIndex, Shard>,
 }
 
-impl Shards {
-    /// Creates a new set of shards based on a shard map.
-    fn new(
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        shard_map: HashMap<ShardIndex, String>,
-        stripe_size: ShardStripeSize,
-        auth_token: Option<String>,
+impl ShardSpec {
+    /// Creates a new shard spec with the given URLs and stripe size. All shards must be given.
+    /// The stripe size may be omitted for unsharded tenants.
+    pub fn new(
+        urls: HashMap<ShardIndex, String>,
+        stripe_size: Option<ShardStripeSize>,
     ) -> anyhow::Result<Self> {
-        let count = match shard_map.len() {
+        // Compute the shard count.
+        let count = match urls.len() {
             0 => return Err(anyhow!("no shards provided")),
             1 => ShardCount::new(0), // NB: unsharded tenants use 0, like `ShardIndex::unsharded()`
             n if n > u8::MAX as usize => return Err(anyhow!("too many shards: {n}")),
             n => ShardCount::new(n as u8),
         };
 
-        let mut map = HashMap::new();
-        for (shard_id, url) in shard_map {
+        // Determine the stripe size. It doesn't matter for unsharded tenants.
+        if stripe_size.is_none() && !count.is_unsharded() {
+            return Err(anyhow!("stripe size must be given for sharded tenants"));
+        }
+        let stripe_size = stripe_size.unwrap_or_default();
+
+        // Validate the shard spec.
+        for (shard_id, url) in &urls {
             // The shard index must match the computed shard count, even for unsharded tenants.
             if shard_id.shard_count != count {
                 return Err(anyhow!("invalid shard index {shard_id}, expected {count}"));
@@ -276,21 +327,64 @@ impl Shards {
             }
             // The above conditions guarantee that we have all shards 0..count: len() matches count,
             // shard number < count, and numbers are unique (via hashmap).
-            let shard = Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?;
-            map.insert(shard_id, shard);
+
+            // Validate the URL.
+            if PageserverProtocol::from_connstring(url)? != PageserverProtocol::Grpc {
+                return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
+            }
         }
 
         Ok(Self {
+            urls,
             count,
             stripe_size,
-            map,
+        })
+    }
+}
+
+/// Tracks the tenant's shards.
+struct Shards {
+    /// Shards by shard index.
+    ///
+    /// INVARIANT: every shard 0..count is present.
+    /// INVARIANT: shard 0 is always present.
+    by_index: HashMap<ShardIndex, Shard>,
+    /// The shard count.
+    ///
+    /// NB: this is 0 for unsharded tenants, following `ShardIndex::unsharded()` convention.
+    count: ShardCount,
+    /// The stripe size. Only used for sharded tenants.
+    stripe_size: ShardStripeSize,
+}
+
+impl Shards {
+    /// Creates a new set of shards based on a shard spec.
+    fn new(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        shard_spec: ShardSpec,
+        auth_token: Option<String>,
+    ) -> anyhow::Result<Self> {
+        // NB: the shard spec has already been validated when constructed.
+        let mut shards = HashMap::with_capacity(shard_spec.urls.len());
+        for (shard_id, url) in shard_spec.urls {
+            shards.insert(
+                shard_id,
+                Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?,
+            );
+        }
+
+        Ok(Self {
+            by_index: shards,
+            count: shard_spec.count,
+            stripe_size: shard_spec.stripe_size,
         })
     }
 
     /// Looks up the given shard.
     #[allow(clippy::result_large_err)] // TODO: check perf impact
     fn get(&self, shard_id: ShardIndex) -> tonic::Result<&Shard> {
-        self.map
+        self.by_index
             .get(&shard_id)
             .ok_or_else(|| tonic::Status::not_found(format!("unknown shard {shard_id}")))
     }
@@ -329,11 +423,6 @@ impl Shard {
         shard_id: ShardIndex,
         auth_token: Option<String>,
     ) -> anyhow::Result<Self> {
-        // Sanity-check that the URL uses gRPC.
-        if PageserverProtocol::from_connstring(&url)? != PageserverProtocol::Grpc {
-            return Err(anyhow!("invalid shard URL {url}: must use gRPC"));
-        }
-
         // Common channel pool for unary and stream requests. Bounded by client/stream pools.
         let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
 
diff --git a/pageserver/client_grpc/src/lib.rs b/pageserver/client_grpc/src/lib.rs
index 3fc7178be2..14fb3fbd5a 100644
--- a/pageserver/client_grpc/src/lib.rs
+++ b/pageserver/client_grpc/src/lib.rs
@@ -3,4 +3,4 @@ mod pool;
 mod retry;
 mod split;
 
-pub use client::PageserverClient;
+pub use client::{PageserverClient, ShardSpec};
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index b0473204d7..a4d4b19870 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -131,7 +131,6 @@ impl Retry {
             tonic::Code::Aborted => true,
             tonic::Code::Cancelled => true,
             tonic::Code::DeadlineExceeded => true, // maybe transient slowness
-            tonic::Code::Internal => true,         // maybe transient failure?
             tonic::Code::ResourceExhausted => true,
             tonic::Code::Unavailable => true,
 
@@ -139,6 +138,10 @@ impl Retry {
             tonic::Code::AlreadyExists => false,
             tonic::Code::DataLoss => false,
             tonic::Code::FailedPrecondition => false,
+            // NB: don't retry Internal. It is intended for serious errors such as invariant
+            // violations, and is also used for client-side invariant checks that would otherwise
+            // result in retry loops.
+            tonic::Code::Internal => false,
             tonic::Code::InvalidArgument => false,
             tonic::Code::NotFound => false,
             tonic::Code::OutOfRange => false,
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
index 5bbcaab393..57c9299b96 100644
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -97,7 +97,8 @@ impl GetPageSplitter {
         self.requests.drain()
     }
 
-    /// Adds a response from the given shard.
+    /// Adds a response from the given shard. The response must match the request ID and have an OK
+    /// status code. A response must not already exist for the given shard ID.
     #[allow(clippy::result_large_err)]
     pub fn add_response(
         &mut self,
@@ -105,24 +106,30 @@ impl GetPageSplitter {
         response: page_api::GetPageResponse,
     ) -> tonic::Result<()> {
         // The caller should already have converted status codes into tonic::Status.
-        assert_eq!(response.status_code, page_api::GetPageStatusCode::Ok);
+        if response.status_code != page_api::GetPageStatusCode::Ok {
+            return Err(tonic::Status::internal(format!(
+                "unexpected non-OK response for shard {shard_id}: {:?}",
+                response.status_code
+            )));
+        }
 
-        // Make sure the response matches the request ID.
+        // The stream pool ensures the response matches the request ID.
         if response.request_id != self.request_id {
             return Err(tonic::Status::internal(format!(
-                "response ID {} does not match request ID {}",
-                response.request_id, self.request_id
+                "response ID mismatch for shard {shard_id}: expected {}, got {}",
+                self.request_id, response.request_id
+            )));
+        }
+
+        // We only dispatch one request per shard.
+        if self.responses.contains_key(&shard_id) {
+            return Err(tonic::Status::internal(format!(
+                "duplicate response for shard {shard_id}"
             )));
         }
 
         // Add the response data to the map.
-        let old = self.responses.insert(shard_id, response.page_images);
-
-        if old.is_some() {
-            return Err(tonic::Status::internal(format!(
-                "duplicate response for shard {shard_id}",
-            )));
-        }
+        self.responses.insert(shard_id, response.page_images);
 
         Ok(())
     }

From 8630d37f5e52a851a48b5936acd409cac5044bb0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 10 Jul 2025 16:53:38 +0100
Subject: [PATCH 073/163] test_runner: manually reuse ports in PortDistributor
 (#12423)

## Problem

Sometimes we run out of free ports in `PortDistributor`. This affects
particularly failed tests that we rerun automatically up to 3 times
(which makes it use up to 3x more ports)

## Summary of changes
- Cycle over the range of ports to reuse freed ports from previous tests

Ref: LKB-62
---
 test_runner/fixtures/port_distributor.py | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py
index 6a829a9399..e51d08e16e 100644
--- a/test_runner/fixtures/port_distributor.py
+++ b/test_runner/fixtures/port_distributor.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import re
 import socket
 from contextlib import closing
+from itertools import cycle
 
 from fixtures.log_helper import log
 
@@ -34,15 +35,23 @@ def can_bind(host: str, port: int) -> bool:
 
 class PortDistributor:
     def __init__(self, base_port: int, port_number: int):
-        self.iterator = iter(range(base_port, base_port + port_number))
+        self.base_port = base_port
+        self.port_number = port_number
+        self.cycle = cycle(range(base_port, base_port + port_number))
         self.port_map: dict[int, int] = {}
 
     def get_port(self) -> int:
-        for port in self.iterator:
+        checked = 0
+        for port in self.cycle:
             if can_bind("localhost", port):
                 return port
+            elif checked < self.port_number:
+                checked += 1
+            else:
+                break
+
         raise RuntimeError(
-            "port range configured for test is exhausted, consider enlarging the range"
+            f"port range ({self.base_port}..{self.base_port + self.port_number}) configured for test is exhausted, consider enlarging the range"
         )
 
     def replace_with_new_port(self, value: int | str) -> int | str:

From dcdfe80bf015e93b991c0aa86ffbbffbcd18c198 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 19:30:09 +0200
Subject: [PATCH 074/163] pagebench: add support for rich gRPC client (#12477)

## Problem

We need to benchmark the rich gRPC client
`client_grpc::PageserverClient` against the basic, no-frills
`page_api::Client` to determine how much overhead it adds.

Touches #11735.
Requires #12476.

## Summary of changes

Add a `pagebench --rich-client` parameter to use
`client_grpc::PageserverClient`. Also adds a compression parameter to
the client.
---
 Cargo.lock                                    |  1 +
 Cargo.toml                                    |  1 +
 pageserver/client_grpc/src/client.rs          | 28 ++++++-
 pageserver/client_grpc/src/pool.rs            |  7 +-
 pageserver/pagebench/Cargo.toml               |  3 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   | 84 +++++++++++++++++++
 6 files changed, 120 insertions(+), 4 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 1d68b8f862..c528354053 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4294,6 +4294,7 @@ dependencies = [
  "humantime-serde",
  "pageserver_api",
  "pageserver_client",
+ "pageserver_client_grpc",
  "pageserver_page_api",
  "rand 0.8.5",
  "reqwest",
diff --git a/Cargo.toml b/Cargo.toml
index 14f2cfcb56..0d521ee4d9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -262,6 +262,7 @@ neon-shmem = { version = "0.1", path = "./libs/neon-shmem/" }
 pageserver = { path = "./pageserver" }
 pageserver_api = { version = "0.1", path = "./libs/pageserver_api/" }
 pageserver_client = { path = "./pageserver/client" }
+pageserver_client_grpc = { path = "./pageserver/client_grpc" }
 pageserver_compaction = { version = "0.1", path = "./pageserver/compaction/" }
 pageserver_page_api = { path = "./pageserver/page_api" }
 postgres_backend = { version = "0.1", path = "./libs/postgres_backend/" }
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index ee09c1f13c..e790f4018e 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -6,6 +6,7 @@ use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
+use tonic::codec::CompressionEncoding;
 use tracing::instrument;
 
 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
@@ -62,6 +63,8 @@ pub struct PageserverClient {
     timeline_id: TimelineId,
     /// The JWT auth token for this tenant, if any.
     auth_token: Option<String>,
+    /// The compression to use, if any.
+    compression: Option<CompressionEncoding>,
     /// The shards for this tenant.
     shards: ArcSwap<Shards>,
     /// The retry configuration.
@@ -76,12 +79,20 @@ impl PageserverClient {
         timeline_id: TimelineId,
         shard_spec: ShardSpec,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
-        let shards = Shards::new(tenant_id, timeline_id, shard_spec, auth_token.clone())?;
+        let shards = Shards::new(
+            tenant_id,
+            timeline_id,
+            shard_spec,
+            auth_token.clone(),
+            compression,
+        )?;
         Ok(Self {
             tenant_id,
             timeline_id,
             auth_token,
+            compression,
             shards: ArcSwap::new(Arc::new(shards)),
             retry: Retry,
         })
@@ -119,6 +130,7 @@ impl PageserverClient {
             self.timeline_id,
             shard_spec,
             self.auth_token.clone(),
+            self.compression,
         )?;
         self.shards.store(Arc::new(shards));
         Ok(())
@@ -364,13 +376,21 @@ impl Shards {
         timeline_id: TimelineId,
         shard_spec: ShardSpec,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
         // NB: the shard spec has already been validated when constructed.
         let mut shards = HashMap::with_capacity(shard_spec.urls.len());
         for (shard_id, url) in shard_spec.urls {
             shards.insert(
                 shard_id,
-                Shard::new(url, tenant_id, timeline_id, shard_id, auth_token.clone())?,
+                Shard::new(
+                    url,
+                    tenant_id,
+                    timeline_id,
+                    shard_id,
+                    auth_token.clone(),
+                    compression,
+                )?,
             );
         }
 
@@ -422,6 +442,7 @@ impl Shard {
         timeline_id: TimelineId,
         shard_id: ShardIndex,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
         // Common channel pool for unary and stream requests. Bounded by client/stream pools.
         let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
@@ -433,6 +454,7 @@ impl Shard {
             timeline_id,
             shard_id,
             auth_token.clone(),
+            compression,
             Some(MAX_UNARY_CLIENTS),
         );
 
@@ -445,6 +467,7 @@ impl Shard {
                 timeline_id,
                 shard_id,
                 auth_token.clone(),
+                compression,
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_STREAMS),
@@ -460,6 +483,7 @@ impl Shard {
                 timeline_id,
                 shard_id,
                 auth_token,
+                compression,
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_BULK_STREAMS),
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 89b3bd646f..2dde40b5b4 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -40,6 +40,7 @@ use futures::StreamExt as _;
 use tokio::sync::mpsc::{Receiver, Sender};
 use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
 use tokio_util::sync::CancellationToken;
+use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
 use tracing::{error, warn};
 
@@ -242,6 +243,8 @@ pub struct ClientPool {
     shard_id: ShardIndex,
     /// Authentication token, if any.
     auth_token: Option<String>,
+    /// Compression to use.
+    compression: Option<CompressionEncoding>,
     /// Channel pool to acquire channels from.
     channel_pool: Arc<ChannelPool>,
     /// Limits the max number of concurrent clients for this pool. None if the pool is unbounded.
@@ -281,6 +284,7 @@ impl ClientPool {
         timeline_id: TimelineId,
         shard_id: ShardIndex,
         auth_token: Option<String>,
+        compression: Option<CompressionEncoding>,
         max_clients: Option<NonZero<usize>>,
     ) -> Arc<Self> {
         let pool = Arc::new(Self {
@@ -288,6 +292,7 @@ impl ClientPool {
             timeline_id,
             shard_id,
             auth_token,
+            compression,
             channel_pool,
             idle: Mutex::default(),
             idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
@@ -331,7 +336,7 @@ impl ClientPool {
             self.timeline_id,
             self.shard_id,
             self.auth_token.clone(),
-            None,
+            self.compression,
         )?;
 
         Ok(ClientGuard {
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
index f5dfc0db25..4086213830 100644
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -27,8 +27,9 @@ tokio-util.workspace = true
 tonic.workspace = true
 url.workspace = true
 
-pageserver_client.workspace = true
 pageserver_api.workspace = true
+pageserver_client.workspace = true
+pageserver_client_grpc.workspace = true
 pageserver_page_api.workspace = true
 utils = { path = "../../libs/utils/" }
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index f14caf548c..42c7e40489 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -10,12 +10,14 @@ use anyhow::Context;
 use async_trait::async_trait;
 use bytes::Bytes;
 use camino::Utf8PathBuf;
+use futures::stream::FuturesUnordered;
 use futures::{Stream, StreamExt as _};
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::pagestream_api::{PagestreamGetPageRequest, PagestreamRequest};
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::TenantShardId;
+use pageserver_client_grpc::{self as client_grpc, ShardSpec};
 use pageserver_page_api as page_api;
 use rand::prelude::*;
 use tokio::task::JoinSet;
@@ -37,6 +39,10 @@ pub(crate) struct Args {
     /// Pageserver connection string. Supports postgresql:// and grpc:// protocols.
     #[clap(long, default_value = "postgres://postgres@localhost:64000")]
     page_service_connstring: String,
+    /// Use the rich gRPC Pageserver client `client_grpc::PageserverClient`, rather than the basic
+    /// no-frills `page_api::Client`. Only valid with grpc:// connstrings.
+    #[clap(long)]
+    rich_client: bool,
     #[clap(long)]
     pageserver_jwt: Option<String>,
     #[clap(long, default_value = "1")]
@@ -332,6 +338,7 @@ async fn main_impl(
             let client: Box<dyn Client> = match scheme.as_str() {
                 "postgresql" | "postgres" => {
                     assert!(!args.compression, "libpq does not support compression");
+                    assert!(!args.rich_client, "rich client requires grpc://");
                     Box::new(
                         LibpqClient::new(&args.page_service_connstring, worker_id.timeline)
                             .await
@@ -339,6 +346,16 @@ async fn main_impl(
                     )
                 }
 
+                "grpc" if args.rich_client => Box::new(
+                    RichGrpcClient::new(
+                        &args.page_service_connstring,
+                        worker_id.timeline,
+                        args.compression,
+                    )
+                    .await
+                    .unwrap(),
+                ),
+
                 "grpc" => Box::new(
                     GrpcClient::new(
                         &args.page_service_connstring,
@@ -680,3 +697,70 @@ impl Client for GrpcClient {
         Ok((resp.request_id, resp.page_images))
     }
 }
+
+/// A rich gRPC Pageserver client.
+struct RichGrpcClient {
+    inner: Arc<client_grpc::PageserverClient>,
+    requests: FuturesUnordered<
+        Pin<Box<dyn Future<Output = anyhow::Result<page_api::GetPageResponse>> + Send>>,
+    >,
+}
+
+impl RichGrpcClient {
+    async fn new(
+        connstring: &str,
+        ttid: TenantTimelineId,
+        compression: bool,
+    ) -> anyhow::Result<Self> {
+        let inner = Arc::new(client_grpc::PageserverClient::new(
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardSpec::new(
+                [(ShardIndex::unsharded(), connstring.to_string())].into(),
+                None,
+            )?,
+            None,
+            compression.then_some(tonic::codec::CompressionEncoding::Zstd),
+        )?);
+        Ok(Self {
+            inner,
+            requests: FuturesUnordered::new(),
+        })
+    }
+}
+
+#[async_trait]
+impl Client for RichGrpcClient {
+    async fn send_get_page(
+        &mut self,
+        req_id: u64,
+        req_lsn: Lsn,
+        mod_lsn: Lsn,
+        rel: RelTag,
+        blks: Vec<u32>,
+    ) -> anyhow::Result<()> {
+        let req = page_api::GetPageRequest {
+            request_id: req_id,
+            request_class: page_api::GetPageClass::Normal,
+            read_lsn: page_api::ReadLsn {
+                request_lsn: req_lsn,
+                not_modified_since_lsn: Some(mod_lsn),
+            },
+            rel,
+            block_numbers: blks,
+        };
+        let inner = self.inner.clone();
+        self.requests.push(Box::pin(async move {
+            inner
+                .get_page(req)
+                .await
+                .map_err(|err| anyhow::anyhow!("{err}"))
+        }));
+        Ok(())
+    }
+
+    async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
+        let resp = self.requests.next().await.unwrap()?;
+        Ok((resp.request_id, resp.page_images))
+    }
+}

From 13b5e7b26fe009c711b3e57436433cf8e4d140d6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 10 Jul 2025 14:02:54 -0400
Subject: [PATCH 075/163] fix(compute_ctl): reload config before applying spec
 (#12551)

## Problem

If we have catalog update AND a pageserver migration batched in a single
spec, we will not be able to apply the spec (running the SQL) because
the compute is not attached to the right pageserver and we are not able
to read anything if we don't pick up the latest pageserver connstring.
This is not a case for now because cplane always schedules shard split /
pageserver migrations with `skip_pg_catalog_updates` (I suppose).

Context:
https://databricks.slack.com/archives/C09254R641L/p1752163559259399?thread_ts=1752160163.141149&cid=C09254R641L

With this fix, backpressure will likely not be able to affect
reconfigurations.

## Summary of changes

Do `pg_reload_conf` before we apply specs in SQL.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 compute_tools/src/compute.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0496d38e67..4a29c232ac 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1805,6 +1805,8 @@ impl ComputeNode {
             tls_config,
         )?;
 
+        self.pg_reload_conf()?;
+
         if !spec.skip_pg_catalog_updates {
             let max_concurrent_connections = spec.reconfigure_concurrency;
             // Temporarily reset max_cluster_size in config
@@ -1824,10 +1826,9 @@ impl ComputeNode {
 
                 Ok(())
             })?;
+            self.pg_reload_conf()?;
         }
 
-        self.pg_reload_conf()?;
-
         let unknown_op = "unknown".to_string();
         let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
         info!(

From c5aaf1ae21df31233a4bc81eef88d56e95b2a33e Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Thu, 10 Jul 2025 19:37:54 +0100
Subject: [PATCH 076/163] Qualify call to neon extension in compute_ctl's
 prewarming (#12554)

https://github.com/neondatabase/cloud/issues/19011
Calls without `neon.` failed on staging.
Also fix local tests to work with qualified calls
---
 compute_tools/src/compute_prewarm.rs         |  6 +++---
 test_runner/regress/test_lfc_prewarm.py      | 14 ++++++++------
 test_runner/regress/test_replica_promotes.py |  4 ++--
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs
index 3f6f9a7ecc..d014a5bb72 100644
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -70,7 +70,7 @@ impl ComputeNode {
             }
         };
         let row = match client
-            .query_one("select * from get_prewarm_info()", &[])
+            .query_one("select * from neon.get_prewarm_info()", &[])
             .await
         {
             Ok(row) => row,
@@ -146,7 +146,7 @@ impl ComputeNode {
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
             .context("connecting to postgres")?
-            .query_one("select prewarm_local_cache($1)", &[&uncompressed])
+            .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
             .await
             .context("loading LFC state into postgres")
             .map(|_| ())
@@ -196,7 +196,7 @@ impl ComputeNode {
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
             .context("connecting to postgres")?
-            .query_one("select get_local_cache_state()", &[])
+            .query_one("select neon.get_local_cache_state()", &[])
             .await
             .context("querying LFC state")?
             .try_get::<usize, &[u8]>(0)
diff --git a/test_runner/regress/test_lfc_prewarm.py b/test_runner/regress/test_lfc_prewarm.py
index 22e5bf576f..0f0cf4cc6d 100644
--- a/test_runner/regress/test_lfc_prewarm.py
+++ b/test_runner/regress/test_lfc_prewarm.py
@@ -40,7 +40,7 @@ def prom_parse(client: EndpointHttpClient) -> dict[str, float]:
 
 def offload_lfc(method: PrewarmMethod, client: EndpointHttpClient, cur: Cursor) -> Any:
     if method == PrewarmMethod.POSTGRES:
-        cur.execute("select get_local_cache_state()")
+        cur.execute("select neon.get_local_cache_state()")
         return cur.fetchall()[0][0]
 
     if method == PrewarmMethod.AUTOPREWARM:
@@ -72,7 +72,7 @@ def prewarm_endpoint(
     elif method == PrewarmMethod.COMPUTE_CTL:
         client.prewarm_lfc()
     elif method == PrewarmMethod.POSTGRES:
-        cur.execute("select prewarm_local_cache(%s)", (lfc_state,))
+        cur.execute("select neon.prewarm_local_cache(%s)", (lfc_state,))
 
 
 def check_prewarmed(
@@ -116,7 +116,7 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
 
     pg_conn = endpoint.connect()
     pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
     pg_cur.execute("create database lfc")
 
     lfc_conn = endpoint.connect(dbname="lfc")
@@ -142,10 +142,12 @@ def test_lfc_prewarm(neon_simple_env: NeonEnv, method: PrewarmMethod):
     lfc_cur = lfc_conn.cursor()
     prewarm_endpoint(method, client, pg_cur, lfc_state)
 
-    pg_cur.execute("select lfc_value from neon_lfc_stats where lfc_key='file_cache_used_pages'")
+    pg_cur.execute(
+        "select lfc_value from neon.neon_lfc_stats where lfc_key='file_cache_used_pages'"
+    )
     lfc_used_pages = pg_cur.fetchall()[0][0]
     log.info(f"Used LFC size: {lfc_used_pages}")
-    pg_cur.execute("select * from get_prewarm_info()")
+    pg_cur.execute("select * from neon.get_prewarm_info()")
     total, prewarmed, skipped, _ = pg_cur.fetchall()[0]
     log.info(f"Prewarm info: {total=} {prewarmed=} {skipped=}")
     progress = (prewarmed + skipped) * 100 // total
@@ -186,7 +188,7 @@ def test_lfc_prewarm_under_workload(neon_simple_env: NeonEnv, method: PrewarmMet
 
     pg_conn = endpoint.connect()
     pg_cur = pg_conn.cursor()
-    pg_cur.execute("create extension neon")
+    pg_cur.execute("create schema neon; create extension neon with schema neon")
     pg_cur.execute("CREATE DATABASE lfc")
 
     lfc_conn = endpoint.connect(dbname="lfc")
diff --git a/test_runner/regress/test_replica_promotes.py b/test_runner/regress/test_replica_promotes.py
index 1f26269f40..8d39ac123a 100644
--- a/test_runner/regress/test_replica_promotes.py
+++ b/test_runner/regress/test_replica_promotes.py
@@ -60,7 +60,7 @@ def test_replica_promote(neon_simple_env: NeonEnv, method: PromoteMethod):
 
     with primary.connect() as primary_conn:
         primary_cur = primary_conn.cursor()
-        primary_cur.execute("create extension neon")
+        primary_cur.execute("create schema neon;create extension neon with schema neon")
         primary_cur.execute(
             "create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)"
         )
@@ -172,7 +172,7 @@ def test_replica_promote_handler_disconnects(neon_simple_env: NeonEnv):
     secondary: Endpoint = env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary")
 
     with primary.connect() as conn, conn.cursor() as cur:
-        cur.execute("create extension neon")
+        cur.execute("create schema neon;create extension neon with schema neon")
         cur.execute("create table t(pk bigint GENERATED ALWAYS AS IDENTITY, payload integer)")
         cur.execute("INSERT INTO t(payload) SELECT generate_series(1, 100)")
         cur.execute("show neon.safekeepers")

From 3593fe195a55441b76874e64bb168acf71f6b4c4 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Thu, 10 Jul 2025 20:28:10 +0100
Subject: [PATCH 077/163] split TerminationPending into two values, keeping
 ComputeStatus stateless (#12506)

After https://github.com/neondatabase/neon/pull/12240 we observed
issues in our go code as `ComputeStatus` is not stateless, thus doesn't
deserialize as string.

```
could not check compute activity: json: cannot unmarshal object into Go struct field
ComputeState.status of type computeclient.ComputeStatus
```

- Fix this by splitting this status into two.
- Update compute OpenApi spec to reflect changes to `/terminate` in
previous PR
---
 compute_tools/README.md                    |  9 +++--
 compute_tools/src/compute.rs               | 17 +++++++---
 compute_tools/src/http/openapi_spec.yaml   | 39 ++++++++++++++++++++--
 compute_tools/src/http/routes/terminate.rs | 14 ++++----
 compute_tools/src/monitor.rs               |  3 +-
 control_plane/src/endpoint.rs              |  3 +-
 libs/compute_api/src/responses.rs          | 18 ++++++++--
 7 files changed, 81 insertions(+), 22 deletions(-)

diff --git a/compute_tools/README.md b/compute_tools/README.md
index 8d84031efc..49f1368f0e 100644
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -46,11 +46,14 @@ stateDiagram-v2
   Configuration --> Failed : Failed to configure the compute
   Configuration --> Running : Compute has been configured
   Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
+  Empty --> TerminationPendingFast : Requested termination
+  Empty --> TerminationPendingImmediate : Requested termination
   Init --> Failed : Failed to start Postgres
   Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
+  Running --> TerminationPendingFast : Requested termination
+  Running --> TerminationPendingImmediate : Requested termination
+  TerminationPendingFast --> Terminated compute with 30s delay for cplane to inspect status
+  TerminationPendingImmediate --> Terminated : Terminated compute immediately
   Failed --> [*] : Compute exited
   Terminated --> [*] : Compute exited
 ```
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 4a29c232ac..c05cc229a2 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -956,14 +956,20 @@ impl ComputeNode {
             None
         };
 
-        let mut delay_exit = false;
         let mut state = self.state.lock().unwrap();
         state.terminate_flush_lsn = lsn;
-        if let ComputeStatus::TerminationPending { mode } = state.status {
+
+        let delay_exit = state.status == ComputeStatus::TerminationPendingFast;
+        if state.status == ComputeStatus::TerminationPendingFast
+            || state.status == ComputeStatus::TerminationPendingImmediate
+        {
+            info!(
+                "Changing compute status from {} to {}",
+                state.status,
+                ComputeStatus::Terminated
+            );
             state.status = ComputeStatus::Terminated;
             self.state_changed.notify_all();
-            // we were asked to terminate gracefully, don't exit to avoid restart
-            delay_exit = mode == compute_api::responses::TerminateMode::Fast
         }
         drop(state);
 
@@ -1901,7 +1907,8 @@ impl ComputeNode {
 
                             // exit loop
                             ComputeStatus::Failed
-                            | ComputeStatus::TerminationPending { .. }
+                            | ComputeStatus::TerminationPendingFast
+                            | ComputeStatus::TerminationPendingImmediate
                             | ComputeStatus::Terminated => break 'cert_update,
 
                             // wait
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 3c58b284b3..93a357e160 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -371,9 +371,28 @@ paths:
       summary: Terminate Postgres and wait for it to exit
       description: ""
       operationId: terminate
+      parameters:
+        - name: mode
+          in: query
+          description: "Terminate mode: fast (wait 30s before returning) and immediate"
+          required: false
+          schema:
+            type: string
+            enum: ["fast", "immediate"]
+            default: fast
       responses:
         200:
           description: Result
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
+        201:
+          description: Result if compute is already terminated
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/TerminateResponse"
         412:
           description: "wrong state"
           content:
@@ -530,11 +549,14 @@ components:
       type: string
       enum:
         - empty
-        - init
-        - failed
-        - running
         - configuration_pending
+        - init
+        - running
         - configuration
+        - failed
+        - termination_pending_fast
+        - termination_pending_immediate
+        - terminated
       example: running
 
     ExtensionInstallRequest:
@@ -660,6 +682,17 @@ components:
           description: Role name.
           example: "neon"
 
+    TerminateResponse:
+      type: object
+      required:
+        - lsn
+      properties:
+        lsn:
+          type: string
+          nullable: true
+          description: "last WAL flush LSN"
+          example: "0/028F10D8"
+
     SetRoleGrantsResponse:
       type: object
       required:
diff --git a/compute_tools/src/http/routes/terminate.rs b/compute_tools/src/http/routes/terminate.rs
index 32d90a5990..5b30b020c8 100644
--- a/compute_tools/src/http/routes/terminate.rs
+++ b/compute_tools/src/http/routes/terminate.rs
@@ -3,7 +3,7 @@ use crate::http::JsonResponse;
 use axum::extract::State;
 use axum::response::Response;
 use axum_extra::extract::OptionalQuery;
-use compute_api::responses::{ComputeStatus, TerminateResponse};
+use compute_api::responses::{ComputeStatus, TerminateMode, TerminateResponse};
 use http::StatusCode;
 use serde::Deserialize;
 use std::sync::Arc;
@@ -12,7 +12,7 @@ use tracing::info;
 
 #[derive(Deserialize, Default)]
 pub struct TerminateQuery {
-    mode: compute_api::responses::TerminateMode,
+    mode: TerminateMode,
 }
 
 /// Terminate the compute.
@@ -24,16 +24,16 @@ pub(in crate::http) async fn terminate(
     {
         let mut state = compute.state.lock().unwrap();
         if state.status == ComputeStatus::Terminated {
-            return JsonResponse::success(StatusCode::CREATED, state.terminate_flush_lsn);
+            let response = TerminateResponse {
+                lsn: state.terminate_flush_lsn,
+            };
+            return JsonResponse::success(StatusCode::CREATED, response);
         }
 
         if !matches!(state.status, ComputeStatus::Empty | ComputeStatus::Running) {
             return JsonResponse::invalid_status(state.status);
         }
-        state.set_status(
-            ComputeStatus::TerminationPending { mode },
-            &compute.state_changed,
-        );
+        state.set_status(mode.into(), &compute.state_changed);
     }
 
     forward_termination_signal(false);
diff --git a/compute_tools/src/monitor.rs b/compute_tools/src/monitor.rs
index 8a2f6addad..fa01545856 100644
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -84,7 +84,8 @@ impl ComputeMonitor {
         if matches!(
             compute_status,
             ComputeStatus::Terminated
-                | ComputeStatus::TerminationPending { .. }
+                | ComputeStatus::TerminationPendingFast
+                | ComputeStatus::TerminationPendingImmediate
                 | ComputeStatus::Failed
         ) {
             info!(
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 74ab15dc97..ad2067e0f2 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -922,7 +922,8 @@ impl Endpoint {
                         ComputeStatus::Empty
                         | ComputeStatus::ConfigurationPending
                         | ComputeStatus::Configuration
-                        | ComputeStatus::TerminationPending { .. }
+                        | ComputeStatus::TerminationPendingFast
+                        | ComputeStatus::TerminationPendingImmediate
                         | ComputeStatus::Terminated => {
                             bail!("unexpected compute status: {:?}", state.status)
                         }
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index e10c381fb4..2fe233214a 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -121,6 +121,15 @@ pub enum TerminateMode {
     Immediate,
 }
 
+impl From<TerminateMode> for ComputeStatus {
+    fn from(mode: TerminateMode) -> Self {
+        match mode {
+            TerminateMode::Fast => ComputeStatus::TerminationPendingFast,
+            TerminateMode::Immediate => ComputeStatus::TerminationPendingImmediate,
+        }
+    }
+}
+
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeStatus {
@@ -141,7 +150,9 @@ pub enum ComputeStatus {
     // control-plane to terminate it.
     Failed,
     // Termination requested
-    TerminationPending { mode: TerminateMode },
+    TerminationPendingFast,
+    // Termination requested, without waiting 30s before returning from /terminate
+    TerminationPendingImmediate,
     // Terminated Postgres
     Terminated,
 }
@@ -160,7 +171,10 @@ impl Display for ComputeStatus {
             ComputeStatus::Running => f.write_str("running"),
             ComputeStatus::Configuration => f.write_str("configuration"),
             ComputeStatus::Failed => f.write_str("failed"),
-            ComputeStatus::TerminationPending { .. } => f.write_str("termination-pending"),
+            ComputeStatus::TerminationPendingFast => f.write_str("termination-pending-fast"),
+            ComputeStatus::TerminationPendingImmediate => {
+                f.write_str("termination-pending-immediate")
+            }
             ComputeStatus::Terminated => f.write_str("terminated"),
         }
     }

From 1b7339b53e2483ab9d8af553007b1af038440c6e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 10 Jul 2025 15:34:11 -0500
Subject: [PATCH 078/163] PG: add max_wal_rate (#12470)

## Problem
One PG tenant may write too fast and overwhelm the PS. The other tenants
sharing the same PSs will get very little bandwidth.

We had one experiment that two tenants sharing the same PSs. One tenant
runs a large ingestion that delivers hundreds of MB/s while the other
only get < 10 MB/s.

## Summary of changes
Rate limit how fast PG can generate WALs. The default is -1. We may
scale the default value with the CPU count. Need to run some experiments
to verify.

## How is this tested?
CI.

PGBench. No limit first. Then set to 1 MB/s and you can see the tps
drop. Then reverted the change and tps increased again.

pgbench -i -s 10 -p 55432 -h 127.0.0.1 -U cloud_admin -d postgres
pgbench postgres -c 10 -j 10 -T 6000000 -P 1 -b tpcb-like -h 127.0.0.1
-U cloud_admin -p 55432
progress: 33.0 s, 986.0 tps, lat 10.142 ms stddev 3.856 progress: 34.0
s, 973.0 tps, lat 10.299 ms stddev 3.857 progress: 35.0 s, 1004.0 tps,
lat 9.939 ms stddev 3.604 progress: 36.0 s, 984.0 tps, lat 10.183 ms
stddev 3.713 progress: 37.0 s, 998.0 tps, lat 10.004 ms stddev 3.668
progress: 38.0 s, 648.9 tps, lat 12.947 ms stddev 24.970 progress: 39.0
s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 40.0 s, 0.0 tps, lat
0.000 ms stddev 0.000 progress: 41.0 s, 0.0 tps, lat 0.000 ms stddev
0.000 progress: 42.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress:
43.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 44.0 s, 0.0 tps,
lat 0.000 ms stddev 0.000 progress: 45.0 s, 0.0 tps, lat 0.000 ms stddev
0.000 progress: 46.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress:
47.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 48.0 s, 0.0 tps,
lat 0.000 ms stddev 0.000 progress: 49.0 s, 347.3 tps, lat 321.560 ms
stddev 1805.633 progress: 50.0 s, 346.8 tps, lat 9.898 ms stddev 3.809
progress: 51.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 52.0 s,
0.0 tps, lat 0.000 ms stddev 0.000 progress: 53.0 s, 0.0 tps, lat 0.000
ms stddev 0.000 progress: 54.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 55.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 56.0 s,
0.0 tps, lat 0.000 ms stddev 0.000 progress: 57.0 s, 0.0 tps, lat 0.000
ms stddev 0.000 progress: 58.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 59.0 s, 0.0 tps, lat 0.000 ms stddev 0.000 progress: 60.0 s,
0.0 tps, lat 0.000 ms stddev 0.000 progress: 61.0 s, 0.0 tps, lat 0.000
ms stddev 0.000 progress: 62.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 63.0 s, 494.5 tps, lat 276.504 ms stddev 1853.689 progress:
64.0 s, 488.0 tps, lat 20.530 ms stddev 71.981 progress: 65.0 s, 407.8
tps, lat 9.502 ms stddev 3.329 progress: 66.0 s, 0.0 tps, lat 0.000 ms
stddev 0.000 progress: 67.0 s, 0.0 tps, lat 0.000 ms stddev 0.000
progress: 68.0 s, 504.5 tps, lat 71.627 ms stddev 397.733 progress: 69.0
s, 371.0 tps, lat 24.898 ms stddev 29.007 progress: 70.0 s, 541.0 tps,
lat 19.684 ms stddev 24.094 progress: 71.0 s, 342.0 tps, lat 29.542 ms
stddev 54.935

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 libs/walproposer/src/api_bindings.rs |  7 ++++
 pgxn/neon/walproposer.h              | 17 ++++++++
 pgxn/neon/walproposer_pg.c           | 62 +++++++++++++++++++++++++++-
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 7c6abf252e..5f856a44d4 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -428,6 +428,12 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
         shard_number: 0,
     };
 
+    let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
+        should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
+        sent_bytes: 0,
+        last_recorded_time_us: 0,
+    };
+
     crate::bindings::WalproposerShmemState {
         propEpochStartLsn: crate::bindings::pg_atomic_uint64 { value: 0 },
         donor_name: [0; 64],
@@ -441,6 +447,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
         num_shards: 0,
         replica_promote: false,
         min_ps_feedback: empty_feedback,
+        wal_rate_limiter: empty_wal_rate_limiter,
     }
 }
 
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index 4b223b6b18..e3a4022664 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -376,6 +376,18 @@ typedef struct PageserverFeedback
 	uint32		shard_number;
 } PageserverFeedback;
 
+/* BEGIN_HADRON */
+typedef struct WalRateLimiter
+{
+	/* If the value is 1, PG backends will hit backpressure. */
+	pg_atomic_uint32 should_limit;
+	/* The number of bytes sent in the current second. */
+	uint64		sent_bytes;
+	/* The last recorded time in microsecond. */
+	TimestampTz last_recorded_time_us;
+} WalRateLimiter;
+/* END_HADRON */
+
 typedef struct WalproposerShmemState
 {
 	pg_atomic_uint64 propEpochStartLsn;
@@ -395,6 +407,11 @@ typedef struct WalproposerShmemState
 
 	/* aggregated feedback with min LSNs across shards */
 	PageserverFeedback min_ps_feedback;
+
+	/* BEGIN_HADRON */
+	/* The WAL rate limiter */
+	WalRateLimiter wal_rate_limiter;
+	/* END_HADRON */
 } WalproposerShmemState;
 
 /*
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 185fc83ace..aaf8f43eeb 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -66,6 +66,9 @@ int			wal_acceptor_reconnect_timeout = 1000;
 int			wal_acceptor_connection_timeout = 10000;
 int			safekeeper_proto_version = 3;
 char	   *safekeeper_conninfo_options = "";
+/* BEGIN_HADRON */
+int         databricks_max_wal_mb_per_second = -1;
+/* END_HADRON */
 
 /* Set to true in the walproposer bgw. */
 static bool am_walproposer;
@@ -252,6 +255,18 @@ nwp_register_gucs(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
+
+    /* BEGIN_HADRON */
+    DefineCustomIntVariable(
+                            "databricks.max_wal_mb_per_second",
+                            "The maximum WAL MB per second allowed. If breached, sending WAL hit the backpressure. Setting to -1 disables the limit.",
+                            NULL,
+                            &databricks_max_wal_mb_per_second,
+                            -1, -1, INT_MAX,
+                            PGC_SUSET,
+                            GUC_UNIT_MB,
+                            NULL, NULL, NULL);
+    /* END_HADRON */
 }
 
 
@@ -393,6 +408,7 @@ assign_neon_safekeepers(const char *newval, void *extra)
 static uint64
 backpressure_lag_impl(void)
 {
+	struct WalproposerShmemState* state = NULL;
 	if (max_replication_apply_lag > 0 || max_replication_flush_lag > 0 || max_replication_write_lag > 0)
 	{
 		XLogRecPtr	writePtr;
@@ -426,6 +442,18 @@ backpressure_lag_impl(void)
 			return (myFlushLsn - applyPtr - max_replication_apply_lag * MB);
 		}
 	}
+
+	/* BEGIN_HADRON */
+	if (databricks_max_wal_mb_per_second == -1) {
+		return 0;
+	}
+
+	state = GetWalpropShmemState();
+	if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1)
+	{
+		return 1;
+	}
+	/* END_HADRON */
 	return 0;
 }
 
@@ -472,6 +500,9 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 		pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
 		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
+		/* BEGIN_HADRON */
+		pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+		/* END_HADRON */
 	}
 	LWLockRelease(AddinShmemInitLock);
 
@@ -487,6 +518,9 @@ WalproposerShmemInit_SyncSafekeeper(void)
 	pg_atomic_init_u64(&walprop_shared->propEpochStartLsn, 0);
 	pg_atomic_init_u64(&walprop_shared->mineLastElectedTerm, 0);
 	pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
+	/* BEGIN_HADRON */
+	pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+	/* END_HADRON */
 }
 
 #define BACK_PRESSURE_DELAY 10000L // 0.01 sec
@@ -521,7 +555,6 @@ backpressure_throttling_impl(void)
 	if (lag == 0)
 		return retry;
 
-
 	old_status = get_ps_display(&len);
 	new_status = (char *) palloc(len + 64 + 1);
 	memcpy(new_status, old_status, len);
@@ -1458,6 +1491,8 @@ XLogBroadcastWalProposer(WalProposer *wp)
 {
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
+	struct WalproposerShmemState *state = NULL;
+	TimestampTz now = 0;
 
 	/* Start from the last sent position */
 	startptr = sentPtr;
@@ -1502,13 +1537,36 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
+	now = GetCurrentTimestamp();
+	LagTrackerWrite(endptr, now);
 
 	/* Do we have any work to do? */
 	Assert(startptr <= endptr);
 	if (endptr <= startptr)
 		return;
 
+	/* BEGIN_HADRON */
+	state = GetWalpropShmemState();
+	if (databricks_max_wal_mb_per_second != -1 && state != NULL)
+	{
+		uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024;
+		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
+
+		if (now - limiter->last_recorded_time_us > USECS_PER_SEC)
+		{
+			/* Reset the rate limiter */
+			limiter->last_recorded_time_us = now;
+			limiter->sent_bytes = 0;
+			pg_atomic_exchange_u32(&limiter->should_limit, 0);
+		}
+		limiter->sent_bytes += (endptr - startptr);
+		if (limiter->sent_bytes > max_wal_bytes)
+		{
+			pg_atomic_exchange_u32(&limiter->should_limit, 1);
+		}
+	}
+	/* END_HADRON */
+
 	WalProposerBroadcast(wp, startptr, endptr);
 	sentPtr = endptr;
 

From 44ea17b7b24240937fc214a1bc5453da0d840ece Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 10 Jul 2025 22:39:42 +0200
Subject: [PATCH 079/163] pageserver/page_api: add attempt to GetPage request
 ID (#12536)

## Problem

`GetPageRequest::request_id` is supposed to be a unique ID for a
request. It's not, because we may retry the request using the same ID.
This causes assertion failures and confusion.

Touches #11735.
Requires #12480.

## Summary of changes

Extend the request ID with a retry attempt, and handle it in the gRPC
client and server.
---
 pageserver/client_grpc/src/client.rs          | 23 +++++---
 pageserver/client_grpc/src/pool.rs            | 13 +++++
 pageserver/client_grpc/src/retry.rs           |  8 +--
 pageserver/page_api/proto/page_service.proto  | 12 +++-
 pageserver/page_api/src/model.rs              | 58 +++++++++++++++++--
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  8 +--
 pageserver/src/page_service.rs                | 19 +++---
 7 files changed, 110 insertions(+), 31 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index e790f4018e..393f89819a 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -143,7 +143,7 @@ impl PageserverClient {
         req: page_api::CheckRelExistsRequest,
     ) -> tonic::Result<page_api::CheckRelExistsResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // Relation metadata is only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.check_rel_exists(req).await
@@ -158,7 +158,7 @@ impl PageserverClient {
         req: page_api::GetDbSizeRequest,
     ) -> tonic::Result<page_api::GetDbSizeResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // Relation metadata is only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_db_size(req).await
@@ -166,8 +166,9 @@ impl PageserverClient {
             .await
     }
 
-    /// Fetches pages. The `request_id` must be unique across all in-flight requests. Automatically
-    /// splits requests that straddle shard boundaries, and assembles the responses.
+    /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
+    /// `attempt` must be 0 (incremented on retry). Automatically splits requests that straddle
+    /// shard boundaries, and assembles the responses.
     ///
     /// Unlike `page_api::Client`, this automatically converts `status_code` into `tonic::Status`
     /// errors. All responses will have `GetPageStatusCode::Ok`.
@@ -187,6 +188,10 @@ impl PageserverClient {
         if req.block_numbers.is_empty() {
             return Err(tonic::Status::invalid_argument("no block number"));
         }
+        // The request attempt must be 0. The client will increment it internally.
+        if req.request_id.attempt != 0 {
+            return Err(tonic::Status::invalid_argument("request attempt must be 0"));
+        }
 
         // The shards may change while we're fetching pages. We execute the request using a stable
         // view of the shards (especially important for requests that span shards), but retry the
@@ -197,7 +202,11 @@ impl PageserverClient {
         // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
         // once we figure out how to handle these.
         self.retry
-            .with(async || Self::get_page_with_shards(req.clone(), &self.shards.load_full()).await)
+            .with(async |attempt| {
+                let mut req = req.clone();
+                req.request_id.attempt = attempt as u32;
+                Self::get_page_with_shards(req, &self.shards.load_full()).await
+            })
             .await
     }
 
@@ -267,7 +276,7 @@ impl PageserverClient {
         req: page_api::GetRelSizeRequest,
     ) -> tonic::Result<page_api::GetRelSizeResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // Relation metadata is only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_rel_size(req).await
@@ -282,7 +291,7 @@ impl PageserverClient {
         req: page_api::GetSlruSegmentRequest,
     ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
         self.retry
-            .with(async || {
+            .with(async |_| {
                 // SLRU segments are only available on shard 0.
                 let mut client = self.shards.load_full().get_zero().client().await?;
                 client.get_slru_segment(req).await
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 2dde40b5b4..906872e091 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -591,6 +591,10 @@ impl StreamPool {
 
         // Track caller response channels by request ID. If the task returns early, these response
         // channels will be dropped and the waiting callers will receive an error.
+        //
+        // NB: this will leak entries if the server doesn't respond to a request (by request ID).
+        // It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
+        // block further use. But we could consider reaping closed channels after some time.
         let mut callers = HashMap::new();
 
         // Process requests and responses.
@@ -695,6 +699,15 @@ impl Drop for StreamGuard {
 
         // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
         // before the response is received, but that's okay.
+        //
+        // TODO: actually, it's probably not okay. Queue depth release should be moved into the
+        // stream task, such that it continues to account for the queue depth slot until the server
+        // responds. Otherwise, if a slow request times out and keeps blocking the stream, the
+        // server will keep waiting on it and we can pile on subsequent requests (including the
+        // timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
+        // requests on e.g. LSN waits and layer downloads, instead returning early to free up the
+        // stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
+        // blocking. TBD.
         let mut streams = pool.streams.lock().unwrap();
         let entry = streams.get_mut(&self.id).expect("unknown stream");
         assert!(entry.idle_since.is_none(), "active stream marked idle");
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index a4d4b19870..a1e0b8636f 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -23,14 +23,14 @@ impl Retry {
     /// If true, log successful requests. For debugging.
     const LOG_SUCCESS: bool = false;
 
-    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
-    /// using the current tracing span for context.
+    /// Runs the given async closure with timeouts and retries (exponential backoff), passing the
+    /// attempt number starting at 0. Logs errors, using the current tracing span for context.
     ///
     /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
     /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
     pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
     where
-        F: FnMut() -> O,
+        F: FnMut(usize) -> O, // takes attempt number, starting at 0
         O: Future<Output = tonic::Result<T>>,
     {
         let started = Instant::now();
@@ -47,7 +47,7 @@ impl Retry {
                 }
 
                 let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f())
+                tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
                     .await
                     .map_err(|_| {
                         tonic::Status::deadline_exceeded(format!(
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
index 1d6c230916..b1f266d910 100644
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -153,7 +153,7 @@ message GetDbSizeResponse {
 message GetPageRequest {
   // A request ID. Will be included in the response. Should be unique for
   // in-flight requests on the stream.
-  uint64 request_id = 1;
+  RequestID request_id = 1;
   // The request class.
   GetPageClass request_class = 2;
   // The LSN to read at.
@@ -177,6 +177,14 @@ message GetPageRequest {
   repeated uint32 block_number = 5;
 }
 
+// A Request ID. Should be unique for in-flight requests on a stream. Included in the response.
+message RequestID {
+  // The base request ID.
+  uint64 id = 1;
+  // The request attempt. Starts at 0, incremented on each retry.
+  uint32 attempt = 2;
+}
+
 // A GetPageRequest class. Primarily intended for observability, but may also be
 // used for prioritization in the future.
 enum GetPageClass {
@@ -199,7 +207,7 @@ enum GetPageClass {
 // the entire batch is ready, so no one can make use of the individual pages.
 message GetPageResponse {
   // The original request's ID.
-  uint64 request_id = 1;
+  RequestID request_id = 1;
   // The response status code.
   GetPageStatusCode status_code = 2;
   // A string describing the status, if any.
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index d0d3517d41..4db8237ad8 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -356,7 +356,10 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
             return Err(ProtocolError::Missing("block_number"));
         }
         Ok(Self {
-            request_id: pb.request_id,
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
             request_class: pb.request_class.into(),
             read_lsn: pb
                 .read_lsn
@@ -371,7 +374,7 @@ impl TryFrom<proto::GetPageRequest> for GetPageRequest {
 impl From<GetPageRequest> for proto::GetPageRequest {
     fn from(request: GetPageRequest) -> Self {
         Self {
-            request_id: request.request_id,
+            request_id: Some(request.request_id.into()),
             request_class: request.request_class.into(),
             read_lsn: Some(request.read_lsn.into()),
             rel: Some(request.rel.into()),
@@ -380,8 +383,51 @@ impl From<GetPageRequest> for proto::GetPageRequest {
     }
 }
 
-/// A GetPage request ID.
-pub type RequestID = u64;
+/// A GetPage request ID and retry attempt. Should be unique for in-flight requests on a stream.
+#[derive(Clone, Copy, Debug, Default, PartialEq, Eq, Hash, PartialOrd, Ord)]
+pub struct RequestID {
+    /// The base request ID.
+    pub id: u64,
+    // The request attempt. Starts at 0, incremented on each retry.
+    pub attempt: u32,
+}
+
+impl RequestID {
+    /// Creates a new RequestID with the given ID and an initial attempt of 0.
+    pub fn new(id: u64) -> Self {
+        Self { id, attempt: 0 }
+    }
+}
+
+impl Display for RequestID {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}.{}", self.id, self.attempt)
+    }
+}
+
+impl From<proto::RequestId> for RequestID {
+    fn from(pb: proto::RequestId) -> Self {
+        Self {
+            id: pb.id,
+            attempt: pb.attempt,
+        }
+    }
+}
+
+impl From<u64> for RequestID {
+    fn from(id: u64) -> Self {
+        Self::new(id)
+    }
+}
+
+impl From<RequestID> for proto::RequestId {
+    fn from(request_id: RequestID) -> Self {
+        Self {
+            id: request_id.id,
+            attempt: request_id.attempt,
+        }
+    }
+}
 
 /// A GetPage request class.
 #[derive(Clone, Copy, Debug, strum_macros::Display)]
@@ -467,7 +513,7 @@ pub struct GetPageResponse {
 impl From<proto::GetPageResponse> for GetPageResponse {
     fn from(pb: proto::GetPageResponse) -> Self {
         Self {
-            request_id: pb.request_id,
+            request_id: pb.request_id.unwrap_or_default().into(),
             status_code: pb.status_code.into(),
             reason: Some(pb.reason).filter(|r| !r.is_empty()),
             page_images: pb.page_image,
@@ -478,7 +524,7 @@ impl From<proto::GetPageResponse> for GetPageResponse {
 impl From<GetPageResponse> for proto::GetPageResponse {
     fn from(response: GetPageResponse) -> Self {
         Self {
-            request_id: response.request_id,
+            request_id: Some(response.request_id.into()),
             status_code: response.status_code.into(),
             reason: response.reason.unwrap_or_default(),
             page_image: response.page_images,
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index 42c7e40489..b5c191e29a 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -674,7 +674,7 @@ impl Client for GrpcClient {
         blks: Vec<u32>,
     ) -> anyhow::Result<()> {
         let req = page_api::GetPageRequest {
-            request_id: req_id,
+            request_id: req_id.into(),
             request_class: page_api::GetPageClass::Normal,
             read_lsn: page_api::ReadLsn {
                 request_lsn: req_lsn,
@@ -694,7 +694,7 @@ impl Client for GrpcClient {
             "unexpected status code: {}",
             resp.status_code,
         );
-        Ok((resp.request_id, resp.page_images))
+        Ok((resp.request_id.id, resp.page_images))
     }
 }
 
@@ -740,7 +740,7 @@ impl Client for RichGrpcClient {
         blks: Vec<u32>,
     ) -> anyhow::Result<()> {
         let req = page_api::GetPageRequest {
-            request_id: req_id,
+            request_id: req_id.into(),
             request_class: page_api::GetPageClass::Normal,
             read_lsn: page_api::ReadLsn {
                 request_lsn: req_lsn,
@@ -761,6 +761,6 @@ impl Client for RichGrpcClient {
 
     async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
         let resp = self.requests.next().await.unwrap()?;
-        Ok((resp.request_id, resp.page_images))
+        Ok((resp.request_id.id, resp.page_images))
     }
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index ebb1addcdb..b2f6cd465d 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3338,9 +3338,12 @@ impl GrpcPageServiceHandler {
     }
 
     /// Generates a PagestreamRequest header from a ReadLsn and request ID.
-    fn make_hdr(read_lsn: page_api::ReadLsn, req_id: u64) -> PagestreamRequest {
+    fn make_hdr(
+        read_lsn: page_api::ReadLsn,
+        req_id: Option<page_api::RequestID>,
+    ) -> PagestreamRequest {
         PagestreamRequest {
-            reqid: req_id,
+            reqid: req_id.map(|r| r.id).unwrap_or_default(),
             request_lsn: read_lsn.request_lsn,
             not_modified_since: read_lsn
                 .not_modified_since_lsn
@@ -3450,7 +3453,7 @@ impl GrpcPageServiceHandler {
 
             batch.push(BatchedGetPageRequest {
                 req: PagestreamGetPageRequest {
-                    hdr: Self::make_hdr(req.read_lsn, req.request_id),
+                    hdr: Self::make_hdr(req.read_lsn, Some(req.request_id)),
                     rel: req.rel,
                     blkno,
                 },
@@ -3528,7 +3531,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(rel=%req.rel, lsn=%req.read_lsn);
 
         let req = PagestreamExistsRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             rel: req.rel,
         };
 
@@ -3678,7 +3681,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(db_oid=%req.db_oid, lsn=%req.read_lsn);
 
         let req = PagestreamDbSizeRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             dbnode: req.db_oid,
         };
 
@@ -3728,7 +3731,7 @@ impl proto::PageService for GrpcPageServiceHandler {
                 .await?
                 .downgrade();
             while let Some(req) = reqs.message().await? {
-                let req_id = req.request_id;
+                let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
                 let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                     .instrument(span.clone()) // propagate request span
                     .await;
@@ -3767,7 +3770,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(rel=%req.rel, lsn=%req.read_lsn);
 
         let req = PagestreamNblocksRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             rel: req.rel,
         };
 
@@ -3800,7 +3803,7 @@ impl proto::PageService for GrpcPageServiceHandler {
         span_record!(kind=%req.kind, segno=%req.segno, lsn=%req.read_lsn);
 
         let req = PagestreamGetSlruSegmentRequest {
-            hdr: Self::make_hdr(req.read_lsn, 0),
+            hdr: Self::make_hdr(req.read_lsn, None),
             kind: req.kind as u8,
             segno: req.segno,
         };

From b91f821e8bae4ed8635f2c9380f304fc575eed91 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 10 Jul 2025 17:49:52 -0400
Subject: [PATCH 080/163] fix(libpagestore): update the default stripe size
 (#12557)

## Problem

Part of LKB-379

The pageserver connstrings are updated in the postmaster and then
there's a hook to propagate it to the shared memory of all backends.
However, the shard stripe doesn't. This would cause problems during
shard splits:

* the compute has active reads/writes
* shard split happens and the cplane applies the new config (pageserver
connstring + stripe size)
* pageserver connstring will be updated immediately once the postmaster
receives the SIGHUP, and it will be copied over the the shared memory of
all other backends.
* stripe size is a normal GUC and we don't have special handling around
that, so if any active backend has ongoing txns the value won't be
applied.
* now it's possible for backends to issue requests based on the wrong
stripe size; what's worse, if a request gets cached in the prefetch
buffer, it will get stuck forever.

## Summary of changes

To make sure it aligns with the current default in storcon.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pgxn/neon/libpagestore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 3b6c4247c3..05ba6da663 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1410,7 +1410,7 @@ pg_init_libpagestore(void)
 							"sharding stripe size",
 							NULL,
 							&stripe_size,
-							32768, 1, INT_MAX,
+							2048, 1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_BLOCKS,
 							NULL, NULL, NULL);

From 8aa9540a05cfab2cf870b309665d78c837310acb Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 11 Jul 2025 00:35:14 +0200
Subject: [PATCH 081/163] pageserver/page_api: include block number and rel in
 gRPC `GetPageResponse` (#12542)

## Problem

With gRPC `GetPageRequest` batches, we'll have non-trivial
fragmentation/reassembly logic in several places of the stack
(concurrent reads, shard splits, LFC hits, etc). If we included the
block numbers with the pages in `GetPageResponse` we could have better
verification and observability that the final responses are correct.

Touches #11735.
Requires #12480.

## Summary of changes

Add a `Page` struct with`block_number` for `GetPageResponse`, along with
the `RelTag` for completeness, and verify them in the rich gRPC client.
---
 pageserver/client_grpc/src/client.rs          |  34 +++-
 pageserver/client_grpc/src/split.rs           | 160 +++++++++++-------
 pageserver/page_api/proto/page_service.proto  |  19 ++-
 pageserver/page_api/src/client.rs             |   3 +-
 pageserver/page_api/src/model.rs              |  59 +++++--
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |  10 +-
 pageserver/src/page_service.rs                |   8 +-
 7 files changed, 201 insertions(+), 92 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 393f89819a..7049fbdb96 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -218,7 +218,7 @@ impl PageserverClient {
     ) -> tonic::Result<page_api::GetPageResponse> {
         // Fast path: request is for a single shard.
         if let Some(shard_id) =
-            GetPageSplitter::is_single_shard(&req, shards.count, shards.stripe_size)
+            GetPageSplitter::for_single_shard(&req, shards.count, shards.stripe_size)
         {
             return Self::get_page_with_shard(req, shards.get(shard_id)?).await;
         }
@@ -238,7 +238,7 @@ impl PageserverClient {
             splitter.add_response(shard_id, shard_response)?;
         }
 
-        splitter.assemble_response()
+        splitter.get_response()
     }
 
     /// Fetches pages on the given shard. Does not retry internally.
@@ -246,9 +246,8 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let expected = req.block_numbers.len();
         let stream = shard.stream(req.request_class.is_bulk()).await;
-        let resp = stream.send(req).await?;
+        let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
         if resp.status_code != page_api::GetPageStatusCode::Ok {
@@ -258,11 +257,27 @@ impl PageserverClient {
             ));
         }
 
-        // Check that we received the expected number of pages.
-        let actual = resp.page_images.len();
-        if expected != actual {
+        // Check that we received the expected pages.
+        if req.rel != resp.rel {
             return Err(tonic::Status::internal(format!(
-                "expected {expected} pages, got {actual}",
+                "shard {} returned wrong relation, expected {} got {}",
+                shard.id, req.rel, resp.rel
+            )));
+        }
+        if !req
+            .block_numbers
+            .iter()
+            .copied()
+            .eq(resp.pages.iter().map(|p| p.block_number))
+        {
+            return Err(tonic::Status::internal(format!(
+                "shard {} returned wrong pages, expected {:?} got {:?}",
+                shard.id,
+                req.block_numbers,
+                resp.pages
+                    .iter()
+                    .map(|page| page.block_number)
+                    .collect::<Vec<_>>()
             )));
         }
 
@@ -435,6 +450,8 @@ impl Shards {
 ///   * Bulk client pool: unbounded.
 ///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
 struct Shard {
+    /// The shard ID.
+    id: ShardIndex,
     /// Unary gRPC client pool.
     client_pool: Arc<ClientPool>,
     /// GetPage stream pool.
@@ -500,6 +517,7 @@ impl Shard {
         );
 
         Ok(Self {
+            id: shard_id,
             client_pool,
             stream_pool,
             bulk_stream_pool,
diff --git a/pageserver/client_grpc/src/split.rs b/pageserver/client_grpc/src/split.rs
index 57c9299b96..b7539b900c 100644
--- a/pageserver/client_grpc/src/split.rs
+++ b/pageserver/client_grpc/src/split.rs
@@ -5,27 +5,24 @@ use bytes::Bytes;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::shard::{ShardStripeSize, key_to_shard_number};
 use pageserver_page_api as page_api;
-use utils::shard::{ShardCount, ShardIndex};
+use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 
 /// Splits GetPageRequests that straddle shard boundaries and assembles the responses.
 /// TODO: add tests for this.
 pub struct GetPageSplitter {
-    /// The original request ID. Used for all shard requests.
-    request_id: page_api::RequestID,
     /// Split requests by shard index.
     requests: HashMap<ShardIndex, page_api::GetPageRequest>,
-    /// Maps the offset in `GetPageRequest::block_numbers` to the owning shard. Used to assemble
-    /// the response pages in the same order as the original request.
+    /// The response being assembled. Preallocated with empty pages, to be filled in.
+    response: page_api::GetPageResponse,
+    /// Maps the offset in `request.block_numbers` and `response.pages` to the owning shard. Used
+    /// to assemble the response pages in the same order as the original request.
     block_shards: Vec<ShardIndex>,
-    /// Page responses by shard index. Will be assembled into a single response.
-    responses: HashMap<ShardIndex, Vec<Bytes>>,
 }
 
 impl GetPageSplitter {
     /// Checks if the given request only touches a single shard, and returns the shard ID. This is
     /// the common case, so we check first in order to avoid unnecessary allocations and overhead.
-    /// The caller must ensure that the request has at least one block number, or this will panic.
-    pub fn is_single_shard(
+    pub fn for_single_shard(
         req: &page_api::GetPageRequest,
         count: ShardCount,
         stripe_size: ShardStripeSize,
@@ -35,8 +32,12 @@ impl GetPageSplitter {
             return Some(ShardIndex::unsharded());
         }
 
-        // Find the base shard index for the first page, and compare with the rest.
-        let key = rel_block_to_key(req.rel, *req.block_numbers.first().expect("no pages"));
+        // Find the first page's shard, for comparison. If there are no pages, just return the first
+        // shard (caller likely checked already, otherwise the server will reject it).
+        let Some(&first_page) = req.block_numbers.first() else {
+            return Some(ShardIndex::new(ShardNumber(0), count));
+        };
+        let key = rel_block_to_key(req.rel, first_page);
         let shard_number = key_to_shard_number(count, stripe_size, &key);
 
         req.block_numbers
@@ -57,19 +58,19 @@ impl GetPageSplitter {
     ) -> Self {
         // The caller should make sure we don't split requests unnecessarily.
         debug_assert!(
-            Self::is_single_shard(&req, count, stripe_size).is_none(),
+            Self::for_single_shard(&req, count, stripe_size).is_none(),
             "unnecessary request split"
         );
 
         // Split the requests by shard index.
         let mut requests = HashMap::with_capacity(2); // common case
         let mut block_shards = Vec::with_capacity(req.block_numbers.len());
-        for blkno in req.block_numbers {
+        for &blkno in &req.block_numbers {
             let key = rel_block_to_key(req.rel, blkno);
             let shard_number = key_to_shard_number(count, stripe_size, &key);
             let shard_id = ShardIndex::new(shard_number, count);
 
-            let shard_req = requests
+            requests
                 .entry(shard_id)
                 .or_insert_with(|| page_api::GetPageRequest {
                     request_id: req.request_id,
@@ -77,20 +78,39 @@ impl GetPageSplitter {
                     rel: req.rel,
                     read_lsn: req.read_lsn,
                     block_numbers: Vec::new(),
-                });
-            shard_req.block_numbers.push(blkno);
+                })
+                .block_numbers
+                .push(blkno);
             block_shards.push(shard_id);
         }
 
-        Self {
+        // Construct a response to be populated by shard responses. Preallocate empty page slots
+        // with the expected block numbers.
+        let response = page_api::GetPageResponse {
             request_id: req.request_id,
-            responses: HashMap::with_capacity(requests.len()),
+            status_code: page_api::GetPageStatusCode::Ok,
+            reason: None,
+            rel: req.rel,
+            pages: req
+                .block_numbers
+                .into_iter()
+                .map(|block_number| {
+                    page_api::Page {
+                        block_number,
+                        image: Bytes::new(), // empty page slot to be filled in
+                    }
+                })
+                .collect(),
+        };
+
+        Self {
             requests,
+            response,
             block_shards,
         }
     }
 
-    /// Drains the per-shard requests, moving them out of the hashmap to avoid extra allocations.
+    /// Drains the per-shard requests, moving them out of the splitter to avoid extra allocations.
     pub fn drain_requests(
         &mut self,
     ) -> impl Iterator<Item = (ShardIndex, page_api::GetPageRequest)> {
@@ -108,72 +128,82 @@ impl GetPageSplitter {
         // The caller should already have converted status codes into tonic::Status.
         if response.status_code != page_api::GetPageStatusCode::Ok {
             return Err(tonic::Status::internal(format!(
-                "unexpected non-OK response for shard {shard_id}: {:?}",
-                response.status_code
+                "unexpected non-OK response for shard {shard_id}: {} {}",
+                response.status_code,
+                response.reason.unwrap_or_default()
             )));
         }
 
-        // The stream pool ensures the response matches the request ID.
-        if response.request_id != self.request_id {
+        if response.request_id != self.response.request_id {
             return Err(tonic::Status::internal(format!(
                 "response ID mismatch for shard {shard_id}: expected {}, got {}",
-                self.request_id, response.request_id
+                self.response.request_id, response.request_id
             )));
         }
 
-        // We only dispatch one request per shard.
-        if self.responses.contains_key(&shard_id) {
+        // Place the shard response pages into the assembled response, in request order.
+        let mut pages = response.pages.into_iter();
+
+        for (i, &s) in self.block_shards.iter().enumerate() {
+            if shard_id != s {
+                continue;
+            }
+
+            let Some(slot) = self.response.pages.get_mut(i) else {
+                return Err(tonic::Status::internal(format!(
+                    "no block_shards slot {i} for shard {shard_id}"
+                )));
+            };
+            let Some(page) = pages.next() else {
+                return Err(tonic::Status::internal(format!(
+                    "missing page {} in shard {shard_id} response",
+                    slot.block_number
+                )));
+            };
+            if page.block_number != slot.block_number {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned wrong page at index {i}, expected {} got {}",
+                    slot.block_number, page.block_number
+                )));
+            }
+            if !slot.image.is_empty() {
+                return Err(tonic::Status::internal(format!(
+                    "shard {shard_id} returned duplicate page {} at index {i}",
+                    slot.block_number
+                )));
+            }
+
+            *slot = page;
+        }
+
+        // Make sure we've consumed all pages from the shard response.
+        if let Some(extra_page) = pages.next() {
             return Err(tonic::Status::internal(format!(
-                "duplicate response for shard {shard_id}"
+                "shard {shard_id} returned extra page: {}",
+                extra_page.block_number
             )));
         }
 
-        // Add the response data to the map.
-        self.responses.insert(shard_id, response.page_images);
-
         Ok(())
     }
 
-    /// Assembles the shard responses into a single response. Responses must be present for all
-    /// relevant shards, and the total number of pages must match the original request.
+    /// Fetches the final, assembled response.
     #[allow(clippy::result_large_err)]
-    pub fn assemble_response(self) -> tonic::Result<page_api::GetPageResponse> {
-        let mut response = page_api::GetPageResponse {
-            request_id: self.request_id,
-            status_code: page_api::GetPageStatusCode::Ok,
-            reason: None,
-            page_images: Vec::with_capacity(self.block_shards.len()),
-        };
-
-        // Set up per-shard page iterators we can pull from.
-        let mut shard_responses = HashMap::with_capacity(self.responses.len());
-        for (shard_id, responses) in self.responses {
-            shard_responses.insert(shard_id, responses.into_iter());
-        }
-
-        // Reassemble the responses in the same order as the original request.
-        for shard_id in &self.block_shards {
-            let page = shard_responses
-                .get_mut(shard_id)
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing response for shard {shard_id}"))
-                })?
-                .next()
-                .ok_or_else(|| {
-                    tonic::Status::internal(format!("missing page from shard {shard_id}"))
-                })?;
-            response.page_images.push(page);
-        }
-
-        // Make sure there are no additional pages.
-        for (shard_id, mut pages) in shard_responses {
-            if pages.next().is_some() {
+    pub fn get_response(self) -> tonic::Result<page_api::GetPageResponse> {
+        // Check that the response is complete.
+        for (i, page) in self.response.pages.iter().enumerate() {
+            if page.image.is_empty() {
                 return Err(tonic::Status::internal(format!(
-                    "extra pages returned from shard {shard_id}"
+                    "missing page {} for shard {}",
+                    page.block_number,
+                    self.block_shards
+                        .get(i)
+                        .map(|s| s.to_string())
+                        .unwrap_or_else(|| "?".to_string())
                 )));
             }
         }
 
-        Ok(response)
+        Ok(self.response)
     }
 }
diff --git a/pageserver/page_api/proto/page_service.proto b/pageserver/page_api/proto/page_service.proto
index b1f266d910..d113a04a42 100644
--- a/pageserver/page_api/proto/page_service.proto
+++ b/pageserver/page_api/proto/page_service.proto
@@ -208,12 +208,25 @@ enum GetPageClass {
 message GetPageResponse {
   // The original request's ID.
   RequestID request_id = 1;
-  // The response status code.
+  // The response status code. If not OK, the rel and page fields will be empty.
   GetPageStatusCode status_code = 2;
   // A string describing the status, if any.
   string reason = 3;
-  // The 8KB page images, in the same order as the request. Empty if status_code != OK.
-  repeated bytes page_image = 4;
+  // The relation that the pages belong to.
+  RelTag rel = 4;
+  // The page(s), in the same order as the request.
+  repeated Page page = 5;
+}
+
+// A page.
+//
+// TODO: it would be slightly more efficient (but less convenient) to have separate arrays of block
+// numbers and images, but given the 8KB page size it's probably negligible. Benchmark it anyway.
+message Page {
+  // The page number.
+  uint32 block_number = 1;
+  // The materialized page image, as an 8KB byte vector.
+  bytes image = 2;
 }
 
 // A GetPageResponse status code.
diff --git a/pageserver/page_api/src/client.rs b/pageserver/page_api/src/client.rs
index 6523d00d3d..f70d0e7b28 100644
--- a/pageserver/page_api/src/client.rs
+++ b/pageserver/page_api/src/client.rs
@@ -1,4 +1,5 @@
 use anyhow::Context as _;
+use futures::future::ready;
 use futures::{Stream, StreamExt as _, TryStreamExt as _};
 use tokio::io::AsyncRead;
 use tokio_util::io::StreamReader;
@@ -110,7 +111,7 @@ impl Client {
     ) -> tonic::Result<impl Stream<Item = tonic::Result<GetPageResponse>> + Send + 'static> {
         let reqs = reqs.map(proto::GetPageRequest::from);
         let resps = self.inner.get_pages(reqs).await?.into_inner();
-        Ok(resps.map_ok(GetPageResponse::from))
+        Ok(resps.and_then(|resp| ready(GetPageResponse::try_from(resp).map_err(|err| err.into()))))
     }
 
     /// Returns the size of a relation, as # of blocks.
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index 4db8237ad8..a9dd154285 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -502,22 +502,30 @@ impl From<GetPageClass> for i32 {
 pub struct GetPageResponse {
     /// The original request's ID.
     pub request_id: RequestID,
-    /// The response status code.
+    /// The response status code. If not OK, the `rel` and `pages` fields will be empty.
     pub status_code: GetPageStatusCode,
     /// A string describing the status, if any.
     pub reason: Option<String>,
-    /// The 8KB page images, in the same order as the request. Empty if status != OK.
-    pub page_images: Vec<Bytes>,
+    /// The relation that the pages belong to.
+    pub rel: RelTag,
+    // The page(s), in the same order as the request.
+    pub pages: Vec<Page>,
 }
 
-impl From<proto::GetPageResponse> for GetPageResponse {
-    fn from(pb: proto::GetPageResponse) -> Self {
-        Self {
-            request_id: pb.request_id.unwrap_or_default().into(),
+impl TryFrom<proto::GetPageResponse> for GetPageResponse {
+    type Error = ProtocolError;
+
+    fn try_from(pb: proto::GetPageResponse) -> Result<Self, ProtocolError> {
+        Ok(Self {
+            request_id: pb
+                .request_id
+                .ok_or(ProtocolError::Missing("request_id"))?
+                .into(),
             status_code: pb.status_code.into(),
             reason: Some(pb.reason).filter(|r| !r.is_empty()),
-            page_images: pb.page_image,
-        }
+            rel: pb.rel.ok_or(ProtocolError::Missing("rel"))?.try_into()?,
+            pages: pb.page.into_iter().map(Page::from).collect(),
+        })
     }
 }
 
@@ -527,7 +535,8 @@ impl From<GetPageResponse> for proto::GetPageResponse {
             request_id: Some(response.request_id.into()),
             status_code: response.status_code.into(),
             reason: response.reason.unwrap_or_default(),
-            page_image: response.page_images,
+            rel: Some(response.rel.into()),
+            page: response.pages.into_iter().map(proto::Page::from).collect(),
         }
     }
 }
@@ -560,11 +569,39 @@ impl GetPageResponse {
             request_id,
             status_code,
             reason: Some(status.message().to_string()),
-            page_images: Vec::new(),
+            rel: RelTag::default(),
+            pages: Vec::new(),
         })
     }
 }
 
+// A page.
+#[derive(Clone, Debug)]
+pub struct Page {
+    /// The page number.
+    pub block_number: u32,
+    /// The materialized page image, as an 8KB byte vector.
+    pub image: Bytes,
+}
+
+impl From<proto::Page> for Page {
+    fn from(pb: proto::Page) -> Self {
+        Self {
+            block_number: pb.block_number,
+            image: pb.image,
+        }
+    }
+}
+
+impl From<Page> for proto::Page {
+    fn from(page: Page) -> Self {
+        Self {
+            block_number: page.block_number,
+            image: page.image,
+        }
+    }
+}
+
 /// A GetPage response status code.
 ///
 /// These are effectively equivalent to gRPC statuses. However, we use a bidirectional stream
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index b5c191e29a..30b30d36f6 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -694,7 +694,10 @@ impl Client for GrpcClient {
             "unexpected status code: {}",
             resp.status_code,
         );
-        Ok((resp.request_id.id, resp.page_images))
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
     }
 }
 
@@ -761,6 +764,9 @@ impl Client for RichGrpcClient {
 
     async fn recv_get_page(&mut self) -> anyhow::Result<(u64, Vec<Bytes>)> {
         let resp = self.requests.next().await.unwrap()?;
-        Ok((resp.request_id.id, resp.page_images))
+        Ok((
+            resp.request_id.id,
+            resp.pages.into_iter().map(|p| p.image).collect(),
+        ))
     }
 }
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b2f6cd465d..1fc7e4eac7 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3483,12 +3483,16 @@ impl GrpcPageServiceHandler {
             request_id: req.request_id,
             status_code: page_api::GetPageStatusCode::Ok,
             reason: None,
-            page_images: Vec::with_capacity(results.len()),
+            rel: req.rel,
+            pages: Vec::with_capacity(results.len()),
         };
 
         for result in results {
             match result {
-                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.page_images.push(r.page),
+                Ok((PagestreamBeMessage::GetPage(r), _, _)) => resp.pages.push(page_api::Page {
+                    block_number: r.req.blkno,
+                    image: r.page,
+                }),
                 Ok((resp, _, _)) => {
                     return Err(tonic::Status::internal(format!(
                         "unexpected response: {resp:?}"

From cec0543b5141f24d928d891a49d8832c70161311 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 10 Jul 2025 17:58:54 -0500
Subject: [PATCH 082/163] Add background to compute migration
 0002-alter_roles.sql (#11708)

On December 8th, 2023, an engineering escalation (INC-110) was opened
after it was found that BYPASSRLS was being applied to all roles.

PR that introduced the issue:
https://github.com/neondatabase/neon/pull/5657
Subsequent commit on main:
https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072

NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but
because it isn't easy to know if a Postgres cluster is affected by the
issue, we need to keep the migration around for a long time, if not
indefinitely, so any cluster can be fixed.

Branching is the gift that keeps on giving...

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
---
 compute_tools/src/migrations/0002-alter_roles.sql | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql
index 6cb49f873f..8fc371eb8f 100644
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -1,3 +1,16 @@
+-- On December 8th, 2023, an engineering escalation (INC-110) was opened after
+-- it was found that BYPASSRLS was being applied to all roles.
+--
+-- PR that introduced the issue: https://github.com/neondatabase/neon/pull/5657
+-- Subsequent commit on main: https://github.com/neondatabase/neon/commit/ad99fa5f0393e2679e5323df653c508ffa0ac072
+--
+-- NOBYPASSRLS and INHERIT are the defaults for a Postgres role, but because it
+-- isn't easy to know if a Postgres cluster is affected by the issue, we need to
+-- keep the migration around for a long time, if not indefinitely, so any
+-- cluster can be fixed.
+--
+-- Branching is the gift that keeps on giving...
+
 DO $$
 DECLARE
     role_name text;

From 1637fbce25db18ae6b86dd400788b6f32e608bf3 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 11 Jul 2025 10:50:19 +0200
Subject: [PATCH 083/163] Merge fix

---
 pgxn/neon/libpagestore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 453149a4fc..7dfc08e54a 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -1441,7 +1441,7 @@ pg_init_libpagestore(void)
 							"sharding stripe size",
 							NULL,
 							&neon_stripe_size,
-							2048 1, INT_MAX,
+							2048, 1, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_BLOCKS,
 							NULL, NULL, NULL);

From c34d36d8a270b9a4910d4d26210e7c608288f079 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Fri, 11 Jul 2025 14:49:37 +0400
Subject: [PATCH 084/163] storcon_cli: timeline-safekeeper-migrate and
 timeline-locate subcommands (#12548)

## Problem
We have a `safekeeper_migrate` handler, but no subcommand in
`storcon_cli`. Same for `/:timeline_id/locate` for identifying current
set of safekeepers.

- Closes: https://github.com/neondatabase/neon/issues/12395

## Summary of changes
- Add `timeline-safekeeper-migrate` and `timeline-locate` subcommands to
`storcon_cli`
---
 Cargo.lock                                    |  1 +
 control_plane/src/broker.rs                   |  2 +-
 control_plane/src/pageserver.rs               |  2 +-
 control_plane/src/safekeeper.rs               |  2 +-
 control_plane/src/storage_controller.rs       |  2 +-
 control_plane/storcon_cli/Cargo.toml          |  1 +
 control_plane/storcon_cli/src/main.rs         | 57 ++++++++++++++++++-
 libs/safekeeper_api/src/models.rs             | 11 +++-
 .../src/service/safekeeper_service.rs         | 10 +---
 9 files changed, 73 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c528354053..025f4e4116 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6991,6 +6991,7 @@ dependencies = [
  "pageserver_api",
  "pageserver_client",
  "reqwest",
+ "safekeeper_api",
  "serde_json",
  "storage_controller_client",
  "tokio",
diff --git a/control_plane/src/broker.rs b/control_plane/src/broker.rs
index f43f459636..988b08e875 100644
--- a/control_plane/src/broker.rs
+++ b/control_plane/src/broker.rs
@@ -36,7 +36,7 @@ impl StorageBroker {
     pub async fn start(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         let broker = &self.env.broker;
 
-        print!("Starting neon broker at {}", broker.client_url());
+        println!("Starting neon broker at {}", broker.client_url());
 
         let mut args = Vec::new();
 
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 3673d1f4f2..843ead807d 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -303,7 +303,7 @@ impl PageServerNode {
     async fn start_node(&self, retry_timeout: &Duration) -> anyhow::Result<()> {
         // TODO: using a thread here because start_process() is not async but we need to call check_status()
         let datadir = self.repo_path();
-        print!(
+        println!(
             "Starting pageserver node {} at '{}' in {:?}, retrying for {:?}",
             self.conf.id,
             self.pg_connection_config.raw_address(),
diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index da9dafd8e9..2ba2f3ebe4 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -127,7 +127,7 @@ impl SafekeeperNode {
         extra_opts: &[String],
         retry_timeout: &Duration,
     ) -> anyhow::Result<()> {
-        print!(
+        println!(
             "Starting safekeeper at '{}' in '{}', retrying for {:?}",
             self.pg_connection_config.raw_address(),
             self.datadir_path().display(),
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index bb83a6319c..dc6c82f504 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -660,7 +660,7 @@ impl StorageController {
             ));
         }
 
-        println!("Starting storage controller");
+        println!("Starting storage controller at {scheme}://{host}:{listen_port}");
 
         background_process::start_process(
             COMMAND,
diff --git a/control_plane/storcon_cli/Cargo.toml b/control_plane/storcon_cli/Cargo.toml
index ce89116691..61d48b2469 100644
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -14,6 +14,7 @@ humantime.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
+safekeeper_api.workspace=true
 serde_json = { workspace = true, features = ["raw_value"] }
 storage_controller_client.workspace = true
 tokio.workspace = true
diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 701c4b3b2e..24fd34a87a 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -11,7 +11,7 @@ use pageserver_api::controller_api::{
     PlacementPolicy, SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest,
     ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
     SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TimelineSafekeeperMigrateRequest,
 };
 use pageserver_api::models::{
     EvictionPolicy, EvictionPolicyLayerAccessThreshold, ShardParameters, TenantConfig,
@@ -21,6 +21,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::{ShardStripeSize, TenantShardId};
 use pageserver_client::mgmt_api::{self};
 use reqwest::{Certificate, Method, StatusCode, Url};
+use safekeeper_api::models::TimelineLocateResponse;
 use storage_controller_client::control_api::Client;
 use utils::id::{NodeId, TenantId, TimelineId};
 
@@ -279,6 +280,23 @@ enum Command {
         #[arg(long)]
         concurrency: Option<usize>,
     },
+    /// Locate safekeepers for a timeline from the storcon DB.
+    TimelineLocate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+    },
+    /// Migrate a timeline to a new set of safekeepers
+    TimelineSafekeeperMigrate {
+        #[arg(long)]
+        tenant_id: TenantId,
+        #[arg(long)]
+        timeline_id: TimelineId,
+        /// Example: --new-sk-set 1,2,3
+        #[arg(long, required = true, value_delimiter = ',')]
+        new_sk_set: Vec<NodeId>,
+    },
 }
 
 #[derive(Parser)]
@@ -1324,7 +1342,7 @@ async fn main() -> anyhow::Result<()> {
             concurrency,
         } => {
             let mut path = format!(
-                "/v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
+                "v1/tenant/{tenant_shard_id}/timeline/{timeline_id}/download_heatmap_layers",
             );
 
             if let Some(c) = concurrency {
@@ -1335,6 +1353,41 @@ async fn main() -> anyhow::Result<()> {
                 .dispatch::<(), ()>(Method::POST, path, None)
                 .await?;
         }
+        Command::TimelineLocate {
+            tenant_id,
+            timeline_id,
+        } => {
+            let path = format!("debug/v1/tenant/{tenant_id}/timeline/{timeline_id}/locate");
+
+            let resp = storcon_client
+                .dispatch::<(), TimelineLocateResponse>(Method::GET, path, None)
+                .await?;
+
+            let sk_set = resp.sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+            let new_sk_set = resp
+                .new_sk_set
+                .as_ref()
+                .map(|ids| ids.iter().map(|id| id.0 as i64).collect::<Vec<_>>());
+
+            println!("generation = {}", resp.generation);
+            println!("sk_set = {sk_set:?}");
+            println!("new_sk_set = {new_sk_set:?}");
+        }
+        Command::TimelineSafekeeperMigrate {
+            tenant_id,
+            timeline_id,
+            new_sk_set,
+        } => {
+            let path = format!("v1/tenant/{tenant_id}/timeline/{timeline_id}/safekeeper_migrate");
+
+            storcon_client
+                .dispatch::<_, ()>(
+                    Method::POST,
+                    path,
+                    Some(TimelineSafekeeperMigrateRequest { new_sk_set }),
+                )
+                .await?;
+        }
     }
 
     Ok(())
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index e87232474b..59e112654b 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::id::{NodeId, TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 use utils::pageserver_feedback::PageserverFeedback;
 
-use crate::membership::Configuration;
+use crate::membership::{Configuration, SafekeeperGeneration};
 use crate::{ServerInfo, Term};
 
 #[derive(Debug, Serialize, Deserialize)]
@@ -311,3 +311,12 @@ pub struct PullTimelineResponse {
     pub safekeeper_host: Option<String>,
     // TODO: add more fields?
 }
+
+/// Response to a timeline locate request.
+/// Storcon-only API.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineLocateResponse {
+    pub generation: SafekeeperGeneration,
+    pub sk_set: Vec<NodeId>,
+    pub new_sk_set: Option<Vec<NodeId>>,
+}
diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index d7179372b2..42ddf81e3e 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -25,7 +25,8 @@ use pageserver_api::models::{SafekeeperInfo, SafekeepersInfo, TimelineInfo};
 use safekeeper_api::PgVersionId;
 use safekeeper_api::membership::{self, MemberSet, SafekeeperGeneration};
 use safekeeper_api::models::{
-    PullTimelineRequest, TimelineMembershipSwitchRequest, TimelineMembershipSwitchResponse,
+    PullTimelineRequest, TimelineLocateResponse, TimelineMembershipSwitchRequest,
+    TimelineMembershipSwitchResponse,
 };
 use safekeeper_api::{INITIAL_TERM, Term};
 use safekeeper_client::mgmt_api;
@@ -37,13 +38,6 @@ use utils::lsn::Lsn;
 
 use super::Service;
 
-#[derive(serde::Serialize, serde::Deserialize, Clone)]
-pub struct TimelineLocateResponse {
-    pub generation: SafekeeperGeneration,
-    pub sk_set: Vec<NodeId>,
-    pub new_sk_set: Option<Vec<NodeId>>,
-}
-
 impl Service {
     fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
         let members = safekeepers

From 15f633922aaa62e333ba3b92cd97d646ce56e5ef Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 11 Jul 2025 12:39:51 +0100
Subject: [PATCH 085/163] pageserver: use image consistent LSN for force image
 layer creation (#12547)

This is a no-op for the neon deployment

* Introduce the concept image consistent lsn: of the largest LSN below
which all pages have been redone successfully
* Use the image consistent LSN for forced image layer creations
* Optionally expose the image consistent LSN via the timeline describe
HTTP endpoint
* Add a sharded timeline describe endpoint to storcon

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
---
 libs/pageserver_api/src/controller_api.rs     |   9 +-
 libs/pageserver_api/src/models.rs             |   3 +
 pageserver/src/http/routes.rs                 |  19 ++
 pageserver/src/tenant.rs                      |  34 +++
 pageserver/src/tenant/layer_map.rs            | 232 +++++++++++++++++-
 pageserver/src/tenant/timeline.rs             |  32 ++-
 pageserver/src/tenant/timeline/compaction.rs  |  89 +++----
 .../src/tenant/timeline/layer_manager.rs      |   1 +
 storage_controller/src/http.rs                |  32 +++
 storage_controller/src/pageserver_client.rs   |  17 ++
 storage_controller/src/service.rs             |  88 ++++++-
 test_runner/fixtures/neon_fixtures.py         |  14 ++
 test_runner/regress/test_compaction.py        |  77 +++++-
 13 files changed, 567 insertions(+), 80 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index a8c7083b17..b02c6a613a 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -10,7 +10,7 @@ use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 
-use crate::models::{PageserverUtilization, ShardParameters, TenantConfig};
+use crate::models::{PageserverUtilization, ShardParameters, TenantConfig, TimelineInfo};
 use crate::shard::{ShardStripeSize, TenantShardId};
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -126,6 +126,13 @@ pub struct TenantDescribeResponse {
     pub config: TenantConfig,
 }
 
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantTimelineDescribeResponse {
+    pub shards: Vec<TimelineInfo>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 pub struct NodeShardResponse {
     pub node_id: NodeId,
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 56dd95eab3..11e02a8550 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1622,6 +1622,9 @@ pub struct TimelineInfo {
 
     /// Whether the timeline is invisible in synthetic size calculations.
     pub is_invisible: Option<bool>,
+    // HADRON: the largest LSN below which all page updates have been included in the image layers.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub image_consistent_lsn: Option<Lsn>,
 }
 
 #[derive(Debug, Clone, Serialize, Deserialize)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7030ac368d..d839bac557 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -397,6 +397,7 @@ async fn build_timeline_info(
     timeline: &Arc<Timeline>,
     include_non_incremental_logical_size: bool,
     force_await_initial_logical_size: bool,
+    include_image_consistent_lsn: bool,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
     crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -421,6 +422,10 @@ async fn build_timeline_info(
                 .await?,
         );
     }
+    // HADRON
+    if include_image_consistent_lsn {
+        info.image_consistent_lsn = Some(timeline.compute_image_consistent_lsn().await?);
+    }
     Ok(info)
 }
 
@@ -510,6 +515,8 @@ async fn build_timeline_info_common(
         is_invisible: Some(is_invisible),
 
         walreceiver_status,
+        // HADRON
+        image_consistent_lsn: None,
     };
     Ok(info)
 }
@@ -712,6 +719,8 @@ async fn timeline_list_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let state = get_state(&request);
@@ -732,6 +741,7 @@ async fn timeline_list_handler(
                 &timeline,
                 include_non_incremental_logical_size.unwrap_or(false),
                 force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                 &ctx,
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -760,6 +770,9 @@ async fn timeline_and_offloaded_list_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
+
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let state = get_state(&request);
@@ -780,6 +793,7 @@ async fn timeline_and_offloaded_list_handler(
                 &timeline,
                 include_non_incremental_logical_size.unwrap_or(false),
                 force_await_initial_logical_size.unwrap_or(false),
+                include_image_consistent_lsn.unwrap_or(false),
                 &ctx,
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -964,6 +978,9 @@ async fn timeline_detail_handler(
         parse_query_param(&request, "include-non-incremental-logical-size")?;
     let force_await_initial_logical_size: Option<bool> =
         parse_query_param(&request, "force-await-initial-logical-size")?;
+    // HADRON
+    let include_image_consistent_lsn: Option<bool> =
+        parse_query_param(&request, "include-image-consistent-lsn")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     // Logical size calculation needs downloading.
@@ -984,6 +1001,7 @@ async fn timeline_detail_handler(
             &timeline,
             include_non_incremental_logical_size.unwrap_or(false),
             force_await_initial_logical_size.unwrap_or(false),
+            include_image_consistent_lsn.unwrap_or(false),
             ctx,
         )
         .await
@@ -3643,6 +3661,7 @@ async fn activate_post_import_handler(
         let timeline_info = build_timeline_info(
             &timeline, false, // include_non_incremental_logical_size,
             false, // force_await_initial_logical_size
+            false, // include_image_consistent_lsn
             &ctx,
         )
         .await
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f67269851a..f75a03a508 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12816,6 +12816,40 @@ mod tests {
                 },
             ]
         );
+
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn test_get_force_image_creation_lsn() -> anyhow::Result<()> {
+        let tenant_conf = pageserver_api::models::TenantConfig {
+            pitr_interval: Some(Duration::from_secs(7 * 3600)),
+            image_layer_force_creation_period: Some(Duration::from_secs(3600)),
+            ..Default::default()
+        };
+
+        let tenant_id = TenantId::generate();
+
+        let harness = TenantHarness::create_custom(
+            "test_get_force_image_creation_lsn",
+            tenant_conf,
+            tenant_id,
+            ShardIdentity::unsharded(),
+            Generation::new(1),
+        )
+        .await?;
+        let (tenant, ctx) = harness.load().await;
+        let timeline = tenant
+            .create_test_timeline(TIMELINE_ID, Lsn(0x10), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        timeline.gc_info.write().unwrap().cutoffs.time = Some(Lsn(100));
+        {
+            let writer = timeline.writer().await;
+            writer.finish_write(Lsn(5000));
+        }
+
+        let image_creation_lsn = timeline.get_force_image_creation_lsn().unwrap();
+        assert_eq!(image_creation_lsn, Lsn(4300));
         Ok(())
     }
 }
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 23052ccee7..ba02602cfe 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -46,10 +46,11 @@
 mod historic_layer_coverage;
 mod layer_coverage;
 
-use std::collections::{HashMap, VecDeque};
+use std::collections::{BTreeMap, HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
 use std::sync::Arc;
+use std::time::Instant;
 
 use anyhow::Result;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
@@ -904,6 +905,103 @@ impl LayerMap {
         max_stacked_deltas
     }
 
+    /* BEGIN_HADRON */
+    /**
+     * Compute the image consistent LSN, the largest LSN below which all pages have been redone successfully.
+     * It works by first finding the latest image layers and store them into a map. Then for each delta layer,
+     * find all overlapping image layers in order to potentially increase the image LSN in case there are gaps
+     * (e.g., if an image is created at LSN 100 but the delta layer spans LSN [150, 200], then we can increase
+     * image LSN to 150 because there is no WAL record in between).
+     * Finally, the image consistent LSN is computed by taking the minimum of all image layers.
+     */
+    pub fn compute_image_consistent_lsn(&self, disk_consistent_lsn: Lsn) -> Lsn {
+        struct ImageLayerInfo {
+            // creation LSN of the image layer
+            image_lsn: Lsn,
+            // the current minimum LSN of newer delta layers with overlapping key ranges
+            min_delta_lsn: Lsn,
+        }
+        let started_at = Instant::now();
+
+        let min_l0_deltas_lsn = {
+            let l0_deltas = self.level0_deltas();
+            l0_deltas
+                .iter()
+                .map(|layer| layer.get_lsn_range().start)
+                .min()
+                .unwrap_or(disk_consistent_lsn)
+        };
+        let global_key_range = Key::MIN..Key::MAX;
+
+        // step 1: collect all most recent image layers into a map
+        // map: end key to image_layer_info
+        let mut image_map: BTreeMap<Key, ImageLayerInfo> = BTreeMap::new();
+        for (img_range, img) in self.image_coverage(&global_key_range, disk_consistent_lsn) {
+            let img_lsn = img.map(|layer| layer.get_lsn_range().end).unwrap_or(Lsn(0));
+            image_map.insert(
+                img_range.end,
+                ImageLayerInfo {
+                    image_lsn: img_lsn,
+                    min_delta_lsn: min_l0_deltas_lsn,
+                },
+            );
+        }
+
+        // step 2: go through all delta layers, and update the image layer info with overlapping
+        // key ranges
+        for layer in self.historic.iter() {
+            if !layer.is_delta {
+                continue;
+            }
+            let delta_key_range = layer.get_key_range();
+            let delta_lsn_range = layer.get_lsn_range();
+            for (img_end_key, img_info) in image_map.range_mut(delta_key_range.start..Key::MAX) {
+                debug_assert!(img_end_key >= &delta_key_range.start);
+                if delta_lsn_range.end > img_info.image_lsn {
+                    // the delta layer includes WAL records after the image
+                    // it's possibel that the delta layer's start LSN < image LSN, which will be simply ignored by step 3
+                    img_info.min_delta_lsn =
+                        std::cmp::min(img_info.min_delta_lsn, delta_lsn_range.start);
+                }
+                if img_end_key >= &delta_key_range.end {
+                    // we have fully processed all overlapping image layers
+                    break;
+                }
+            }
+        }
+
+        // step 3, go through all image layers and find the image consistent LSN
+        let mut img_consistent_lsn = min_l0_deltas_lsn.checked_sub(Lsn(1)).unwrap();
+        let mut prev_key = Key::MIN;
+        for (img_key, img_info) in image_map {
+            tracing::debug!(
+                "Image layer {:?}:{} has min delta lsn {}",
+                Range {
+                    start: prev_key,
+                    end: img_key,
+                },
+                img_info.image_lsn,
+                img_info.min_delta_lsn,
+            );
+            let image_lsn = std::cmp::max(
+                img_info.image_lsn,
+                img_info.min_delta_lsn.checked_sub(Lsn(1)).unwrap_or(Lsn(0)),
+            );
+            img_consistent_lsn = std::cmp::min(img_consistent_lsn, image_lsn);
+            prev_key = img_key;
+        }
+        tracing::info!(
+            "computed image_consistent_lsn {} for disk_consistent_lsn {} in {}ms. Processed {} layrs in total.",
+            img_consistent_lsn,
+            disk_consistent_lsn,
+            started_at.elapsed().as_millis(),
+            self.historic.len()
+        );
+        img_consistent_lsn
+    }
+
+    /* END_HADRON */
+
     /// Return all L0 delta layers
     pub fn level0_deltas(&self) -> &Vec<Arc<PersistentLayerDesc>> {
         &self.l0_delta_layers
@@ -1579,6 +1677,138 @@ mod tests {
             LayerVisibilityHint::Visible
         ));
     }
+
+    /* BEGIN_HADRON */
+    #[test]
+    fn test_compute_image_consistent_lsn() {
+        let mut layer_map = LayerMap::default();
+
+        let disk_consistent_lsn = Lsn(1000);
+        // case 1: empty layer map
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(
+            disk_consistent_lsn.checked_sub(Lsn(1)).unwrap(),
+            image_consistent_lsn
+        );
+
+        // case 2: only L0 delta layer
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(900)..Lsn(990),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(100),
+                Lsn(850)..Lsn(899),
+                true,
+            ));
+        }
+
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 3: 3 images, no L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(100)..Lsn(100),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(40)..Key::from_i128(70),
+                Lsn(200)..Lsn(200),
+                false,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(70)..Key::from_i128(100),
+                Lsn(150)..Lsn(150),
+                false,
+            ));
+        }
+        // should use min L0 delta LSN - 1 as image consistent LSN
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(849), image_consistent_lsn);
+
+        // case 4: 3 images with 1 L1 delta
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(300)..Lsn(350),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(299), image_consistent_lsn);
+
+        // case 5: 3 images with 1 more L1 delta with smaller LSN
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(72),
+                Lsn(200)..Lsn(300),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 6: 3 images with more newer L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(30),
+                Lsn(400)..Lsn(500),
+                true,
+            ));
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(35)..Key::from_i128(100),
+                Lsn(450)..Lsn(600),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 7: 3 images with more older L1 deltas (no impact on final results)
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(40),
+                Lsn(0)..Lsn(50),
+                true,
+            ));
+
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(50)..Key::from_i128(100),
+                Lsn(10)..Lsn(60),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(199), image_consistent_lsn);
+
+        // case 8: 3 images with one more L1 delta with overlapping LSN range
+        {
+            let mut updates = layer_map.batch_update();
+            updates.insert_historic(PersistentLayerDesc::new_test(
+                Key::from_i128(0)..Key::from_i128(50),
+                Lsn(50)..Lsn(250),
+                true,
+            ));
+        }
+        let image_consistent_lsn = layer_map.compute_image_consistent_lsn(disk_consistent_lsn);
+        assert_eq!(Lsn(100), image_consistent_lsn);
+    }
+
+    /* END_HADRON */
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index a9bc0a060b..718ea925b7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -351,13 +351,6 @@ pub struct Timeline {
     last_image_layer_creation_check_at: AtomicLsn,
     last_image_layer_creation_check_instant: std::sync::Mutex<Option<Instant>>,
 
-    // HADRON
-    /// If a key range has writes with LSN > force_image_creation_lsn, then we should force image layer creation
-    /// on this key range.
-    force_image_creation_lsn: AtomicLsn,
-    /// The last time instant when force_image_creation_lsn is computed.
-    force_image_creation_lsn_computed_at: std::sync::Mutex<Option<Instant>>,
-
     /// Current logical size of the "datadir", at the last LSN.
     current_logical_size: LogicalSize,
 
@@ -2854,7 +2847,7 @@ impl Timeline {
     }
 
     // HADRON
-    fn get_image_creation_timeout(&self) -> Option<Duration> {
+    fn get_image_layer_force_creation_period(&self) -> Option<Duration> {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
             .tenant_conf
@@ -3134,9 +3127,6 @@ impl Timeline {
                 repartition_threshold: 0,
                 last_image_layer_creation_check_at: AtomicLsn::new(0),
                 last_image_layer_creation_check_instant: Mutex::new(None),
-                // HADRON
-                force_image_creation_lsn: AtomicLsn::new(0),
-                force_image_creation_lsn_computed_at: std::sync::Mutex::new(None),
                 last_received_wal: Mutex::new(None),
                 rel_size_latest_cache: RwLock::new(HashMap::new()),
                 rel_size_snapshot_cache: Mutex::new(LruCache::new(relsize_snapshot_cache_capacity)),
@@ -5381,13 +5371,16 @@ impl Timeline {
         }
 
         // HADRON
+        // for child timelines, we consider all pages up to ancestor_LSN are redone successfully by the parent timeline
+        min_image_lsn = min_image_lsn.max(self.get_ancestor_lsn());
         if min_image_lsn < force_image_creation_lsn.unwrap_or(Lsn(0)) && max_deltas > 0 {
             info!(
-                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}",
+                "forcing image creation for partitioned range {}-{}. Min image LSN: {}, force image creation LSN: {}, num deltas: {}",
                 partition.ranges[0].start,
                 partition.ranges[0].end,
                 min_image_lsn,
-                force_image_creation_lsn.unwrap()
+                force_image_creation_lsn.unwrap(),
+                max_deltas
             );
             return true;
         }
@@ -7153,6 +7146,19 @@ impl Timeline {
             .unwrap()
             .clone()
     }
+
+    /* BEGIN_HADRON */
+    pub(crate) async fn compute_image_consistent_lsn(&self) -> anyhow::Result<Lsn> {
+        let guard = self
+            .layers
+            .read(LayerManagerLockHolder::ComputeImageConsistentLsn)
+            .await;
+        let layer_map = guard.layer_map()?;
+        let disk_consistent_lsn = self.get_disk_consistent_lsn();
+
+        Ok(layer_map.compute_image_consistent_lsn(disk_consistent_lsn))
+    }
+    /* END_HADRON */
 }
 
 impl Timeline {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 171f9d1284..aa1aa937b6 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -8,7 +8,7 @@ use std::cmp::min;
 use std::collections::{BinaryHeap, HashMap, HashSet, VecDeque};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
-use std::time::{Duration, Instant, SystemTime};
+use std::time::{Duration, Instant};
 
 use super::layer_manager::LayerManagerLockHolder;
 use super::{
@@ -34,7 +34,6 @@ use pageserver_api::models::{CompactInfoResponse, CompactKeyRange};
 use pageserver_api::shard::{ShardCount, ShardIdentity, TenantShardId};
 use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;
-use postgres_ffi::to_pg_timestamp;
 use serde::Serialize;
 use tokio::sync::{OwnedSemaphorePermit, Semaphore};
 use tokio_util::sync::CancellationToken;
@@ -47,7 +46,6 @@ use wal_decoder::models::value::Value;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::gc_block::GcBlock;
@@ -1271,10 +1269,7 @@ impl Timeline {
         // Define partitioning schema if needed
 
         // HADRON
-        let force_image_creation_lsn = self
-            .get_or_compute_force_image_creation_lsn(cancel, ctx)
-            .await
-            .map_err(CompactionError::Other)?;
+        let force_image_creation_lsn = self.get_force_image_creation_lsn();
 
         // 1. L0 Compact
         let l0_outcome = {
@@ -1484,59 +1479,37 @@ impl Timeline {
     }
 
     /* BEGIN_HADRON */
-    // Get the force image creation LSN. Compute it if the last computed LSN is too old.
-    async fn get_or_compute_force_image_creation_lsn(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Option<Lsn>> {
-        const FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL: Duration = Duration::from_secs(10 * 60); // 10 minutes
-        let image_layer_force_creation_period = self.get_image_creation_timeout();
-        if image_layer_force_creation_period.is_none() {
-            return Ok(None);
+    // Get the force image creation LSN based on gc_cutoff_lsn.
+    // Note that this is an estimation and the workload rate may suddenly change. When that happens,
+    // the force image creation may be too early or too late, but eventually it should be able to catch up.
+    pub(crate) fn get_force_image_creation_lsn(self: &Arc<Self>) -> Option<Lsn> {
+        let image_creation_period = self.get_image_layer_force_creation_period()?;
+        let current_lsn = self.get_last_record_lsn();
+        let pitr_lsn = self.gc_info.read().unwrap().cutoffs.time?;
+        let pitr_interval = self.get_pitr_interval();
+        if pitr_lsn == Lsn::INVALID || pitr_interval.is_zero() {
+            tracing::warn!(
+                "pitr LSN/interval not found, skipping force image creation LSN calculation"
+            );
+            return None;
         }
 
-        let image_layer_force_creation_period = image_layer_force_creation_period.unwrap();
-        let force_image_creation_lsn_computed_at =
-            *self.force_image_creation_lsn_computed_at.lock().unwrap();
-        if force_image_creation_lsn_computed_at.is_none()
-            || force_image_creation_lsn_computed_at.unwrap().elapsed()
-                > FORCE_IMAGE_CREATION_LSN_COMPUTE_INTERVAL
-        {
-            let now: SystemTime = SystemTime::now();
-            let timestamp = now
-                .checked_sub(image_layer_force_creation_period)
-                .ok_or_else(|| {
-                    anyhow::anyhow!(
-                        "image creation timeout is too large: {image_layer_force_creation_period:?}"
-                    )
-                })?;
-            let timestamp = to_pg_timestamp(timestamp);
-            let force_image_creation_lsn = match self
-                .find_lsn_for_timestamp(timestamp, cancel, ctx)
-                .await?
-            {
-                LsnForTimestamp::Present(lsn) | LsnForTimestamp::Future(lsn) => lsn,
-                _ => {
-                    let gc_lsn = *self.get_applied_gc_cutoff_lsn();
-                    tracing::info!(
-                        "no LSN found for timestamp {timestamp:?}, using latest GC cutoff LSN {}",
-                        gc_lsn
-                    );
-                    gc_lsn
-                }
-            };
-            self.force_image_creation_lsn
-                .store(force_image_creation_lsn);
-            *self.force_image_creation_lsn_computed_at.lock().unwrap() = Some(Instant::now());
-            tracing::info!(
-                "computed force image creation LSN: {}",
-                force_image_creation_lsn
-            );
-            Ok(Some(force_image_creation_lsn))
-        } else {
-            Ok(Some(self.force_image_creation_lsn.load()))
-        }
+        let delta_lsn = current_lsn.checked_sub(pitr_lsn).unwrap().0
+            * image_creation_period.as_secs()
+            / pitr_interval.as_secs();
+        let force_image_creation_lsn = current_lsn.checked_sub(delta_lsn).unwrap_or(Lsn(0));
+
+        tracing::info!(
+            "Tenant shard {} computed force_image_creation_lsn: {}. Current lsn: {}, image_layer_force_creation_period: {:?}, GC cutoff: {}, PITR interval: {:?}",
+            self.tenant_shard_id,
+            force_image_creation_lsn,
+            current_lsn,
+            image_creation_period,
+            pitr_lsn,
+            pitr_interval
+        );
+
+        Some(force_image_creation_lsn)
     }
     /* END_HADRON */
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 2eccf48579..d8d81a6c91 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -47,6 +47,7 @@ pub(crate) enum LayerManagerLockHolder {
     ImportPgData,
     DetachAncestor,
     Eviction,
+    ComputeImageConsistentLsn,
     #[cfg(test)]
     Testing,
 }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index e5a3a969d4..62fc212e12 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -850,6 +850,31 @@ async fn handle_tenant_describe(
     json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 
+/* BEGIN_HADRON */
+async fn handle_tenant_timeline_describe(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Scrubber)?;
+
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;
+    match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(_req) => {}
+    };
+
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_timeline_describe(tenant_id, timeline_id)
+            .await?,
+    )
+}
+/* END_HADRON */
+
 async fn handle_tenant_list(
     service: Arc<Service>,
     req: Request<Body>,
@@ -2480,6 +2505,13 @@ pub fn make_router(
             )
         })
         // Timeline operations
+        .get("/control/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+            tenant_service_handler(
+                r,
+                handle_tenant_timeline_describe,
+                RequestName("v1_tenant_timeline_describe"),
+            )
+        })
         .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
             tenant_service_handler(
                 r,
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index d6fe173eb3..da0687895a 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -86,6 +86,23 @@ impl PageserverClient {
         )
     }
 
+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Result<TimelineInfo> {
+        measured_request!(
+            "tenant_timeline_describe",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner
+                .tenant_timeline_describe(tenant_shard_id, timeline_id,)
+                .await
+        )
+    }
+    /* END_HADRON */
+
     pub(crate) async fn tenant_scan_remote_storage(
         &self,
         tenant_id: TenantId,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 9c1b81d261..31d149c5ac 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -32,7 +32,7 @@ use pageserver_api::controller_api::{
     ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
     SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
     TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-    TenantShardMigrateRequest, TenantShardMigrateResponse,
+    TenantShardMigrateRequest, TenantShardMigrateResponse, TenantTimelineDescribeResponse,
 };
 use pageserver_api::models::{
     self, DetachBehavior, LocationConfig, LocationConfigListResponse, LocationConfigMode, LsnLease,
@@ -5486,6 +5486,92 @@ impl Service {
         .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
     }
 
+    /* BEGIN_HADRON */
+    pub(crate) async fn tenant_timeline_describe(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<TenantTimelineDescribeResponse, ApiError> {
+        self.tenant_remote_mutation(tenant_id, |locations| async move {
+            if locations.0.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
+            };
+
+            let locations: Vec<(TenantShardId, Node)> = locations
+                .0
+                .iter()
+                .map(|t| (*t.0, t.1.latest.node.clone()))
+                .collect();
+            let mut futs = FuturesUnordered::new();
+
+            for (shard_id, node) in locations {
+                futs.push({
+                    async move {
+                        let result = node
+                            .with_client_retries(
+                                |client| async move {
+                                    client
+                                        .tenant_timeline_describe(&shard_id, &timeline_id)
+                                        .await
+                                },
+                                &self.http_client,
+                                &self.config.pageserver_jwt_token,
+                                3,
+                                3,
+                                Duration::from_secs(30),
+                                &self.cancel,
+                            )
+                            .await;
+                        (result, shard_id, node.get_id())
+                    }
+                });
+            }
+
+            let mut results: Vec<TimelineInfo> = Vec::new();
+            while let Some((result, tenant_shard_id, node_id)) = futs.next().await {
+                match result {
+                    Some(Ok(timeline_info)) => results.push(timeline_info),
+                    Some(Err(e)) => {
+                        tracing::warn!(
+                            "Failed to describe tenant {} timeline {} for pageserver {}: {e}",
+                            tenant_shard_id,
+                            timeline_id,
+                            node_id,
+                        );
+                        return Err(ApiError::ResourceUnavailable(format!("{e}").into()));
+                    }
+                    None => return Err(ApiError::Cancelled),
+                }
+            }
+            let mut image_consistent_lsn: Option<Lsn> = Some(Lsn::MAX);
+            for timeline_info in &results {
+                if let Some(tline_image_consistent_lsn) = timeline_info.image_consistent_lsn {
+                    image_consistent_lsn = Some(std::cmp::min(
+                        image_consistent_lsn.unwrap(),
+                        tline_image_consistent_lsn,
+                    ));
+                } else {
+                    tracing::warn!(
+                        "Timeline {} on shard {} does not have image consistent lsn",
+                        timeline_info.timeline_id,
+                        timeline_info.tenant_id
+                    );
+                    image_consistent_lsn = None;
+                    break;
+                }
+            }
+
+            Ok(TenantTimelineDescribeResponse {
+                shards: results,
+                image_consistent_lsn,
+            })
+        })
+        .await?
+    }
+    /* END_HADRON */
+
     /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
     /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
     /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 42924f9b83..a7b7f0e74d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2342,6 +2342,20 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return response.json()
 
+    # HADRON
+    def tenant_timeline_describe(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ):
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/tenant/{tenant_id}/timeline/{timeline_id}",
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        response.raise_for_status()
+        return response.json()
+
     def nodes(self):
         """
         :return: list of {"id": ""}
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index e67161c6b7..ab02314288 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -960,9 +960,9 @@ def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
     return image_layer_count, delta_layer_count
 
 
-def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
+def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder):
     """
-    Tests that page server can force creating new images if image creation timeout is enabled
+    Tests that page server can force creating new images if image_layer_force_creation_period is enabled
     """
     # use large knobs to disable L0 compaction/image creation except for the force image creation
     tenant_conf = {
@@ -972,10 +972,10 @@ def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
         "checkpoint_distance": 10 * 1024,
         "checkpoint_timeout": "1s",
         "image_layer_force_creation_period": "1s",
-        # The lsn for forced image layer creations is calculated once every 10 minutes.
-        # Hence, drive compaction manually such that the test doesn't compute it at the
-        # wrong time.
-        "compaction_period": "0s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
     }
 
     # consider every tenant large to run the image layer generation check more eagerly
@@ -1018,4 +1018,69 @@ def test_image_creation_timeout(neon_env_builder: NeonEnvBuilder):
     )
 
 
+def test_image_consistent_lsn(neon_env_builder: NeonEnvBuilder):
+    """
+    Test the /v1/tenant/<tenant_id>/timeline/<timeline_id> endpoint and the computation of image_consistent_lsn
+    """
+    # use large knobs to disable L0 compaction/image creation except for the force image creation
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        "checkpoint_distance": 10 * 1024,
+        "checkpoint_timeout": "1s",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf,
+        initial_tenant_shard_count=4,
+        initial_tenant_shard_stripe_size=1,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
+    for v in range(10):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+    shards = response["shards"]
+    for shard in shards:
+        assert shard["image_consistent_lsn"] is not None
+    image_consistent_lsn = response["image_consistent_lsn"]
+    assert image_consistent_lsn is not None
+
+    # do more writes and wait for image_consistent_lsn to advance
+    for v in range(100):
+        endpoint.safe_psql(
+            f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))", log_query=False
+        )
+
+    def check_image_consistent_lsn_advanced():
+        response = env.storage_controller.tenant_timeline_describe(tenant_id, timeline_id)
+        new_image_consistent_lsn = response["image_consistent_lsn"]
+        shards = response["shards"]
+        for shard in shards:
+            print(f"shard {shard['tenant_id']} image_consistent_lsn{shard['image_consistent_lsn']}")
+        assert new_image_consistent_lsn != image_consistent_lsn
+
+    wait_until(check_image_consistent_lsn_advanced)
+
+    endpoint.stop_and_destroy()
+
+    for ps in env.pageservers:
+        ps.allowed_errors.append(".*created delta file of size.*larger than double of target.*")
+
+
 # END_HADRON

From 154f6dc59cc91ebde58d1a0b4a8b43aa68d1c3a5 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 11 Jul 2025 14:25:25 +0100
Subject: [PATCH 086/163] pageserver: log only on final shard resolution
 failure (#12565)

This log is too noisy. Instead of warning on every retry, let's log only
on the final failure.
---
 pageserver/src/tenant/timeline/handle.rs          | 10 +++++-----
 test_runner/fixtures/pageserver/allowed_errors.py |  3 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 33c97287c0..7bca66190f 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -359,14 +359,14 @@ impl<T: Types> Cache<T> {
                 Err(e) => {
                     // Retry on tenant manager error to handle tenant split more gracefully
                     if attempt < GET_MAX_RETRIES {
-                        tracing::warn!(
-                            "Fail to resolve tenant shard in attempt {}: {:?}. Retrying...",
-                            attempt,
-                            e
-                        );
                         tokio::time::sleep(RETRY_BACKOFF).await;
                         continue;
                     } else {
+                        tracing::warn!(
+                            "Failed to resolve tenant shard after {} attempts: {:?}",
+                            GET_MAX_RETRIES,
+                            e
+                        );
                         return Err(e);
                     }
                 }
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index 0e4dd571c0..59249f31ad 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -115,8 +115,7 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*Local data loss suspected.*",
     # Too many frozen layers error is normal during intensive benchmarks
     ".*too many frozen layers.*",
-    # Transient errors when resolving tenant shards by page service
-    ".*Fail to resolve tenant shard in attempt.*",
+    ".*Failed to resolve tenant shard after.*",
     # Expected warnings when pageserver has not refreshed GC info yet
     ".*pitr LSN/interval not found, skipping force image creation LSN calculation.*",
     ".*No broker updates received for a while.*",

From a8db7ebffb7e9b2a1bc9cc950a03a244d26d34d4 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 11 Jul 2025 17:17:44 +0300
Subject: [PATCH 087/163] Minor refactor of the SQL functions to get working
 set size estimate (#12550)

Split the functions into two: one internal function to calculate the
estimate, and another (two functions) to expose it as SQL functions.

This is in preparation of adding new communicator implementation. With
that, the SQL functions will dispatch the call to the old or new
implementation depending on which is being used.
---
 pgxn/neon/file_cache.c | 47 +++++++++++++++---------------------------
 pgxn/neon/file_cache.h |  3 ++-
 pgxn/neon/neon.c       | 30 +++++++++++++++++++++++++++
 3 files changed, 49 insertions(+), 31 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 8cfa09bc87..0e316abd1d 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -205,6 +205,8 @@ bool AmPrewarmWorker;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
+PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+
 /*
  * Close LFC file if opened.
  * All backends should close their LFC files once LFC is disabled.
@@ -2135,40 +2137,25 @@ local_cache_pages(PG_FUNCTION_ARGS)
 		SRF_RETURN_DONE(funcctx);
 }
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
 
-Datum
-approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+/*
+ * Internal implementation of the approximate_working_set_size_seconds()
+ * function.
+ */
+int32
+lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 {
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		time_t duration = PG_ARGISNULL(0) ? (time_t)-1 : PG_GETARG_INT32(0);
-		LWLockAcquire(lfc_lock, LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
-}
+	int32		dc;
 
-PG_FUNCTION_INFO_V1(approximate_working_set_size);
+	if (lfc_size_limit == 0)
+		return -1;
 
-Datum
-approximate_working_set_size(PG_FUNCTION_ARGS)
-{
-	if (lfc_size_limit != 0)
-	{
-		int32 dc;
-		bool reset = PG_GETARG_BOOL(0);
-		LWLockAcquire(lfc_lock, reset ? LW_EXCLUSIVE : LW_SHARED);
-		dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, (time_t)-1);
-		if (reset)
-			memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-		LWLockRelease(lfc_lock);
-		PG_RETURN_INT32(dc);
-	}
-	PG_RETURN_NULL();
+	LWLockAcquire(lfc_lock, LW_SHARED);
+	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
+	if (reset)
+		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
+	LWLockRelease(lfc_lock);
+	return dc;
 }
 
 PG_FUNCTION_INFO_V1(get_local_cache_state);
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
index d5ac55d5ba..14e5d4f753 100644
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -47,7 +47,8 @@ extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blk
 extern FileCacheState* lfc_get_state(size_t max_entries);
 extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
 
-PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
+
 
 static inline bool
 lfc_read(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 9e0ca16fed..7b749f1080 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -561,6 +561,8 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(approximate_working_set_size_seconds);
+PG_FUNCTION_INFO_V1(approximate_working_set_size);
 
 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -607,6 +609,34 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
 
+Datum
+approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
+{
+	time_t		duration;
+	int32		dc;
+
+	duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
+
+	dc = lfc_approximate_working_set_size_seconds(duration, false);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
+Datum
+approximate_working_set_size(PG_FUNCTION_ARGS)
+{
+	bool		reset = PG_GETARG_BOOL(0);
+	int32		dc;
+
+	dc = lfc_approximate_working_set_size_seconds(-1, reset);
+	if (dc < 0)
+		PG_RETURN_NULL();
+	else
+		PG_RETURN_INT32(dc);
+}
+
 #if PG_MAJORVERSION_NUM >= 16
 static void
 neon_shmem_startup_hook(void)

From f4245403b36925c3ad0ef39c344ca30b1701b74f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 11 Jul 2025 16:13:36 +0100
Subject: [PATCH 088/163] [proxy] allow testing query cancellation locally
 (#12568)

## Problem

Canceelation requires redis, redis required control-plane.

## Summary of changes

Make redis for cancellation not require control plane.
Add instructions for setting up redis locally.
---
 proxy/README.md           | 10 +++++++++-
 proxy/src/binary/proxy.rs | 20 +++++++++++---------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index e10ff3d710..ff48f9f323 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -123,6 +123,11 @@ docker exec -it proxy-postgres psql -U postgres -c "CREATE TABLE neon_control_pl
 docker exec -it proxy-postgres psql -U postgres -c "CREATE ROLE proxy WITH SUPERUSER LOGIN PASSWORD 'password';"
 ```
 
+If you want to test query cancellation, redis is also required:
+```sh
+docker run --detach --name proxy-redis --publish 6379:6379 redis:7.0
+```
+
 Let's create self-signed certificate by running:
 ```sh
 openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key -subj "/CN=*.local.neon.build"
@@ -130,7 +135,10 @@ openssl req -new -x509 -days 365 -nodes -text -out server.crt -keyout server.key
 
 Then we need to build proxy with 'testing' feature and run, e.g.:
 ```sh
-RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' -c server.crt -k server.key
+RUST_LOG=proxy LOGFMT=text cargo run -p proxy --bin proxy --features testing -- \
+  --auth-backend postgres --auth-endpoint 'postgresql://postgres:proxy-postgres@127.0.0.1:5432/postgres' \
+  --redis-auth-type="plain" --redis-plain="redis://127.0.0.1:6379" \
+  -c server.crt -k server.key
 ```
 
 Now from client you can start a new session:
diff --git a/proxy/src/binary/proxy.rs b/proxy/src/binary/proxy.rs
index 691709ce2a..16a7dc7b67 100644
--- a/proxy/src/binary/proxy.rs
+++ b/proxy/src/binary/proxy.rs
@@ -522,15 +522,7 @@ pub async fn run() -> anyhow::Result<()> {
         maintenance_tasks.spawn(usage_metrics::task_main(metrics_config));
     }
 
-    if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
-        && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
-        && let Some(client) = redis_client
-    {
-        // project info cache and invalidation of that cache.
-        let cache = api.caches.project_info.clone();
-        maintenance_tasks.spawn(notifications::task_main(client.clone(), cache.clone()));
-        maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
-
+    if let Some(client) = redis_client {
         // Try to connect to Redis 3 times with 1 + (0..0.1) second interval.
         // This prevents immediate exit and pod restart,
         // which can cause hammering of the redis in case of connection issues.
@@ -560,6 +552,16 @@ pub async fn run() -> anyhow::Result<()> {
                 }
             }
         }
+
+        #[allow(irrefutable_let_patterns)]
+        if let Either::Left(auth::Backend::ControlPlane(api, ())) = &auth_backend
+            && let crate::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api
+        {
+            // project info cache and invalidation of that cache.
+            let cache = api.caches.project_info.clone();
+            maintenance_tasks.spawn(notifications::task_main(client, cache.clone()));
+            maintenance_tasks.spawn(async move { cache.gc_worker().await });
+        }
     }
 
     let maintenance = loop {

From a0a7733b5aa657553a5b91bb0a3d4f6e3847e38b Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 11 Jul 2025 10:57:50 -0500
Subject: [PATCH 089/163] Use relative paths in submodule URL references
 (#12559)

This is a nifty trick from the hadron repo that seems to help with SSH
key dance.

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
---
 .gitmodules | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index d1330bf28c..e381fb079e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,16 +1,16 @@
 [submodule "vendor/postgres-v14"]
 	path = vendor/postgres-v14
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_14_STABLE_neon
 [submodule "vendor/postgres-v15"]
 	path = vendor/postgres-v15
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_15_STABLE_neon
 [submodule "vendor/postgres-v16"]
 	path = vendor/postgres-v16
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_16_STABLE_neon
 [submodule "vendor/postgres-v17"]
 	path = vendor/postgres-v17
-	url = https://github.com/neondatabase/postgres.git
+	url = ../postgres.git
 	branch = REL_17_STABLE_neon

From 3300207523008ab3dd922780c4d164bd4376a007 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 11 Jul 2025 19:05:22 +0300
Subject: [PATCH 090/163] Update working set size estimate without lock
 (#12570)

Update the WSS estimate before acquring the lock, so that we don't need
to hold the lock for so long. That seems safe to me, see added comment.

I was planning to do this with the new rust-based communicator
implementation anyway, but it might help a little with the current C
implementation too. And more importantly, having this as a separate PR
gives us a chance to review this aspect independently.
---
 pgxn/neon/file_cache.c | 77 +++++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 28 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 0e316abd1d..2c87f139af 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -162,8 +162,34 @@ typedef struct FileCacheControl
 	dlist_head	lru;			/* double linked list for LRU replacement
 								 * algorithm */
 	dlist_head  holes;          /* double linked list of punched holes */
-	HyperLogLogState wss_estimation; /* estimation of working set size */
+
 	ConditionVariable cv[N_COND_VARS]; /* turnstile of condition variables */
+
+	/*
+	 * Estimation of working set size.
+	 *
+	 * This is not guarded by the lock. No locking is needed because all the
+	 * writes to the "registers" are simple 64-bit stores, to update a
+	 * timestamp. We assume that:
+	 *
+	 * - 64-bit stores are atomic. We could enforce that by using
+	 *   pg_atomic_uint64 instead of TimestampTz as the datatype in hll.h, but
+	 *   for now we just rely on it implicitly.
+	 *
+	 * - Even if they're not, and there is a race between two stores, it
+	 *   doesn't matter much which one wins because they're both updating the
+	 *   register with the current timestamp. Or you have a race between
+	 *   resetting the register and updating it, in which case it also doesn't
+	 *   matter much which one wins.
+	 *
+	 * - If they're not atomic, you might get an occasional "torn write" if
+	 *   you're really unlucky, but we tolerate that too. It just means that
+	 *   the estimate will be a little off, until the register is updated
+	 *   again.
+	 */
+	HyperLogLogState wss_estimation;
+
+	/* Prewarmer state */
 	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
 	size_t n_prewarm_workers;
 	size_t n_prewarm_entries;
@@ -1144,6 +1170,13 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 
 	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 
+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
+
 	/*
 	 * For every chunk that has blocks we're interested in, we
 	 * 1. get the chunk header
@@ -1222,14 +1255,6 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		}
 
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_FIND, NULL);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (entry == NULL)
 		{
 			/* Pages are not cached */
@@ -1506,9 +1531,15 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 		return false;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forknum;
 
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	if (lfc_prewarm_update_ws_estimation)
+	{
+		tag.blockNum = blkno;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 
 	tag.blockNum = blkno - chunk_offs;
 	hash = get_hash_value(lfc_hash, &tag);
@@ -1526,19 +1557,13 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 
 	if (lwlsn > lsn)
 	{
-		elog(DEBUG1, "Skip LFC write for %d because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
+		elog(DEBUG1, "Skip LFC write for %u because LwLSN=%X/%X is greater than not_nodified_since LSN %X/%X",
 			 blkno, LSN_FORMAT_ARGS(lwlsn), LSN_FORMAT_ARGS(lsn));
 		LWLockRelease(lfc_lock);
 		return false;
 	}
 
 	entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-	if (lfc_prewarm_update_ws_estimation)
-	{
-		tag.blockNum = blkno;
-		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-	}
 	if (found)
 	{
 		state = GET_STATE(entry, chunk_offs);
@@ -1651,9 +1676,15 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		return;
 
 	CopyNRelFileInfoToBufTag(tag, rinfo);
+	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
 	tag.forkNum = forkNum;
 
-	CriticalAssert(BufTagGetRelNumber(&tag) != InvalidRelFileNumber);
+	/* Update working set size estimate for the blocks */
+	for (int i = 0; i < nblocks; i++)
+	{
+		tag.blockNum = blkno + i;
+		addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
+	}
 
 	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
 
@@ -1694,14 +1725,6 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 		cv = &lfc_ctl->cv[hash % N_COND_VARS];
 
 		entry = hash_search_with_hash_value(lfc_hash, &tag, hash, HASH_ENTER, &found);
-
-		/* Approximate working set for the blocks assumed in this entry */
-		for (int i = 0; i < blocks_in_chunk; i++)
-		{
-			tag.blockNum = blkno + i;
-			addSHLL(&lfc_ctl->wss_estimation, hash_bytes((uint8_t const*)&tag, sizeof(tag)));
-		}
-
 		if (found)
 		{
 			/*
@@ -2150,11 +2173,9 @@ lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 	if (lfc_size_limit == 0)
 		return -1;
 
-	LWLockAcquire(lfc_lock, LW_SHARED);
 	dc = (int32) estimateSHLL(&lfc_ctl->wss_estimation, duration);
 	if (reset)
 		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
-	LWLockRelease(lfc_lock);
 	return dc;
 }
 

From 379259bdd75edae91fad0d180fa513bff3e1f92b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 11 Jul 2025 19:07:14 +0200
Subject: [PATCH 091/163] storcon: don't error log on timeline delete if tenant
 migration is in progress (#12523)

Fixes [LKB-61](https://databricks.atlassian.net/browse/LKB-61):
`test_timeline_archival_chaos` being flaky with storcon error `Requested
tenant is missing`.

When a tenant migration is ongoing, and the attach request has been sent
to the new location, but the attach hasn't finished yet, it is possible
for the pageserver to return a 412 precondition failed HTTP error on
timeline deletion, because it is being sent to the new location already.
That one we would previously log via sth like:

```
ERROR request{method=DELETE path=/v1/tenant/1f544a11c90d1afd7af9b26e48985a4e/timeline/32818fb3ebf07cb7f06805429d7dee38 request_id=c493c04b-7f33-46d2-8a65-aac8a5516055}: Error processing HTTP request: InternalServerError(Error deleting timeline 32
818fb3ebf07cb7f06805429d7dee38 on 1f544a11c90d1afd7af9b26e48985a4e on node 2 (localhost): pageserver API: Precondition failed: Requested tenant is missing
```

This patch changes that and makes us return a more reasonable resource
unavailable error. Not sure how scalable this is with tenants with a
large number of shards, but that's a different discussion (we'd probably
need a limited amount of per-storcon retries).

example
[link](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-12398/15981821532/index.html#/testresult/e7785dfb1238d92f).
---
 storage_controller/src/service.rs | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 31d149c5ac..0907907edc 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5206,6 +5206,9 @@ impl Service {
                 match res {
                     Ok(ok) => Ok(ok),
                     Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(mgmt_api::Error::ApiError(StatusCode::PRECONDITION_FAILED, msg)) if msg.contains("Requested tenant is missing") => {
+                        Err(ApiError::ResourceUnavailable("Tenant migration in progress".into()))
+                    },
                     Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())),
                     Err(e) => {
                         Err(

From 63ca084696f4dd226bfea1abae66dcb3234d1051 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 11 Jul 2025 14:37:55 -0400
Subject: [PATCH 092/163] fix(pageserver): downgrade wal apply error during
 gc-compaction (#12518)

## Problem

close LKB-162

close https://github.com/neondatabase/cloud/issues/30665, related to
https://github.com/neondatabase/cloud/issues/29434

We see a lot of errors like:

```
2025-05-22T23:06:14.928959Z ERROR compaction_loop{tenant_id=? shard_id=0304}:run:gc_compact_timeline{timeline_id=?}: error applying 4 WAL records 35/DC0DF0B8..3B/E43188C0 (8119 bytes) to key 000000067F0000400500006027000000B9D0, from base image with LSN 0/0 to reconstruct page image at LSN 61/150B9B20 n_attempts=0: apply_wal_records

Caused by:
    0: read walredo stdout
    1: early eof
```

which is an acceptable form of error and we should downgrade it to
warning.

## Summary of changes

walredo error during gc-compaction is expected when the data below the
gc horizon does not contain a full key history. This is possible in some
rare cases of gc that is only able to remove data in the middle of the
history but not all earlier history when a full keyspace gets deleted.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/walredo.rs | 46 +++++++++++++++++++++++++++++----------
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index c6d3cafe9a..f053c9ed37 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -147,6 +147,16 @@ pub enum RedoAttemptType {
     GcCompaction,
 }
 
+impl std::fmt::Display for RedoAttemptType {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            RedoAttemptType::ReadPage => write!(f, "read page"),
+            RedoAttemptType::LegacyCompaction => write!(f, "legacy compaction"),
+            RedoAttemptType::GcCompaction => write!(f, "gc compaction"),
+        }
+    }
+}
+
 ///
 /// Public interface of WAL redo manager
 ///
@@ -199,6 +209,7 @@ impl PostgresRedoManager {
                         self.conf.wal_redo_timeout,
                         pg_version,
                         max_retry_attempts,
+                        redo_attempt_type,
                     )
                     .await
                 };
@@ -221,6 +232,7 @@ impl PostgresRedoManager {
                 self.conf.wal_redo_timeout,
                 pg_version,
                 max_retry_attempts,
+                redo_attempt_type,
             )
             .await
         }
@@ -445,6 +457,7 @@ impl PostgresRedoManager {
         wal_redo_timeout: Duration,
         pg_version: PgMajorVersion,
         max_retry_attempts: u32,
+        redo_attempt_type: RedoAttemptType,
     ) -> Result<Bytes, Error> {
         *(self.last_redo_at.lock().unwrap()) = Some(Instant::now());
 
@@ -485,17 +498,28 @@ impl PostgresRedoManager {
                 );
 
                 if let Err(e) = result.as_ref() {
-                    error!(
-                        "error applying {} WAL records {}..{} ({} bytes) to key {key}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
-                        records.len(),
-                        records.first().map(|p| p.0).unwrap_or(Lsn(0)),
-                        records.last().map(|p| p.0).unwrap_or(Lsn(0)),
-                        nbytes,
-                        base_img_lsn,
-                        lsn,
-                        n_attempts,
-                        e,
-                    );
+                    macro_rules! message {
+                        ($level:tt) => {
+                            $level!(
+                                "error applying {} WAL records {}..{} ({} bytes) to key {} during {}, from base image with LSN {} to reconstruct page image at LSN {} n_attempts={}: {:?}",
+                                records.len(),
+                                records.first().map(|p| p.0).unwrap_or(Lsn(0)),
+                                records.last().map(|p| p.0).unwrap_or(Lsn(0)),
+                                nbytes,
+                                key,
+                                redo_attempt_type,
+                                base_img_lsn,
+                                lsn,
+                                n_attempts,
+                                e,
+                            )
+                        }
+                    }
+                    match redo_attempt_type {
+                        RedoAttemptType::ReadPage => message!(error),
+                        RedoAttemptType::LegacyCompaction => message!(error),
+                        RedoAttemptType::GcCompaction => message!(warn),
+                    }
                 }
 
                 result.map_err(Error::Other)

From 4566b12a22876f1110b77da9e7b75615c9963b38 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Fri, 11 Jul 2025 20:56:39 +0200
Subject: [PATCH 093/163] NEON: Finish Zenith->Neon rename (#12566)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Even though we're now part of Databricks, let's at least make this part
consistent.

## Summary of changes

- PG14: https://github.com/neondatabase/postgres/pull/669
- PG15: https://github.com/neondatabase/postgres/pull/670
- PG16: https://github.com/neondatabase/postgres/pull/671
- PG17: https://github.com/neondatabase/postgres/pull/672

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 compute_tools/src/compute.rs                  | 23 +++++++++++++
 control_plane/src/endpoint.rs                 |  3 +-
 docs/core_changes.md                          |  7 ++--
 pageserver/src/basebackup.rs                  | 33 +++++++++++--------
 pageserver/src/import_datadir.rs              | 14 ++++----
 pgxn/neon_test_utils/neontest.c               | 10 +++---
 pgxn/typedefs.list                            | 22 ++++++-------
 test_runner/fixtures/neon_fixtures.py         |  1 +
 .../regress/test_timeline_detach_ancestor.py  |  8 ++---
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  8 ++---
 14 files changed, 84 insertions(+), 53 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c05cc229a2..2e0b7d7b2e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1040,6 +1040,8 @@ impl ComputeNode {
             PageserverProtocol::Grpc => self.try_get_basebackup_grpc(spec, lsn)?,
         };
 
+        self.fix_zenith_signal_neon_signal()?;
+
         let mut state = self.state.lock().unwrap();
         state.metrics.pageserver_connect_micros =
             connected.duration_since(started).as_micros() as u64;
@@ -1049,6 +1051,27 @@ impl ComputeNode {
         Ok(())
     }
 
+    /// Move the Zenith signal file to Neon signal file location.
+    /// This makes Compute compatible with older PageServers that don't yet
+    /// know about the Zenith->Neon rename.
+    fn fix_zenith_signal_neon_signal(&self) -> Result<()> {
+        let datadir = Path::new(&self.params.pgdata);
+
+        let neonsig = datadir.join("neon.signal");
+
+        if neonsig.is_file() {
+            return Ok(());
+        }
+
+        let zenithsig = datadir.join("zenith.signal");
+
+        if zenithsig.is_file() {
+            fs::copy(zenithsig, neonsig)?;
+        }
+
+        Ok(())
+    }
+
     /// Fetches a basebackup via gRPC. The connstring must use grpc://. Returns the timestamp when
     /// the connection was established, and the (compressed) size of the basebackup.
     fn try_get_basebackup_grpc(&self, spec: &ParsedSpec, lsn: Lsn) -> Result<(Instant, usize)> {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index ad2067e0f2..91a62b0ca4 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -32,7 +32,8 @@
 //!     config.json                 - passed to `compute_ctl`
 //!     pgdata/
 //!         postgresql.conf       - copy of postgresql.conf created by `compute_ctl`
-//!         zenith.signal
+//!         neon.signal
+//!         zenith.signal         - copy of neon.signal, for backward compatibility
 //!         <other PostgreSQL files>
 //! ```
 //!
diff --git a/docs/core_changes.md b/docs/core_changes.md
index 1388317728..abfd20af26 100644
--- a/docs/core_changes.md
+++ b/docs/core_changes.md
@@ -129,9 +129,10 @@ segment to bootstrap the WAL writing, but it doesn't contain the checkpoint reco
 changes in xlog.c, to allow starting the compute node without reading the last checkpoint record
 from WAL.
 
-This includes code to read the `zenith.signal` file, which tells the startup code the LSN to start
-at. When the `zenith.signal` file is present, the startup uses that LSN instead of the last
-checkpoint's LSN. The system is known to be consistent at that LSN, without any WAL redo.
+This includes code to read the `neon.signal` (also `zenith.signal`) file, which tells the startup 
+code the LSN to start at. When the `neon.signal` file is present, the startup uses that LSN
+instead of the last checkpoint's LSN. The system is known to be consistent at that LSN, without 
+any WAL redo.
 
 
 ### How to get rid of the patch
diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 36dada1e89..1a44c80e2d 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -114,7 +114,7 @@ where
     // Compute postgres doesn't have any previous WAL files, but the first
     // record that it's going to write needs to include the LSN of the
     // previous record (xl_prev). We include prev_record_lsn in the
-    // "zenith.signal" file, so that postgres can read it during startup.
+    // "neon.signal" file, so that postgres can read it during startup.
     //
     // We don't keep full history of record boundaries in the page server,
     // however, only the predecessor of the latest record on each
@@ -751,34 +751,39 @@ where
 
     //
     // Add generated pg_control file and bootstrap WAL segment.
-    // Also send zenith.signal file with extra bootstrap data.
+    // Also send neon.signal and zenith.signal file with extra bootstrap data.
     //
     async fn add_pgcontrol_file(
         &mut self,
         pg_control_bytes: Bytes,
         system_identifier: u64,
     ) -> Result<(), BasebackupError> {
-        // add zenith.signal file
-        let mut zenith_signal = String::new();
+        // add neon.signal file
+        let mut neon_signal = String::new();
         if self.prev_record_lsn == Lsn(0) {
             if self.timeline.is_ancestor_lsn(self.lsn) {
-                write!(zenith_signal, "PREV LSN: none")
+                write!(neon_signal, "PREV LSN: none")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             } else {
-                write!(zenith_signal, "PREV LSN: invalid")
+                write!(neon_signal, "PREV LSN: invalid")
                     .map_err(|e| BasebackupError::Server(e.into()))?;
             }
         } else {
-            write!(zenith_signal, "PREV LSN: {}", self.prev_record_lsn)
+            write!(neon_signal, "PREV LSN: {}", self.prev_record_lsn)
                 .map_err(|e| BasebackupError::Server(e.into()))?;
         }
-        self.ar
-            .append(
-                &new_tar_header("zenith.signal", zenith_signal.len() as u64)?,
-                zenith_signal.as_bytes(),
-            )
-            .await
-            .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,zenith.signal"))?;
+
+        // TODO: Remove zenith.signal once all historical computes have been replaced
+        // ... and thus support the neon.signal file.
+        for signalfilename in ["neon.signal", "zenith.signal"] {
+            self.ar
+                .append(
+                    &new_tar_header(signalfilename, neon_signal.len() as u64)?,
+                    neon_signal.as_bytes(),
+                )
+                .await
+                .map_err(|e| BasebackupError::Client(e, "add_pgcontrol_file,neon.signal"))?;
+        }
 
         //send pg_control
         let header = new_tar_header("global/pg_control", pg_control_bytes.len() as u64)?;
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 96fe0c1078..409cc2e3c5 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -610,13 +610,13 @@ async fn import_file(
         debug!("imported twophase file");
     } else if file_path.starts_with("pg_wal") {
         debug!("found wal file in base section. ignore it");
-    } else if file_path.starts_with("zenith.signal") {
+    } else if file_path.starts_with("zenith.signal") || file_path.starts_with("neon.signal") {
         // Parse zenith signal file to set correct previous LSN
         let bytes = read_all_bytes(reader).await?;
-        // zenith.signal format is "PREV LSN: prev_lsn"
+        // neon.signal format is "PREV LSN: prev_lsn"
         // TODO write serialization and deserialization in the same place.
-        let zenith_signal = std::str::from_utf8(&bytes)?.trim();
-        let prev_lsn = match zenith_signal {
+        let neon_signal = std::str::from_utf8(&bytes)?.trim();
+        let prev_lsn = match neon_signal {
             "PREV LSN: none" => Lsn(0),
             "PREV LSN: invalid" => Lsn(0),
             other => {
@@ -624,17 +624,17 @@ async fn import_file(
                 split[1]
                     .trim()
                     .parse::<Lsn>()
-                    .context("can't parse zenith.signal")?
+                    .context("can't parse neon.signal")?
             }
         };
 
-        // zenith.signal is not necessarily the last file, that we handle
+        // neon.signal is not necessarily the last file, that we handle
         // but it is ok to call `finish_write()`, because final `modification.commit()`
         // will update lsn once more to the final one.
         let writer = modification.tline.writer().await;
         writer.finish_write(prev_lsn);
 
-        debug!("imported zenith signal {}", prev_lsn);
+        debug!("imported neon signal {}", prev_lsn);
     } else if file_path.starts_with("pg_tblspc") {
         // TODO Backups exported from neon won't have pg_tblspc, but we will need
         // this to import arbitrary postgres databases.
diff --git a/pgxn/neon_test_utils/neontest.c b/pgxn/neon_test_utils/neontest.c
index d37412f674..5f880dfd23 100644
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -236,13 +236,13 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	bool		save_neon_test_evict;
 
 	/*
-	 * Temporarily set the zenith_test_evict GUC, so that when we pin and
+	 * Temporarily set the neon_test_evict GUC, so that when we pin and
 	 * unpin a buffer, the buffer is evicted. We use that hack to evict all
 	 * buffers, as there is no explicit "evict this buffer" function in the
 	 * buffer manager.
 	 */
-	save_neon_test_evict = zenith_test_evict;
-	zenith_test_evict = true;
+	save_neon_test_evict = neon_test_evict;
+	neon_test_evict = true;
 	PG_TRY();
 	{
 		/* Scan through all the buffers */
@@ -273,7 +273,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 
 			/*
 			 * Pin the buffer, and release it again. Because we have
-			 * zenith_test_evict==true, this will evict the page from the
+			 * neon_test_evict==true, this will evict the page from the
 			 * buffer cache if no one else is holding a pin on it.
 			 */
 			if (isvalid)
@@ -286,7 +286,7 @@ clear_buffer_cache(PG_FUNCTION_ARGS)
 	PG_FINALLY();
 	{
 		/* restore the GUC */
-		zenith_test_evict = save_neon_test_evict;
+		neon_test_evict = save_neon_test_evict;
 	}
 	PG_END_TRY();
 
diff --git a/pgxn/typedefs.list b/pgxn/typedefs.list
index 760f384212..3ea8b3b091 100644
--- a/pgxn/typedefs.list
+++ b/pgxn/typedefs.list
@@ -2953,17 +2953,17 @@ XmlTableBuilderData
 YYLTYPE
 YYSTYPE
 YY_BUFFER_STATE
-ZenithErrorResponse
-ZenithExistsRequest
-ZenithExistsResponse
-ZenithGetPageRequest
-ZenithGetPageResponse
-ZenithMessage
-ZenithMessageTag
-ZenithNblocksRequest
-ZenithNblocksResponse
-ZenithRequest
-ZenithResponse
+NeonErrorResponse
+NeonExistsRequest
+NeonExistsResponse
+NeonGetPageRequest
+NeonGetPageResponse
+NeonMessage
+NeonMessageTag
+NeonNblocksRequest
+NeonNblocksResponse
+NeonRequest
+NeonResponse
 _SPI_connection
 _SPI_plan
 __AssignProcessToJobObject
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a7b7f0e74d..b9fff05c6c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -5409,6 +5409,7 @@ SKIP_FILES = frozenset(
     (
         "pg_internal.init",
         "pg.log",
+        "neon.signal",
         "zenith.signal",
         "pg_hba.conf",
         "postgresql.conf",
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index c0f163db32..45b7af719e 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -209,9 +209,9 @@ def test_ancestor_detach_branched_from(
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
     wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
-    # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
-    # as there is always "PREV_LSN: invalid" for "before"
-    skip_files = {"zenith.signal"}
+    # because we do the fullbackup from ancestor at the branch_lsn, the neon.signal and/or zenith.signal is always
+    # different as there is always "PREV_LSN: invalid" for "before"
+    skip_files = {"zenith.signal", "neon.signal"}
 
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, skip_files)
 
@@ -767,7 +767,7 @@ def test_compaction_induced_by_detaches_in_history(
         env.pageserver, env.initial_tenant, branch_timeline_id, branch_lsn, fullbackup_after
     )
 
-    # we don't need to skip any files, because zenith.signal will be identical
+    # we don't need to skip any files, because neon.signal will be identical
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 9085654ee8..8ce1f52303 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 9085654ee8022d5cc4ca719380a1dc53e5e3246f
+Subproject commit 8ce1f52303aec29e098309347b57c01a1962e221
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 8c3249f36c..afd46987f3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9
+Subproject commit afd46987f3da50c9146a8aa59380052df0862c06
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 7a4c0eacae..e08c8d5f15 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 7a4c0eacaeb9b97416542fa19103061c166460b1
+Subproject commit e08c8d5f1576ca0487d14d154510499c5f12adfb
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index db424d42d7..353c725b0c 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit db424d42d748f8ad91ac00e28db2c7f2efa42f7f
+Subproject commit 353c725b0c76cc82b15af21d8360d03391dc6814
diff --git a/vendor/revisions.json b/vendor/revisions.json
index b260698c86..992aa405b1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "db424d42d748f8ad91ac00e28db2c7f2efa42f7f"
+    "353c725b0c76cc82b15af21d8360d03391dc6814"
   ],
   "v16": [
     "16.9",
-    "7a4c0eacaeb9b97416542fa19103061c166460b1"
+    "e08c8d5f1576ca0487d14d154510499c5f12adfb"
   ],
   "v15": [
     "15.13",
-    "8c3249f36c7df6ac0efb8ee9f1baf4aa1b83e5c9"
+    "afd46987f3da50c9146a8aa59380052df0862c06"
   ],
   "v14": [
     "14.18",
-    "9085654ee8022d5cc4ca719380a1dc53e5e3246f"
+    "8ce1f52303aec29e098309347b57c01a1962e221"
   ]
 }

From cb991fba421999e390c9debfc39fb39a636fe1e9 Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Fri, 11 Jul 2025 12:27:55 -0700
Subject: [PATCH 094/163] A few more PS changes (#12552)

# TLDR
Problem-I is a bug fix. The rest are no-ops.

## Problem I
Page server checks image layer creation based on the elapsed time but
this check depends on the current logical size, which is only computed
on shard 0. Thus, for non-0 shards, the check will be ineffective and
image creation will never be done for idle tenants.

## Summary of changes I
This PR fixes the problem by simply removing the dependency on current
logical size.

## Summary of changes II
This PR adds a timeout when calling page server to split shard to make
sure SC does not wait for the API call forever. Currently the PR doesn't
adds any retry logic because it's not clear whether page server shard
split can be safely retried if the existing operation is still ongoing
or left the storage in a bad state. Thus it's better to abort the whole
operation and restart.

## Problem III
`test_remote_failures` requires PS to be compiled in the testing mode.
For PS in dev/staging, they are compiled without this mode.

## Summary of changes III
Remove the restriction and also increase the number of total failures
allowed.

## Summary of changes IV
remove test on PS getpage http route.

---------

Co-authored-by: Chen Luo <chen.luo@databricks.com>
Co-authored-by: Yecheng Yang <carlton.yang@databricks.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 control_plane/src/local_env.rs               |  4 +
 control_plane/src/storage_controller.rs      |  7 ++
 libs/remote_storage/src/simulate_failures.rs |  1 +
 libs/utils/src/env.rs                        |  3 +-
 pageserver/src/bin/pageserver.rs             |  5 --
 pageserver/src/http/routes.rs                |  2 +-
 pageserver/src/tenant/mgr.rs                 |  2 +
 pageserver/src/tenant/timeline.rs            | 48 ++++++-----
 storage_controller/src/main.rs               |  7 ++
 storage_controller/src/service.rs            | 27 ++++++-
 test_runner/regress/test_compaction.py       | 62 ++++++++++++++
 test_runner/regress/test_sharding.py         | 85 ++++++++++++++++++++
 12 files changed, 226 insertions(+), 27 deletions(-)

diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index d0611113e8..d34dd39f61 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -217,6 +217,9 @@ pub struct NeonStorageControllerConf {
     pub posthog_config: Option<PostHogConfig>,
 
     pub kick_secondary_downloads: Option<bool>,
+
+    #[serde(with = "humantime_serde")]
+    pub shard_split_request_timeout: Option<Duration>,
 }
 
 impl NeonStorageControllerConf {
@@ -250,6 +253,7 @@ impl Default for NeonStorageControllerConf {
             timeline_safekeeper_count: None,
             posthog_config: None,
             kick_secondary_downloads: None,
+            shard_split_request_timeout: None,
         }
     }
 }
diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index dc6c82f504..f996f39967 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -648,6 +648,13 @@ impl StorageController {
             args.push(format!("--timeline-safekeeper-count={sk_cnt}"));
         }
 
+        if let Some(duration) = self.config.shard_split_request_timeout {
+            args.push(format!(
+                "--shard-split-request-timeout={}",
+                humantime::Duration::from(duration)
+            ));
+        }
+
         let mut envs = vec![
             ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
             ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 30d116f57c..e895380192 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -31,6 +31,7 @@ pub struct UnreliableWrapper {
     /* BEGIN_HADRON */
     // This the probability of failure for each operation, ranged from [0, 100].
     // The probability is default to 100, which means that all operations will fail.
+    // Storage will fail by probability up to attempts_to_fail times.
     attempt_failure_probability: u64,
     /* END_HADRON */
 }
diff --git a/libs/utils/src/env.rs b/libs/utils/src/env.rs
index cc1cbf8009..0b3b5e6c4f 100644
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -47,6 +47,7 @@ where
 
 /* BEGIN_HADRON */
 pub enum DeploymentMode {
+    Local,
     Dev,
     Staging,
     Prod,
@@ -64,7 +65,7 @@ pub fn get_deployment_mode() -> Option<DeploymentMode> {
             }
         },
         Err(_) => {
-            tracing::error!("DEPLOYMENT_MODE not set");
+            // tracing::error!("DEPLOYMENT_MODE not set");
             None
         }
     }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 299fe7e159..dfb8b437c3 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -917,11 +917,6 @@ async fn create_remote_storage_client(
     // If `test_remote_failures` is non-zero, wrap the client with a
     // wrapper that simulates failures.
     if conf.test_remote_failures > 0 {
-        if !cfg!(feature = "testing") {
-            anyhow::bail!(
-                "test_remote_failures option is not available because pageserver was compiled without the 'testing' feature"
-            );
-        }
         info!(
             "Simulating remote failures for first {} attempts of each op",
             conf.test_remote_failures
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d839bac557..0d40c5ecf7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4183,7 +4183,7 @@ pub fn make_router(
         })
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
-            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
+            |r|  testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
         )
         .get(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/touchpage",
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 15853d3614..52f67abde5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1678,6 +1678,8 @@ impl TenantManager {
         // Phase 6: Release the InProgress on the parent shard
         drop(parent_slot_guard);
 
+        utils::pausable_failpoint!("shard-split-post-finish-pause");
+
         Ok(child_shards)
     }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 718ea925b7..fe622713e9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5604,10 +5604,11 @@ impl Timeline {
     /// Predicate function which indicates whether we should check if new image layers
     /// are required. Since checking if new image layers are required is expensive in
     /// terms of CPU, we only do it in the following cases:
-    /// 1. If the timeline has ingested sufficient WAL to justify the cost
+    /// 1. If the timeline has ingested sufficient WAL to justify the cost or ...
     /// 2. If enough time has passed since the last check:
     ///     1. For large tenants, we wish to perform the check more often since they
-    ///        suffer from the lack of image layers
+    ///        suffer from the lack of image layers. Note that we assume sharded tenants
+    ///        to be large since non-zero shards do not track the logical size.
     ///     2. For small tenants (that can mostly fit in RAM), we use a much longer interval
     fn should_check_if_image_layers_required(self: &Arc<Timeline>, lsn: Lsn) -> bool {
         let large_timeline_threshold = self.conf.image_layer_generation_large_timeline_threshold;
@@ -5621,30 +5622,39 @@ impl Timeline {
 
         let distance_based_decision = distance.0 >= min_distance;
 
-        let mut time_based_decision = false;
         let mut last_check_instant = self.last_image_layer_creation_check_instant.lock().unwrap();
-        if let CurrentLogicalSize::Exact(logical_size) = self.current_logical_size.current_size() {
-            let check_required_after =
-                if Some(Into::<u64>::into(&logical_size)) >= large_timeline_threshold {
-                    self.get_checkpoint_timeout()
-                } else {
-                    Duration::from_secs(3600 * 48)
-                };
-
-            time_based_decision = match *last_check_instant {
-                Some(last_check) => {
-                    let elapsed = last_check.elapsed();
-                    elapsed >= check_required_after
+        let check_required_after = (|| {
+            if self.shard_identity.is_unsharded() {
+                if let CurrentLogicalSize::Exact(logical_size) =
+                    self.current_logical_size.current_size()
+                {
+                    if Some(Into::<u64>::into(&logical_size)) < large_timeline_threshold {
+                        return Duration::from_secs(3600 * 48);
+                    }
                 }
-                None => true,
-            };
-        }
+            }
+
+            self.get_checkpoint_timeout()
+        })();
+
+        let time_based_decision = match *last_check_instant {
+            Some(last_check) => {
+                let elapsed = last_check.elapsed();
+                elapsed >= check_required_after
+            }
+            None => true,
+        };
 
         // Do the expensive delta layer counting only if this timeline has ingested sufficient
         // WAL since the last check or a checkpoint timeout interval has elapsed since the last
         // check.
         let decision = distance_based_decision || time_based_decision;
-
+        tracing::info!(
+            "Decided to check image layers: {}. Distance-based decision: {}, time-based decision: {}",
+            decision,
+            distance_based_decision,
+            time_based_decision
+        );
         if decision {
             self.last_image_layer_creation_check_at.store(lsn);
             *last_check_instant = Some(Instant::now());
diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs
index 2a851dc25b..5d21feeb10 100644
--- a/storage_controller/src/main.rs
+++ b/storage_controller/src/main.rs
@@ -222,6 +222,9 @@ struct Cli {
     /// Primarily useful for testing to reduce test execution time.
     #[arg(long, default_value = "false", action=ArgAction::Set)]
     kick_secondary_downloads: bool,
+
+    #[arg(long)]
+    shard_split_request_timeout: Option<humantime::Duration>,
 }
 
 enum StrictMode {
@@ -470,6 +473,10 @@ async fn async_main() -> anyhow::Result<()> {
         timeline_safekeeper_count: args.timeline_safekeeper_count,
         posthog_config: posthog_config.clone(),
         kick_secondary_downloads: args.kick_secondary_downloads,
+        shard_split_request_timeout: args
+            .shard_split_request_timeout
+            .map(humantime::Duration::into)
+            .unwrap_or(Duration::MAX),
     };
 
     // Validate that we can connect to the database
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0907907edc..638cb410fa 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -60,6 +60,7 @@ use tokio::sync::mpsc::error::TrySendError;
 use tokio_util::sync::CancellationToken;
 use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 use utils::completion::Barrier;
+use utils::env;
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
@@ -483,6 +484,9 @@ pub struct Config {
 
     /// When set, actively checks and initiates heatmap downloads/uploads.
     pub kick_secondary_downloads: bool,
+
+    /// Timeout used for HTTP client of split requests. [`Duration::MAX`] if None.
+    pub shard_split_request_timeout: Duration,
 }
 
 impl From<DatabaseError> for ApiError {
@@ -6406,18 +6410,39 @@ impl Service {
         // TODO: issue split calls concurrently (this only matters once we're splitting
         // N>1 shards into M shards -- initially we're usually splitting 1 shard into N).
 
+        // HADRON: set a timeout for splitting individual shards on page servers.
+        // Currently we do not perform any retry because it's not clear if page server can handle
+        // partially split shards correctly.
+        let shard_split_timeout =
+            if let Some(env::DeploymentMode::Local) = env::get_deployment_mode() {
+                Duration::from_secs(30)
+            } else {
+                self.config.shard_split_request_timeout
+            };
+        let mut http_client_builder = reqwest::ClientBuilder::new()
+            .pool_max_idle_per_host(0)
+            .timeout(shard_split_timeout);
+
+        for ssl_ca_cert in &self.config.ssl_ca_certs {
+            http_client_builder = http_client_builder.add_root_certificate(ssl_ca_cert.clone());
+        }
+        let http_client = http_client_builder
+            .build()
+            .expect("Failed to construct HTTP client");
         for target in &targets {
             let ShardSplitTarget {
                 parent_id,
                 node,
                 child_ids,
             } = target;
+
             let client = PageserverClient::new(
                 node.get_id(),
-                self.http_client.clone(),
+                http_client.clone(),
                 node.base_url(),
                 self.config.pageserver_jwt_token.as_deref(),
             );
+
             let response = client
                 .tenant_shard_split(
                     *parent_id,
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index ab02314288..963a19d640 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -7,6 +7,7 @@ import time
 from enum import StrEnum
 
 import pytest
+from fixtures.common_types import TenantShardId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -960,6 +961,67 @@ def get_layer_map(env, tenant_shard_id, timeline_id, ps_id):
     return image_layer_count, delta_layer_count
 
 
+def test_image_layer_creation_time_threshold(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that image layers can be created when the time threshold is reached on non-0 shards.
+    """
+    tenant_conf = {
+        "compaction_threshold": "100",
+        "image_creation_threshold": "100",
+        "image_layer_creation_check_threshold": "1",
+        # disable distance based image layer creation check
+        "checkpoint_distance": 10 * 1024 * 1024 * 1024,
+        "checkpoint_timeout": "100ms",
+        "image_layer_force_creation_period": "1s",
+        "pitr_interval": "10s",
+        "gc_period": "1s",
+        "compaction_period": "1s",
+        "lsn_lease_length": "1s",
+    }
+
+    # consider every tenant large to run the image layer generation check more eagerly
+    neon_env_builder.pageserver_config_override = (
+        "image_layer_generation_large_timeline_threshold=0"
+    )
+
+    neon_env_builder.num_pageservers = 1
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start(
+        initial_tenant_conf=tenant_conf,
+        initial_tenant_shard_count=2,
+        initial_tenant_shard_stripe_size=1,
+    )
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+    endpoint = env.endpoints.create_start("main")
+    endpoint.safe_psql("CREATE TABLE foo (id INTEGER, val text)")
+
+    for v in range(10):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    tenant_shard_id = TenantShardId(tenant_id, 1, 2)
+
+    # Generate some rows.
+    for v in range(20):
+        endpoint.safe_psql(f"INSERT INTO foo (id, val) VALUES ({v}, repeat('abcde{v:0>3}', 500))")
+
+    # restart page server so that logical size on non-0 shards is missing
+    env.pageserver.restart()
+
+    (old_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
+    log.info(f"old images: {old_images}, old deltas: {old_deltas}")
+
+    def check_image_creation():
+        (new_images, old_deltas) = get_layer_map(env, tenant_shard_id, timeline_id, 0)
+        log.info(f"images: {new_images}, deltas: {old_deltas}")
+        assert new_images > old_images
+
+    wait_until(check_image_creation)
+
+    endpoint.stop_and_destroy()
+
+
 def test_image_layer_force_creation_period(neon_env_builder: NeonEnvBuilder):
     """
     Tests that page server can force creating new images if image_layer_force_creation_period is enabled
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 8ff767eca4..5549105188 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -1673,6 +1673,91 @@ def test_shard_resolve_during_split_abort(neon_env_builder: NeonEnvBuilder):
 # END_HADRON
 
 
+# HADRON
+@pytest.mark.skip(reason="The backpressure change has not been merged yet.")
+def test_back_pressure_per_shard(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests back pressure knobs are enforced on the per shard basis instead of at the tenant level.
+    """
+    init_shard_count = 4
+    neon_env_builder.num_pageservers = init_shard_count
+    stripe_size = 1
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+        initial_tenant_conf={
+            # disable auto-flush of shards and set max_replication_flush_lag as 15MB.
+            # The backpressure parameters must be enforced at the shard level to avoid stalling PG.
+            "checkpoint_distance": 1 * 1024 * 1024 * 1024,
+            "checkpoint_timeout": "1h",
+        },
+    )
+
+    endpoint = env.endpoints.create(
+        "main",
+        config_lines=[
+            "max_replication_write_lag = 0",
+            "max_replication_apply_lag = 0",
+            "max_replication_flush_lag = 15MB",
+            "neon.max_cluster_size = 10GB",
+        ],
+    )
+    endpoint.respec(skip_pg_catalog_updates=False)  # Needed for databricks_system to get created.
+    endpoint.start()
+
+    # generate 20MB of data
+    endpoint.safe_psql(
+        "CREATE TABLE usertable AS SELECT s AS KEY, repeat('a', 1000) as VALUE from generate_series(1, 20000) s;"
+    )
+    res = endpoint.safe_psql(
+        "SELECT neon.backpressure_throttling_time() as throttling_time", dbname="databricks_system"
+    )[0]
+    assert res[0] == 0, f"throttling_time should be 0, but got {res[0]}"
+
+    endpoint.stop()
+
+
+# HADRON
+def test_shard_split_page_server_timeout(neon_env_builder: NeonEnvBuilder):
+    """
+    Tests that shard split can correctly handle page server timeouts and abort the split
+    """
+    init_shard_count = 2
+    neon_env_builder.num_pageservers = 1
+    stripe_size = 1
+
+    if neon_env_builder.storage_controller_config is None:
+        neon_env_builder.storage_controller_config = {"shard_split_request_timeout": "5s"}
+    else:
+        neon_env_builder.storage_controller_config["shard_split_request_timeout"] = "5s"
+
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=init_shard_count,
+        initial_tenant_shard_stripe_size=stripe_size,
+    )
+
+    env.storage_controller.allowed_errors.extend(
+        [
+            ".*Enqueuing background abort.*",
+            ".*failpoint.*",
+            ".*Failed to abort.*",
+            ".*Exclusive lock by ShardSplit was held.*",
+        ]
+    )
+    env.pageserver.allowed_errors.extend([".*request was dropped before completing.*"])
+
+    endpoint1 = env.endpoints.create_start(branch_name="main")
+
+    env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "pause"))
+
+    with pytest.raises(StorageControllerApiException):
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=4)
+
+    env.pageserver.http_client().configure_failpoints(("shard-split-post-finish-pause", "off"))
+    endpoint1.stop_and_destroy()
+
+
 def test_sharding_backpressure(neon_env_builder: NeonEnvBuilder):
     """
     Check a scenario when one of the shards is much slower than others.

From 380d167b7ca2c8312fafffef30ae8cbdea7fd8a0 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 11 Jul 2025 21:35:42 +0200
Subject: [PATCH 095/163] proxy: For cancellation data replace HSET+EXPIRE/HGET
 with SET..EX/GET (#12553)

## Problem

To store cancellation data we send two commands to redis because the
redis server version doesn't support HSET with EX. Also, HSET is not
really needed.

## Summary of changes

* Replace the HSET + EXPIRE command pair with one SET .. EX command.
* Replace HGET with GET.
* Leave a workaround for old keys set with HSET.
* Replace some anyhow errors with specific errors to surface the
WRONGTYPE error from redis.
---
 Cargo.lock                                    |   1 +
 proxy/Cargo.toml                              |   3 +-
 proxy/src/batch.rs                            |  68 +++++++----
 proxy/src/cancellation.rs                     | 111 ++++++++++++------
 proxy/src/metrics.rs                          |   6 +-
 .../connection_with_credentials_provider.rs   |  24 +++-
 proxy/src/redis/elasticache.rs                |  20 +++-
 proxy/src/redis/kv_ops.rs                     |  16 ++-
 8 files changed, 175 insertions(+), 74 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 025f4e4116..4323254f0a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5289,6 +5289,7 @@ dependencies = [
  "async-trait",
  "atomic-take",
  "aws-config",
+ "aws-credential-types",
  "aws-sdk-iam",
  "aws-sigv4",
  "base64 0.22.1",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index ce8610be24..0a406d1ca8 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -16,6 +16,7 @@ async-compression.workspace = true
 async-trait.workspace = true
 atomic-take.workspace = true
 aws-config.workspace = true
+aws-credential-types.workspace = true
 aws-sdk-iam.workspace = true
 aws-sigv4.workspace = true
 base64.workspace = true
@@ -127,4 +128,4 @@ rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
 tokio-postgres.workspace = true
-tracing-test = "0.2"
\ No newline at end of file
+tracing-test = "0.2"
diff --git a/proxy/src/batch.rs b/proxy/src/batch.rs
index 33e08797f2..cf866ab9a3 100644
--- a/proxy/src/batch.rs
+++ b/proxy/src/batch.rs
@@ -7,13 +7,17 @@ use std::pin::pin;
 use std::sync::Mutex;
 
 use scopeguard::ScopeGuard;
+use tokio::sync::oneshot;
 use tokio::sync::oneshot::error::TryRecvError;
 
 use crate::ext::LockExt;
 
+type ProcResult<P> = Result<<P as QueueProcessing>::Res, <P as QueueProcessing>::Err>;
+
 pub trait QueueProcessing: Send + 'static {
     type Req: Send + 'static;
     type Res: Send;
+    type Err: Send + Clone;
 
     /// Get the desired batch size.
     fn batch_size(&self, queue_size: usize) -> usize;
@@ -24,7 +28,18 @@ pub trait QueueProcessing: Send + 'static {
     /// If this apply can error, it's expected that errors be forwarded to each Self::Res.
     ///
     /// Batching does not need to happen atomically.
-    fn apply(&mut self, req: Vec<Self::Req>) -> impl Future<Output = Vec<Self::Res>> + Send;
+    fn apply(
+        &mut self,
+        req: Vec<Self::Req>,
+    ) -> impl Future<Output = Result<Vec<Self::Res>, Self::Err>> + Send;
+}
+
+#[derive(thiserror::Error)]
+pub enum BatchQueueError<E: Clone, C> {
+    #[error(transparent)]
+    Result(E),
+    #[error(transparent)]
+    Cancelled(C),
 }
 
 pub struct BatchQueue<P: QueueProcessing> {
@@ -34,7 +49,7 @@ pub struct BatchQueue<P: QueueProcessing> {
 
 struct BatchJob<P: QueueProcessing> {
     req: P::Req,
-    res: tokio::sync::oneshot::Sender<P::Res>,
+    res: tokio::sync::oneshot::Sender<Result<P::Res, P::Err>>,
 }
 
 impl<P: QueueProcessing> BatchQueue<P> {
@@ -55,11 +70,11 @@ impl<P: QueueProcessing> BatchQueue<P> {
         &self,
         req: P::Req,
         cancelled: impl Future<Output = R>,
-    ) -> Result<P::Res, R> {
+    ) -> Result<P::Res, BatchQueueError<P::Err, R>> {
         let (id, mut rx) = self.inner.lock_propagate_poison().register_job(req);
 
         let mut cancelled = pin!(cancelled);
-        let resp = loop {
+        let resp: Option<Result<P::Res, P::Err>> = loop {
             // try become the leader, or try wait for success.
             let mut processor = tokio::select! {
                 // try become leader.
@@ -72,7 +87,7 @@ impl<P: QueueProcessing> BatchQueue<P> {
                     if inner.queue.remove(&id).is_some() {
                         tracing::warn!("batched task cancelled before completion");
                     }
-                    return Err(cancel);
+                    return Err(BatchQueueError::Cancelled(cancel));
                 },
             };
 
@@ -96,18 +111,30 @@ impl<P: QueueProcessing> BatchQueue<P> {
             // good: we didn't get cancelled.
             ScopeGuard::into_inner(cancel_safety);
 
-            if values.len() != resps.len() {
-                tracing::error!(
-                    "batch: invalid response size, expected={}, got={}",
-                    resps.len(),
-                    values.len()
-                );
-            }
+            match values {
+                Ok(values) => {
+                    if values.len() != resps.len() {
+                        tracing::error!(
+                            "batch: invalid response size, expected={}, got={}",
+                            resps.len(),
+                            values.len()
+                        );
+                    }
 
-            // send response values.
-            for (tx, value) in std::iter::zip(resps, values) {
-                if tx.send(value).is_err() {
-                    // receiver hung up but that's fine.
+                    // send response values.
+                    for (tx, value) in std::iter::zip(resps, values) {
+                        if tx.send(Ok(value)).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
+                }
+
+                Err(err) => {
+                    for tx in resps {
+                        if tx.send(Err(err.clone())).is_err() {
+                            // receiver hung up but that's fine.
+                        }
+                    }
                 }
             }
 
@@ -129,7 +156,8 @@ impl<P: QueueProcessing> BatchQueue<P> {
 
         tracing::debug!(id, "batch: job completed");
 
-        Ok(resp.expect("no response found. batch processer should not panic"))
+        resp.expect("no response found. batch processer should not panic")
+            .map_err(BatchQueueError::Result)
     }
 }
 
@@ -139,8 +167,8 @@ struct BatchQueueInner<P: QueueProcessing> {
 }
 
 impl<P: QueueProcessing> BatchQueueInner<P> {
-    fn register_job(&mut self, req: P::Req) -> (u64, tokio::sync::oneshot::Receiver<P::Res>) {
-        let (tx, rx) = tokio::sync::oneshot::channel();
+    fn register_job(&mut self, req: P::Req) -> (u64, oneshot::Receiver<ProcResult<P>>) {
+        let (tx, rx) = oneshot::channel();
 
         let id = self.version;
 
@@ -158,7 +186,7 @@ impl<P: QueueProcessing> BatchQueueInner<P> {
         (id, rx)
     }
 
-    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<tokio::sync::oneshot::Sender<P::Res>>) {
+    fn get_batch(&mut self, p: &P) -> (Vec<P::Req>, Vec<oneshot::Sender<ProcResult<P>>>) {
         let batch_size = p.batch_size(self.queue.len());
         let mut reqs = Vec::with_capacity(batch_size);
         let mut resps = Vec::with_capacity(batch_size);
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 74413f1a7d..4ea4c4ea54 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -4,12 +4,11 @@ use std::pin::pin;
 use std::sync::{Arc, OnceLock};
 use std::time::Duration;
 
-use anyhow::anyhow;
 use futures::FutureExt;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
 use postgres_client::RawCancelToken;
 use postgres_client::tls::MakeTlsConnect;
-use redis::{Cmd, FromRedisValue, Value};
+use redis::{Cmd, FromRedisValue, SetExpiry, SetOptions, Value};
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -18,7 +17,7 @@ use tracing::{debug, error, info};
 
 use crate::auth::AuthError;
 use crate::auth::backend::ComputeUserInfo;
-use crate::batch::{BatchQueue, QueueProcessing};
+use crate::batch::{BatchQueue, BatchQueueError, QueueProcessing};
 use crate::config::ComputeConfig;
 use crate::context::RequestContext;
 use crate::control_plane::ControlPlaneApi;
@@ -28,7 +27,7 @@ use crate::metrics::{CancelChannelSizeGuard, CancellationRequest, Metrics, Redis
 use crate::pqproto::CancelKeyData;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
-use crate::redis::kv_ops::RedisKVClient;
+use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError};
 
 type IpSubnetKey = IpNet;
 
@@ -45,6 +44,17 @@ pub enum CancelKeyOp {
     GetCancelData {
         key: CancelKeyData,
     },
+    GetCancelDataOld {
+        key: CancelKeyData,
+    },
+}
+
+#[derive(thiserror::Error, Debug, Clone)]
+pub enum PipelineError {
+    #[error("could not send cmd to redis: {0}")]
+    RedisKVClient(Arc<RedisKVClientError>),
+    #[error("incorrect number of responses from redis")]
+    IncorrectNumberOfResponses,
 }
 
 pub struct Pipeline {
@@ -60,7 +70,7 @@ impl Pipeline {
         }
     }
 
-    async fn execute(self, client: &mut RedisKVClient) -> Vec<anyhow::Result<Value>> {
+    async fn execute(self, client: &mut RedisKVClient) -> Result<Vec<Value>, PipelineError> {
         let responses = self.replies;
         let batch_size = self.inner.len();
 
@@ -78,30 +88,20 @@ impl Pipeline {
                     batch_size,
                     responses, "successfully completed cancellation jobs",
                 );
-                values.into_iter().map(Ok).collect()
+                Ok(values.into_iter().collect())
             }
             Ok(value) => {
                 error!(batch_size, ?value, "unexpected redis return value");
-                std::iter::repeat_with(|| Err(anyhow!("incorrect response type from redis")))
-                    .take(responses)
-                    .collect()
-            }
-            Err(err) => {
-                std::iter::repeat_with(|| Err(anyhow!("could not send cmd to redis: {err}")))
-                    .take(responses)
-                    .collect()
+                Err(PipelineError::IncorrectNumberOfResponses)
             }
+            Err(err) => Err(PipelineError::RedisKVClient(Arc::new(err))),
         }
     }
 
-    fn add_command_with_reply(&mut self, cmd: Cmd) {
+    fn add_command(&mut self, cmd: Cmd) {
         self.inner.add_command(cmd);
         self.replies += 1;
     }
-
-    fn add_command_no_reply(&mut self, cmd: Cmd) {
-        self.inner.add_command(cmd).ignore();
-    }
 }
 
 impl CancelKeyOp {
@@ -109,12 +109,19 @@ impl CancelKeyOp {
         match self {
             CancelKeyOp::StoreCancelKey { key, value, expire } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hset(&key, "data", &**value));
-                pipe.add_command_no_reply(Cmd::expire(&key, expire.as_secs() as i64));
+                pipe.add_command(Cmd::set_options(
+                    &key,
+                    &**value,
+                    SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())),
+                ));
+            }
+            CancelKeyOp::GetCancelDataOld { key } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::hget(key, "data"));
             }
             CancelKeyOp::GetCancelData { key } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
-                pipe.add_command_with_reply(Cmd::hget(key, "data"));
+                pipe.add_command(Cmd::get(key));
             }
         }
     }
@@ -127,13 +134,14 @@ pub struct CancellationProcessor {
 
 impl QueueProcessing for CancellationProcessor {
     type Req = (CancelChannelSizeGuard<'static>, CancelKeyOp);
-    type Res = anyhow::Result<redis::Value>;
+    type Res = redis::Value;
+    type Err = PipelineError;
 
     fn batch_size(&self, _queue_size: usize) -> usize {
         self.batch_size
     }
 
-    async fn apply(&mut self, batch: Vec<Self::Req>) -> Vec<Self::Res> {
+    async fn apply(&mut self, batch: Vec<Self::Req>) -> Result<Vec<Self::Res>, Self::Err> {
         if !self.client.credentials_refreshed() {
             // this will cause a timeout for cancellation operations
             tracing::debug!(
@@ -244,18 +252,18 @@ impl CancellationHandler {
         &self,
         key: CancelKeyData,
     ) -> Result<Option<CancelClosure>, CancelError> {
-        let guard = Metrics::get()
-            .proxy
-            .cancel_channel_size
-            .guard(RedisMsgKind::HGet);
-        let op = CancelKeyOp::GetCancelData { key };
+        const TIMEOUT: Duration = Duration::from_secs(5);
 
         let Some(tx) = self.tx.get() else {
             tracing::warn!("cancellation handler is not available");
             return Err(CancelError::InternalError);
         };
 
-        const TIMEOUT: Duration = Duration::from_secs(5);
+        let guard = Metrics::get()
+            .proxy
+            .cancel_channel_size
+            .guard(RedisMsgKind::Get);
+        let op = CancelKeyOp::GetCancelData { key };
         let result = timeout(
             TIMEOUT,
             tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -264,10 +272,37 @@ impl CancellationHandler {
         .map_err(|_| {
             tracing::warn!("timed out waiting to receive GetCancelData response");
             CancelError::RateLimit
-        })?
-        // cannot be cancelled
-        .unwrap_or_else(|x| match x {})
-        .map_err(|e| {
+        })?;
+
+        // We may still have cancel keys set with HSET <key> "data".
+        // Check error type and retry with HGET.
+        // TODO: remove code after HSET is not used anymore.
+        let result = if let Err(err) = result.as_ref()
+            && let BatchQueueError::Result(err) = err
+            && let PipelineError::RedisKVClient(err) = err
+            && let RedisKVClientError::Redis(err) = &**err
+            && let Some(errcode) = err.code()
+            && errcode == "WRONGTYPE"
+        {
+            let guard = Metrics::get()
+                .proxy
+                .cancel_channel_size
+                .guard(RedisMsgKind::HGet);
+            let op = CancelKeyOp::GetCancelDataOld { key };
+            timeout(
+                TIMEOUT,
+                tx.call((guard, op), std::future::pending::<Infallible>()),
+            )
+            .await
+            .map_err(|_| {
+                tracing::warn!("timed out waiting to receive GetCancelData response");
+                CancelError::RateLimit
+            })?
+        } else {
+            result
+        };
+
+        let result = result.map_err(|e| {
             tracing::warn!("failed to receive GetCancelData response: {e}");
             CancelError::InternalError
         })?;
@@ -442,7 +477,7 @@ impl Session {
             let guard = Metrics::get()
                 .proxy
                 .cancel_channel_size
-                .guard(RedisMsgKind::HSet);
+                .guard(RedisMsgKind::Set);
             let op = CancelKeyOp::StoreCancelKey {
                 key: self.key,
                 value: closure_json.clone(),
@@ -456,7 +491,7 @@ impl Session {
             );
 
             match tx.call((guard, op), cancel.as_mut()).await {
-                Ok(Ok(_)) => {
+                Ok(_) => {
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
@@ -467,10 +502,10 @@ impl Session {
                     tokio::time::sleep(CANCEL_KEY_REFRESH).await;
                 }
                 // retry immediately.
-                Ok(Err(error)) => {
+                Err(BatchQueueError::Result(error)) => {
                     tracing::warn!(?error, "error registering cancellation key");
                 }
-                Err(Err(_cancelled)) => break,
+                Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
             }
         }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 9d1a3d4358..8439082498 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -374,11 +374,9 @@ pub enum Waiting {
 #[label(singleton = "kind")]
 #[allow(clippy::enum_variant_names)]
 pub enum RedisMsgKind {
-    HSet,
-    HSetMultiple,
+    Set,
+    Get,
     HGet,
-    HGetAll,
-    HDel,
 }
 
 #[derive(Default, Clone)]
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index 35a3fe4334..b0bf332e44 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -4,11 +4,12 @@ use std::time::Duration;
 
 use futures::FutureExt;
 use redis::aio::{ConnectionLike, MultiplexedConnection};
-use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult};
+use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisError, RedisResult};
 use tokio::task::AbortHandle;
 use tracing::{error, info, warn};
 
 use super::elasticache::CredentialsProvider;
+use crate::redis::elasticache::CredentialsProviderError;
 
 enum Credentials {
     Static(ConnectionInfo),
@@ -26,6 +27,14 @@ impl Clone for Credentials {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub enum ConnectionProviderError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    CredentialsProvider(#[from] CredentialsProviderError),
+}
+
 /// A wrapper around `redis::MultiplexedConnection` that automatically refreshes the token.
 /// Provides PubSub connection without credentials refresh.
 pub struct ConnectionWithCredentialsProvider {
@@ -86,15 +95,18 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
-    async fn ping(con: &mut MultiplexedConnection) -> RedisResult<()> {
-        redis::cmd("PING").query_async(con).await
+    async fn ping(con: &mut MultiplexedConnection) -> Result<(), ConnectionProviderError> {
+        redis::cmd("PING")
+            .query_async(con)
+            .await
+            .map_err(Into::into)
     }
 
     pub(crate) fn credentials_refreshed(&self) -> bool {
         self.credentials_refreshed.load(Ordering::Relaxed)
     }
 
-    pub(crate) async fn connect(&mut self) -> anyhow::Result<()> {
+    pub(crate) async fn connect(&mut self) -> Result<(), ConnectionProviderError> {
         let _guard = self.mutex.lock().await;
         if let Some(con) = self.con.as_mut() {
             match Self::ping(con).await {
@@ -141,7 +153,7 @@ impl ConnectionWithCredentialsProvider {
         Ok(())
     }
 
-    async fn get_connection_info(&self) -> anyhow::Result<ConnectionInfo> {
+    async fn get_connection_info(&self) -> Result<ConnectionInfo, ConnectionProviderError> {
         match &self.credentials {
             Credentials::Static(info) => Ok(info.clone()),
             Credentials::Dynamic(provider, addr) => {
@@ -160,7 +172,7 @@ impl ConnectionWithCredentialsProvider {
         }
     }
 
-    async fn get_client(&self) -> anyhow::Result<redis::Client> {
+    async fn get_client(&self) -> Result<redis::Client, ConnectionProviderError> {
         let client = redis::Client::open(self.get_connection_info().await?)?;
         self.credentials_refreshed.store(true, Ordering::Relaxed);
         Ok(client)
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index 58e3c889a7..6f3b34d381 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -9,10 +9,12 @@ use aws_config::meta::region::RegionProviderChain;
 use aws_config::profile::ProfileFileCredentialsProvider;
 use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_credential_types::provider::error::CredentialsError;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
-    self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
+    self, SignableBody, SignableRequest, SignatureLocation, SigningError, SigningSettings,
 };
+use aws_sigv4::sign::v4::signing_params::BuildError;
 use tracing::info;
 
 #[derive(Debug)]
@@ -40,6 +42,18 @@ impl AWSIRSAConfig {
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+pub enum CredentialsProviderError {
+    #[error(transparent)]
+    AwsCredentials(#[from] CredentialsError),
+    #[error(transparent)]
+    AwsSigv4Build(#[from] BuildError),
+    #[error(transparent)]
+    AwsSigv4Singing(#[from] SigningError),
+    #[error(transparent)]
+    Http(#[from] http::Error),
+}
+
 /// Credentials provider for AWS elasticache authentication.
 ///
 /// Official documentation:
@@ -92,7 +106,9 @@ impl CredentialsProvider {
         })
     }
 
-    pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
+    pub(crate) async fn provide_credentials(
+        &self,
+    ) -> Result<(String, String), CredentialsProviderError> {
         let aws_credentials = self
             .credentials_provider
             .provide_credentials()
diff --git a/proxy/src/redis/kv_ops.rs b/proxy/src/redis/kv_ops.rs
index cfdbc21839..d1e97b6b09 100644
--- a/proxy/src/redis/kv_ops.rs
+++ b/proxy/src/redis/kv_ops.rs
@@ -2,9 +2,18 @@ use std::time::Duration;
 
 use futures::FutureExt;
 use redis::aio::ConnectionLike;
-use redis::{Cmd, FromRedisValue, Pipeline, RedisResult};
+use redis::{Cmd, FromRedisValue, Pipeline, RedisError, RedisResult};
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use crate::redis::connection_with_credentials_provider::ConnectionProviderError;
+
+#[derive(thiserror::Error, Debug)]
+pub enum RedisKVClientError {
+    #[error(transparent)]
+    Redis(#[from] RedisError),
+    #[error(transparent)]
+    ConnectionProvider(#[from] ConnectionProviderError),
+}
 
 pub struct RedisKVClient {
     client: ConnectionWithCredentialsProvider,
@@ -32,12 +41,13 @@ impl RedisKVClient {
         Self { client }
     }
 
-    pub async fn try_connect(&mut self) -> anyhow::Result<()> {
+    pub async fn try_connect(&mut self) -> Result<(), RedisKVClientError> {
         self.client
             .connect()
             .boxed()
             .await
             .inspect_err(|e| tracing::error!("failed to connect to redis: {e}"))
+            .map_err(Into::into)
     }
 
     pub(crate) fn credentials_refreshed(&self) -> bool {
@@ -47,7 +57,7 @@ impl RedisKVClient {
     pub(crate) async fn query<T: FromRedisValue>(
         &mut self,
         q: &impl Queryable,
-    ) -> anyhow::Result<T> {
+    ) -> Result<T, RedisKVClientError> {
         let e = match q.query(&mut self.client).await {
             Ok(t) => return Ok(t),
             Err(e) => e,

From 9bba31bf6805e1c179b75fbb5bcab96c96980c75 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 11 Jul 2025 20:39:08 +0100
Subject: [PATCH 096/163] proxy: encode json as we parse rows (#11992)

Serialize query row responses directly into JSON. Some of this code
should be using the `json::value_as_object/list` macros, but I've
avoided it for now to minimize the size of the diff.
---
 Cargo.lock                            |   1 +
 proxy/Cargo.toml                      |   1 +
 proxy/src/serverless/json.rs          |  95 +++++++---------
 proxy/src/serverless/sql_over_http.rs | 154 +++++++++++++-------------
 4 files changed, 122 insertions(+), 129 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 4323254f0a..14b460005a 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5329,6 +5329,7 @@ dependencies = [
  "itoa",
  "jose-jwa",
  "jose-jwk",
+ "json",
  "lasso",
  "measured",
  "metrics",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 0a406d1ca8..82fe6818e3 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -49,6 +49,7 @@ indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
 itoa.workspace = true
+json = { path = "../libs/proxy/json" }
 lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 2e67d07079..ef7c8a4d82 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,6 +1,7 @@
+use json::{ListSer, ObjectSer, ValueSer};
 use postgres_client::Row;
 use postgres_client::types::{Kind, Type};
-use serde_json::{Map, Value};
+use serde_json::Value;
 
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
@@ -74,44 +75,40 @@ pub(crate) enum JsonConversionError {
     UnbalancedString,
 }
 
-enum OutputMode {
-    Array(Vec<Value>),
-    Object(Map<String, Value>),
+enum OutputMode<'a> {
+    Array(ListSer<'a>),
+    Object(ObjectSer<'a>),
 }
 
-impl OutputMode {
-    fn key(&mut self, key: &str) -> &mut Value {
+impl OutputMode<'_> {
+    fn key(&mut self, key: &str) -> ValueSer<'_> {
         match self {
-            OutputMode::Array(values) => push_entry(values, Value::Null),
-            OutputMode::Object(map) => map.entry(key.to_string()).or_insert(Value::Null),
+            OutputMode::Array(values) => values.entry(),
+            OutputMode::Object(map) => map.key(key),
         }
     }
 
-    fn finish(self) -> Value {
+    fn finish(self) {
         match self {
-            OutputMode::Array(values) => Value::Array(values),
-            OutputMode::Object(map) => Value::Object(map),
+            OutputMode::Array(values) => values.finish(),
+            OutputMode::Object(map) => map.finish(),
         }
     }
 }
 
-fn push_entry<T>(arr: &mut Vec<T>, t: T) -> &mut T {
-    arr.push(t);
-    arr.last_mut().expect("a value was just inserted")
-}
-
 //
 // Convert postgres row with text-encoded values to JSON object
 //
 pub(crate) fn pg_text_row_to_json(
+    output: ValueSer,
     row: &Row,
     raw_output: bool,
     array_mode: bool,
-) -> Result<Value, JsonConversionError> {
+) -> Result<(), JsonConversionError> {
     let mut entries = if array_mode {
-        OutputMode::Array(Vec::with_capacity(row.columns().len()))
+        OutputMode::Array(output.list())
     } else {
-        OutputMode::Object(Map::with_capacity(row.columns().len()))
+        OutputMode::Object(output.object())
     };
 
     for (i, column) in row.columns().iter().enumerate() {
@@ -120,53 +117,48 @@ pub(crate) fn pg_text_row_to_json(
         let value = entries.key(column.name());
 
         match pg_value {
-            Some(v) if raw_output => *value = Value::String(v.to_string()),
+            Some(v) if raw_output => value.value(v),
             Some(v) => pg_text_to_json(value, v, column.type_())?,
-            None => *value = Value::Null,
+            None => value.value(json::Null),
         }
     }
 
-    Ok(entries.finish())
+    entries.finish();
+    Ok(())
 }
 
 //
 // Convert postgres text-encoded value to JSON value
 //
-fn pg_text_to_json(
-    output: &mut Value,
-    val: &str,
-    pg_type: &Type,
-) -> Result<(), JsonConversionError> {
+fn pg_text_to_json(output: ValueSer, val: &str, pg_type: &Type) -> Result<(), JsonConversionError> {
     if let Kind::Array(elem_type) = pg_type.kind() {
         // todo: we should fetch this from postgres.
         let delimiter = ',';
 
-        let mut array = vec![];
-        pg_array_parse(&mut array, val, elem_type, delimiter)?;
-        *output = Value::Array(array);
+        json::value_as_list!(|output| pg_array_parse(output, val, elem_type, delimiter)?);
         return Ok(());
     }
 
     match *pg_type {
-        Type::BOOL => *output = Value::Bool(val == "t"),
+        Type::BOOL => output.value(val == "t"),
         Type::INT2 | Type::INT4 => {
             let val = val.parse::<i32>()?;
-            *output = Value::Number(serde_json::Number::from(val));
+            output.value(val);
         }
         Type::FLOAT4 | Type::FLOAT8 => {
             let fval = val.parse::<f64>()?;
-            let num = serde_json::Number::from_f64(fval);
-            if let Some(num) = num {
-                *output = Value::Number(num);
+            if fval.is_finite() {
+                output.value(fval);
             } else {
                 // Pass Nan, Inf, -Inf as strings
                 // JS JSON.stringify() does converts them to null, but we
                 // want to preserve them, so we pass them as strings
-                *output = Value::String(val.to_string());
+                output.value(val);
             }
         }
-        Type::JSON | Type::JSONB => *output = serde_json::from_str(val)?,
-        _ => *output = Value::String(val.to_string()),
+        // we assume that the string value is valid json.
+        Type::JSON | Type::JSONB => output.write_raw_json(val.as_bytes()),
+        _ => output.value(val),
     }
 
     Ok(())
@@ -192,7 +184,7 @@ fn pg_text_to_json(
 /// gets its own level of curly braces, and delimiters must be written between adjacent
 /// curly-braced entities of the same level.
 fn pg_array_parse(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
     mut pg_array: &str,
     elem: &Type,
     delim: char,
@@ -221,7 +213,7 @@ fn pg_array_parse(
 /// reads a single array from the `pg_array` string and pushes each values to `elements`.
 /// returns the rest of the `pg_array` string that was not read.
 fn pg_array_parse_inner<'a>(
-    elements: &mut Vec<Value>,
+    elements: &mut ListSer,
     mut pg_array: &'a str,
     elem: &Type,
     delim: char,
@@ -234,7 +226,7 @@ fn pg_array_parse_inner<'a>(
     let mut q = String::new();
 
     loop {
-        let value = push_entry(elements, Value::Null);
+        let value = elements.entry();
         pg_array = pg_array_parse_item(value, &mut q, pg_array, elem, delim)?;
 
         // check for separator.
@@ -260,7 +252,7 @@ fn pg_array_parse_inner<'a>(
 ///
 /// `quoted` is a scratch allocation that has no defined output.
 fn pg_array_parse_item<'a>(
-    output: &mut Value,
+    output: ValueSer,
     quoted: &mut String,
     mut pg_array: &'a str,
     elem: &Type,
@@ -276,9 +268,8 @@ fn pg_array_parse_item<'a>(
 
     if pg_array.starts_with('{') {
         // nested array.
-        let mut nested = vec![];
-        pg_array = pg_array_parse_inner(&mut nested, pg_array, elem, delim)?;
-        *output = Value::Array(nested);
+        pg_array =
+            json::value_as_list!(|output| pg_array_parse_inner(output, pg_array, elem, delim))?;
         return Ok(pg_array);
     }
 
@@ -306,7 +297,7 @@ fn pg_array_parse_item<'a>(
     // we might have an item string:
     // check for null
     if item == "NULL" {
-        *output = Value::Null;
+        output.value(json::Null);
     } else {
         pg_text_to_json(output, item, elem)?;
     }
@@ -440,15 +431,15 @@ mod tests {
     }
 
     fn pg_text_to_json(val: &str, pg_type: &Type) -> Value {
-        let mut v = Value::Null;
-        super::pg_text_to_json(&mut v, val, pg_type).unwrap();
-        v
+        let output = json::value_to_string!(|v| super::pg_text_to_json(v, val, pg_type).unwrap());
+        serde_json::from_str(&output).unwrap()
     }
 
     fn pg_array_parse(pg_array: &str, pg_type: &Type) -> Value {
-        let mut array = vec![];
-        super::pg_array_parse(&mut array, pg_array, pg_type, ',').unwrap();
-        Value::Array(array)
+        let output = json::value_to_string!(|v| json::value_as_list!(|v| {
+            super::pg_array_parse(v, pg_array, pg_type, ',').unwrap();
+        }));
+        serde_json::from_str(&output).unwrap()
     }
 
     #[test]
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 7a718d0280..8a14f804b6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -14,10 +14,7 @@ use hyper::http::{HeaderName, HeaderValue};
 use hyper::{Request, Response, StatusCode, header};
 use indexmap::IndexMap;
 use postgres_client::error::{DbError, ErrorPosition, SqlState};
-use postgres_client::{
-    GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, RowStream, Transaction,
-};
-use serde::Serialize;
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use serde_json::Value;
 use serde_json::value::RawValue;
 use tokio::time::{self, Instant};
@@ -687,32 +684,21 @@ impl QueryData {
         let (inner, mut discard) = client.inner();
         let cancel_token = inner.cancel_token();
 
-        match select(
+        let mut json_buf = vec![];
+
+        let batch_result = match select(
             pin!(query_to_json(
                 config,
                 &mut *inner,
                 self,
-                &mut 0,
+                json::ValueSer::new(&mut json_buf),
                 parsed_headers
             )),
             pin!(cancel.cancelled()),
         )
         .await
         {
-            // The query successfully completed.
-            Either::Left((Ok((status, results)), __not_yet_cancelled)) => {
-                discard.check_idle(status);
-
-                let json_output =
-                    serde_json::to_string(&results).expect("json serialization should not fail");
-                Ok(json_output)
-            }
-            // The query failed with an error
-            Either::Left((Err(e), __not_yet_cancelled)) => {
-                discard.discard();
-                Err(e)
-            }
-            // The query was cancelled.
+            Either::Left((res, __not_yet_cancelled)) => res,
             Either::Right((_cancelled, query)) => {
                 tracing::info!("cancelling query");
                 if let Err(err) = cancel_token.cancel_query(NoTls).await {
@@ -721,13 +707,7 @@ impl QueryData {
                 // wait for the query cancellation
                 match time::timeout(time::Duration::from_millis(100), query).await {
                     // query successed before it was cancelled.
-                    Ok(Ok((status, results))) => {
-                        discard.check_idle(status);
-
-                        let json_output = serde_json::to_string(&results)
-                            .expect("json serialization should not fail");
-                        Ok(json_output)
-                    }
+                    Ok(Ok(status)) => Ok(status),
                     // query failed or was cancelled.
                     Ok(Err(error)) => {
                         let db_error = match &error {
@@ -743,14 +723,29 @@ impl QueryData {
                             discard.discard();
                         }
 
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                     }
                     Err(_timeout) => {
                         discard.discard();
-                        Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres))
+                        return Err(SqlOverHttpError::Cancelled(SqlOverHttpCancel::Postgres));
                     }
                 }
             }
+        };
+
+        match batch_result {
+            // The query successfully completed.
+            Ok(status) => {
+                discard.check_idle(status);
+
+                let json_output = String::from_utf8(json_buf).expect("json should be valid utf8");
+                Ok(json_output)
+            }
+            // The query failed with an error
+            Err(e) => {
+                discard.discard();
+                Err(e)
+            }
         }
     }
 }
@@ -787,7 +782,7 @@ impl BatchQueryData {
             })
             .map_err(SqlOverHttpError::Postgres)?;
 
-        let json_output = match query_batch(
+        let json_output = match query_batch_to_json(
             config,
             cancel.child_token(),
             &mut transaction,
@@ -845,24 +840,21 @@ async fn query_batch(
     transaction: &mut Transaction<'_>,
     queries: BatchQueryData,
     parsed_headers: HttpHeaders,
-) -> Result<String, SqlOverHttpError> {
-    let mut results = Vec::with_capacity(queries.queries.len());
-    let mut current_size = 0;
+    results: &mut json::ListSer<'_>,
+) -> Result<(), SqlOverHttpError> {
     for stmt in queries.queries {
         let query = pin!(query_to_json(
             config,
             transaction,
             stmt,
-            &mut current_size,
+            results.entry(),
             parsed_headers,
         ));
         let cancelled = pin!(cancel.cancelled());
         let res = select(query, cancelled).await;
         match res {
             // TODO: maybe we should check that the transaction bit is set here
-            Either::Left((Ok((_, values)), _cancelled)) => {
-                results.push(values);
-            }
+            Either::Left((Ok(_), _cancelled)) => {}
             Either::Left((Err(e), _cancelled)) => {
                 return Err(e);
             }
@@ -872,8 +864,22 @@ async fn query_batch(
         }
     }
 
-    let results = json!({ "results": results });
-    let json_output = serde_json::to_string(&results).expect("json serialization should not fail");
+    Ok(())
+}
+
+async fn query_batch_to_json(
+    config: &'static HttpConfig,
+    cancel: CancellationToken,
+    tx: &mut Transaction<'_>,
+    queries: BatchQueryData,
+    headers: HttpHeaders,
+) -> Result<String, SqlOverHttpError> {
+    let json_output = json::value_to_string!(|obj| json::value_as_object!(|obj| {
+        let results = obj.key("results");
+        json::value_as_list!(|results| {
+            query_batch(config, cancel, tx, queries, headers, results).await?;
+        });
+    }));
 
     Ok(json_output)
 }
@@ -882,54 +888,54 @@ async fn query_to_json<T: GenericClient>(
     config: &'static HttpConfig,
     client: &mut T,
     data: QueryData,
-    current_size: &mut usize,
+    output: json::ValueSer<'_>,
     parsed_headers: HttpHeaders,
-) -> Result<(ReadyForQueryStatus, impl Serialize + use<T>), SqlOverHttpError> {
+) -> Result<ReadyForQueryStatus, SqlOverHttpError> {
     let query_start = Instant::now();
 
-    let query_params = data.params;
+    let mut output = json::ObjectSer::new(output);
     let mut row_stream = client
-        .query_raw_txt(&data.query, query_params)
+        .query_raw_txt(&data.query, data.params)
         .await
         .map_err(SqlOverHttpError::Postgres)?;
     let query_acknowledged = Instant::now();
 
-    let columns_len = row_stream.statement.columns().len();
-    let mut fields = Vec::with_capacity(columns_len);
-
+    let mut json_fields = output.key("fields").list();
     for c in row_stream.statement.columns() {
-        fields.push(json!({
-            "name": c.name().to_owned(),
-            "dataTypeID": c.type_().oid(),
-            "tableID": c.table_oid(),
-            "columnID": c.column_id(),
-            "dataTypeSize": c.type_size(),
-            "dataTypeModifier": c.type_modifier(),
-            "format": "text",
-        }));
+        let json_field = json_fields.entry();
+        json::value_as_object!(|json_field| {
+            json_field.entry("name", c.name());
+            json_field.entry("dataTypeID", c.type_().oid());
+            json_field.entry("tableID", c.table_oid());
+            json_field.entry("columnID", c.column_id());
+            json_field.entry("dataTypeSize", c.type_size());
+            json_field.entry("dataTypeModifier", c.type_modifier());
+            json_field.entry("format", "text");
+        });
     }
+    json_fields.finish();
 
-    let raw_output = parsed_headers.raw_output;
     let array_mode = data.array_mode.unwrap_or(parsed_headers.default_array_mode);
+    let raw_output = parsed_headers.raw_output;
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    let mut rows = Vec::new();
+    let mut rows = 0;
+    let mut json_rows = output.key("rows").list();
     while let Some(row) = row_stream.next().await {
         let row = row.map_err(SqlOverHttpError::Postgres)?;
-        *current_size += row.body_len();
 
         // we don't have a streaming response support yet so this is to prevent OOM
         // from a malicious query (eg a cross join)
-        if *current_size > config.max_response_size_bytes {
+        if json_rows.as_buffer().len() > config.max_response_size_bytes {
             return Err(SqlOverHttpError::ResponseTooLarge(
                 config.max_response_size_bytes,
             ));
         }
 
-        let row = pg_text_row_to_json(&row, raw_output, array_mode)?;
-        rows.push(row);
+        pg_text_row_to_json(json_rows.entry(), &row, raw_output, array_mode)?;
+        rows += 1;
 
         // assumption: parsing pg text and converting to json takes CPU time.
         // let's assume it is slightly expensive, so we should consume some cooperative budget.
@@ -937,16 +943,14 @@ async fn query_to_json<T: GenericClient>(
         // of rows and never hit the tokio mpsc for a long time (although unlikely).
         tokio::task::consume_budget().await;
     }
+    json_rows.finish();
 
     let query_resp_end = Instant::now();
-    let RowStream {
-        command_tag,
-        status: ready,
-        ..
-    } = row_stream;
+
+    let ready = row_stream.status;
 
     // grab the command tag and number of rows affected
-    let command_tag = command_tag.unwrap_or_default();
+    let command_tag = row_stream.command_tag.unwrap_or_default();
     let mut command_tag_split = command_tag.split(' ');
     let command_tag_name = command_tag_split.next().unwrap_or_default();
     let command_tag_count = if command_tag_name == "INSERT" {
@@ -959,7 +963,7 @@ async fn query_to_json<T: GenericClient>(
     .and_then(|s| s.parse::<i64>().ok());
 
     info!(
-        rows = rows.len(),
+        rows,
         ?ready,
         command_tag,
         acknowledgement = ?(query_acknowledged - query_start),
@@ -967,16 +971,12 @@ async fn query_to_json<T: GenericClient>(
         "finished executing query"
     );
 
-    // Resulting JSON format is based on the format of node-postgres result.
-    let results = json!({
-        "command": command_tag_name.to_string(),
-        "rowCount": command_tag_count,
-        "rows": rows,
-        "fields": fields,
-        "rowAsArray": array_mode,
-    });
+    output.entry("command", command_tag_name);
+    output.entry("rowCount", command_tag_count);
+    output.entry("rowAsArray", array_mode);
 
-    Ok((ready, results))
+    output.finish();
+    Ok(ready)
 }
 
 enum Client {

From ee7bb1a66746e4bbbf1213792b8169e00ce08334 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <34828390+DimasKovas@users.noreply.github.com>
Date: Sat, 12 Jul 2025 08:57:04 +0400
Subject: [PATCH 097/163] storcon: validate new_sk_set before starting
 safekeeper migration (#12546)

## Problem
We don't validate the validity of the `new_sk_set` before starting the
migration. It is validated later, so the migration to an invalid
safekeeper set will fail anyway. But at this point we might already
commited an invalid `new_sk_set` to the database and there is no `abort`
command yet (I ran into this issue in neon_local and ruined the timeline
:)

- Part of https://github.com/neondatabase/neon/issues/11669

## Summary of changes
- Add safekeeper count and safekeeper duplication checks before starting
the migration
- Test that we validate the `new_sk_set` before starting the migration
- Add `force` option to the `TimelineSafekeeperMigrateRequest` to
disable not-mandatory checks
---
 .../src/service/safekeeper_service.rs         | 45 +++++++++++++++----
 .../regress/test_safekeeper_migration.py      | 38 ++++++++++++++++
 2 files changed, 75 insertions(+), 8 deletions(-)

diff --git a/storage_controller/src/service/safekeeper_service.rs b/storage_controller/src/service/safekeeper_service.rs
index 42ddf81e3e..7521d7bd86 100644
--- a/storage_controller/src/service/safekeeper_service.rs
+++ b/storage_controller/src/service/safekeeper_service.rs
@@ -39,13 +39,13 @@ use utils::lsn::Lsn;
 use super::Service;
 
 impl Service {
-    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, ApiError> {
+    fn make_member_set(safekeepers: &[Safekeeper]) -> Result<MemberSet, anyhow::Error> {
         let members = safekeepers
             .iter()
             .map(|sk| sk.get_safekeeper_id())
             .collect::<Vec<_>>();
 
-        MemberSet::new(members).map_err(ApiError::InternalServerError)
+        MemberSet::new(members)
     }
 
     fn get_safekeepers(&self, ids: &[i64]) -> Result<Vec<Safekeeper>, ApiError> {
@@ -80,7 +80,7 @@ impl Service {
     ) -> Result<Vec<NodeId>, ApiError> {
         let safekeepers = self.get_safekeepers(&timeline_persistence.sk_set)?;
 
-        let mset = Self::make_member_set(&safekeepers)?;
+        let mset = Self::make_member_set(&safekeepers).map_err(ApiError::InternalServerError)?;
         let mconf = safekeeper_api::membership::Configuration::new(mset);
 
         let req = safekeeper_api::models::TimelineCreateRequest {
@@ -1105,6 +1105,26 @@ impl Service {
             }
         }
 
+        if new_sk_set.is_empty() {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "new safekeeper set is empty"
+            )));
+        }
+
+        if new_sk_set.len() < self.config.timeline_safekeeper_count {
+            return Err(ApiError::BadRequest(anyhow::anyhow!(
+                "new safekeeper set must have at least {} safekeepers",
+                self.config.timeline_safekeeper_count
+            )));
+        }
+
+        let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
+        let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
+        // Construct new member set in advance to validate it.
+        // E.g. validates that there is no duplicate safekeepers.
+        let new_sk_member_set =
+            Self::make_member_set(&new_safekeepers).map_err(ApiError::BadRequest)?;
+
         // TODO(diko): per-tenant lock is too wide. Consider introducing per-timeline locks.
         let _tenant_lock = trace_shared_lock(
             &self.tenant_op_locks,
@@ -1135,6 +1155,18 @@ impl Service {
             .map(|&id| NodeId(id as u64))
             .collect::<Vec<_>>();
 
+        // Validate that we are not migrating to a decomissioned safekeeper.
+        for sk in new_safekeepers.iter() {
+            if !cur_sk_set.contains(&sk.get_id())
+                && sk.scheduling_policy() == SkSchedulingPolicy::Decomissioned
+            {
+                return Err(ApiError::BadRequest(anyhow::anyhow!(
+                    "safekeeper {} is decomissioned",
+                    sk.get_id()
+                )));
+            }
+        }
+
         tracing::info!(
             ?cur_sk_set,
             ?new_sk_set,
@@ -1177,11 +1209,8 @@ impl Service {
         }
 
         let cur_safekeepers = self.get_safekeepers(&timeline.sk_set)?;
-        let cur_sk_member_set = Self::make_member_set(&cur_safekeepers)?;
-
-        let new_sk_set_i64 = new_sk_set.iter().map(|id| id.0 as i64).collect::<Vec<_>>();
-        let new_safekeepers = self.get_safekeepers(&new_sk_set_i64)?;
-        let new_sk_member_set = Self::make_member_set(&new_safekeepers)?;
+        let cur_sk_member_set =
+            Self::make_member_set(&cur_safekeepers).map_err(ApiError::InternalServerError)?;
 
         let joint_config = membership::Configuration {
             generation,
diff --git a/test_runner/regress/test_safekeeper_migration.py b/test_runner/regress/test_safekeeper_migration.py
index b82d7b9bb0..170c1a3650 100644
--- a/test_runner/regress/test_safekeeper_migration.py
+++ b/test_runner/regress/test_safekeeper_migration.py
@@ -2,6 +2,9 @@ from __future__ import annotations
 
 from typing import TYPE_CHECKING
 
+import pytest
+from fixtures.neon_fixtures import StorageControllerApiException
+
 if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnvBuilder
 
@@ -75,3 +78,38 @@ def test_safekeeper_migration_simple(neon_env_builder: NeonEnvBuilder):
     ep.start(safekeeper_generation=1, safekeepers=[3])
 
     assert ep.safe_psql("SELECT * FROM t") == [(i,) for i in range(1, 4)]
+
+
+def test_new_sk_set_validation(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that safekeeper_migrate validates the new_sk_set before starting the migration.
+    """
+    neon_env_builder.num_safekeepers = 3
+    neon_env_builder.storage_controller_config = {
+        "timelines_onto_safekeepers": True,
+        "timeline_safekeeper_count": 2,
+    }
+    env = neon_env_builder.init_start()
+
+    def expect_fail(sk_set: list[int], match: str):
+        with pytest.raises(StorageControllerApiException, match=match):
+            env.storage_controller.migrate_safekeepers(
+                env.initial_tenant, env.initial_timeline, sk_set
+            )
+        # Check that we failed before commiting to the database.
+        mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+        assert mconf["generation"] == 1
+
+    expect_fail([], "safekeeper set is empty")
+    expect_fail([1], "must have at least 2 safekeepers")
+    expect_fail([1, 1], "duplicate safekeeper")
+    expect_fail([1, 100500], "does not exist")
+
+    mconf = env.storage_controller.timeline_locate(env.initial_tenant, env.initial_timeline)
+    sk_set = mconf["sk_set"]
+    assert len(sk_set) == 2
+
+    decom_sk = [sk.id for sk in env.safekeepers if sk.id not in sk_set][0]
+    env.storage_controller.safekeeper_scheduling_policy(decom_sk, "Decomissioned")
+
+    expect_fail([sk_set[0], decom_sk], "decomissioned")

From 0d5f4dd979f3de9c792efb0775210900ba297238 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sat, 12 Jul 2025 12:41:11 +0200
Subject: [PATCH 098/163] pageserver/client_grpc: improve retry logic

---
 libs/utils/src/logging.rs            |  50 ++++++++-
 pageserver/client_grpc/src/client.rs | 160 +++++++++++++++++++--------
 pageserver/client_grpc/src/retry.rs  |  72 ++++++------
 3 files changed, 191 insertions(+), 91 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 5828a400a0..35b64bf7f2 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,4 +1,5 @@
 use std::future::Future;
+use std::pin::Pin;
 use std::str::FromStr;
 use std::time::Duration;
 
@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{info, warn};
 
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
 ///
 /// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
-where
-    F: Future<Output = O>,
-{
+pub async fn log_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
     monitor_slow_future(
         threshold,
         threshold, // period = threshold
@@ -410,13 +412,49 @@ where
     .await
 }
 
+/// Logs a periodic warning if a future is slow to complete.
+#[inline]
+pub async fn warn_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
+    monitor_slow_future(
+        threshold,
+        threshold, // period = threshold
+        f,
+        |MonitorSlowFutureCallback {
+             ready,
+             is_slow,
+             elapsed_total,
+             elapsed_since_last_callback: _,
+         }| {
+            if !is_slow {
+                return;
+            }
+            if ready {
+                warn!(
+                    "slow {name} completed after {:.3}s",
+                    elapsed_total.as_secs_f64()
+                );
+            } else {
+                warn!(
+                    "slow {name} still running after {:.3}s",
+                    elapsed_total.as_secs_f64()
+                );
+            }
+        },
+    )
+    .await
+}
+
 /// Poll future `fut` to completion, invoking callback `cb` at the given `threshold` and every
 /// `period` afterwards, and also unconditionally when the future completes.
 #[inline]
 pub async fn monitor_slow_future<F, O>(
     threshold: Duration,
     period: Duration,
-    mut fut: std::pin::Pin<&mut F>,
+    mut fut: Pin<&mut F>,
     mut cb: impl FnMut(MonitorSlowFutureCallback),
 ) -> O
 where
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 7049fbdb96..7732585f7c 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -1,13 +1,16 @@
 use std::collections::HashMap;
 use std::num::NonZero;
+use std::pin::pin;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 
 use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
 use tonic::codec::CompressionEncoding;
-use tracing::instrument;
+use tracing::{debug, instrument};
+use utils::logging::warn_slow;
 
 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
 use crate::retry::Retry;
@@ -44,6 +47,23 @@ const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
 /// get a larger queue depth.
 const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 
+/// The overall request call timeout, including retries and pool acquisition.
+/// TODO: should we retry forever? Should the caller decide?
+const CALL_TIMEOUT: Duration = Duration::from_secs(60);
+
+/// The per-request (retry attempt) timeout, including any lazy connection establishment.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// The initial request retry backoff duration. The first retry does not back off.
+/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
+const BASE_BACKOFF: Duration = Duration::from_millis(5);
+
+/// The maximum request retry backoff duration.
+const MAX_BACKOFF: Duration = Duration::from_secs(5);
+
+/// Threshold and interval for warning about slow operation.
+const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
+
 /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
 /// basic `page_api::Client` gRPC client, and supports:
 ///
@@ -67,8 +87,6 @@ pub struct PageserverClient {
     compression: Option<CompressionEncoding>,
     /// The shards for this tenant.
     shards: ArcSwap<Shards>,
-    /// The retry configuration.
-    retry: Retry,
 }
 
 impl PageserverClient {
@@ -94,7 +112,6 @@ impl PageserverClient {
             auth_token,
             compression,
             shards: ArcSwap::new(Arc::new(shards)),
-            retry: Retry,
         })
     }
 
@@ -142,13 +159,15 @@ impl PageserverClient {
         &self,
         req: page_api::CheckRelExistsRequest,
     ) -> tonic::Result<page_api::CheckRelExistsResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.check_rel_exists(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Returns the total size of a database, as # of bytes.
@@ -157,13 +176,15 @@ impl PageserverClient {
         &self,
         req: page_api::GetDbSizeRequest,
     ) -> tonic::Result<page_api::GetDbSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_db_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
@@ -193,6 +214,8 @@ impl PageserverClient {
             return Err(tonic::Status::invalid_argument("request attempt must be 0"));
         }
 
+        debug!("sending request: {req:?}");
+
         // The shards may change while we're fetching pages. We execute the request using a stable
         // view of the shards (especially important for requests that span shards), but retry the
         // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
@@ -201,13 +224,16 @@ impl PageserverClient {
         //
         // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
         // once we figure out how to handle these.
-        self.retry
-            .with(async |attempt| {
-                let mut req = req.clone();
-                req.request_id.attempt = attempt as u32;
-                Self::get_page_with_shards(req, &self.shards.load_full()).await
-            })
-            .await
+        let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
+            let mut req = req.clone();
+            req.request_id.attempt = attempt as u32;
+            let shards = self.shards.load_full();
+            Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
+        })
+        .await?;
+
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
@@ -290,13 +316,15 @@ impl PageserverClient {
         &self,
         req: page_api::GetRelSizeRequest,
     ) -> tonic::Result<page_api::GetRelSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_rel_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches an SLRU segment.
@@ -305,13 +333,45 @@ impl PageserverClient {
         &self,
         req: page_api::GetSlruSegmentRequest,
     ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
-        self.retry
-            .with(async |_| {
-                // SLRU segments are only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_slru_segment(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // SLRU segments are only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
+    /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
+    async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
+    where
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
+        O: Future<Output = tonic::Result<T>>,
+    {
+        Retry {
+            timeout: Some(timeout),
+            base_backoff: BASE_BACKOFF,
+            max_backoff: MAX_BACKOFF,
+        }
+        .with(f)
+        .await
+    }
+
+    /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
+    async fn with_timeout<T>(
+        timeout: Duration,
+        f: impl Future<Output = tonic::Result<T>>,
+    ) -> tonic::Result<T> {
+        let started = Instant::now();
+        tokio::time::timeout(timeout, f).await.map_err(|_| {
+            tonic::Status::deadline_exceeded(format!(
+                "request timed out after {:.3}s",
+                started.elapsed().as_secs_f64()
+            ))
+        })?
     }
 }
 
@@ -525,19 +585,25 @@ impl Shard {
     }
 
     /// Returns a pooled client for this shard.
+    #[instrument(skip_all)]
     async fn client(&self) -> tonic::Result<ClientGuard> {
-        self.client_pool
-            .get()
-            .await
-            .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
+        warn_slow(
+            "client pool acquisition",
+            SLOW_THRESHOLD,
+            pin!(self.client_pool.get()),
+        )
+        .await
+        .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
     }
 
     /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
     /// pool (e.g. for prefetches).
+    #[instrument(skip_all, fields(bulk))]
     async fn stream(&self, bulk: bool) -> StreamGuard {
-        match bulk {
-            false => self.stream_pool.get().await,
-            true => self.bulk_stream_pool.get().await,
-        }
+        let pool = match bulk {
+            false => &self.stream_pool,
+            true => &self.bulk_stream_pool,
+        };
+        warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
     }
 }
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index a1e0b8636f..8a138711e8 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -1,5 +1,6 @@
 use std::time::Duration;
 
+use futures::future::pending;
 use tokio::time::Instant;
 use tracing::{error, info, warn};
 
@@ -8,60 +9,54 @@ use utils::backoff::exponential_backoff_duration;
 /// A retry handler for Pageserver gRPC requests.
 ///
 /// This is used instead of backoff::retry for better control and observability.
-pub struct Retry;
+pub struct Retry {
+    /// Timeout across all retry attempts. If None, retries forever.
+    pub timeout: Option<Duration>,
+    /// The initial backoff duration. The first retry does not use a backoff.
+    pub base_backoff: Duration,
+    /// The maximum backoff duration.
+    pub max_backoff: Duration,
+}
 
 impl Retry {
-    /// The per-request timeout.
-    // TODO: tune these, and/or make them configurable. Should we retry forever?
-    const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
-    /// The total timeout across all attempts
-    const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
-    /// The initial backoff duration.
-    const BASE_BACKOFF: Duration = Duration::from_millis(10);
-    /// The maximum backoff duration.
-    const MAX_BACKOFF: Duration = Duration::from_secs(10);
-    /// If true, log successful requests. For debugging.
-    const LOG_SUCCESS: bool = false;
-
-    /// Runs the given async closure with timeouts and retries (exponential backoff), passing the
-    /// attempt number starting at 0. Logs errors, using the current tracing span for context.
+    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
+    /// using the current tracing span for context.
     ///
-    /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
-    /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
+    /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
     pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
     where
-        F: FnMut(usize) -> O, // takes attempt number, starting at 0
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
         O: Future<Output = tonic::Result<T>>,
     {
         let started = Instant::now();
-        let deadline = started + Self::TOTAL_TIMEOUT;
+        let deadline = self.timeout.map(|timeout| started + timeout);
         let mut last_error = None;
         let mut retries = 0;
         loop {
-            // Set up a future to wait for the backoff (if any) and run the request with a timeout.
+            // Set up a future to wait for the backoff, if any, and run the closure.
             let backoff_and_try = async {
                 // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
                 // https://github.com/tokio-rs/tokio/issues/6866
-                if let Some(backoff) = Self::backoff_duration(retries) {
+                if let Some(backoff) = self.backoff_duration(retries) {
                     tokio::time::sleep(backoff).await;
                 }
 
-                let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
-                    .await
-                    .map_err(|_| {
-                        tonic::Status::deadline_exceeded(format!(
-                            "request timed out after {:.3}s",
-                            request_started.elapsed().as_secs_f64()
-                        ))
-                    })?
+                f(retries).await
             };
 
-            // Wait for the backoff and request, or bail out if the total timeout is exceeded.
+            // Set up a future for the timeout, if any.
+            let timeout = async {
+                match deadline {
+                    Some(deadline) => tokio::time::sleep_until(deadline).await,
+                    None => pending().await,
+                }
+            };
+
+            // Wait for the backoff and request, or bail out if the timeout is exceeded.
             let result = tokio::select! {
                 result = backoff_and_try => result,
 
-                _ = tokio::time::sleep_until(deadline) => {
+                _ = timeout => {
                     let last_error = last_error.unwrap_or_else(|| {
                         tonic::Status::deadline_exceeded(format!(
                             "request timed out after {:.3}s",
@@ -79,7 +74,7 @@ impl Retry {
             match result {
                 // Success, return the result.
                 Ok(result) => {
-                    if retries > 0 || Self::LOG_SUCCESS {
+                    if retries > 0 {
                         info!(
                             "request succeeded after {retries} retries in {:.3}s",
                             started.elapsed().as_secs_f64(),
@@ -112,12 +107,13 @@ impl Retry {
         }
     }
 
-    /// Returns the backoff duration for the given retry attempt, or None for no backoff.
-    fn backoff_duration(retry: usize) -> Option<Duration> {
+    /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
+    /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
+    fn backoff_duration(&self, retries: usize) -> Option<Duration> {
         let backoff = exponential_backoff_duration(
-            retry as u32,
-            Self::BASE_BACKOFF.as_secs_f64(),
-            Self::MAX_BACKOFF.as_secs_f64(),
+            (retries as u32).saturating_sub(1), // first retry does not back off
+            self.base_backoff.as_secs_f64(),
+            self.max_backoff.as_secs_f64(),
         );
         (!backoff.is_zero()).then_some(backoff)
     }

From ddeb3f3ed3f1294fe785e2de8472a1abe22cb1e9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sun, 13 Jul 2025 12:18:12 +0200
Subject: [PATCH 099/163] pageserver/client_grpc: don't pipeline GetPage
 requests

---
 Cargo.lock                           |   1 +
 Cargo.toml                           |   2 +-
 pageserver/client_grpc/src/client.rs |  19 +-
 pageserver/client_grpc/src/pool.rs   | 405 +++++++++++----------------
 pageserver/page_api/src/model.rs     |   7 +-
 5 files changed, 169 insertions(+), 265 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 906be972be..db2b19dc54 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7778,6 +7778,7 @@ dependencies = [
  "futures-core",
  "pin-project-lite",
  "tokio",
+ "tokio-util",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index dd414257a6..b622892392 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -204,7 +204,7 @@ tokio = { version = "1.43.1", features = ["macros"] }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
-tokio-stream = "0.1"
+tokio-stream = { version = "0.1", features = ["sync"] }
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 7732585f7c..4b606d6939 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -32,21 +32,13 @@ const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
 /// Max number of concurrent unary request clients per shard.
 const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
-/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
+/// Max number of concurrent GetPage streams per shard.
 const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of pipelined requests per stream.
-const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
-
 /// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit but higher queue depth.
+/// are more throughput-oriented, we have a smaller limit.
 const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
 
-/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
-/// get a larger queue depth.
-const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
-
 /// The overall request call timeout, including retries and pool acquisition.
 /// TODO: should we retry forever? Should the caller decide?
 const CALL_TIMEOUT: Duration = Duration::from_secs(60);
@@ -272,7 +264,7 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let stream = shard.stream(req.request_class.is_bulk()).await;
+        let mut stream = shard.stream(req.request_class.is_bulk()).await?;
         let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
@@ -557,7 +549,6 @@ impl Shard {
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_STREAMS),
-            MAX_STREAM_QUEUE_DEPTH,
         );
 
         // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
@@ -573,7 +564,6 @@ impl Shard {
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_BULK_STREAMS),
-            MAX_BULK_STREAM_QUEUE_DEPTH,
         );
 
         Ok(Self {
@@ -593,13 +583,12 @@ impl Shard {
             pin!(self.client_pool.get()),
         )
         .await
-        .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
     }
 
     /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
     /// pool (e.g. for prefetches).
     #[instrument(skip_all, fields(bulk))]
-    async fn stream(&self, bulk: bool) -> StreamGuard {
+    async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
         let pool = match bulk {
             false => &self.stream_pool,
             true => &self.bulk_stream_pool,
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 906872e091..cbc4dff4ad 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -9,19 +9,34 @@
 //!
 //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
 //!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
-//!   per-channel client limit. Channels may be closed when they are no longer used by any clients.
+//!   per-channel client limit.
 //!
 //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
 //!   channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
-//!   single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
-//!   from the pool after some time, to free up the channel.
+//!   single caller at a time, and is returned to the pool when dropped.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
-//!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
-//!   returns a guard that can be used to send a single request, to properly enforce queue depth and
-//!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
-//!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams may be removed from the pool after a while to free up the client.
+//!   ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
+//!   time, and is returned to the pool when dropped. The stream only supports sending a single,
+//!   synchronous request at a time, and does not support pipelining multiple requests from
+//!   different callers onto the same stream -- instead, we scale out concurrent streams to improve
+//!   throughput. There are many reasons for this design choice:
+//!
+//!     * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
+//!       a single server task, which may block e.g. on layer downloads, LSN waits, etc.
+//!
+//!     * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
+//!       (e.g. because of a timeout), the request would still be processed by the server and block
+//!       requests behind it in the stream. It might even block its own timeout retry.
+//!
+//!     * Individual callers can use client-side batching for pipelining.
+//!
+//!     * Stream scheduling becomes significantly simpler and cheaper.
+//!
+//!     * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
+//!       per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
+//!
+//! Idle resources are removed from the pools periodically, to avoid holding onto server resources.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -29,20 +44,20 @@
 //! TODO: error handling (including custom error types).
 //! TODO: observability.
 
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
+use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
 use std::time::{Duration, Instant};
 
-use futures::StreamExt as _;
-use tokio::sync::mpsc::{Receiver, Sender};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use futures::{Stream, StreamExt as _};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
+use tokio_stream::wrappers::WatchStream;
 use tokio_util::sync::CancellationToken;
 use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
-use tracing::{error, warn};
 
 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
@@ -253,8 +268,7 @@ pub struct ClientPool {
     ///
     /// The first client in the map will be acquired next. The map is sorted by client ID, which in
     /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
-    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
-    /// clients are reaped.
+    /// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
     idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
     /// Reaps idle clients.
     idle_reaper: Reaper,
@@ -310,7 +324,7 @@ impl ClientPool {
     /// This is moderately performance-sensitive. It is called for every unary request, but these
     /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
     /// the `StreamPool` instead.
-    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
@@ -328,7 +342,7 @@ impl ClientPool {
             });
         }
 
-        // Slow path: construct a new client.
+        // Construct a new client.
         let mut channel_guard = self.channel_pool.get();
         let client = page_api::Client::new(
             channel_guard.take(),
@@ -337,7 +351,8 @@ impl ClientPool {
             self.shard_id,
             self.auth_token.clone(),
             self.compression,
-        )?;
+        )
+        .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;
 
         Ok(ClientGuard {
             pool: Arc::downgrade(self),
@@ -407,287 +422,188 @@ impl Drop for ClientGuard {
 /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
 /// acquires a client from the inner `ClientPool` for the stream's lifetime.
 ///
-/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
-/// a single request and await the response. Internally, requests are multiplexed across streams and
-/// channels. This allows proper queue depth enforcement and response routing.
+/// Individual streams only send a single request at a time, and do not pipeline multiple callers
+/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
+/// to eliminate head-of-line blocking. See the module documentation for more details.
 ///
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
     /// The client pool to acquire clients from. Must be unbounded.
     client_pool: Arc<ClientPool>,
-    /// All pooled streams.
+    /// Idle pooled streams. Acquired streams are removed from here and returned on drop.
     ///
-    /// Incoming requests will be sent over an existing stream with available capacity. If all
-    /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
-    /// stream has an associated Tokio task that processes requests and responses.
-    streams: Mutex<HashMap<StreamID, StreamEntry>>,
-    /// The max number of concurrent streams, or None if unbounded.
-    max_streams: Option<NonZero<usize>>,
-    /// The max number of concurrent requests per stream.
-    max_queue_depth: NonZero<usize>,
-    /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
-    /// None if the pool is unbounded.
+    /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
+    /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
+    /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
+    /// higher-ordered channels.
+    idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
+    /// Limits the max number of concurrent streams. None if the pool is unbounded.
     limiter: Option<Arc<Semaphore>>,
     /// Reaps idle streams.
     idle_reaper: Reaper,
-    /// Stream ID generator.
-    next_stream_id: AtomicUsize,
 }
 
-type StreamID = usize;
-type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
-type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
-type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
+/// The stream ID. Reuses the inner client ID.
+type StreamID = ClientID;
 
+/// A pooled stream.
 struct StreamEntry {
-    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
-    sender: RequestSender,
-    /// Number of in-flight requests on this stream.
-    queue_depth: usize,
-    /// The time when this stream went idle (queue_depth == 0).
-    /// INVARIANT: Some if queue_depth == 0, otherwise None.
-    idle_since: Option<Instant>,
+    /// The bidirectional stream.
+    stream: BiStream,
+    /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
+    idle_since: Instant,
+}
+
+/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
+struct BiStream {
+    /// The owning client. Holds onto the channel slot while the stream is alive.
+    client: ClientGuard,
+    /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
+    /// time, and the caller must await the response before sending another request. This is
+    /// enforced by `StreamGuard::send`.
+    sender: watch::Sender<page_api::GetPageRequest>,
+    /// Stream for receiving responses.
+    /// TODO: consider returning a concrete type from `Client::get_pages`.
+    receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
 }
 
 impl StreamPool {
-    /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
-    /// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
+    /// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
+    /// concurrent streams.
     ///
     /// The client pool must be unbounded. The stream pool will enforce its own limits, and because
     /// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
     /// The stream pool should generally have its own dedicated client pool (but it can share a
     /// channel pool with others since these are always unbounded).
-    pub fn new(
-        client_pool: Arc<ClientPool>,
-        max_streams: Option<NonZero<usize>>,
-        max_queue_depth: NonZero<usize>,
-    ) -> Arc<Self> {
+    pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
         assert!(client_pool.limiter.is_none(), "bounded client pool");
         let pool = Arc::new(Self {
             client_pool,
-            streams: Mutex::default(),
-            limiter: max_streams.map(|max_streams| {
-                Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
-            }),
-            max_streams,
-            max_queue_depth,
+            idle: Mutex::default(),
+            limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
             idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
-            next_stream_id: AtomicUsize::default(),
         });
         pool.idle_reaper.spawn(&pool);
         pool
     }
 
-    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
-    /// are full. Returns a guard that can be used to send a single request on the stream and await
-    /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
-    /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
+    /// Acquires an available stream from the pool, or spins up a new stream if all streams are
+    /// full. Returns a guard that can be used to send requests and await the responses. Blocks if
+    /// the pool is full.
     ///
     /// This is very performance-sensitive, as it is on the GetPage hot path.
     ///
-    /// TODO: this must do something more sophisticated for performance. We want:
-    ///
-    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
-    /// * Quick acquisition of pooled streams with available capacity.
-    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
-    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
-    /// * Don't hold a lock while spinning up new streams.
-    /// * Allow concurrent clients to join onto streams while they're spun up.
-    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
-    ///
-    /// For now, we just do something simple but inefficient (linear scan under mutex).
-    pub async fn get(self: &Arc<Self>) -> StreamGuard {
+    /// TODO: is a Mutex<BTreeMap> performant enough? Will it become too contended? We can't
+    /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
+    /// to free up higher-ordered channels.
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
             permit = Some(limiter.acquire_owned().await.expect("never closed"));
         }
-        let mut streams = self.streams.lock().unwrap();
 
-        // Look for a pooled stream with available capacity.
-        for (&id, entry) in streams.iter_mut() {
-            assert!(
-                entry.queue_depth <= self.max_queue_depth.get(),
-                "stream queue overflow"
-            );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.queue_depth == 0,
-                "incorrect stream idle state"
-            );
-            if entry.queue_depth < self.max_queue_depth.get() {
-                entry.queue_depth += 1;
-                entry.idle_since = None;
-                return StreamGuard {
-                    pool: Arc::downgrade(self),
-                    id,
-                    sender: entry.sender.clone(),
-                    permit,
-                };
-            }
+        // Fast path: acquire an idle stream from the pool.
+        if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(StreamGuard {
+                pool: Arc::downgrade(self),
+                stream: Some(entry.stream),
+                active: false,
+                permit,
+            });
         }
 
-        // No available stream, spin up a new one. We install the stream entry in the pool first and
-        // return the guard, while spinning up the stream task async. This allows other callers to
-        // join onto this stream and also create additional streams concurrently if this fills up.
-        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
-        let entry = StreamEntry {
-            sender: req_tx.clone(),
-            queue_depth: 1, // reserve quota for this caller
-            idle_since: None,
-        };
-        streams.insert(id, entry);
+        // Spin up a new stream. Uses a watch channel to send a single request at a time, since
+        // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
+        let mut client = self.client_pool.get().await?;
 
-        if let Some(max_streams) = self.max_streams {
-            assert!(streams.len() <= max_streams.get(), "stream overflow");
-        };
+        let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
+        let req_stream = WatchStream::from_changes(req_rx);
+        let resp_stream = client.get_pages(req_stream).await?;
 
-        let client_pool = self.client_pool.clone();
-        let pool = Arc::downgrade(self);
-
-        tokio::spawn(async move {
-            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
-                error!("stream failed: {err}");
-            }
-            // Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
-            if let Some(pool) = pool.upgrade() {
-                let entry = pool.streams.lock().unwrap().remove(&id);
-                assert!(entry.is_some(), "unknown stream ID: {id}");
-            }
-        });
-
-        StreamGuard {
+        Ok(StreamGuard {
             pool: Arc::downgrade(self),
-            id,
-            sender: req_tx,
+            stream: Some(BiStream {
+                client,
+                sender: req_tx,
+                receiver: Box::pin(resp_stream),
+            }),
+            active: false,
             permit,
-        }
-    }
-
-    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
-    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
-    /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
-    /// atomic with pool stream acquisition.
-    ///
-    /// The task exits when the request channel is closed, or on a stream error. The caller is
-    /// responsible for removing the stream from the pool on exit.
-    async fn run_stream(
-        client_pool: Arc<ClientPool>,
-        mut caller_rx: RequestReceiver,
-    ) -> anyhow::Result<()> {
-        // Acquire a client from the pool and create a stream.
-        let mut client = client_pool.get().await?;
-
-        // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
-        // theoretically deadlock if both the client and server block on sends (since we're not
-        // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
-        // low queue depths, but it was seen to happen with the libpq protocol so better safe than
-        // sorry. It should never buffer more than the queue depth anyway, but using an unbounded
-        // channel guarantees that it will never block.
-        let (req_tx, req_rx) = mpsc::unbounded_channel();
-        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
-        let mut resp_stream = client.get_pages(req_stream).await?;
-
-        // Track caller response channels by request ID. If the task returns early, these response
-        // channels will be dropped and the waiting callers will receive an error.
-        //
-        // NB: this will leak entries if the server doesn't respond to a request (by request ID).
-        // It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
-        // block further use. But we could consider reaping closed channels after some time.
-        let mut callers = HashMap::new();
-
-        // Process requests and responses.
-        loop {
-            tokio::select! {
-                // Receive requests from callers and send them to the stream.
-                req = caller_rx.recv() => {
-                    // Shut down if request channel is closed.
-                    let Some((req, resp_tx)) = req else {
-                        return Ok(());
-                    };
-
-                    // Store the response channel by request ID.
-                    if callers.contains_key(&req.request_id) {
-                        // Error on request ID duplicates. Ignore callers that went away.
-                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
-                            format!("duplicate request ID: {}", req.request_id),
-                        )));
-                        continue;
-                    }
-                    callers.insert(req.request_id, resp_tx);
-
-                    // Send the request on the stream. Bail out if the stream is closed.
-                    req_tx.send(req).map_err(|_| {
-                        tonic::Status::unavailable("stream closed")
-                    })?;
-                }
-
-                // Receive responses from the stream and send them to callers.
-                resp = resp_stream.next() => {
-                    // Shut down if the stream is closed, and bail out on stream errors.
-                    let Some(resp) = resp.transpose()? else {
-                        return Ok(())
-                    };
-
-                    // Send the response to the caller. Ignore errors if the caller went away.
-                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
-                        warn!("received response for unknown request ID: {}", resp.request_id);
-                        continue;
-                    };
-                    _ = resp_tx.send(Ok(resp));
-                }
-            }
-        }
+        })
     }
 }
 
 impl Reapable for StreamPool {
     /// Reaps streams that have been idle since before the cutoff.
     fn reap_idle(&self, cutoff: Instant) {
-        self.streams.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
-                return true;
-            };
-            assert_eq!(entry.queue_depth, 0, "idle stream has requests");
-            idle_since >= cutoff
-        });
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff);
     }
 }
 
-/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
-/// depth. Queue depth is already reserved and will be returned on drop.
+/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
+/// in-flight requests on the stream, or the stream failed.
 pub struct StreamGuard {
     pool: Weak<StreamPool>,
-    id: StreamID,
-    sender: RequestSender,
+    stream: Option<BiStream>,             // Some until dropped
+    active: bool,                         // not returned to pool if true
     permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
 impl StreamGuard {
-    /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
-    /// valid for a single request (to enforce queue depth). This also drops the guard on return and
-    /// returns the queue depth quota to the pool.
+    /// Sends a request on the stream and awaits the response. If the future is dropped before it
+    /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
+    /// request and is not returned to the pool. The same is true if the stream errors, in which
+    /// case the caller can't send further requests on the stream.
     ///
-    /// The `GetPageRequest::request_id` must be unique across in-flight requests.
+    /// We only support sending a single request at a time, to eliminate head-of-line blocking. See
+    /// module documentation for details.
     ///
     /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
     /// to avoid tearing down the stream for per-request errors. Callers must check this.
     pub async fn send(
-        self,
+        &mut self,
         req: page_api::GetPageRequest,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let (resp_tx, resp_rx) = oneshot::channel();
+        let req_id = req.request_id;
+        let stream = self.stream.as_mut().expect("not dropped");
 
-        self.sender
-            .send((req, resp_tx))
-            .await
+        // Mark the stream as active. We only allow one request at a time, to avoid HoL-blocking.
+        // We also don't allow reuse of the stream after an error.
+        //
+        // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
+        if self.active {
+            return Err(tonic::Status::internal("stream already active"));
+        }
+        self.active = true;
+
+        // Send the request and receive the response. If the stream errors for whatever reason, we
+        // don't reset `active` such that the stream won't be returned to the pool.
+        stream
+            .sender
+            .send(req)
             .map_err(|_| tonic::Status::unavailable("stream closed"))?;
 
-        resp_rx
+        let resp = stream
+            .receiver
+            .next()
             .await
-            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+            .ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
+
+        if resp.request_id != req_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID {} does not match request ID {}",
+                resp.request_id, req_id
+            )));
+        }
+
+        // Success, mark the stream as inactive.
+        self.active = false;
+
+        Ok(resp)
     }
 }
 
@@ -697,26 +613,23 @@ impl Drop for StreamGuard {
             return; // pool was dropped
         };
 
-        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
-        // before the response is received, but that's okay.
-        //
-        // TODO: actually, it's probably not okay. Queue depth release should be moved into the
-        // stream task, such that it continues to account for the queue depth slot until the server
-        // responds. Otherwise, if a slow request times out and keeps blocking the stream, the
-        // server will keep waiting on it and we can pile on subsequent requests (including the
-        // timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
-        // requests on e.g. LSN waits and layer downloads, instead returning early to free up the
-        // stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
-        // blocking. TBD.
-        let mut streams = pool.streams.lock().unwrap();
-        let entry = streams.get_mut(&self.id).expect("unknown stream");
-        assert!(entry.idle_since.is_none(), "active stream marked idle");
-        assert!(entry.queue_depth > 0, "stream queue underflow");
-        entry.queue_depth -= 1;
-        if entry.queue_depth == 0 {
-            entry.idle_since = Some(Instant::now()); // mark stream as idle
+        // If the stream is still active, we can't return it to the pool. The next caller could be
+        // head-of-line blocked by an in-flight request, receive a stale response, or the stream may
+        // have failed.
+        if self.active {
+            return;
         }
 
+        // Place the idle stream back into the pool.
+        let entry = StreamEntry {
+            stream: self.stream.take().expect("dropped once"),
+            idle_since: Instant::now(),
+        };
+        pool.idle
+            .lock()
+            .unwrap()
+            .insert(entry.stream.client.id, entry);
+
         _ = self.permit; // returned on drop, referenced for visibility
     }
 }
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index 6641dfd900..058839f20c 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -51,7 +51,7 @@ impl From<ProtocolError> for tonic::Status {
 }
 
 /// The LSN a request should read at.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Default)]
 pub struct ReadLsn {
     /// The request's read LSN.
     pub request_lsn: Lsn,
@@ -331,7 +331,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
 }
 
 /// Requests one or more pages.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct GetPageRequest {
     /// A request ID. Will be included in the response. Should be unique for in-flight requests on
     /// the stream.
@@ -432,12 +432,13 @@ impl From<RequestID> for proto::RequestId {
 }
 
 /// A GetPage request class.
-#[derive(Clone, Copy, Debug, strum_macros::Display)]
+#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
 pub enum GetPageClass {
     /// Unknown class. For backwards compatibility: used when an older client version sends a class
     /// that a newer server version has removed.
     Unknown,
     /// A normal request. This is the default.
+    #[default]
     Normal,
     /// A prefetch request. NB: can only be classified on pg < 18.
     Prefetch,

From 56eb511618937afa69f02c0ee5325aed2bea40b8 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sun, 13 Jul 2025 13:29:00 +0200
Subject: [PATCH 100/163] pageserver/client_grpc: use unbounded pools

---
 pageserver/client_grpc/src/client.rs | 104 ++++++++++++++-------------
 pageserver/page_api/src/model.rs     |  13 ----
 2 files changed, 55 insertions(+), 62 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 4b606d6939..3a9edc7092 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -24,20 +24,23 @@ use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
 /// when full.
 ///
+/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
+/// streams per connection.
+///
 /// TODO: tune all of these constants, and consider making them configurable.
-/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
-/// with only streams.
-const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
+const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of concurrent unary request clients per shard.
-const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
+/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
+/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
+/// transmission delays. This also concentrates large window sizes on a smaller set of
+/// streams/connections, presumably reducing memory use.
+const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
 
-/// Max number of concurrent GetPage streams per shard.
-const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
-
-/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit.
-const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
+/// The batch size threshold at which a GetPage request will use the bulk stream pool.
+///
+/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
+/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
+const BULK_THRESHOLD_BATCH_SIZE: usize = 5;
 
 /// The overall request call timeout, including retries and pool acquisition.
 /// TODO: should we retry forever? Should the caller decide?
@@ -62,10 +65,19 @@ const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
 /// * Sharded tenants across multiple Pageservers.
 /// * Pooling of connections, clients, and streams for efficient resource use.
 /// * Concurrent use by many callers.
-/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
+/// * Internal handling of GetPage bidirectional streams.
 /// * Automatic retries.
 /// * Observability.
 ///
+/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
+/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
+/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
+/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
+/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
+/// one stream per backend, but without the TCP connection overhead. In the common case we expect
+/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
+/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
+///
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
@@ -264,7 +276,7 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let mut stream = shard.stream(req.request_class.is_bulk()).await?;
+        let mut stream = shard.stream(Self::is_bulk(&req)).await?;
         let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
@@ -365,6 +377,11 @@ impl PageserverClient {
             ))
         })?
     }
+
+    /// Returns true if the request is considered a bulk request and should use the bulk pool.
+    fn is_bulk(req: &page_api::GetPageRequest) -> bool {
+        req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
+    }
 }
 
 /// Shard specification for a PageserverClient.
@@ -492,15 +509,23 @@ impl Shards {
     }
 }
 
-/// A single shard. Uses dedicated resource pools with the following structure:
+/// A single shard. Has dedicated resource pools with the following structure:
 ///
-/// * Channel pool: unbounded.
-///   * Unary client pool: MAX_UNARY_CLIENTS.
-///   * Stream client pool: unbounded.
-///     * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
-/// * Bulk channel pool: unbounded.
+/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
+///   * Client pool: unbounded.
+///     * Stream pool: unbounded.
+/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
 ///   * Bulk client pool: unbounded.
-///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
+///     * Bulk stream pool: unbounded.
+///
+/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
+/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
+/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
+/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
+/// similar (except for TCP transmission time).
+///
+/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
+/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
 struct Shard {
     /// The shard ID.
     id: ShardIndex,
@@ -508,7 +533,7 @@ struct Shard {
     client_pool: Arc<ClientPool>,
     /// GetPage stream pool.
     stream_pool: Arc<StreamPool>,
-    /// GetPage stream pool for bulk requests, e.g. prefetches.
+    /// GetPage stream pool for bulk requests.
     bulk_stream_pool: Arc<StreamPool>,
 }
 
@@ -522,48 +547,30 @@ impl Shard {
         auth_token: Option<String>,
         compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
-        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
-        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
-
-        // Client pool for unary requests.
+        // Shard pools for unary requests and non-bulk GetPage requests.
         let client_pool = ClientPool::new(
-            channel_pool.clone(),
+            ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
             tenant_id,
             timeline_id,
             shard_id,
             auth_token.clone(),
             compression,
-            Some(MAX_UNARY_CLIENTS),
+            None, // unbounded
         );
+        let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded
 
-        // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
-        // but shares a channel pool with it (as it's unbounded).
-        let stream_pool = StreamPool::new(
-            ClientPool::new(
-                channel_pool.clone(),
-                tenant_id,
-                timeline_id,
-                shard_id,
-                auth_token.clone(),
-                compression,
-                None, // unbounded, limited by stream pool
-            ),
-            Some(MAX_STREAMS),
-        );
-
-        // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
-        // to avoid head-of-line blocking of latency-sensitive requests.
+        // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
         let bulk_stream_pool = StreamPool::new(
             ClientPool::new(
-                ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
+                ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
                 tenant_id,
                 timeline_id,
                 shard_id,
                 auth_token,
                 compression,
-                None, // unbounded, limited by stream pool
+                None, // unbounded,
             ),
-            Some(MAX_BULK_STREAMS),
+            None, // unbounded
         );
 
         Ok(Self {
@@ -585,8 +592,7 @@ impl Shard {
         .await
     }
 
-    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
-    /// pool (e.g. for prefetches).
+    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
     #[instrument(skip_all, fields(bulk))]
     async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
         let pool = match bulk {
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index 058839f20c..7df7de6fc6 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -446,19 +446,6 @@ pub enum GetPageClass {
     Background,
 }
 
-impl GetPageClass {
-    /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
-    /// latency-sensitive).
-    pub fn is_bulk(&self) -> bool {
-        match self {
-            Self::Unknown => false,
-            Self::Normal => false,
-            Self::Prefetch => true,
-            Self::Background => true,
-        }
-    }
-}
-
 impl From<proto::GetPageClass> for GetPageClass {
     fn from(pb: proto::GetPageClass) -> Self {
         match pb {

From 87f01a25ab652e95396dba22891d15100ca680da Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sun, 13 Jul 2025 18:40:27 +0200
Subject: [PATCH 101/163] pageserver/client_grpc: reap idle channels
 immediately

---
 pageserver/client_grpc/src/pool.rs | 60 ++++++++----------------------
 1 file changed, 16 insertions(+), 44 deletions(-)

diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index cbc4dff4ad..20d59f7ec7 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -36,7 +36,8 @@
 //!     * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
 //!       per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
 //!
-//! Idle resources are removed from the pools periodically, to avoid holding onto server resources.
+//! Idle clients/streams are removed from the pools periodically, to free up server resources.
+//! Channels are reaped immediately when unused, and indirectly rely on client/stream idle timeouts.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -63,14 +64,12 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
-/// Reap channels/clients/streams that have been idle for this long.
+/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
+/// empty, and indirectly rely on the client/stream idle timeouts.
 ///
-/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
-/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
-/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
-/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
-/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
-/// channels, and/or stream pool clients.
+/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
+/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
+/// keep its client around in the pool for a while.
 const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
     false => Duration::from_secs(180),
     true => Duration::from_secs(1), // exercise reaping in tests
@@ -98,8 +97,6 @@ pub struct ChannelPool {
     max_clients_per_channel: NonZero<usize>,
     /// Open channels.
     channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
-    /// Reaps idle channels.
-    idle_reaper: Reaper,
     /// Channel ID generator.
     next_channel_id: AtomicUsize,
 }
@@ -111,9 +108,6 @@ struct ChannelEntry {
     channel: Channel,
     /// Number of clients using this channel.
     clients: usize,
-    /// The channel has been idle (no clients) since this time. None if channel is in use.
-    /// INVARIANT: Some if clients == 0, otherwise None.
-    idle_since: Option<Instant>,
 }
 
 impl ChannelPool {
@@ -123,15 +117,12 @@ impl ChannelPool {
         E: TryInto<Endpoint> + Send + Sync + 'static,
         <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
     {
-        let pool = Arc::new(Self {
+        Ok(Arc::new(Self {
             endpoint: endpoint.try_into()?,
             max_clients_per_channel,
             channels: Mutex::default(),
-            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             next_channel_id: AtomicUsize::default(),
-        });
-        pool.idle_reaper.spawn(&pool);
-        Ok(pool)
+        }))
     }
 
     /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
@@ -152,22 +143,17 @@ impl ChannelPool {
         let mut channels = self.channels.lock().unwrap();
 
         // Try to find an existing channel with available capacity. We check entries in BTreeMap
-        // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
-        // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
+        // order, to fill up the lower-ordered channels first. The client/stream pools also prefer
+        // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
         // channels, and free up higher-ordered channels such that they can be reaped.
         for (&id, entry) in channels.iter_mut() {
             assert!(
                 entry.clients <= self.max_clients_per_channel.get(),
                 "channel overflow"
             );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.clients == 0,
-                "incorrect channel idle state"
-            );
+            assert_ne!(entry.clients, 0, "empty channel not reaped");
             if entry.clients < self.max_clients_per_channel.get() {
                 entry.clients += 1;
-                entry.idle_since = None;
                 return ChannelGuard {
                     pool: Arc::downgrade(self),
                     id,
@@ -184,7 +170,6 @@ impl ChannelPool {
         let entry = ChannelEntry {
             channel: channel.clone(),
             clients: 1, // account for the guard below
-            idle_since: None,
         };
         channels.insert(id, entry);
 
@@ -196,20 +181,6 @@ impl ChannelPool {
     }
 }
 
-impl Reapable for ChannelPool {
-    /// Reaps channels that have been idle since before the cutoff.
-    fn reap_idle(&self, cutoff: Instant) {
-        self.channels.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.clients, 0, "empty channel not marked idle");
-                return true;
-            };
-            assert_eq!(entry.clients, 0, "idle channel has clients");
-            idle_since >= cutoff
-        })
-    }
-}
-
 /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
 /// since the gRPC client requires an owned `Channel`.
 pub struct ChannelGuard {
@@ -226,7 +197,7 @@ impl ChannelGuard {
     }
 }
 
-/// Returns the channel to the pool.
+/// Returns the channel to the pool. The channel is closed when empty.
 impl Drop for ChannelGuard {
     fn drop(&mut self) {
         let Some(pool) = self.pool.upgrade() else {
@@ -235,11 +206,12 @@ impl Drop for ChannelGuard {
 
         let mut channels = pool.channels.lock().unwrap();
         let entry = channels.get_mut(&self.id).expect("unknown channel");
-        assert!(entry.idle_since.is_none(), "active channel marked idle");
         assert!(entry.clients > 0, "channel underflow");
         entry.clients -= 1;
+
+        // Reap empty channels immediately.
         if entry.clients == 0 {
-            entry.idle_since = Some(Instant::now()); // mark channel as idle
+            channels.remove(&self.id);
         }
     }
 }

From a5fe67f3616b55135fa3a58c2db89bf30a9eb955 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Sun, 13 Jul 2025 19:27:39 +0200
Subject: [PATCH 102/163] proxy: cancel maintain_cancel_key task immediately
 (#12586)

## Problem

When a connection terminates its maintain_cancel_key task keeps running
until the CANCEL_KEY_REFRESH sleep finishes and then it triggers another
cancel key TTL refresh before exiting.

## Summary of changes

* Check for cancellation while sleeping and interrupt sleep.
* If cancelled, break the loop, don't send a refresh cmd.
---
 proxy/src/cancellation.rs | 10 ++++++++--
 proxy/src/util.rs         | 14 +++++++++++---
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 4ea4c4ea54..03be9dd4cf 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -28,6 +28,7 @@ use crate::pqproto::CancelKeyData;
 use crate::rate_limiter::LeakyBucketRateLimiter;
 use crate::redis::keys::KeyPrefix;
 use crate::redis::kv_ops::{RedisKVClient, RedisKVClientError};
+use crate::util::run_until;
 
 type IpSubnetKey = IpNet;
 
@@ -498,8 +499,13 @@ impl Session {
                         "registered cancellation key"
                     );
 
-                    // wait before continuing.
-                    tokio::time::sleep(CANCEL_KEY_REFRESH).await;
+                    // wait before continuing. break immediately if cancelled.
+                    if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+                        .await
+                        .is_err()
+                    {
+                        break;
+                    }
                 }
                 // retry immediately.
                 Err(BatchQueueError::Result(error)) => {
diff --git a/proxy/src/util.rs b/proxy/src/util.rs
index 7fc2d9fbdb..0291216d94 100644
--- a/proxy/src/util.rs
+++ b/proxy/src/util.rs
@@ -7,8 +7,16 @@ pub async fn run_until_cancelled<F: Future>(
     f: F,
     cancellation_token: &CancellationToken,
 ) -> Option<F::Output> {
-    match select(pin!(f), pin!(cancellation_token.cancelled())).await {
-        Either::Left((f, _)) => Some(f),
-        Either::Right(((), _)) => None,
+    run_until(f, cancellation_token.cancelled()).await.ok()
+}
+
+/// Runs the future `f` unless interrupted by future `condition`.
+pub async fn run_until<F1: Future, F2: Future>(
+    f: F1,
+    condition: F2,
+) -> Result<F1::Output, F2::Output> {
+    match select(pin!(f), pin!(condition)).await {
+        Either::Left((f1, _)) => Ok(f1),
+        Either::Right((f2, _)) => Err(f2),
     }
 }

From 296c9190b2f6e12c571a2b71f070b1c5597738e8 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 14 Jul 2025 00:49:23 +0200
Subject: [PATCH 103/163] proxy: Use EXPIRE command to refresh cancel entries
 (#12580)

## Problem

When refreshing cancellation data we resend the entire value again just
to reset the TTL, which causes unnecessary load in proxy, on network and
possibly on redis side.

## Summary of changes

* Switch from using SET with full value to using EXPIRE to reset TTL.
* Add a tiny delay between retries to prevent busy loop.
* Shorten CancelKeyOp variants: drop redundant suffix.
* Retry SET when EXPIRE failed.
---
 proxy/src/cancellation.rs | 130 +++++++++++++++++++++++++++-----------
 proxy/src/metrics.rs      |   1 +
 2 files changed, 95 insertions(+), 36 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 03be9dd4cf..77062d3bb4 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -32,20 +32,24 @@ use crate::util::run_until;
 
 type IpSubnetKey = IpNet;
 
-const CANCEL_KEY_TTL: std::time::Duration = std::time::Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: std::time::Duration = std::time::Duration::from_secs(570);
+const CANCEL_KEY_TTL: Duration = Duration::from_secs(600);
+const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570);
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
-    StoreCancelKey {
+    Store {
         key: CancelKeyData,
         value: Box<str>,
-        expire: std::time::Duration,
+        expire: Duration,
     },
-    GetCancelData {
+    Refresh {
+        key: CancelKeyData,
+        expire: Duration,
+    },
+    Get {
         key: CancelKeyData,
     },
-    GetCancelDataOld {
+    GetOld {
         key: CancelKeyData,
     },
 }
@@ -108,7 +112,7 @@ impl Pipeline {
 impl CancelKeyOp {
     fn register(&self, pipe: &mut Pipeline) {
         match self {
-            CancelKeyOp::StoreCancelKey { key, value, expire } => {
+            CancelKeyOp::Store { key, value, expire } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
                 pipe.add_command(Cmd::set_options(
                     &key,
@@ -116,11 +120,15 @@ impl CancelKeyOp {
                     SetOptions::default().with_expiration(SetExpiry::EX(expire.as_secs())),
                 ));
             }
-            CancelKeyOp::GetCancelDataOld { key } => {
+            CancelKeyOp::Refresh { key, expire } => {
+                let key = KeyPrefix::Cancel(*key).build_redis_key();
+                pipe.add_command(Cmd::expire(&key, expire.as_secs() as i64));
+            }
+            CancelKeyOp::GetOld { key } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
                 pipe.add_command(Cmd::hget(key, "data"));
             }
-            CancelKeyOp::GetCancelData { key } => {
+            CancelKeyOp::Get { key } => {
                 let key = KeyPrefix::Cancel(*key).build_redis_key();
                 pipe.add_command(Cmd::get(key));
             }
@@ -264,7 +272,7 @@ impl CancellationHandler {
             .proxy
             .cancel_channel_size
             .guard(RedisMsgKind::Get);
-        let op = CancelKeyOp::GetCancelData { key };
+        let op = CancelKeyOp::Get { key };
         let result = timeout(
             TIMEOUT,
             tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -289,7 +297,7 @@ impl CancellationHandler {
                 .proxy
                 .cancel_channel_size
                 .guard(RedisMsgKind::HGet);
-            let op = CancelKeyOp::GetCancelDataOld { key };
+            let op = CancelKeyOp::GetOld { key };
             timeout(
                 TIMEOUT,
                 tx.call((guard, op), std::future::pending::<Infallible>()),
@@ -474,45 +482,95 @@ impl Session {
 
         let mut cancel = pin!(cancel);
 
+        enum State {
+            Set,
+            Refresh,
+        }
+        let mut state = State::Set;
+
         loop {
-            let guard = Metrics::get()
-                .proxy
-                .cancel_channel_size
-                .guard(RedisMsgKind::Set);
-            let op = CancelKeyOp::StoreCancelKey {
-                key: self.key,
-                value: closure_json.clone(),
-                expire: CANCEL_KEY_TTL,
+            let guard_op = match state {
+                State::Set => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Set);
+                    let op = CancelKeyOp::Store {
+                        key: self.key,
+                        value: closure_json.clone(),
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "registering cancellation key"
+                    );
+                    (guard, op)
+                }
+
+                State::Refresh => {
+                    let guard = Metrics::get()
+                        .proxy
+                        .cancel_channel_size
+                        .guard(RedisMsgKind::Expire);
+                    let op = CancelKeyOp::Refresh {
+                        key: self.key,
+                        expire: CANCEL_KEY_TTL,
+                    };
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshing cancellation key"
+                    );
+                    (guard, op)
+                }
             };
 
-            tracing::debug!(
-                src=%self.key,
-                dest=?cancel_closure.cancel_token,
-                "registering cancellation key"
-            );
-
-            match tx.call((guard, op), cancel.as_mut()).await {
-                Ok(_) => {
+            match tx.call(guard_op, cancel.as_mut()).await {
+                // SET returns OK
+                Ok(Value::Okay) => {
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
                         "registered cancellation key"
                     );
-
-                    // wait before continuing. break immediately if cancelled.
-                    if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
-                        .await
-                        .is_err()
-                    {
-                        break;
-                    }
+                    state = State::Refresh;
                 }
+
+                // EXPIRE returns 1
+                Ok(Value::Int(1)) => {
+                    tracing::debug!(
+                        src=%self.key,
+                        dest=?cancel_closure.cancel_token,
+                        "refreshed cancellation key"
+                    );
+                }
+
+                Ok(_) => {
+                    // Any other response likely means the key expired.
+                    tracing::warn!(src=%self.key, "refreshing cancellation key failed");
+                    // Re-enter the SET loop to repush full data.
+                    state = State::Set;
+                }
+
                 // retry immediately.
                 Err(BatchQueueError::Result(error)) => {
-                    tracing::warn!(?error, "error registering cancellation key");
+                    tracing::warn!(?error, "error refreshing cancellation key");
+                    // Small delay to prevent busy loop with high cpu and logging.
+                    tokio::time::sleep(Duration::from_millis(10)).await;
+                    continue;
                 }
+
                 Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
             }
+
+            // wait before continuing. break immediately if cancelled.
+            if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+                .await
+                .is_err()
+            {
+                break;
+            }
         }
 
         if let Err(err) = cancel_closure
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 8439082498..bf4d5a11eb 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -376,6 +376,7 @@ pub enum Waiting {
 pub enum RedisMsgKind {
     Set,
     Get,
+    Expire,
     HGet,
 }
 

From fecb707b19f6f14942e9cbc624890a0e371bb931 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 11:41:58 +0200
Subject: [PATCH 104/163] pagebench: add `idle-streams` (#12583)

## Problem

For the communicator scheduling policy, we need to understand the
server-side cost of idle gRPC streams.

Touches #11735.

## Summary of changes

Add an `idle-streams` benchmark to `pagebench` which opens a large
number of idle gRPC GetPage streams.
---
 pageserver/pagebench/src/cmd/idle_streams.rs | 127 +++++++++++++++++++
 pageserver/pagebench/src/main.rs             |   3 +
 2 files changed, 130 insertions(+)
 create mode 100644 pageserver/pagebench/src/cmd/idle_streams.rs

diff --git a/pageserver/pagebench/src/cmd/idle_streams.rs b/pageserver/pagebench/src/cmd/idle_streams.rs
new file mode 100644
index 0000000000..73bc9f3f46
--- /dev/null
+++ b/pageserver/pagebench/src/cmd/idle_streams.rs
@@ -0,0 +1,127 @@
+use std::sync::Arc;
+
+use anyhow::anyhow;
+use futures::StreamExt;
+use tonic::transport::Endpoint;
+use tracing::info;
+
+use pageserver_page_api::{GetPageClass, GetPageRequest, GetPageStatusCode, ReadLsn, RelTag};
+use utils::id::TenantTimelineId;
+use utils::lsn::Lsn;
+use utils::shard::ShardIndex;
+
+/// Starts a large number of idle gRPC GetPage streams.
+#[derive(clap::Parser)]
+pub(crate) struct Args {
+    /// The Pageserver to connect to. Must use grpc://.
+    #[clap(long, default_value = "grpc://localhost:51051")]
+    server: String,
+    /// The Pageserver HTTP API.
+    #[clap(long, default_value = "http://localhost:9898")]
+    http_server: String,
+    /// The number of streams to open.
+    #[clap(long, default_value = "100000")]
+    count: usize,
+    /// Number of streams per connection.
+    #[clap(long, default_value = "100")]
+    per_connection: usize,
+    /// Send a single GetPage request on each stream.
+    #[clap(long, default_value_t = false)]
+    send_request: bool,
+}
+
+pub(crate) fn main(args: Args) -> anyhow::Result<()> {
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()?;
+
+    rt.block_on(main_impl(args))
+}
+
+async fn main_impl(args: Args) -> anyhow::Result<()> {
+    // Discover a tenant and timeline to use.
+    let mgmt_api_client = Arc::new(pageserver_client::mgmt_api::Client::new(
+        reqwest::Client::new(),
+        args.http_server.clone(),
+        None,
+    ));
+    let timelines: Vec<TenantTimelineId> = crate::util::cli::targets::discover(
+        &mgmt_api_client,
+        crate::util::cli::targets::Spec {
+            limit_to_first_n_targets: Some(1),
+            targets: None,
+        },
+    )
+    .await?;
+    let ttid = timelines
+        .first()
+        .ok_or_else(|| anyhow!("no timelines found"))?;
+
+    // Set up the initial client.
+    let endpoint = Endpoint::from_shared(args.server.clone())?;
+
+    let connect = async || {
+        pageserver_page_api::Client::new(
+            endpoint.connect().await?,
+            ttid.tenant_id,
+            ttid.timeline_id,
+            ShardIndex::unsharded(),
+            None,
+            None,
+        )
+    };
+
+    let mut client = connect().await?;
+    let mut streams = Vec::with_capacity(args.count);
+
+    // Create streams.
+    for i in 0..args.count {
+        if i % 100 == 0 {
+            info!("opened {}/{} streams", i, args.count);
+        }
+        if i % args.per_connection == 0 && i > 0 {
+            client = connect().await?;
+        }
+
+        let (req_tx, req_rx) = tokio::sync::mpsc::unbounded_channel();
+        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
+        let mut resp_stream = client.get_pages(req_stream).await?;
+
+        // Send request if specified.
+        if args.send_request {
+            req_tx.send(GetPageRequest {
+                request_id: 1.into(),
+                request_class: GetPageClass::Normal,
+                read_lsn: ReadLsn {
+                    request_lsn: Lsn::MAX,
+                    not_modified_since_lsn: Some(Lsn(1)),
+                },
+                rel: RelTag {
+                    spcnode: 1664, // pg_global
+                    dbnode: 0,     // shared database
+                    relnode: 1262, // pg_authid
+                    forknum: 0,    // init
+                },
+                block_numbers: vec![0],
+            })?;
+            let resp = resp_stream
+                .next()
+                .await
+                .transpose()?
+                .ok_or_else(|| anyhow!("no response"))?;
+            if resp.status_code != GetPageStatusCode::Ok {
+                return Err(anyhow!("{} response", resp.status_code));
+            }
+        }
+
+        // Hold onto streams to avoid closing them.
+        streams.push((req_tx, resp_stream));
+    }
+
+    info!("opened {} streams, sleeping", args.count);
+
+    // Block forever, to hold the idle streams open for inspection.
+    futures::future::pending::<()>().await;
+
+    Ok(())
+}
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 5527557450..6498203de3 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -17,6 +17,7 @@ mod cmd {
     pub(super) mod aux_files;
     pub(super) mod basebackup;
     pub(super) mod getpage_latest_lsn;
+    pub(super) mod idle_streams;
     pub(super) mod ondemand_download_churn;
     pub(super) mod trigger_initial_size_calculation;
 }
@@ -29,6 +30,7 @@ enum Args {
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
     OndemandDownloadChurn(cmd::ondemand_download_churn::Args),
     AuxFiles(cmd::aux_files::Args),
+    IdleStreams(cmd::idle_streams::Args),
 }
 
 fn main() {
@@ -49,6 +51,7 @@ fn main() {
         }
         Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
         Args::AuxFiles(args) => cmd::aux_files::main(args),
+        Args::IdleStreams(args) => cmd::idle_streams::main(args),
     }
     .unwrap()
 }

From d14d8271b815b57adeab6707b84ee26909f647f7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 12:43:10 +0200
Subject: [PATCH 105/163] pageserver/client_grpc: improve retry logic (#12579)

## Problem

gRPC client retries currently include pool acquisition under the
per-attempt timeout. If pool acquisition is slow (e.g. full pool), this
will cause spurious timeout warnings, and the caller will lose its place
in the pool queue.

Touches #11735.

## Summary of changes

Makes several improvements to retries and related logic:

* Don't include pool acquisition time under request timeouts.
* Move attempt timeouts out of `Retry` and into the closure.
* Make `Retry` configurable, move constants into main module.
* Don't backoff on the first retry, and reduce initial/max backoffs to
5ms and 5s respectively.
* Add `with_retries` and `with_timeout` helpers.
* Add slow logging for pool acquisition, and a `warn_slow` counterpart
to `log_slow`.
* Add debug logging for requests and responses at the client boundary.
---
 libs/utils/src/logging.rs            |  56 +++++++---
 pageserver/client_grpc/src/client.rs | 160 +++++++++++++++++++--------
 pageserver/client_grpc/src/retry.rs  |  72 ++++++------
 3 files changed, 189 insertions(+), 99 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 5828a400a0..d67c0f123b 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -1,4 +1,5 @@
 use std::future::Future;
+use std::pin::Pin;
 use std::str::FromStr;
 use std::time::Duration;
 
@@ -7,7 +8,7 @@ use metrics::{IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 use strum_macros::{EnumString, VariantNames};
 use tokio::time::Instant;
-use tracing::info;
+use tracing::{info, warn};
 
 /// Logs a critical error, similarly to `tracing::error!`. This will:
 ///
@@ -377,10 +378,11 @@ impl std::fmt::Debug for SecretString {
 ///
 /// TODO: consider upgrading this to a warning, but currently it fires too often.
 #[inline]
-pub async fn log_slow<F, O>(name: &str, threshold: Duration, f: std::pin::Pin<&mut F>) -> O
-where
-    F: Future<Output = O>,
-{
+pub async fn log_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
     monitor_slow_future(
         threshold,
         threshold, // period = threshold
@@ -394,16 +396,42 @@ where
             if !is_slow {
                 return;
             }
+            let elapsed = elapsed_total.as_secs_f64();
             if ready {
-                info!(
-                    "slow {name} completed after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} completed after {elapsed:.3}s");
             } else {
-                info!(
-                    "slow {name} still running after {:.3}s",
-                    elapsed_total.as_secs_f64()
-                );
+                info!("slow {name} still running after {elapsed:.3}s");
+            }
+        },
+    )
+    .await
+}
+
+/// Logs a periodic warning if a future is slow to complete.
+#[inline]
+pub async fn warn_slow<O>(
+    name: &str,
+    threshold: Duration,
+    f: Pin<&mut impl Future<Output = O>>,
+) -> O {
+    monitor_slow_future(
+        threshold,
+        threshold, // period = threshold
+        f,
+        |MonitorSlowFutureCallback {
+             ready,
+             is_slow,
+             elapsed_total,
+             elapsed_since_last_callback: _,
+         }| {
+            if !is_slow {
+                return;
+            }
+            let elapsed = elapsed_total.as_secs_f64();
+            if ready {
+                warn!("slow {name} completed after {elapsed:.3}s");
+            } else {
+                warn!("slow {name} still running after {elapsed:.3}s");
             }
         },
     )
@@ -416,7 +444,7 @@ where
 pub async fn monitor_slow_future<F, O>(
     threshold: Duration,
     period: Duration,
-    mut fut: std::pin::Pin<&mut F>,
+    mut fut: Pin<&mut F>,
     mut cb: impl FnMut(MonitorSlowFutureCallback),
 ) -> O
 where
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 7049fbdb96..7732585f7c 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -1,13 +1,16 @@
 use std::collections::HashMap;
 use std::num::NonZero;
+use std::pin::pin;
 use std::sync::Arc;
+use std::time::{Duration, Instant};
 
 use anyhow::anyhow;
 use arc_swap::ArcSwap;
 use futures::stream::FuturesUnordered;
 use futures::{FutureExt as _, StreamExt as _};
 use tonic::codec::CompressionEncoding;
-use tracing::instrument;
+use tracing::{debug, instrument};
+use utils::logging::warn_slow;
 
 use crate::pool::{ChannelPool, ClientGuard, ClientPool, StreamGuard, StreamPool};
 use crate::retry::Retry;
@@ -44,6 +47,23 @@ const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
 /// get a larger queue depth.
 const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
 
+/// The overall request call timeout, including retries and pool acquisition.
+/// TODO: should we retry forever? Should the caller decide?
+const CALL_TIMEOUT: Duration = Duration::from_secs(60);
+
+/// The per-request (retry attempt) timeout, including any lazy connection establishment.
+const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// The initial request retry backoff duration. The first retry does not back off.
+/// TODO: use a different backoff for ResourceExhausted (rate limiting)? Needs server support.
+const BASE_BACKOFF: Duration = Duration::from_millis(5);
+
+/// The maximum request retry backoff duration.
+const MAX_BACKOFF: Duration = Duration::from_secs(5);
+
+/// Threshold and interval for warning about slow operation.
+const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
+
 /// A rich Pageserver gRPC client for a single tenant timeline. This client is more capable than the
 /// basic `page_api::Client` gRPC client, and supports:
 ///
@@ -67,8 +87,6 @@ pub struct PageserverClient {
     compression: Option<CompressionEncoding>,
     /// The shards for this tenant.
     shards: ArcSwap<Shards>,
-    /// The retry configuration.
-    retry: Retry,
 }
 
 impl PageserverClient {
@@ -94,7 +112,6 @@ impl PageserverClient {
             auth_token,
             compression,
             shards: ArcSwap::new(Arc::new(shards)),
-            retry: Retry,
         })
     }
 
@@ -142,13 +159,15 @@ impl PageserverClient {
         &self,
         req: page_api::CheckRelExistsRequest,
     ) -> tonic::Result<page_api::CheckRelExistsResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.check_rel_exists(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.check_rel_exists(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Returns the total size of a database, as # of bytes.
@@ -157,13 +176,15 @@ impl PageserverClient {
         &self,
         req: page_api::GetDbSizeRequest,
     ) -> tonic::Result<page_api::GetDbSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_db_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_db_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches pages. The `request_id` must be unique across all in-flight requests, and the
@@ -193,6 +214,8 @@ impl PageserverClient {
             return Err(tonic::Status::invalid_argument("request attempt must be 0"));
         }
 
+        debug!("sending request: {req:?}");
+
         // The shards may change while we're fetching pages. We execute the request using a stable
         // view of the shards (especially important for requests that span shards), but retry the
         // top-level (pre-split) request to pick up shard changes. This can lead to unnecessary
@@ -201,13 +224,16 @@ impl PageserverClient {
         //
         // TODO: the gRPC server and client doesn't yet properly support shard splits. Revisit this
         // once we figure out how to handle these.
-        self.retry
-            .with(async |attempt| {
-                let mut req = req.clone();
-                req.request_id.attempt = attempt as u32;
-                Self::get_page_with_shards(req, &self.shards.load_full()).await
-            })
-            .await
+        let resp = Self::with_retries(CALL_TIMEOUT, async |attempt| {
+            let mut req = req.clone();
+            req.request_id.attempt = attempt as u32;
+            let shards = self.shards.load_full();
+            Self::with_timeout(REQUEST_TIMEOUT, Self::get_page_with_shards(req, &shards)).await
+        })
+        .await?;
+
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches pages using the given shards. This uses a stable view of the shards, regardless of
@@ -290,13 +316,15 @@ impl PageserverClient {
         &self,
         req: page_api::GetRelSizeRequest,
     ) -> tonic::Result<page_api::GetRelSizeResponse> {
-        self.retry
-            .with(async |_| {
-                // Relation metadata is only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_rel_size(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // Relation metadata is only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_rel_size(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
     }
 
     /// Fetches an SLRU segment.
@@ -305,13 +333,45 @@ impl PageserverClient {
         &self,
         req: page_api::GetSlruSegmentRequest,
     ) -> tonic::Result<page_api::GetSlruSegmentResponse> {
-        self.retry
-            .with(async |_| {
-                // SLRU segments are only available on shard 0.
-                let mut client = self.shards.load_full().get_zero().client().await?;
-                client.get_slru_segment(req).await
-            })
-            .await
+        debug!("sending request: {req:?}");
+        let resp = Self::with_retries(CALL_TIMEOUT, async |_| {
+            // SLRU segments are only available on shard 0.
+            let mut client = self.shards.load_full().get_zero().client().await?;
+            Self::with_timeout(REQUEST_TIMEOUT, client.get_slru_segment(req)).await
+        })
+        .await?;
+        debug!("received response: {resp:?}");
+        Ok(resp)
+    }
+
+    /// Runs the given async closure with retries up to the given timeout. Only certain gRPC status
+    /// codes are retried, see [`Retry::should_retry`]. Returns `DeadlineExceeded` on timeout.
+    async fn with_retries<T, F, O>(timeout: Duration, f: F) -> tonic::Result<T>
+    where
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
+        O: Future<Output = tonic::Result<T>>,
+    {
+        Retry {
+            timeout: Some(timeout),
+            base_backoff: BASE_BACKOFF,
+            max_backoff: MAX_BACKOFF,
+        }
+        .with(f)
+        .await
+    }
+
+    /// Runs the given future with a timeout. Returns `DeadlineExceeded` on timeout.
+    async fn with_timeout<T>(
+        timeout: Duration,
+        f: impl Future<Output = tonic::Result<T>>,
+    ) -> tonic::Result<T> {
+        let started = Instant::now();
+        tokio::time::timeout(timeout, f).await.map_err(|_| {
+            tonic::Status::deadline_exceeded(format!(
+                "request timed out after {:.3}s",
+                started.elapsed().as_secs_f64()
+            ))
+        })?
     }
 }
 
@@ -525,19 +585,25 @@ impl Shard {
     }
 
     /// Returns a pooled client for this shard.
+    #[instrument(skip_all)]
     async fn client(&self) -> tonic::Result<ClientGuard> {
-        self.client_pool
-            .get()
-            .await
-            .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
+        warn_slow(
+            "client pool acquisition",
+            SLOW_THRESHOLD,
+            pin!(self.client_pool.get()),
+        )
+        .await
+        .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
     }
 
     /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
     /// pool (e.g. for prefetches).
+    #[instrument(skip_all, fields(bulk))]
     async fn stream(&self, bulk: bool) -> StreamGuard {
-        match bulk {
-            false => self.stream_pool.get().await,
-            true => self.bulk_stream_pool.get().await,
-        }
+        let pool = match bulk {
+            false => &self.stream_pool,
+            true => &self.bulk_stream_pool,
+        };
+        warn_slow("stream pool acquisition", SLOW_THRESHOLD, pin!(pool.get())).await
     }
 }
diff --git a/pageserver/client_grpc/src/retry.rs b/pageserver/client_grpc/src/retry.rs
index a1e0b8636f..8a138711e8 100644
--- a/pageserver/client_grpc/src/retry.rs
+++ b/pageserver/client_grpc/src/retry.rs
@@ -1,5 +1,6 @@
 use std::time::Duration;
 
+use futures::future::pending;
 use tokio::time::Instant;
 use tracing::{error, info, warn};
 
@@ -8,60 +9,54 @@ use utils::backoff::exponential_backoff_duration;
 /// A retry handler for Pageserver gRPC requests.
 ///
 /// This is used instead of backoff::retry for better control and observability.
-pub struct Retry;
+pub struct Retry {
+    /// Timeout across all retry attempts. If None, retries forever.
+    pub timeout: Option<Duration>,
+    /// The initial backoff duration. The first retry does not use a backoff.
+    pub base_backoff: Duration,
+    /// The maximum backoff duration.
+    pub max_backoff: Duration,
+}
 
 impl Retry {
-    /// The per-request timeout.
-    // TODO: tune these, and/or make them configurable. Should we retry forever?
-    const REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
-    /// The total timeout across all attempts
-    const TOTAL_TIMEOUT: Duration = Duration::from_secs(60);
-    /// The initial backoff duration.
-    const BASE_BACKOFF: Duration = Duration::from_millis(10);
-    /// The maximum backoff duration.
-    const MAX_BACKOFF: Duration = Duration::from_secs(10);
-    /// If true, log successful requests. For debugging.
-    const LOG_SUCCESS: bool = false;
-
-    /// Runs the given async closure with timeouts and retries (exponential backoff), passing the
-    /// attempt number starting at 0. Logs errors, using the current tracing span for context.
+    /// Runs the given async closure with timeouts and retries (exponential backoff). Logs errors,
+    /// using the current tracing span for context.
     ///
-    /// Only certain gRPC status codes are retried, see [`Self::should_retry`]. For default
-    /// timeouts, see [`Self::REQUEST_TIMEOUT`] and [`Self::TOTAL_TIMEOUT`].
+    /// Only certain gRPC status codes are retried, see [`Self::should_retry`].
     pub async fn with<T, F, O>(&self, mut f: F) -> tonic::Result<T>
     where
-        F: FnMut(usize) -> O, // takes attempt number, starting at 0
+        F: FnMut(usize) -> O, // pass attempt number, starting at 0
         O: Future<Output = tonic::Result<T>>,
     {
         let started = Instant::now();
-        let deadline = started + Self::TOTAL_TIMEOUT;
+        let deadline = self.timeout.map(|timeout| started + timeout);
         let mut last_error = None;
         let mut retries = 0;
         loop {
-            // Set up a future to wait for the backoff (if any) and run the request with a timeout.
+            // Set up a future to wait for the backoff, if any, and run the closure.
             let backoff_and_try = async {
                 // NB: sleep() always sleeps 1ms, even when given a 0 argument. See:
                 // https://github.com/tokio-rs/tokio/issues/6866
-                if let Some(backoff) = Self::backoff_duration(retries) {
+                if let Some(backoff) = self.backoff_duration(retries) {
                     tokio::time::sleep(backoff).await;
                 }
 
-                let request_started = Instant::now();
-                tokio::time::timeout(Self::REQUEST_TIMEOUT, f(retries))
-                    .await
-                    .map_err(|_| {
-                        tonic::Status::deadline_exceeded(format!(
-                            "request timed out after {:.3}s",
-                            request_started.elapsed().as_secs_f64()
-                        ))
-                    })?
+                f(retries).await
             };
 
-            // Wait for the backoff and request, or bail out if the total timeout is exceeded.
+            // Set up a future for the timeout, if any.
+            let timeout = async {
+                match deadline {
+                    Some(deadline) => tokio::time::sleep_until(deadline).await,
+                    None => pending().await,
+                }
+            };
+
+            // Wait for the backoff and request, or bail out if the timeout is exceeded.
             let result = tokio::select! {
                 result = backoff_and_try => result,
 
-                _ = tokio::time::sleep_until(deadline) => {
+                _ = timeout => {
                     let last_error = last_error.unwrap_or_else(|| {
                         tonic::Status::deadline_exceeded(format!(
                             "request timed out after {:.3}s",
@@ -79,7 +74,7 @@ impl Retry {
             match result {
                 // Success, return the result.
                 Ok(result) => {
-                    if retries > 0 || Self::LOG_SUCCESS {
+                    if retries > 0 {
                         info!(
                             "request succeeded after {retries} retries in {:.3}s",
                             started.elapsed().as_secs_f64(),
@@ -112,12 +107,13 @@ impl Retry {
         }
     }
 
-    /// Returns the backoff duration for the given retry attempt, or None for no backoff.
-    fn backoff_duration(retry: usize) -> Option<Duration> {
+    /// Returns the backoff duration for the given retry attempt, or None for no backoff. The first
+    /// attempt and first retry never backs off, so this returns None for 0 and 1 retries.
+    fn backoff_duration(&self, retries: usize) -> Option<Duration> {
         let backoff = exponential_backoff_duration(
-            retry as u32,
-            Self::BASE_BACKOFF.as_secs_f64(),
-            Self::MAX_BACKOFF.as_secs_f64(),
+            (retries as u32).saturating_sub(1), // first retry does not back off
+            self.base_backoff.as_secs_f64(),
+            self.max_backoff.as_secs_f64(),
         );
         (!backoff.is_zero()).then_some(backoff)
     }

From f18cc808f09adcc5fd570cdb2a5bddd2c77a0da9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 12:47:26 +0200
Subject: [PATCH 106/163] pageserver/client_grpc: reap idle channels
 immediately (#12587)

## Problem

It can take 3x the idle timeout to reap a channel. We have to wait for
the idle timeout to trigger first for the stream, then the client, then
the channel.

Touches #11735.

## Summary of changes

Reap empty channels immediately, and rely indirectly on the
channel/stream timeouts.

This can still lead to 2x the idle timeout for streams (first stream
then client), but that's okay -- if the stream closes abruptly (e.g. due
to timeout or error) we want to keep the client around in the pool for a
while.
---
 pageserver/client_grpc/src/pool.rs | 66 +++++++++---------------------
 1 file changed, 19 insertions(+), 47 deletions(-)

diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 906872e091..4a29252cd9 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -9,19 +9,20 @@
 //!
 //! * ChannelPool: manages gRPC channels (TCP connections) to a single Pageserver. Multiple clients
 //!   can acquire and use the same channel concurrently (via HTTP/2 stream multiplexing), up to a
-//!   per-channel client limit. Channels may be closed when they are no longer used by any clients.
+//!   per-channel client limit. Channels are closed immediately when empty, and indirectly rely on
+//!   client/stream idle timeouts.
 //!
 //! * ClientPool: manages gRPC clients for a single tenant shard. Each client acquires a (shared)
 //!   channel from the ChannelPool for the client's lifetime. A client can only be acquired by a
-//!   single caller at a time, and is returned to the pool when dropped. Idle clients may be removed
-//!   from the pool after some time, to free up the channel.
+//!   single caller at a time, and is returned to the pool when dropped. Idle clients are removed
+//!   from the pool after a while to free up resources.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
 //!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
 //!   returns a guard that can be used to send a single request, to properly enforce queue depth and
 //!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
 //!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams may be removed from the pool after a while to free up the client.
+//!   queue depth). Idle streams are removed from the pool after a while to free up resources.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -48,14 +49,12 @@ use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
 use utils::shard::ShardIndex;
 
-/// Reap channels/clients/streams that have been idle for this long.
+/// Reap clients/streams that have been idle for this long. Channels are reaped immediately when
+/// empty, and indirectly rely on the client/stream idle timeouts.
 ///
-/// TODO: this is per-pool. For nested pools, it can take up to 3x as long for a TCP connection to
-/// be reaped. First, we must wait for an idle stream to be reaped, which marks its client as idle.
-/// Then, we must wait for the idle client to be reaped, which marks its channel as idle. Then, we
-/// must wait for the idle channel to be reaped. Is that a problem? Maybe not, we just have to
-/// account for it when setting the reap threshold. Alternatively, we can immediately reap empty
-/// channels, and/or stream pool clients.
+/// A stream's client will be reaped after 2x the idle threshold (first stream the client), but
+/// that's okay -- if the stream closes abruptly (e.g. due to timeout or cancellation), we want to
+/// keep its client around in the pool for a while.
 const REAP_IDLE_THRESHOLD: Duration = match cfg!(any(test, feature = "testing")) {
     false => Duration::from_secs(180),
     true => Duration::from_secs(1), // exercise reaping in tests
@@ -83,8 +82,6 @@ pub struct ChannelPool {
     max_clients_per_channel: NonZero<usize>,
     /// Open channels.
     channels: Mutex<BTreeMap<ChannelID, ChannelEntry>>,
-    /// Reaps idle channels.
-    idle_reaper: Reaper,
     /// Channel ID generator.
     next_channel_id: AtomicUsize,
 }
@@ -96,9 +93,6 @@ struct ChannelEntry {
     channel: Channel,
     /// Number of clients using this channel.
     clients: usize,
-    /// The channel has been idle (no clients) since this time. None if channel is in use.
-    /// INVARIANT: Some if clients == 0, otherwise None.
-    idle_since: Option<Instant>,
 }
 
 impl ChannelPool {
@@ -108,15 +102,12 @@ impl ChannelPool {
         E: TryInto<Endpoint> + Send + Sync + 'static,
         <E as TryInto<Endpoint>>::Error: std::error::Error + Send + Sync,
     {
-        let pool = Arc::new(Self {
+        Ok(Arc::new(Self {
             endpoint: endpoint.try_into()?,
             max_clients_per_channel,
             channels: Mutex::default(),
-            idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
             next_channel_id: AtomicUsize::default(),
-        });
-        pool.idle_reaper.spawn(&pool);
-        Ok(pool)
+        }))
     }
 
     /// Acquires a gRPC channel for a client. Multiple clients may acquire the same channel.
@@ -137,22 +128,17 @@ impl ChannelPool {
         let mut channels = self.channels.lock().unwrap();
 
         // Try to find an existing channel with available capacity. We check entries in BTreeMap
-        // order, to fill up the lower-ordered channels first. The ClientPool also prefers clients
-        // with lower-ordered channel IDs first. This will cluster clients in lower-ordered
+        // order, to fill up the lower-ordered channels first. The client/stream pools also prefer
+        // clients with lower-ordered channel IDs first. This will cluster clients in lower-ordered
         // channels, and free up higher-ordered channels such that they can be reaped.
         for (&id, entry) in channels.iter_mut() {
             assert!(
                 entry.clients <= self.max_clients_per_channel.get(),
                 "channel overflow"
             );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.clients == 0,
-                "incorrect channel idle state"
-            );
+            assert_ne!(entry.clients, 0, "empty channel not reaped");
             if entry.clients < self.max_clients_per_channel.get() {
                 entry.clients += 1;
-                entry.idle_since = None;
                 return ChannelGuard {
                     pool: Arc::downgrade(self),
                     id,
@@ -169,7 +155,6 @@ impl ChannelPool {
         let entry = ChannelEntry {
             channel: channel.clone(),
             clients: 1, // account for the guard below
-            idle_since: None,
         };
         channels.insert(id, entry);
 
@@ -181,20 +166,6 @@ impl ChannelPool {
     }
 }
 
-impl Reapable for ChannelPool {
-    /// Reaps channels that have been idle since before the cutoff.
-    fn reap_idle(&self, cutoff: Instant) {
-        self.channels.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.clients, 0, "empty channel not marked idle");
-                return true;
-            };
-            assert_eq!(entry.clients, 0, "idle channel has clients");
-            idle_since >= cutoff
-        })
-    }
-}
-
 /// Tracks a channel acquired from the pool. The owned inner channel can be obtained with `take()`,
 /// since the gRPC client requires an owned `Channel`.
 pub struct ChannelGuard {
@@ -211,7 +182,7 @@ impl ChannelGuard {
     }
 }
 
-/// Returns the channel to the pool.
+/// Returns the channel to the pool. The channel is closed when empty.
 impl Drop for ChannelGuard {
     fn drop(&mut self) {
         let Some(pool) = self.pool.upgrade() else {
@@ -220,11 +191,12 @@ impl Drop for ChannelGuard {
 
         let mut channels = pool.channels.lock().unwrap();
         let entry = channels.get_mut(&self.id).expect("unknown channel");
-        assert!(entry.idle_since.is_none(), "active channel marked idle");
         assert!(entry.clients > 0, "channel underflow");
         entry.clients -= 1;
+
+        // Reap empty channels immediately.
         if entry.clients == 0 {
-            entry.idle_since = Some(Instant::now()); // mark channel as idle
+            channels.remove(&self.id);
         }
     }
 }

From 30b877074cda2580c677ec9527b83ab975dee181 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 13:44:53 +0200
Subject: [PATCH 107/163] pagebench: add CPU profiling support (#12478)

## Problem

The new communicator gRPC client has significantly worse Pagebench
performance than a basic gRPC client. We need to find out why.

## Summary of changes

Add a `pagebench --profile` flag which takes a client CPU profile of the
benchmark and writes a flamegraph to `profile.svg`.
---
 Cargo.lock                       |  1 +
 pageserver/pagebench/Cargo.toml  |  1 +
 pageserver/pagebench/src/main.rs | 59 +++++++++++++++++++++++++-------
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 14b460005a..bea8d3a7fd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4296,6 +4296,7 @@ dependencies = [
  "pageserver_client",
  "pageserver_client_grpc",
  "pageserver_page_api",
+ "pprof",
  "rand 0.8.5",
  "reqwest",
  "serde",
diff --git a/pageserver/pagebench/Cargo.toml b/pageserver/pagebench/Cargo.toml
index 4086213830..609fef2b4f 100644
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -16,6 +16,7 @@ futures.workspace = true
 hdrhistogram.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
+pprof.workspace = true
 rand.workspace = true
 reqwest.workspace = true
 serde.workspace = true
diff --git a/pageserver/pagebench/src/main.rs b/pageserver/pagebench/src/main.rs
index 6498203de3..ceca58e032 100644
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -1,4 +1,7 @@
+use std::fs::File;
+
 use clap::Parser;
+use tracing::info;
 use utils::logging;
 
 /// Re-usable pieces of code that aren't CLI-specific.
@@ -24,7 +27,18 @@ mod cmd {
 
 /// Component-level performance test for pageserver.
 #[derive(clap::Parser)]
-enum Args {
+struct Args {
+    /// Takes a client CPU profile into profile.svg. The benchmark must exit cleanly before it's
+    /// written, e.g. via --runtime.
+    #[arg(long)]
+    profile: bool,
+
+    #[command(subcommand)]
+    subcommand: Subcommand,
+}
+
+#[derive(clap::Subcommand)]
+enum Subcommand {
     Basebackup(cmd::basebackup::Args),
     GetPageLatestLsn(cmd::getpage_latest_lsn::Args),
     TriggerInitialSizeCalculation(cmd::trigger_initial_size_calculation::Args),
@@ -33,25 +47,46 @@ enum Args {
     IdleStreams(cmd::idle_streams::Args),
 }
 
-fn main() {
+fn main() -> anyhow::Result<()> {
     logging::init(
         logging::LogFormat::Plain,
         logging::TracingErrorLayerEnablement::Disabled,
         logging::Output::Stderr,
-    )
-    .unwrap();
+    )?;
     logging::replace_panic_hook_with_tracing_panic_hook().forget();
 
     let args = Args::parse();
-    match args {
-        Args::Basebackup(args) => cmd::basebackup::main(args),
-        Args::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
-        Args::TriggerInitialSizeCalculation(args) => {
+
+    // Start a CPU profile if requested.
+    let mut profiler = None;
+    if args.profile {
+        profiler = Some(
+            pprof::ProfilerGuardBuilder::default()
+                .frequency(1000)
+                .blocklist(&["libc", "libgcc", "pthread", "vdso"])
+                .build()?,
+        );
+    }
+
+    match args.subcommand {
+        Subcommand::Basebackup(args) => cmd::basebackup::main(args),
+        Subcommand::GetPageLatestLsn(args) => cmd::getpage_latest_lsn::main(args),
+        Subcommand::TriggerInitialSizeCalculation(args) => {
             cmd::trigger_initial_size_calculation::main(args)
         }
-        Args::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
-        Args::AuxFiles(args) => cmd::aux_files::main(args),
-        Args::IdleStreams(args) => cmd::idle_streams::main(args),
+        Subcommand::OndemandDownloadChurn(args) => cmd::ondemand_download_churn::main(args),
+        Subcommand::AuxFiles(args) => cmd::aux_files::main(args),
+        Subcommand::IdleStreams(args) => cmd::idle_streams::main(args),
+    }?;
+
+    // Generate a CPU flamegraph if requested.
+    if let Some(profiler) = profiler {
+        let report = profiler.report().build()?;
+        drop(profiler); // stop profiling
+        let file = File::create("profile.svg")?;
+        report.flamegraph(file)?;
+        info!("wrote CPU profile flamegraph to profile.svg")
     }
-    .unwrap()
+
+    Ok(())
 }

From 42ab34dc362b1b54dc96c43202b43d5ece558aa7 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 14:11:33 +0200
Subject: [PATCH 108/163] pageserver/client_grpc: don't pipeline GetPage
 requests (#12584)

## Problem

The communicator gRPC client currently attempts to pipeline GetPage
requests from multiple callers onto the same gRPC stream. This has a
number of issues:

* Head-of-line blocking: the request may block on e.g. layer download or
LSN wait, delaying the next request.
* Cancellation: we can't easily cancel in-progress requests (e.g. due to
timeout or backend termination), so it may keep blocking the next
request (even its own retry).
* Complex stream scheduling: picking a stream becomes harder/slower, and
additional Tokio tasks and synchronization is needed for stream
management.

Touches #11735.
Requires #12579.

## Summary of changes

This patch removes pipelining of gRPC stream requests, and instead
prefers to scale out the number of streams to achieve the same
throughput. Stream scheduling has been rewritten, and mostly follows the
same pattern as the client pool with exclusive acquisition by a single
caller.

[Benchmarks](https://github.com/neondatabase/neon/pull/12583) show that
the cost of an idle server-side GetPage worker task is about 26 KB (2.5
GB for 100,000), so we can afford to scale out.

This has a number of advantages:

* It (mostly) eliminates head-of-line blocking (except at the TCP
level).
* Cancellation becomes trivial, by closing the stream.
* Stream scheduling becomes significantly simpler and cheaper.
* Individual callers can still use client-side batching for pipelining.
---
 Cargo.lock                           |   1 +
 Cargo.toml                           |   2 +-
 pageserver/client_grpc/src/client.rs |  19 +-
 pageserver/client_grpc/src/pool.rs   | 397 +++++++++++----------------
 pageserver/page_api/src/model.rs     |   7 +-
 workspace_hack/Cargo.toml            |   2 +-
 6 files changed, 165 insertions(+), 263 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index bea8d3a7fd..2f36790d30 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7564,6 +7564,7 @@ dependencies = [
  "futures-core",
  "pin-project-lite",
  "tokio",
+ "tokio-util",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 0d521ee4d9..df2064a4a7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -201,7 +201,7 @@ tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.g
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
 tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
-tokio-stream = "0.1"
+tokio-stream = { version = "0.1", features = ["sync"] }
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "io-util", "rt"] }
 toml = "0.8"
diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 7732585f7c..4b606d6939 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -32,21 +32,13 @@ const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
 /// Max number of concurrent unary request clients per shard.
 const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of concurrent GetPage streams per shard. The max number of concurrent GetPage
-/// requests is given by `MAX_STREAMS * MAX_STREAM_QUEUE_DEPTH`.
+/// Max number of concurrent GetPage streams per shard.
 const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of pipelined requests per stream.
-const MAX_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(2).unwrap();
-
 /// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit but higher queue depth.
+/// are more throughput-oriented, we have a smaller limit.
 const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
 
-/// Max number of pipelined requests per bulk stream. These are more throughput-oriented and thus
-/// get a larger queue depth.
-const MAX_BULK_STREAM_QUEUE_DEPTH: NonZero<usize> = NonZero::new(4).unwrap();
-
 /// The overall request call timeout, including retries and pool acquisition.
 /// TODO: should we retry forever? Should the caller decide?
 const CALL_TIMEOUT: Duration = Duration::from_secs(60);
@@ -272,7 +264,7 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let stream = shard.stream(req.request_class.is_bulk()).await;
+        let mut stream = shard.stream(req.request_class.is_bulk()).await?;
         let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
@@ -557,7 +549,6 @@ impl Shard {
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_STREAMS),
-            MAX_STREAM_QUEUE_DEPTH,
         );
 
         // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
@@ -573,7 +564,6 @@ impl Shard {
                 None, // unbounded, limited by stream pool
             ),
             Some(MAX_BULK_STREAMS),
-            MAX_BULK_STREAM_QUEUE_DEPTH,
         );
 
         Ok(Self {
@@ -593,13 +583,12 @@ impl Shard {
             pin!(self.client_pool.get()),
         )
         .await
-        .map_err(|err| tonic::Status::internal(format!("failed to get client: {err}")))
     }
 
     /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
     /// pool (e.g. for prefetches).
     #[instrument(skip_all, fields(bulk))]
-    async fn stream(&self, bulk: bool) -> StreamGuard {
+    async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
         let pool = match bulk {
             false => &self.stream_pool,
             true => &self.bulk_stream_pool,
diff --git a/pageserver/client_grpc/src/pool.rs b/pageserver/client_grpc/src/pool.rs
index 4a29252cd9..98a649b4c8 100644
--- a/pageserver/client_grpc/src/pool.rs
+++ b/pageserver/client_grpc/src/pool.rs
@@ -18,11 +18,27 @@
 //!   from the pool after a while to free up resources.
 //!
 //! * StreamPool: manages bidirectional gRPC GetPage streams. Each stream acquires a client from the
-//!   ClientPool for the stream's lifetime. Internal streams are not exposed to callers; instead, it
-//!   returns a guard that can be used to send a single request, to properly enforce queue depth and
-//!   route responses. Internally, the pool will reuse or spin up a suitable stream for the request,
-//!   possibly pipelining multiple requests from multiple callers on the same stream (up to some
-//!   queue depth). Idle streams are removed from the pool after a while to free up resources.
+//!   ClientPool for the stream's lifetime. A stream can only be acquired by a single caller at a
+//!   time, and is returned to the pool when dropped. Idle streams are removed from the pool after
+//!   a while to free up resources.
+//!
+//!   The stream only supports sending a single, synchronous request at a time, and does not support
+//!   pipelining multiple requests from different callers onto the same stream -- instead, we scale
+//!   out concurrent streams to improve throughput. There are many reasons for this design choice:
+//!
+//!     * It (mostly) eliminates head-of-line blocking. A single stream is processed sequentially by
+//!       a single server task, which may block e.g. on layer downloads, LSN waits, etc.
+//!
+//!     * Cancellation becomes trivial, by closing the stream. Otherwise, if a caller goes away
+//!       (e.g. because of a timeout), the request would still be processed by the server and block
+//!       requests behind it in the stream. It might even block its own timeout retry.
+//!
+//!     * Stream scheduling becomes significantly simpler and cheaper.
+//!
+//!     * Individual callers can still use client-side batching for pipelining.
+//!
+//!     * Idle streams are cheap. Benchmarks show that an idle GetPage stream takes up about 26 KB
+//!       per stream (2.5 GB for 100,000 streams), so we can afford to scale out.
 //!
 //! Each channel corresponds to one TCP connection. Each client unary request and each stream
 //! corresponds to one HTTP/2 stream and server task.
@@ -30,20 +46,20 @@
 //! TODO: error handling (including custom error types).
 //! TODO: observability.
 
-use std::collections::{BTreeMap, HashMap};
+use std::collections::BTreeMap;
 use std::num::NonZero;
 use std::ops::{Deref, DerefMut};
+use std::pin::Pin;
 use std::sync::atomic::{AtomicUsize, Ordering};
 use std::sync::{Arc, Mutex, Weak};
 use std::time::{Duration, Instant};
 
-use futures::StreamExt as _;
-use tokio::sync::mpsc::{Receiver, Sender};
-use tokio::sync::{OwnedSemaphorePermit, Semaphore, mpsc, oneshot};
+use futures::{Stream, StreamExt as _};
+use tokio::sync::{OwnedSemaphorePermit, Semaphore, watch};
+use tokio_stream::wrappers::WatchStream;
 use tokio_util::sync::CancellationToken;
 use tonic::codec::CompressionEncoding;
 use tonic::transport::{Channel, Endpoint};
-use tracing::{error, warn};
 
 use pageserver_page_api as page_api;
 use utils::id::{TenantId, TimelineId};
@@ -225,8 +241,7 @@ pub struct ClientPool {
     ///
     /// The first client in the map will be acquired next. The map is sorted by client ID, which in
     /// turn is sorted by its channel ID, such that we prefer acquiring idle clients from
-    /// lower-ordered channels. This allows us to free up and reap higher-numbered channels as idle
-    /// clients are reaped.
+    /// lower-ordered channels. This allows us to free up and reap higher-ordered channels.
     idle: Mutex<BTreeMap<ClientID, ClientEntry>>,
     /// Reaps idle clients.
     idle_reaper: Reaper,
@@ -282,7 +297,7 @@ impl ClientPool {
     /// This is moderately performance-sensitive. It is called for every unary request, but these
     /// establish a new gRPC stream per request so they're already expensive. GetPage requests use
     /// the `StreamPool` instead.
-    pub async fn get(self: &Arc<Self>) -> anyhow::Result<ClientGuard> {
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<ClientGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
@@ -300,7 +315,7 @@ impl ClientPool {
             });
         }
 
-        // Slow path: construct a new client.
+        // Construct a new client.
         let mut channel_guard = self.channel_pool.get();
         let client = page_api::Client::new(
             channel_guard.take(),
@@ -309,7 +324,8 @@ impl ClientPool {
             self.shard_id,
             self.auth_token.clone(),
             self.compression,
-        )?;
+        )
+        .map_err(|err| tonic::Status::internal(format!("failed to create client: {err}")))?;
 
         Ok(ClientGuard {
             pool: Arc::downgrade(self),
@@ -379,287 +395,187 @@ impl Drop for ClientGuard {
 /// A pool of bidirectional gRPC streams. Currently only used for GetPage streams. Each stream
 /// acquires a client from the inner `ClientPool` for the stream's lifetime.
 ///
-/// Individual streams are not exposed to callers -- instead, the returned guard can be used to send
-/// a single request and await the response. Internally, requests are multiplexed across streams and
-/// channels. This allows proper queue depth enforcement and response routing.
+/// Individual streams only send a single request at a time, and do not pipeline multiple callers
+/// onto the same stream. Instead, we scale out the number of concurrent streams. This is primarily
+/// to eliminate head-of-line blocking. See the module documentation for more details.
 ///
 /// TODO: consider making this generic over request and response types; not currently needed.
 pub struct StreamPool {
     /// The client pool to acquire clients from. Must be unbounded.
     client_pool: Arc<ClientPool>,
-    /// All pooled streams.
+    /// Idle pooled streams. Acquired streams are removed from here and returned on drop.
     ///
-    /// Incoming requests will be sent over an existing stream with available capacity. If all
-    /// streams are full, a new one is spun up and added to the pool (up to `max_streams`). Each
-    /// stream has an associated Tokio task that processes requests and responses.
-    streams: Mutex<HashMap<StreamID, StreamEntry>>,
-    /// The max number of concurrent streams, or None if unbounded.
-    max_streams: Option<NonZero<usize>>,
-    /// The max number of concurrent requests per stream.
-    max_queue_depth: NonZero<usize>,
-    /// Limits the max number of concurrent requests, given by `max_streams * max_queue_depth`.
-    /// None if the pool is unbounded.
+    /// The first stream in the map will be acquired next. The map is sorted by stream ID, which is
+    /// equivalent to the client ID and in turn sorted by its channel ID. This way we prefer
+    /// acquiring idle streams from lower-ordered channels, which allows us to free up and reap
+    /// higher-ordered channels.
+    idle: Mutex<BTreeMap<StreamID, StreamEntry>>,
+    /// Limits the max number of concurrent streams. None if the pool is unbounded.
     limiter: Option<Arc<Semaphore>>,
     /// Reaps idle streams.
     idle_reaper: Reaper,
-    /// Stream ID generator.
-    next_stream_id: AtomicUsize,
 }
 
-type StreamID = usize;
-type RequestSender = Sender<(page_api::GetPageRequest, ResponseSender)>;
-type RequestReceiver = Receiver<(page_api::GetPageRequest, ResponseSender)>;
-type ResponseSender = oneshot::Sender<tonic::Result<page_api::GetPageResponse>>;
+/// The stream ID. Reuses the inner client ID.
+type StreamID = ClientID;
 
+/// A pooled stream.
 struct StreamEntry {
-    /// Sends caller requests to the stream task. The stream task exits when this is dropped.
-    sender: RequestSender,
-    /// Number of in-flight requests on this stream.
-    queue_depth: usize,
-    /// The time when this stream went idle (queue_depth == 0).
-    /// INVARIANT: Some if queue_depth == 0, otherwise None.
-    idle_since: Option<Instant>,
+    /// The bidirectional stream.
+    stream: BiStream,
+    /// The time when this stream was last used, i.e. when it was put back into `StreamPool::idle`.
+    idle_since: Instant,
+}
+
+/// A bidirectional GetPage stream and its client. Can send requests and receive responses.
+struct BiStream {
+    /// The owning client. Holds onto the channel slot while the stream is alive.
+    client: ClientGuard,
+    /// Stream for sending requests. Uses a watch channel, so it can only send a single request at a
+    /// time, and the caller must await the response before sending another request. This is
+    /// enforced by `StreamGuard::send`.
+    sender: watch::Sender<page_api::GetPageRequest>,
+    /// Stream for receiving responses.
+    receiver: Pin<Box<dyn Stream<Item = tonic::Result<page_api::GetPageResponse>> + Send>>,
 }
 
 impl StreamPool {
-    /// Creates a new stream pool, using the given client pool. It will send up to `max_queue_depth`
-    /// concurrent requests on each stream, and use up to `max_streams` concurrent streams.
+    /// Creates a new stream pool, using the given client pool. It will use up to `max_streams`
+    /// concurrent streams.
     ///
     /// The client pool must be unbounded. The stream pool will enforce its own limits, and because
     /// streams are long-lived they can cause persistent starvation if they exhaust the client pool.
     /// The stream pool should generally have its own dedicated client pool (but it can share a
     /// channel pool with others since these are always unbounded).
-    pub fn new(
-        client_pool: Arc<ClientPool>,
-        max_streams: Option<NonZero<usize>>,
-        max_queue_depth: NonZero<usize>,
-    ) -> Arc<Self> {
+    pub fn new(client_pool: Arc<ClientPool>, max_streams: Option<NonZero<usize>>) -> Arc<Self> {
         assert!(client_pool.limiter.is_none(), "bounded client pool");
         let pool = Arc::new(Self {
             client_pool,
-            streams: Mutex::default(),
-            limiter: max_streams.map(|max_streams| {
-                Arc::new(Semaphore::new(max_streams.get() * max_queue_depth.get()))
-            }),
-            max_streams,
-            max_queue_depth,
+            idle: Mutex::default(),
+            limiter: max_streams.map(|max_streams| Arc::new(Semaphore::new(max_streams.get()))),
             idle_reaper: Reaper::new(REAP_IDLE_THRESHOLD, REAP_IDLE_INTERVAL),
-            next_stream_id: AtomicUsize::default(),
         });
         pool.idle_reaper.spawn(&pool);
         pool
     }
 
-    /// Acquires an available stream from the pool, or spins up a new stream async if all streams
-    /// are full. Returns a guard that can be used to send a single request on the stream and await
-    /// the response, with queue depth quota already acquired. Blocks if the pool is at capacity
-    /// (i.e. `CLIENT_LIMIT * STREAM_QUEUE_DEPTH` requests in flight).
+    /// Acquires an available stream from the pool, or spins up a new stream if all streams are
+    /// full. Returns a guard that can be used to send requests and await the responses. Blocks if
+    /// the pool is full.
     ///
     /// This is very performance-sensitive, as it is on the GetPage hot path.
     ///
-    /// TODO: this must do something more sophisticated for performance. We want:
-    ///
-    /// * Cheap, concurrent access in the common case where we can use a pooled stream.
-    /// * Quick acquisition of pooled streams with available capacity.
-    /// * Prefer streams that belong to lower-numbered channels, to reap idle channels.
-    /// * Prefer filling up existing streams' queue depth before spinning up new streams.
-    /// * Don't hold a lock while spinning up new streams.
-    /// * Allow concurrent clients to join onto streams while they're spun up.
-    /// * Allow spinning up multiple streams concurrently, but don't overshoot limits.
-    ///
-    /// For now, we just do something simple but inefficient (linear scan under mutex).
-    pub async fn get(self: &Arc<Self>) -> StreamGuard {
+    /// TODO: is a `Mutex<BTreeMap>` performant enough? Will it become too contended? We can't
+    /// trivially use e.g. DashMap or sharding, because we want to pop lower-ordered streams first
+    /// to free up higher-ordered channels.
+    pub async fn get(self: &Arc<Self>) -> tonic::Result<StreamGuard> {
         // Acquire a permit if the pool is bounded.
         let mut permit = None;
         if let Some(limiter) = self.limiter.clone() {
             permit = Some(limiter.acquire_owned().await.expect("never closed"));
         }
-        let mut streams = self.streams.lock().unwrap();
 
-        // Look for a pooled stream with available capacity.
-        for (&id, entry) in streams.iter_mut() {
-            assert!(
-                entry.queue_depth <= self.max_queue_depth.get(),
-                "stream queue overflow"
-            );
-            assert_eq!(
-                entry.idle_since.is_some(),
-                entry.queue_depth == 0,
-                "incorrect stream idle state"
-            );
-            if entry.queue_depth < self.max_queue_depth.get() {
-                entry.queue_depth += 1;
-                entry.idle_since = None;
-                return StreamGuard {
-                    pool: Arc::downgrade(self),
-                    id,
-                    sender: entry.sender.clone(),
-                    permit,
-                };
-            }
+        // Fast path: acquire an idle stream from the pool.
+        if let Some((_, entry)) = self.idle.lock().unwrap().pop_first() {
+            return Ok(StreamGuard {
+                pool: Arc::downgrade(self),
+                stream: Some(entry.stream),
+                can_reuse: true,
+                permit,
+            });
         }
 
-        // No available stream, spin up a new one. We install the stream entry in the pool first and
-        // return the guard, while spinning up the stream task async. This allows other callers to
-        // join onto this stream and also create additional streams concurrently if this fills up.
-        let id = self.next_stream_id.fetch_add(1, Ordering::Relaxed);
-        let (req_tx, req_rx) = mpsc::channel(self.max_queue_depth.get());
-        let entry = StreamEntry {
-            sender: req_tx.clone(),
-            queue_depth: 1, // reserve quota for this caller
-            idle_since: None,
-        };
-        streams.insert(id, entry);
+        // Spin up a new stream. Uses a watch channel to send a single request at a time, since
+        // `StreamGuard::send` enforces this anyway and it avoids unnecessary channel overhead.
+        let mut client = self.client_pool.get().await?;
 
-        if let Some(max_streams) = self.max_streams {
-            assert!(streams.len() <= max_streams.get(), "stream overflow");
-        };
+        let (req_tx, req_rx) = watch::channel(page_api::GetPageRequest::default());
+        let req_stream = WatchStream::from_changes(req_rx);
+        let resp_stream = client.get_pages(req_stream).await?;
 
-        let client_pool = self.client_pool.clone();
-        let pool = Arc::downgrade(self);
-
-        tokio::spawn(async move {
-            if let Err(err) = Self::run_stream(client_pool, req_rx).await {
-                error!("stream failed: {err}");
-            }
-            // Remove stream from pool on exit. Weak reference to avoid holding the pool alive.
-            if let Some(pool) = pool.upgrade() {
-                let entry = pool.streams.lock().unwrap().remove(&id);
-                assert!(entry.is_some(), "unknown stream ID: {id}");
-            }
-        });
-
-        StreamGuard {
+        Ok(StreamGuard {
             pool: Arc::downgrade(self),
-            id,
-            sender: req_tx,
+            stream: Some(BiStream {
+                client,
+                sender: req_tx,
+                receiver: Box::pin(resp_stream),
+            }),
+            can_reuse: true,
             permit,
-        }
-    }
-
-    /// Runs a stream task. This acquires a client from the `ClientPool` and establishes a
-    /// bidirectional GetPage stream, then forwards requests and responses between callers and the
-    /// stream. It does not track or enforce queue depths -- that's done by `get()` since it must be
-    /// atomic with pool stream acquisition.
-    ///
-    /// The task exits when the request channel is closed, or on a stream error. The caller is
-    /// responsible for removing the stream from the pool on exit.
-    async fn run_stream(
-        client_pool: Arc<ClientPool>,
-        mut caller_rx: RequestReceiver,
-    ) -> anyhow::Result<()> {
-        // Acquire a client from the pool and create a stream.
-        let mut client = client_pool.get().await?;
-
-        // NB: use an unbounded channel such that the stream send never blocks. Otherwise, we could
-        // theoretically deadlock if both the client and server block on sends (since we're not
-        // reading responses while sending). This is unlikely to happen due to gRPC/TCP buffers and
-        // low queue depths, but it was seen to happen with the libpq protocol so better safe than
-        // sorry. It should never buffer more than the queue depth anyway, but using an unbounded
-        // channel guarantees that it will never block.
-        let (req_tx, req_rx) = mpsc::unbounded_channel();
-        let req_stream = tokio_stream::wrappers::UnboundedReceiverStream::new(req_rx);
-        let mut resp_stream = client.get_pages(req_stream).await?;
-
-        // Track caller response channels by request ID. If the task returns early, these response
-        // channels will be dropped and the waiting callers will receive an error.
-        //
-        // NB: this will leak entries if the server doesn't respond to a request (by request ID).
-        // It shouldn't happen, and if it does it will often hold onto queue depth quota anyway and
-        // block further use. But we could consider reaping closed channels after some time.
-        let mut callers = HashMap::new();
-
-        // Process requests and responses.
-        loop {
-            tokio::select! {
-                // Receive requests from callers and send them to the stream.
-                req = caller_rx.recv() => {
-                    // Shut down if request channel is closed.
-                    let Some((req, resp_tx)) = req else {
-                        return Ok(());
-                    };
-
-                    // Store the response channel by request ID.
-                    if callers.contains_key(&req.request_id) {
-                        // Error on request ID duplicates. Ignore callers that went away.
-                        _ = resp_tx.send(Err(tonic::Status::invalid_argument(
-                            format!("duplicate request ID: {}", req.request_id),
-                        )));
-                        continue;
-                    }
-                    callers.insert(req.request_id, resp_tx);
-
-                    // Send the request on the stream. Bail out if the stream is closed.
-                    req_tx.send(req).map_err(|_| {
-                        tonic::Status::unavailable("stream closed")
-                    })?;
-                }
-
-                // Receive responses from the stream and send them to callers.
-                resp = resp_stream.next() => {
-                    // Shut down if the stream is closed, and bail out on stream errors.
-                    let Some(resp) = resp.transpose()? else {
-                        return Ok(())
-                    };
-
-                    // Send the response to the caller. Ignore errors if the caller went away.
-                    let Some(resp_tx) = callers.remove(&resp.request_id) else {
-                        warn!("received response for unknown request ID: {}", resp.request_id);
-                        continue;
-                    };
-                    _ = resp_tx.send(Ok(resp));
-                }
-            }
-        }
+        })
     }
 }
 
 impl Reapable for StreamPool {
     /// Reaps streams that have been idle since before the cutoff.
     fn reap_idle(&self, cutoff: Instant) {
-        self.streams.lock().unwrap().retain(|_, entry| {
-            let Some(idle_since) = entry.idle_since else {
-                assert_ne!(entry.queue_depth, 0, "empty stream not marked idle");
-                return true;
-            };
-            assert_eq!(entry.queue_depth, 0, "idle stream has requests");
-            idle_since >= cutoff
-        });
+        self.idle
+            .lock()
+            .unwrap()
+            .retain(|_, entry| entry.idle_since >= cutoff);
     }
 }
 
-/// A pooled stream reference. Can be used to send a single request, to properly enforce queue
-/// depth. Queue depth is already reserved and will be returned on drop.
+/// A stream acquired from the pool. Returned to the pool when dropped, unless there are still
+/// in-flight requests on the stream, or the stream failed.
 pub struct StreamGuard {
     pool: Weak<StreamPool>,
-    id: StreamID,
-    sender: RequestSender,
+    stream: Option<BiStream>,             // Some until dropped
+    can_reuse: bool,                      // returned to pool if true
     permit: Option<OwnedSemaphorePermit>, // None if pool is unbounded
 }
 
 impl StreamGuard {
-    /// Sends a request on the stream and awaits the response. Consumes the guard, since it's only
-    /// valid for a single request (to enforce queue depth). This also drops the guard on return and
-    /// returns the queue depth quota to the pool.
+    /// Sends a request on the stream and awaits the response. If the future is dropped before it
+    /// resolves (e.g. due to a timeout or cancellation), the stream will be closed to cancel the
+    /// request and is not returned to the pool. The same is true if the stream errors, in which
+    /// case the caller can't send further requests on the stream.
     ///
-    /// The `GetPageRequest::request_id` must be unique across in-flight requests.
+    /// We only support sending a single request at a time, to eliminate head-of-line blocking. See
+    /// module documentation for details.
     ///
     /// NB: errors are often returned as `GetPageResponse::status_code` instead of `tonic::Status`
     /// to avoid tearing down the stream for per-request errors. Callers must check this.
     pub async fn send(
-        self,
+        &mut self,
         req: page_api::GetPageRequest,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let (resp_tx, resp_rx) = oneshot::channel();
+        let req_id = req.request_id;
+        let stream = self.stream.as_mut().expect("not dropped");
 
-        self.sender
-            .send((req, resp_tx))
-            .await
+        // Mark the stream as not reusable while the request is in flight. We can't return the
+        // stream to the pool until we receive the response, to avoid head-of-line blocking and
+        // stale responses. Failed streams can't be reused either.
+        if !self.can_reuse {
+            return Err(tonic::Status::internal("stream can't be reused"));
+        }
+        self.can_reuse = false;
+
+        // Send the request and receive the response.
+        //
+        // NB: this uses a watch channel, so it's unsafe to change this code to pipeline requests.
+        stream
+            .sender
+            .send(req)
             .map_err(|_| tonic::Status::unavailable("stream closed"))?;
 
-        resp_rx
+        let resp = stream
+            .receiver
+            .next()
             .await
-            .map_err(|_| tonic::Status::unavailable("stream closed"))?
+            .ok_or_else(|| tonic::Status::unavailable("stream closed"))??;
+
+        if resp.request_id != req_id {
+            return Err(tonic::Status::internal(format!(
+                "response ID {} does not match request ID {}",
+                resp.request_id, req_id
+            )));
+        }
+
+        // Success, mark the stream as reusable.
+        self.can_reuse = true;
+
+        Ok(resp)
     }
 }
 
@@ -669,26 +585,21 @@ impl Drop for StreamGuard {
             return; // pool was dropped
         };
 
-        // Release the queue depth reservation on drop. This can prematurely decrement it if dropped
-        // before the response is received, but that's okay.
-        //
-        // TODO: actually, it's probably not okay. Queue depth release should be moved into the
-        // stream task, such that it continues to account for the queue depth slot until the server
-        // responds. Otherwise, if a slow request times out and keeps blocking the stream, the
-        // server will keep waiting on it and we can pile on subsequent requests (including the
-        // timeout retry) in the same stream and get blocked. But we may also want to avoid blocking
-        // requests on e.g. LSN waits and layer downloads, instead returning early to free up the
-        // stream. Or just scale out streams with a queue depth of 1 to sidestep all head-of-line
-        // blocking. TBD.
-        let mut streams = pool.streams.lock().unwrap();
-        let entry = streams.get_mut(&self.id).expect("unknown stream");
-        assert!(entry.idle_since.is_none(), "active stream marked idle");
-        assert!(entry.queue_depth > 0, "stream queue underflow");
-        entry.queue_depth -= 1;
-        if entry.queue_depth == 0 {
-            entry.idle_since = Some(Instant::now()); // mark stream as idle
+        // If the stream isn't reusable, it can't be returned to the pool.
+        if !self.can_reuse {
+            return;
         }
 
+        // Place the idle stream back into the pool.
+        let entry = StreamEntry {
+            stream: self.stream.take().expect("dropped once"),
+            idle_since: Instant::now(),
+        };
+        pool.idle
+            .lock()
+            .unwrap()
+            .insert(entry.stream.client.id, entry);
+
         _ = self.permit; // returned on drop, referenced for visibility
     }
 }
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index a9dd154285..76355ae546 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -49,7 +49,7 @@ impl From<ProtocolError> for tonic::Status {
 }
 
 /// The LSN a request should read at.
-#[derive(Clone, Copy, Debug)]
+#[derive(Clone, Copy, Debug, Default)]
 pub struct ReadLsn {
     /// The request's read LSN.
     pub request_lsn: Lsn,
@@ -329,7 +329,7 @@ impl From<GetDbSizeResponse> for proto::GetDbSizeResponse {
 }
 
 /// Requests one or more pages.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, Default)]
 pub struct GetPageRequest {
     /// A request ID. Will be included in the response. Should be unique for in-flight requests on
     /// the stream.
@@ -430,12 +430,13 @@ impl From<RequestID> for proto::RequestId {
 }
 
 /// A GetPage request class.
-#[derive(Clone, Copy, Debug, strum_macros::Display)]
+#[derive(Clone, Copy, Debug, Default, strum_macros::Display)]
 pub enum GetPageClass {
     /// Unknown class. For backwards compatibility: used when an older client version sends a class
     /// that a newer server version has removed.
     Unknown,
     /// A normal request. This is the default.
+    #[default]
     Normal,
     /// A prefetch request. NB: can only be classified on pg < 18.
     Prefetch,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index fc01deb92d..c61598cdf6 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -98,7 +98,7 @@ tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unpref
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
-tokio-stream = { version = "0.1", features = ["net"] }
+tokio-stream = { version = "0.1", features = ["net", "sync"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io-util", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
 tower = { version = "0.5", default-features = false, features = ["balance", "buffer", "limit", "log"] }

From a203f9829a87fc47deece609b4a35b6239bd7322 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 14:30:28 +0200
Subject: [PATCH 109/163] pageserver: add timeline_id span when freezing layers
 (#12572)

## Problem

We don't log the timeline ID when rolling ephemeral layers during
housekeeping.

Resolves [LKB-179](https://databricks.atlassian.net/browse/LKB-179)

## Summary of changes

Add a span with timeline ID when calling `maybe_freeze_ephemeral_layer`
from the housekeeping loop.

We don't instrument the function itself, since future callers may not
have a span including the tenant_id already, but we don't want to
duplicate the tenant_id for these spans.
---
 pageserver/src/tenant.rs          | 8 +++++++-
 pageserver/src/tenant/timeline.rs | 2 ++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f75a03a508..1a3016e7f1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3393,7 +3393,13 @@ impl TenantShard {
                 .collect_vec();
 
             for timeline in timelines {
-                timeline.maybe_freeze_ephemeral_layer().await;
+                // Include a span with the timeline ID. The parent span already has the tenant ID.
+                let span =
+                    info_span!("maybe_freeze_ephemeral_layer", timeline_id = %timeline.timeline_id);
+                timeline
+                    .maybe_freeze_ephemeral_layer()
+                    .instrument(span)
+                    .await;
             }
         }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fe622713e9..f2833674a9 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1893,6 +1893,8 @@ impl Timeline {
     // an ephemeral layer open forever when idle.  It also freezes layers if the global limit on
     // ephemeral layer bytes has been breached.
     pub(super) async fn maybe_freeze_ephemeral_layer(&self) {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         let Ok(mut write_guard) = self.write_lock.try_lock() else {
             // If the write lock is held, there is an active wal receiver: rolling open layers
             // is their responsibility while they hold this lock.

From eb830fa547f61aaaa582d765b440b156b6a780f2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 14 Jul 2025 15:22:38 +0200
Subject: [PATCH 110/163] pageserver/client_grpc: use unbounded pools (#12585)

## Problem

The communicator gRPC client currently uses bounded client/stream pools.
This can artificially constrain clients, especially after we remove
pipelining in #12584.

[Benchmarks](https://github.com/neondatabase/neon/pull/12583) show that
the cost of an idle server-side GetPage worker task is about 26 KB (2.5
GB for 100,000), so we can afford to scale out.

In the worst case, we'll degenerate to the current libpq state with one
stream per backend, but without the TCP connection overhead. In the
common case we expect significantly lower stream counts due to stream
sharing, driven e.g. by idle backends, LFC hits, read coalescing,
sharding (backends typically only talk to one shard at a time), etc.

Currently, Pageservers rarely serve more than 4000 backend connections,
so we have at least 2 orders of magnitude of headroom.

Touches #11735.
Requires #12584.

## Summary of changes

Remove the pool limits, and restructure the pools.

We still keep a separate bulk pool for Getpage batches of >4 pages (>32
KB), with fewer streams per connection. This reduces TCP-level
congestion and head-of-line blocking for non-bulk requests, and
concentrates larger window sizes on a smaller set of
streams/connections, presumably reducing memory usage. Apart from this,
bulk requests don't have any latency penalty compared to other requests.
---
 pageserver/client_grpc/src/client.rs | 104 ++++++++++++++-------------
 pageserver/page_api/src/model.rs     |  13 ----
 2 files changed, 55 insertions(+), 62 deletions(-)

diff --git a/pageserver/client_grpc/src/client.rs b/pageserver/client_grpc/src/client.rs
index 4b606d6939..3a9edc7092 100644
--- a/pageserver/client_grpc/src/client.rs
+++ b/pageserver/client_grpc/src/client.rs
@@ -24,20 +24,23 @@ use utils::shard::{ShardCount, ShardIndex, ShardNumber};
 /// Max number of concurrent clients per channel (i.e. TCP connection). New channels will be spun up
 /// when full.
 ///
+/// Normal requests are small, and we don't pipeline them, so we can afford a large number of
+/// streams per connection.
+///
 /// TODO: tune all of these constants, and consider making them configurable.
-/// TODO: consider separate limits for unary and streaming clients, so we don't fill up channels
-/// with only streams.
-const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
+const MAX_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(64).unwrap();
 
-/// Max number of concurrent unary request clients per shard.
-const MAX_UNARY_CLIENTS: NonZero<usize> = NonZero::new(64).unwrap();
+/// Max number of concurrent bulk GetPage streams per channel (i.e. TCP connection). These use a
+/// dedicated channel pool with a lower client limit, to avoid TCP-level head-of-line blocking and
+/// transmission delays. This also concentrates large window sizes on a smaller set of
+/// streams/connections, presumably reducing memory use.
+const MAX_BULK_CLIENTS_PER_CHANNEL: NonZero<usize> = NonZero::new(16).unwrap();
 
-/// Max number of concurrent GetPage streams per shard.
-const MAX_STREAMS: NonZero<usize> = NonZero::new(64).unwrap();
-
-/// Max number of concurrent bulk GetPage streams per shard, used e.g. for prefetches. Because these
-/// are more throughput-oriented, we have a smaller limit.
-const MAX_BULK_STREAMS: NonZero<usize> = NonZero::new(16).unwrap();
+/// The batch size threshold at which a GetPage request will use the bulk stream pool.
+///
+/// The gRPC initial window size is 64 KB. Each page is 8 KB, so let's avoid increasing the window
+/// size for the normal stream pool, and route requests for >= 5 pages (>32 KB) to the bulk pool.
+const BULK_THRESHOLD_BATCH_SIZE: usize = 5;
 
 /// The overall request call timeout, including retries and pool acquisition.
 /// TODO: should we retry forever? Should the caller decide?
@@ -62,10 +65,19 @@ const SLOW_THRESHOLD: Duration = Duration::from_secs(3);
 /// * Sharded tenants across multiple Pageservers.
 /// * Pooling of connections, clients, and streams for efficient resource use.
 /// * Concurrent use by many callers.
-/// * Internal handling of GetPage bidirectional streams, with pipelining and error handling.
+/// * Internal handling of GetPage bidirectional streams.
 /// * Automatic retries.
 /// * Observability.
 ///
+/// The client has dedicated connection/client/stream pools per shard, for resource reuse. These
+/// pools are unbounded: we allow scaling out as many concurrent streams as needed to serve all
+/// concurrent callers, which mostly eliminates head-of-line blocking. Idle streams are fairly
+/// cheap: the server task currently uses 26 KB of memory, so we can comfortably fit 100,000
+/// concurrent idle streams (2.5 GB memory). The worst case degenerates to the old libpq case with
+/// one stream per backend, but without the TCP connection overhead. In the common case we expect
+/// significantly lower stream counts due to stream sharing, driven e.g. by idle backends, LFC hits,
+/// read coalescing, sharding (backends typically only talk to one shard at a time), etc.
+///
 /// TODO: this client does not support base backups or LSN leases, as these are only used by
 /// compute_ctl. Consider adding this, but LSN leases need concurrent requests on all shards.
 pub struct PageserverClient {
@@ -264,7 +276,7 @@ impl PageserverClient {
         req: page_api::GetPageRequest,
         shard: &Shard,
     ) -> tonic::Result<page_api::GetPageResponse> {
-        let mut stream = shard.stream(req.request_class.is_bulk()).await?;
+        let mut stream = shard.stream(Self::is_bulk(&req)).await?;
         let resp = stream.send(req.clone()).await?;
 
         // Convert per-request errors into a tonic::Status.
@@ -365,6 +377,11 @@ impl PageserverClient {
             ))
         })?
     }
+
+    /// Returns true if the request is considered a bulk request and should use the bulk pool.
+    fn is_bulk(req: &page_api::GetPageRequest) -> bool {
+        req.block_numbers.len() >= BULK_THRESHOLD_BATCH_SIZE
+    }
 }
 
 /// Shard specification for a PageserverClient.
@@ -492,15 +509,23 @@ impl Shards {
     }
 }
 
-/// A single shard. Uses dedicated resource pools with the following structure:
+/// A single shard. Has dedicated resource pools with the following structure:
 ///
-/// * Channel pool: unbounded.
-///   * Unary client pool: MAX_UNARY_CLIENTS.
-///   * Stream client pool: unbounded.
-///     * Stream pool: MAX_STREAMS and MAX_STREAM_QUEUE_DEPTH.
-/// * Bulk channel pool: unbounded.
+/// * Channel pool: MAX_CLIENTS_PER_CHANNEL.
+///   * Client pool: unbounded.
+///     * Stream pool: unbounded.
+/// * Bulk channel pool: MAX_BULK_CLIENTS_PER_CHANNEL.
 ///   * Bulk client pool: unbounded.
-///     * Bulk stream pool: MAX_BULK_STREAMS and MAX_BULK_STREAM_QUEUE_DEPTH.
+///     * Bulk stream pool: unbounded.
+///
+/// We use a separate bulk channel pool with a lower concurrency limit for large batch requests.
+/// This avoids TCP-level head-of-line blocking, and also concentrates large window sizes on a
+/// smaller set of streams/connections, which presumably reduces memory use. Neither of these pools
+/// are bounded, nor do they pipeline requests, so the latency characteristics should be mostly
+/// similar (except for TCP transmission time).
+///
+/// TODO: since we never use bounded pools, we could consider removing the pool limiters. However,
+/// the code is fairly trivial, so we may as well keep them around for now in case we need them.
 struct Shard {
     /// The shard ID.
     id: ShardIndex,
@@ -508,7 +533,7 @@ struct Shard {
     client_pool: Arc<ClientPool>,
     /// GetPage stream pool.
     stream_pool: Arc<StreamPool>,
-    /// GetPage stream pool for bulk requests, e.g. prefetches.
+    /// GetPage stream pool for bulk requests.
     bulk_stream_pool: Arc<StreamPool>,
 }
 
@@ -522,48 +547,30 @@ impl Shard {
         auth_token: Option<String>,
         compression: Option<CompressionEncoding>,
     ) -> anyhow::Result<Self> {
-        // Common channel pool for unary and stream requests. Bounded by client/stream pools.
-        let channel_pool = ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?;
-
-        // Client pool for unary requests.
+        // Shard pools for unary requests and non-bulk GetPage requests.
         let client_pool = ClientPool::new(
-            channel_pool.clone(),
+            ChannelPool::new(url.clone(), MAX_CLIENTS_PER_CHANNEL)?,
             tenant_id,
             timeline_id,
             shard_id,
             auth_token.clone(),
             compression,
-            Some(MAX_UNARY_CLIENTS),
+            None, // unbounded
         );
+        let stream_pool = StreamPool::new(client_pool.clone(), None); // unbounded
 
-        // GetPage stream pool. Uses a dedicated client pool to avoid starving out unary clients,
-        // but shares a channel pool with it (as it's unbounded).
-        let stream_pool = StreamPool::new(
-            ClientPool::new(
-                channel_pool.clone(),
-                tenant_id,
-                timeline_id,
-                shard_id,
-                auth_token.clone(),
-                compression,
-                None, // unbounded, limited by stream pool
-            ),
-            Some(MAX_STREAMS),
-        );
-
-        // Bulk GetPage stream pool, e.g. for prefetches. Uses dedicated channel/client/stream pools
-        // to avoid head-of-line blocking of latency-sensitive requests.
+        // Bulk GetPage stream pool for large batches (prefetches, sequential scans, vacuum, etc.).
         let bulk_stream_pool = StreamPool::new(
             ClientPool::new(
-                ChannelPool::new(url, MAX_CLIENTS_PER_CHANNEL)?,
+                ChannelPool::new(url, MAX_BULK_CLIENTS_PER_CHANNEL)?,
                 tenant_id,
                 timeline_id,
                 shard_id,
                 auth_token,
                 compression,
-                None, // unbounded, limited by stream pool
+                None, // unbounded,
             ),
-            Some(MAX_BULK_STREAMS),
+            None, // unbounded
         );
 
         Ok(Self {
@@ -585,8 +592,7 @@ impl Shard {
         .await
     }
 
-    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk stream
-    /// pool (e.g. for prefetches).
+    /// Returns a pooled stream for this shard. If `bulk` is `true`, uses the dedicated bulk pool.
     #[instrument(skip_all, fields(bulk))]
     async fn stream(&self, bulk: bool) -> tonic::Result<StreamGuard> {
         let pool = match bulk {
diff --git a/pageserver/page_api/src/model.rs b/pageserver/page_api/src/model.rs
index 76355ae546..a3286ecf15 100644
--- a/pageserver/page_api/src/model.rs
+++ b/pageserver/page_api/src/model.rs
@@ -444,19 +444,6 @@ pub enum GetPageClass {
     Background,
 }
 
-impl GetPageClass {
-    /// Returns true if this is considered a bulk request (i.e. more throughput-oriented rather than
-    /// latency-sensitive).
-    pub fn is_bulk(&self) -> bool {
-        match self {
-            Self::Unknown => false,
-            Self::Normal => false,
-            Self::Prefetch => true,
-            Self::Background => true,
-        }
-    }
-}
-
 impl From<proto::GetPageClass> for GetPageClass {
     fn from(pb: proto::GetPageClass) -> Self {
         match pb {

From 4fedcbc0ac94d399808384911b92f8417b74c286 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Mon, 14 Jul 2025 15:25:25 +0200
Subject: [PATCH 111/163] Leverage the existing mechanism to retry 404 errors
 instead of implementing new code. (#12567)

## Problem
In https://github.com/neondatabase/neon/pull/12513, the new code was
implemented to retry 404 errors caused by the replication lag. However,
this implemented the new logic, making the script more complicated,
while we have an existing one in `neon_api.py`.
## Summary of changes
The existing mechanism is used to retry 404 errors.

---------

Co-authored-by: Alexey Masterov <alexey.masterov@databricks.com>
---
 test_runner/fixtures/neon_api.py          | 19 +++++++++++++------
 test_runner/random_ops/test_random_ops.py | 22 +++-------------------
 2 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 9d85b9a332..e0f16abe77 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -34,7 +34,9 @@ class NeonAPI:
         self.retries524 = 0
         self.retries4xx = 0
 
-    def __request(self, method: str | bytes, endpoint: str, **kwargs: Any) -> requests.Response:
+    def __request(
+        self, method: str | bytes, endpoint: str, retry404: bool = False, **kwargs: Any
+    ) -> requests.Response:
         kwargs["headers"] = kwargs.get("headers", {})
         kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
 
@@ -55,10 +57,12 @@ class NeonAPI:
                 resp.raise_for_status()
                 break
             elif resp.status_code >= 400:
-                if resp.status_code == 422:
-                    if resp.json()["message"] == "branch not ready yet":
-                        retry = True
-                        self.retries4xx += 1
+                if resp.status_code == 404 and retry404:
+                    retry = True
+                    self.retries4xx += 1
+                elif resp.status_code == 422 and resp.json()["message"] == "branch not ready yet":
+                    retry = True
+                    self.retries4xx += 1
                 elif resp.status_code == 423 and resp.json()["message"] in {
                     "endpoint is in some transitive state, could not suspend",
                     "project already has running conflicting operations, scheduling of new ones is prohibited",
@@ -66,7 +70,7 @@ class NeonAPI:
                     retry = True
                     self.retries4xx += 1
                 elif resp.status_code == 524:
-                    log.info("The request was timed out, trying to get operations")
+                    log.info("The request was timed out")
                     retry = True
                     self.retries524 += 1
             if retry:
@@ -203,6 +207,9 @@ class NeonAPI:
         resp = self.__request(
             "GET",
             f"/projects/{project_id}/branches/{branch_id}",
+            # XXX Retry get parent details to work around the issue
+            # https://databricks.atlassian.net/browse/LKB-279
+            retry404=True,
             headers={
                 "Accept": "application/json",
             },
diff --git a/test_runner/random_ops/test_random_ops.py b/test_runner/random_ops/test_random_ops.py
index 5c43b06bc5..b106e9b729 100644
--- a/test_runner/random_ops/test_random_ops.py
+++ b/test_runner/random_ops/test_random_ops.py
@@ -13,7 +13,6 @@ from typing import TYPE_CHECKING, Any
 
 import pytest
 from fixtures.log_helper import log
-from requests import HTTPError
 
 if TYPE_CHECKING:
     from pathlib import Path
@@ -153,26 +152,11 @@ class NeonBranch:
             return
         self.updated_at = datetime.fromisoformat(res["branch"]["updated_at"])
         parent_id: str = res["branch"]["parent_id"]
-        # XXX Retry get parent details to work around the issue
-        # https://databricks.atlassian.net/browse/LKB-279
-        target_time = datetime.now() + timedelta(seconds=30)
-        while datetime.now() < target_time:
-            try:
-                parent_def = self.neon_api.get_branch_details(self.project_id, parent_id)
-            except HTTPError as he:
-                if he.response.status_code == 404:
-                    log.info("Branch not found, waiting...")
-                    time.sleep(1)
-                else:
-                    raise HTTPError(he) from he
-            else:
-                break
-        else:
-            raise RuntimeError(f"Branch {parent_id} not found")
-
         # Creates an object for the parent branch
         # After the reset operation a new parent branch is created
-        parent = NeonBranch(self.project, parent_def, True)
+        parent = NeonBranch(
+            self.project, self.neon_api.get_branch_details(self.project_id, parent_id), True
+        )
         self.project.branches[parent_id] = parent
         self.parent = parent
         parent.children[self.id] = self

From 2288efae662e41fcd2cf7369e3b4b9dc95d25e95 Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Mon, 14 Jul 2025 14:41:31 +0100
Subject: [PATCH 112/163] Performance test for LFC prewarm (#12524)

https://github.com/neondatabase/cloud/issues/19011

Measure relative performance for prewarmed and non-prewarmed endpoints.
Add test that runs on every commit, and one performance test with a
remote cluster.
---
 .github/actionlint.yml                      |   1 +
 .github/workflows/benchmarking.yml          |  72 +++++++++
 test_runner/fixtures/neon_api.py            |   4 +
 test_runner/performance/test_lfc_prewarm.py | 167 ++++++++++++++++++++
 4 files changed, 244 insertions(+)
 create mode 100644 test_runner/performance/test_lfc_prewarm.py

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 3142a36fa0..25b2fc702a 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -31,6 +31,7 @@ config-variables:
   - NEON_PROD_AWS_ACCOUNT_ID
   - PGREGRESS_PG16_PROJECT_ID
   - PGREGRESS_PG17_PROJECT_ID
+  - PREWARM_PGBENCH_SIZE
   - REMOTE_STORAGE_AZURE_CONTAINER
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_CICD_CHANNEL_ID
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 79371ec704..df80bad579 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -219,6 +219,7 @@ jobs:
           --ignore test_runner/performance/test_cumulative_statistics_persistence.py
           --ignore test_runner/performance/test_perf_many_relations.py
           --ignore test_runner/performance/test_perf_oltp_large_tenant.py
+          --ignore test_runner/performance/test_lfc_prewarm.py
       env:
         BENCHMARK_CONNSTR: ${{ steps.create-neon-project.outputs.dsn }}
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
@@ -410,6 +411,77 @@ jobs:
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
 
+  prewarm-test:
+    if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PGBENCH_SIZE: ${{ vars.PREWARM_PGBENCH_SIZE }}
+      POSTGRES_DISTRIB_DIR: /tmp/neon/pg_install
+      DEFAULT_PG_VERSION: 17
+      TEST_OUTPUT: /tmp/test_output
+      BUILD_TYPE: remote
+      SAVE_PERF_REPORT: ${{ github.event.inputs.save_perf_report || ( github.ref_name == 'main' ) }}
+      PLATFORM: "neon-staging"
+
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: ghcr.io/neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
+    steps:
+    - name: Harden the runner (Audit all outbound calls)
+      uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
+      with:
+        egress-policy: audit
+
+    - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+
+    - name: Configure AWS credentials
+      uses: aws-actions/configure-aws-credentials@e3dd6a429d7300a6a4c196c26e071d42e0343502 # v4.0.2
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+
+    - name: Run prewarm benchmark
+      uses: ./.github/actions/run-python-test-set
+      with:
+        build_type: ${{ env.BUILD_TYPE }}
+        test_selection: performance/test_lfc_prewarm.py
+        run_in_parallel: false
+        save_perf_report: ${{ env.SAVE_PERF_REPORT }}
+        extra_params: -m remote_cluster --timeout 5400
+        pg_version: ${{ env.DEFAULT_PG_VERSION }}
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"
+        PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
+        NEON_API_KEY: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Create Allure report
+      id: create-allure-report
+      if: ${{ !cancelled() }}
+      uses: ./.github/actions/allure-report-generate
+      with:
+        store-test-results-into-db: true
+        aws-oidc-role-arn: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+      env:
+        REGRESS_TEST_RESULT_CONNSTR_NEW: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
+
   generate-matrices:
     if: ${{ github.event.inputs.run_only_pgvector_tests == 'false' || github.event.inputs.run_only_pgvector_tests == null }}
     # Create matrices for the benchmarking jobs, so we run benchmarks on rds only once a week (on Saturday)
diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index e0f16abe77..bb618325e0 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -314,6 +314,10 @@ class NeonAPI:
         if endpoint_type:
             data["endpoint"]["type"] = endpoint_type
         if settings:
+            # otherwise we get 400 "settings must not be nil"
+            # TODO(myrrc): fix on cplane side
+            if "pg_settings" not in settings:
+                settings["pg_settings"] = {}
             data["endpoint"]["settings"] = settings
 
         resp = self.__request(
diff --git a/test_runner/performance/test_lfc_prewarm.py b/test_runner/performance/test_lfc_prewarm.py
new file mode 100644
index 0000000000..ad2c759a63
--- /dev/null
+++ b/test_runner/performance/test_lfc_prewarm.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import os
+import timeit
+import traceback
+from concurrent.futures import ThreadPoolExecutor as Exec
+from pathlib import Path
+from time import sleep
+from typing import TYPE_CHECKING, Any, cast
+
+import pytest
+from fixtures.benchmark_fixture import NeonBenchmarker, PgBenchRunResult
+from fixtures.log_helper import log
+from fixtures.neon_api import NeonAPI, connection_parameters_to_env
+
+if TYPE_CHECKING:
+    from fixtures.compare_fixtures import NeonCompare
+    from fixtures.neon_fixtures import Endpoint, PgBin
+    from fixtures.pg_version import PgVersion
+
+from performance.test_perf_pgbench import utc_now_timestamp
+
+# These tests compare performance for a write-heavy and read-heavy workloads of an ordinary endpoint
+# compared to the endpoint which saves its LFC and prewarms using it on startup.
+
+
+def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
+    env = neon_compare.env
+    env.create_branch("normal")
+    env.create_branch("prewarmed")
+    pg_bin = neon_compare.pg_bin
+    ep_normal: Endpoint = env.endpoints.create_start("normal")
+    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
+
+    for ep in [ep_normal, ep_prewarmed]:
+        connstr: str = ep.connstr()
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", connstr, "-s100"])
+        ep.safe_psql("CREATE EXTENSION neon")
+        client = ep.http_client()
+        client.offload_lfc()
+        ep.stop()
+        ep.start()
+        client.prewarm_lfc_wait()
+
+        run_start_timestamp = utc_now_timestamp()
+        t0 = timeit.default_timer()
+        out = pg_bin.run_capture(["pgbench", "-c10", "-T10", connstr])
+        run_duration = timeit.default_timer() - t0
+        run_end_timestamp = utc_now_timestamp()
+
+        stdout = Path(f"{out}.stdout").read_text()
+        res = PgBenchRunResult.parse_from_stdout(
+            stdout=stdout,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
+        name: str = cast("str", ep.branch_name)
+        neon_compare.zenbenchmark.record_pg_bench_result(name, res)
+
+
+@pytest.mark.remote_cluster
+@pytest.mark.timeout(30 * 60)
+def test_compare_prewarmed_pgbench_perf_benchmark(
+    pg_bin: PgBin,
+    neon_api: NeonAPI,
+    pg_version: PgVersion,
+    zenbenchmark: NeonBenchmarker,
+):
+    name = f"Test prewarmed pgbench performance, GITHUB_RUN_ID={os.getenv('GITHUB_RUN_ID')}"
+    project = neon_api.create_project(pg_version, name)
+    project_id = project["project"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+    err = False
+    try:
+        benchmark_impl(pg_bin, neon_api, project, zenbenchmark)
+    except Exception as e:
+        err = True
+        log.error(f"Caught exception: {e}")
+        log.error(traceback.format_exc())
+    finally:
+        assert not err
+        neon_api.delete_project(project_id)
+
+
+def benchmark_impl(
+    pg_bin: PgBin, neon_api: NeonAPI, project: dict[str, Any], zenbenchmark: NeonBenchmarker
+):
+    pgbench_size = int(os.getenv("PGBENCH_SIZE") or "3424")  # 50GB
+    offload_secs = 20
+    test_duration_min = 5
+    pgbench_duration = f"-T{test_duration_min * 60}"
+    # prewarm API is not publicly exposed. In order to test performance of a
+    # fully prewarmed endpoint, wait after it restarts
+    prewarmed_sleep_secs = 30
+
+    branch_id = project["branch"]["id"]
+    project_id = project["project"]["id"]
+    normal_env = connection_parameters_to_env(
+        project["connection_uris"][0]["connection_parameters"]
+    )
+    normal_id = project["endpoints"][0]["id"]
+
+    prewarmed_branch_id = neon_api.create_branch(
+        project_id, "prewarmed", parent_id=branch_id, add_endpoint=False
+    )["branch"]["id"]
+    neon_api.wait_for_operation_to_finish(project_id)
+
+    ep_prewarmed = neon_api.create_endpoint(
+        project_id,
+        prewarmed_branch_id,
+        endpoint_type="read_write",
+        settings={"autoprewarm": True, "offload_lfc_interval_seconds": offload_secs},
+    )
+    neon_api.wait_for_operation_to_finish(project_id)
+
+    prewarmed_env = normal_env.copy()
+    prewarmed_env["PGHOST"] = ep_prewarmed["endpoint"]["host"]
+    prewarmed_id = ep_prewarmed["endpoint"]["id"]
+
+    def bench(endpoint_name, endpoint_id, env):
+        pg_bin.run(["pgbench", "-i", "-I", "dtGvp", f"-s{pgbench_size}"], env)
+        sleep(offload_secs * 2)  # ensure LFC is offloaded after pgbench finishes
+        neon_api.restart_endpoint(project_id, endpoint_id)
+        sleep(prewarmed_sleep_secs)
+
+        run_start_timestamp = utc_now_timestamp()
+        t0 = timeit.default_timer()
+        out = pg_bin.run_capture(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env)
+        run_duration = timeit.default_timer() - t0
+        run_end_timestamp = utc_now_timestamp()
+
+        stdout = Path(f"{out}.stdout").read_text()
+        res = PgBenchRunResult.parse_from_stdout(
+            stdout=stdout,
+            run_duration=run_duration,
+            run_start_timestamp=run_start_timestamp,
+            run_end_timestamp=run_end_timestamp,
+        )
+        zenbenchmark.record_pg_bench_result(endpoint_name, res)
+
+    with Exec(max_workers=2) as exe:
+        exe.submit(bench, "normal", normal_id, normal_env)
+        exe.submit(bench, "prewarmed", prewarmed_id, prewarmed_env)
+
+
+def test_compare_prewarmed_read_perf(neon_compare: NeonCompare):
+    env = neon_compare.env
+    env.create_branch("normal")
+    env.create_branch("prewarmed")
+    ep_normal: Endpoint = env.endpoints.create_start("normal")
+    ep_prewarmed: Endpoint = env.endpoints.create_start("prewarmed", autoprewarm=True)
+
+    sql = [
+        "CREATE EXTENSION neon",
+        "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')",
+        "INSERT INTO foo SELECT FROM generate_series(1,1000000)",
+    ]
+    for ep in [ep_normal, ep_prewarmed]:
+        ep.safe_psql_many(sql)
+        client = ep.http_client()
+        client.offload_lfc()
+        ep.stop()
+        ep.start()
+        client.prewarm_lfc_wait()
+        with neon_compare.record_duration(f"{ep.branch_name}_run_duration"):
+            ep.safe_psql("SELECT count(*) from foo")

From f67a8a173ec889a163f0b89b43dd6957da45b82c Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Mon, 14 Jul 2025 09:37:04 -0700
Subject: [PATCH 113/163] A few SK changes (#12577)

# TLDR
This PR is a no-op.

## Problem
When a SK loses a disk, it must recover all WALs from the very
beginning. This may take days/weeks to catch up to the latest WALs for
all timelines it owns.

## Summary of changes
When SK starts up,
if it finds that it has 0 timelines,
- it will ask SC for the timeline it owns.
- Then, pulls the timeline from its peer safekeepers to restore the WAL
redundancy right away.

After pulling timeline is complete, it will become active and accepts
new WALs.

The current impl is a prototype. We can optimize the impl further, e.g.,
parallel pull timelines.

---------

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 control_plane/storcon_cli/src/main.rs         |   1 +
 libs/pageserver_api/src/controller_api.rs     |  39 ++
 libs/utils/src/ip_address.rs                  |  73 ++++
 libs/utils/src/lib.rs                         |   3 +
 pageserver/src/controller_upcall_client.rs    |   1 +
 safekeeper/client/src/mgmt_api.rs             |   9 +-
 safekeeper/src/bin/safekeeper.rs              |  24 ++
 safekeeper/src/hadron.rs                      | 388 ++++++++++++++++++
 safekeeper/src/http/routes.rs                 |  11 +-
 safekeeper/src/lib.rs                         |  12 +
 safekeeper/src/metrics.rs                     |  37 ++
 safekeeper/src/pull_timeline.rs               | 128 ++++--
 .../tests/walproposer_sim/safekeeper.rs       |   5 +
 test_runner/regress/test_wal_restore.py       | 113 +++++
 14 files changed, 808 insertions(+), 36 deletions(-)
 create mode 100644 libs/utils/src/ip_address.rs
 create mode 100644 safekeeper/src/hadron.rs

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 24fd34a87a..fcc5549beb 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -476,6 +476,7 @@ async fn main() -> anyhow::Result<()> {
                         listen_http_port,
                         listen_https_port,
                         availability_zone_id: AvailabilityZone(availability_zone_id),
+                        node_ip_addr: None,
                     }),
                 )
                 .await?;
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index b02c6a613a..8f86b03f72 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -1,5 +1,6 @@
 use std::collections::{HashMap, HashSet};
 use std::fmt::Display;
+use std::net::IpAddr;
 use std::str::FromStr;
 use std::time::{Duration, Instant};
 
@@ -60,6 +61,11 @@ pub struct NodeRegisterRequest {
     pub listen_https_port: Option<u16>,
 
     pub availability_zone_id: AvailabilityZone,
+
+    // Reachable IP address of the PS/SK registering, if known.
+    // Hadron Cluster Coordiantor will update the DNS record of the registering node
+    // with this IP address.
+    pub node_ip_addr: Option<IpAddr>,
 }
 
 #[derive(Serialize, Deserialize)]
@@ -545,6 +551,39 @@ pub struct SafekeeperDescribeResponse {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct TimelineSafekeeperPeer {
+    pub node_id: NodeId,
+    pub listen_http_addr: String,
+    pub http_port: i32,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimeline {
+    // SC does not know the tenant id.
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SCSafekeeperTimelinesResponse {
+    pub timelines: Vec<SCSafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimeline {
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub peers: Vec<NodeId>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct SafekeeperTimelinesResponse {
+    pub timelines: Vec<SafekeeperTimeline>,
+    pub safekeeper_peers: Vec<TimelineSafekeeperPeer>,
+}
+
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperSchedulingPolicyRequest {
     pub scheduling_policy: SkSchedulingPolicy,
diff --git a/libs/utils/src/ip_address.rs b/libs/utils/src/ip_address.rs
new file mode 100644
index 0000000000..d0834d0ba5
--- /dev/null
+++ b/libs/utils/src/ip_address.rs
@@ -0,0 +1,73 @@
+use std::env::{VarError, var};
+use std::error::Error;
+use std::net::IpAddr;
+use std::str::FromStr;
+
+/// Name of the environment variable containing the reachable IP address of the node. If set, the IP address contained in this
+/// environment variable is used as the reachable IP address of the pageserver or safekeeper node during node registration.
+/// In a Kubernetes environment, this environment variable should be set by Kubernetes to the Pod IP (specified in the Pod
+/// template).
+pub const HADRON_NODE_IP_ADDRESS: &str = "HADRON_NODE_IP_ADDRESS";
+
+/// Read the reachable IP address of this page server from env var HADRON_NODE_IP_ADDRESS.
+/// In Kubernetes this environment variable is set to the Pod IP (specified in the Pod template).
+pub fn read_node_ip_addr_from_env() -> Result<Option<IpAddr>, Box<dyn Error>> {
+    match var(HADRON_NODE_IP_ADDRESS) {
+        Ok(v) => {
+            if let Ok(addr) = IpAddr::from_str(&v) {
+                Ok(Some(addr))
+            } else {
+                Err(format!("Invalid IP address string: {v}. Cannot be parsed as either an IPv4 or an IPv6 address.").into())
+            }
+        }
+        Err(VarError::NotPresent) => Ok(None),
+        Err(e) => Err(e.into()),
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use std::env;
+    use std::net::{Ipv4Addr, Ipv6Addr};
+
+    #[test]
+    fn test_read_node_ip_addr_from_env() {
+        // SAFETY: test code
+        unsafe {
+            // Test with a valid IPv4 address
+            env::set_var(HADRON_NODE_IP_ADDRESS, "192.168.1.1");
+            let result = read_node_ip_addr_from_env().unwrap();
+            assert_eq!(result, Some(IpAddr::V4(Ipv4Addr::new(192, 168, 1, 1))));
+
+            // Test with a valid IPv6 address
+            env::set_var(
+                HADRON_NODE_IP_ADDRESS,
+                "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
+            );
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(
+            result,
+            Some(IpAddr::V6(
+                Ipv6Addr::from_str("2001:0db8:85a3:0000:0000:8a2e:0370:7334").unwrap()
+            ))
+        );
+
+        // Test with an invalid IP address
+        // SAFETY: test code
+        unsafe {
+            env::set_var(HADRON_NODE_IP_ADDRESS, "invalid_ip");
+        }
+        let result = read_node_ip_addr_from_env();
+        assert!(result.is_err());
+
+        // Test with no environment variable set
+        // SAFETY: test code
+        unsafe {
+            env::remove_var(HADRON_NODE_IP_ADDRESS);
+        }
+        let result = read_node_ip_addr_from_env().unwrap();
+        assert_eq!(result, None);
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2b81da017d..69771be5dc 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -26,6 +26,9 @@ pub mod auth;
 // utility functions and helper traits for unified unique id generation/serialization etc.
 pub mod id;
 
+// utility functions to obtain reachable IP addresses in PS/SK nodes.
+pub mod ip_address;
+
 pub mod shard;
 
 mod hex;
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index f1f9aaf43c..be1de43d18 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -194,6 +194,7 @@ impl StorageControllerUpcallApi for StorageControllerUpcallClient {
                         listen_http_port: m.http_port,
                         listen_https_port: m.https_port,
                         availability_zone_id: az_id.expect("Checked above"),
+                        node_ip_addr: None,
                     })
                 }
                 Err(e) => {
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index b4bb193a4b..3c8db3029e 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -6,10 +6,10 @@
 use std::error::Error as _;
 
 use http_utils::error::HttpErrorBody;
-use reqwest::{IntoUrl, Method, StatusCode};
+use reqwest::{IntoUrl, Method, Response, StatusCode};
 use safekeeper_api::models::{
     self, PullTimelineRequest, PullTimelineResponse, SafekeeperStatus, SafekeeperUtilization,
-    TimelineCreateRequest, TimelineStatus,
+    TimelineCreateRequest,
 };
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::logging::SecretString;
@@ -161,13 +161,12 @@ impl Client {
         &self,
         tenant_id: TenantId,
         timeline_id: TimelineId,
-    ) -> Result<TimelineStatus> {
+    ) -> Result<Response> {
         let uri = format!(
             "{}/v1/tenant/{}/timeline/{}",
             self.mgmt_api_endpoint, tenant_id, timeline_id
         );
-        let resp = self.get(&uri).await?;
-        resp.json().await.map_err(Error::ReceiveBody)
+        self.get(&uri).await
     }
 
     pub async fn snapshot(
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index b2d5976ef4..79cf2f9149 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -23,6 +23,7 @@ use safekeeper::defaults::{
     DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
     DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
 };
+use safekeeper::hadron;
 use safekeeper::wal_backup::WalBackup;
 use safekeeper::{
     BACKGROUND_RUNTIME, BROKER_RUNTIME, GlobalTimelines, HTTP_RUNTIME, SafeKeeperConf,
@@ -252,6 +253,10 @@ struct Args {
     /// Run in development mode (disables security checks)
     #[arg(long, help = "Run in development mode (disables security checks)")]
     dev: bool,
+    /* BEGIN_HADRON */
+    #[arg(long)]
+    enable_pull_timeline_on_startup: bool,
+    /* END_HADRON */
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -435,6 +440,11 @@ async fn main() -> anyhow::Result<()> {
         use_https_safekeeper_api: args.use_https_safekeeper_api,
         enable_tls_wal_service_api: args.enable_tls_wal_service_api,
         force_metric_collection_on_scrape: args.force_metric_collection_on_scrape,
+        /* BEGIN_HADRON */
+        advertise_pg_addr_tenant_only: None,
+        enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
+        hcc_base_url: None,
+        /* END_HADRON */
     });
 
     // initialize sentry if SENTRY_DSN is provided
@@ -529,6 +539,20 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
     // Load all timelines from disk to memory.
     global_timelines.init().await?;
 
+    /* BEGIN_HADRON */
+    if conf.enable_pull_timeline_on_startup && global_timelines.timelines_count() == 0 {
+        match hadron::hcc_pull_timelines(&conf, global_timelines.clone()).await {
+            Ok(_) => {
+                info!("Successfully pulled all timelines from peer safekeepers");
+            }
+            Err(e) => {
+                error!("Failed to pull timelines from peer safekeepers: {:?}", e);
+                return Err(e);
+            }
+        }
+    }
+    /* END_HADRON */
+
     // Run everything in current thread rt, if asked.
     if conf.current_thread_runtime {
         info!("running in current thread runtime");
diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs
new file mode 100644
index 0000000000..b41bf2c3da
--- /dev/null
+++ b/safekeeper/src/hadron.rs
@@ -0,0 +1,388 @@
+use pem::Pem;
+use safekeeper_api::models::PullTimelineRequest;
+use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration};
+use tokio::time::sleep;
+use tokio_util::sync::CancellationToken;
+use url::Url;
+use utils::{backoff, id::TenantTimelineId, ip_address};
+
+use anyhow::Result;
+use pageserver_api::controller_api::{
+    AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse,
+};
+
+use crate::{
+    GlobalTimelines, SafeKeeperConf,
+    metrics::{
+        SK_RECOVERY_PULL_TIMELINE_ERRORS, SK_RECOVERY_PULL_TIMELINE_OKS,
+        SK_RECOVERY_PULL_TIMELINE_SECONDS, SK_RECOVERY_PULL_TIMELINES_SECONDS,
+    },
+    pull_timeline,
+    timelines_global_map::DeleteOrExclude,
+};
+
+// Extract information in the SafeKeeperConf to build a NodeRegisterRequest used to register the safekeeper with the HCC.
+fn build_node_registeration_request(
+    conf: &SafeKeeperConf,
+    node_ip_addr: Option<IpAddr>,
+) -> Result<NodeRegisterRequest> {
+    let advertise_pg_addr_with_port = conf
+        .advertise_pg_addr_tenant_only
+        .as_deref()
+        .expect("advertise_pg_addr_tenant_only is required to register with HCC");
+
+    // Extract host/port from the string.
+    let (advertise_host_addr, pg_port_str) = advertise_pg_addr_with_port.split_at(
+        advertise_pg_addr_with_port
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid advertise_pg_addr"))?,
+    );
+    // Need the `[1..]` to remove the leading ':'.
+    let pg_port = pg_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse PG port: {}", e))?;
+
+    let (_, http_port_str) = conf.listen_http_addr.split_at(
+        conf.listen_http_addr
+            .rfind(':')
+            .ok_or(anyhow::anyhow!("Invalid listen_http_addr"))?,
+    );
+    let http_port = http_port_str[1..]
+        .parse::<u16>()
+        .map_err(|e| anyhow::anyhow!("Cannot parse HTTP port: {}", e))?;
+
+    Ok(NodeRegisterRequest {
+        node_id: conf.my_id,
+        listen_pg_addr: advertise_host_addr.to_string(),
+        listen_pg_port: pg_port,
+        listen_http_addr: advertise_host_addr.to_string(),
+        listen_http_port: http_port,
+        node_ip_addr,
+        availability_zone_id: AvailabilityZone("todo".to_string()),
+        listen_grpc_addr: None,
+        listen_grpc_port: None,
+        listen_https_port: None,
+    })
+}
+
+// Retrieve the JWT token used for authenticating with HCC from the environment variable.
+// Returns None if the token cannot be retrieved.
+fn get_hcc_auth_token() -> Option<String> {
+    match std::env::var("HCC_AUTH_TOKEN") {
+        Ok(v) => {
+            tracing::info!("Loaded JWT token for authentication with HCC");
+            Some(v)
+        }
+        Err(VarError::NotPresent) => {
+            tracing::info!("No JWT token for authentication with HCC detected");
+            None
+        }
+        Err(_) => {
+            tracing::info!(
+                "Failed to either load to detect non-present HCC_AUTH_TOKEN environment variable"
+            );
+            None
+        }
+    }
+}
+
+async fn send_safekeeper_register_request(
+    request_url: &Url,
+    auth_token: &Option<String>,
+    request: &NodeRegisterRequest,
+) -> Result<()> {
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .post(request_url.clone())
+        .header("Content-Type", "application/json");
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    req_builder
+        .json(&request)
+        .send()
+        .await?
+        .error_for_status()?;
+    Ok(())
+}
+
+/// Registers this safe keeper with the HCC.
+pub async fn register(conf: &SafeKeeperConf) -> Result<()> {
+    match conf.hcc_base_url.as_ref() {
+        None => {
+            tracing::info!("HCC base URL is not set, skipping registration");
+            Ok(())
+        }
+        Some(hcc_base_url) => {
+            // The following operations acquiring the auth token and the node IP address both read environment
+            // variables. It's fine for now as this `register()` function is only called once during startup.
+            // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+            // refactoring things into a "HadronClusterCoordinatorClient" struct.
+            let auth_token = get_hcc_auth_token();
+            let node_ip_addr =
+                ip_address::read_node_ip_addr_from_env().expect("Error reading node IP address.");
+
+            let request = build_node_registeration_request(conf, node_ip_addr)?;
+            let cancel = CancellationToken::new();
+            let request_url = hcc_base_url.clone().join("/hadron-internal/v1/sk")?;
+
+            backoff::retry(
+                || async {
+                    send_safekeeper_register_request(&request_url, &auth_token, &request).await
+                },
+                |_| false,
+                3,
+                u32::MAX,
+                "Calling the HCC safekeeper register API",
+                &cancel,
+            )
+            .await
+            .ok_or(anyhow::anyhow!(
+                "Error in forever retry loop. This error should never be surfaced."
+            ))?
+        }
+    }
+}
+
+async fn safekeeper_list_timelines_request(
+    conf: &SafeKeeperConf,
+) -> Result<pageserver_api::controller_api::SafekeeperTimelinesResponse> {
+    if conf.hcc_base_url.is_none() {
+        tracing::info!("HCC base URL is not set, skipping registration");
+        return Err(anyhow::anyhow!("HCC base URL is not set"));
+    }
+
+    // The following operations acquiring the auth token and the node IP address both read environment
+    // variables. It's fine for now as this `register()` function is only called once during startup.
+    // If we start to talk to HCC more regularly in the safekeeper we should probably consider
+    // refactoring things into a "HadronClusterCoordinatorClient" struct.
+    let auth_token = get_hcc_auth_token();
+    let method = format!("/control/v1/safekeeper/{}/timelines", conf.my_id.0);
+    let request_url = conf.hcc_base_url.as_ref().unwrap().clone().join(&method)?;
+
+    let client = reqwest::Client::new();
+    let mut req_builder = client
+        .get(request_url.clone())
+        .header("Content-Type", "application/json")
+        .query(&[("id", conf.my_id.0)]);
+    if let Some(token) = auth_token {
+        req_builder = req_builder.bearer_auth(token);
+    }
+    let response = req_builder
+        .send()
+        .await?
+        .error_for_status()?
+        .json::<pageserver_api::controller_api::SafekeeperTimelinesResponse>()
+        .await?;
+    Ok(response)
+}
+
+// Returns true on success, false otherwise.
+pub async fn hcc_pull_timeline(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) -> bool {
+    let mut request = PullTimelineRequest {
+        tenant_id: timeline.tenant_id,
+        timeline_id: timeline.timeline_id,
+        http_hosts: Vec::new(),
+        ignore_tombstone: None,
+    };
+    for host in timeline.peers {
+        if host.0 == conf.my_id.0 {
+            continue;
+        }
+        if let Some(http_host) = nodeid_http.get(&host.0) {
+            request.http_hosts.push(http_host.clone());
+        }
+    }
+
+    let ca_certs = match conf
+        .ssl_ca_certs
+        .iter()
+        .map(Pem::contents)
+        .map(reqwest::Certificate::from_der)
+        .collect::<Result<Vec<_>, _>>()
+    {
+        Ok(result) => result,
+        Err(_) => {
+            return false;
+        }
+    };
+    match pull_timeline::handle_request(
+        request,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines.clone(),
+        true,
+    )
+    .await
+    {
+        Ok(resp) => {
+            tracing::info!(
+                "Completed pulling tenant {} timeline {} from SK {:?}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                resp.safekeeper_host
+            );
+            return true;
+        }
+        Err(e) => {
+            tracing::error!(
+                "Failed to pull tenant {} timeline {} from SK {}",
+                timeline.tenant_id,
+                timeline.timeline_id,
+                e
+            );
+
+            let ttid = TenantTimelineId {
+                tenant_id: timeline.tenant_id,
+                timeline_id: timeline.timeline_id,
+            };
+            // Revert the failed timeline pull.
+            // Notice that not found timeline returns OK also.
+            match global_timelines
+                .delete_or_exclude(&ttid, DeleteOrExclude::DeleteLocal)
+                .await
+            {
+                Ok(dr) => {
+                    tracing::info!(
+                        "Deleted tenant {} timeline {} DirExists: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        dr.dir_existed,
+                    );
+                }
+                Err(e) => {
+                    tracing::error!(
+                        "Failed to delete tenant {} timeline {} from global_timelines: {}",
+                        timeline.tenant_id,
+                        timeline.timeline_id,
+                        e
+                    );
+                }
+            }
+        }
+    }
+    false
+}
+
+pub async fn hcc_pull_timeline_till_success(
+    timeline: SafekeeperTimeline,
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+    nodeid_http: &HashMap<u64, String>,
+) {
+    const MAX_PULL_TIMELINE_RETRIES: u64 = 100;
+    for i in 0..MAX_PULL_TIMELINE_RETRIES {
+        if hcc_pull_timeline(
+            timeline.clone(),
+            conf,
+            global_timelines.clone(),
+            nodeid_http,
+        )
+        .await
+        {
+            SK_RECOVERY_PULL_TIMELINE_OKS.inc();
+            return;
+        }
+        tracing::error!(
+            "Failed to pull timeline {} from SK peers, retrying {}/{}",
+            timeline.timeline_id,
+            i + 1,
+            MAX_PULL_TIMELINE_RETRIES
+        );
+        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
+    }
+    SK_RECOVERY_PULL_TIMELINE_ERRORS.inc();
+}
+
+pub async fn hcc_pull_timelines(
+    conf: &SafeKeeperConf,
+    global_timelines: Arc<GlobalTimelines>,
+) -> Result<()> {
+    let _timer = SK_RECOVERY_PULL_TIMELINES_SECONDS.start_timer();
+    tracing::info!("Start pulling timelines from SK peers");
+
+    let mut response = SafekeeperTimelinesResponse {
+        timelines: Vec::new(),
+        safekeeper_peers: Vec::new(),
+    };
+    for i in 0..100 {
+        match safekeeper_list_timelines_request(conf).await {
+            Ok(timelines) => {
+                response = timelines;
+            }
+            Err(e) => {
+                tracing::error!("Failed to list timelines from HCC: {}", e);
+                if i == 99 {
+                    return Err(e);
+                }
+            }
+        }
+        sleep(Duration::from_millis(100)).await;
+    }
+
+    let mut nodeid_http = HashMap::new();
+    for sk in response.safekeeper_peers {
+        nodeid_http.insert(
+            sk.node_id.0,
+            format!("http://{}:{}", sk.listen_http_addr, sk.http_port),
+        );
+    }
+    tracing::info!("Received {} timelines from HCC", response.timelines.len());
+    for timeline in response.timelines {
+        let _timer = SK_RECOVERY_PULL_TIMELINE_SECONDS
+            .with_label_values(&[
+                &timeline.tenant_id.to_string(),
+                &timeline.timeline_id.to_string(),
+            ])
+            .start_timer();
+        hcc_pull_timeline_till_success(timeline, conf, global_timelines.clone(), &nodeid_http)
+            .await;
+    }
+    Ok(())
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_build_node_registeration_request() {
+        // Test that:
+        // 1. We always extract the host name and port used to register with the HCC from the
+        //    `advertise_pg_addr` if it is set.
+        // 2. The correct ports are extracted from `advertise_pg_addr` and `listen_http_addr`.
+        let mut conf = SafeKeeperConf::dummy();
+        conf.my_id = NodeId(1);
+        conf.advertise_pg_addr_tenant_only =
+            Some("safe-keeper-1.safe-keeper.hadron.svc.cluster.local:5454".to_string());
+        // `listen_pg_addr` and `listen_pg_addr_tenant_only` are not used for node registration. Set them to a different
+        // host and port values and make sure that they don't show up in the node registration request.
+        conf.listen_pg_addr = "0.0.0.0:5456".to_string();
+        conf.listen_pg_addr_tenant_only = Some("0.0.0.0:5456".to_string());
+        conf.listen_http_addr = "0.0.0.0:7676".to_string();
+        let node_ip_addr: Option<IpAddr> = Some("127.0.0.1".parse().unwrap());
+
+        let request = build_node_registeration_request(&conf, node_ip_addr).unwrap();
+        assert_eq!(request.node_id, NodeId(1));
+        assert_eq!(
+            request.listen_pg_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_pg_port, 5454);
+        assert_eq!(
+            request.listen_http_addr,
+            "safe-keeper-1.safe-keeper.hadron.svc.cluster.local"
+        );
+        assert_eq!(request.listen_http_port, 7676);
+        assert_eq!(
+            request.node_ip_addr,
+            Some(IpAddr::V4("127.0.0.1".parse().unwrap()))
+        );
+    }
+}
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4b061c65d9..a0ee2facb5 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -241,9 +241,14 @@ async fn timeline_pull_handler(mut request: Request<Body>) -> Result<Response<Bo
             ApiError::InternalServerError(anyhow::anyhow!("failed to parse CA certs: {e}"))
         })?;
 
-    let resp =
-        pull_timeline::handle_request(data, conf.sk_auth_token.clone(), ca_certs, global_timelines)
-            .await?;
+    let resp = pull_timeline::handle_request(
+        data,
+        conf.sk_auth_token.clone(),
+        ca_certs,
+        global_timelines,
+        false,
+    )
+    .await?;
     json_response(StatusCode::OK, resp)
 }
 
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index c0b5403ebf..02533b804d 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -10,6 +10,7 @@ use pem::Pem;
 use remote_storage::RemoteStorageConfig;
 use storage_broker::Uri;
 use tokio::runtime::Runtime;
+use url::Url;
 use utils::auth::SwappableJwtAuth;
 use utils::id::NodeId;
 use utils::logging::SecretString;
@@ -20,6 +21,7 @@ pub mod control_file;
 pub mod control_file_upgrade;
 pub mod copy_timeline;
 pub mod debug_dump;
+pub mod hadron;
 pub mod handler;
 pub mod http;
 pub mod metrics;
@@ -100,6 +102,11 @@ pub struct SafeKeeperConf {
     pub advertise_pg_addr: Option<String>,
     pub availability_zone: Option<String>,
     pub no_sync: bool,
+    /* BEGIN_HADRON */
+    pub advertise_pg_addr_tenant_only: Option<String>,
+    pub enable_pull_timeline_on_startup: bool,
+    pub hcc_base_url: Option<Url>,
+    /* END_HADRON */
     pub broker_endpoint: Uri,
     pub broker_keepalive_interval: Duration,
     pub heartbeat_timeout: Duration,
@@ -185,6 +192,11 @@ impl SafeKeeperConf {
             use_https_safekeeper_api: false,
             enable_tls_wal_service_api: false,
             force_metric_collection_on_scrape: true,
+            /* BEGIN_HADRON */
+            advertise_pg_addr_tenant_only: None,
+            enable_pull_timeline_on_startup: false,
+            hcc_base_url: None,
+            /* END_HADRON */
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 1f98651e71..e1af51c115 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -85,6 +85,43 @@ pub static WAL_STORAGE_LIMIT_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_wal_storage_limit_errors counter")
 });
+pub static SK_RECOVERY_PULL_TIMELINE_ERRORS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_errors",
+        concat!(
+            "Number of errors due to pull_timeline errors during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines runs into error."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_errors counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_OKS: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "safekeeper_recovery_pull_timeline_oks",
+        concat!(
+            "Number of successful pull_timeline during SK lost disk recovery.",
+            "An increase in this metric indicates pull timelines is successful."
+        )
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_oks counter")
+});
+pub static SK_RECOVERY_PULL_TIMELINES_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "safekeeper_recovery_pull_timelines_seconds",
+        "Seconds to pull timelines",
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timelines_seconds histogram")
+});
+pub static SK_RECOVERY_PULL_TIMELINE_SECONDS: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "safekeeper_recovery_pull_timeline_seconds",
+        "Seconds to pull timeline",
+        &["tenant_id", "timeline_id"],
+        DISK_FSYNC_SECONDS_BUCKETS.to_vec()
+    )
+    .expect("Failed to register safekeeper_recovery_pull_timeline_seconds histogram vec")
+});
 /* END_HADRON */
 pub static PERSIST_CONTROL_FILE_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index 1c9e5bade5..b4c4877b2c 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -8,6 +8,7 @@ use bytes::Bytes;
 use camino::Utf8PathBuf;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
+use http::StatusCode;
 use http_utils::error::ApiError;
 use postgres_ffi::{PG_TLI, XLogFileName, XLogSegNo};
 use remote_storage::GenericRemoteStorage;
@@ -21,10 +22,11 @@ use tokio::fs::OpenOptions;
 use tokio::io::AsyncWrite;
 use tokio::sync::mpsc;
 use tokio::task;
+use tokio::time::sleep;
 use tokio_tar::{Archive, Builder, Header};
 use tokio_util::io::{CopyToBytes, SinkWriter};
 use tokio_util::sync::PollSender;
-use tracing::{error, info, instrument};
+use tracing::{error, info, instrument, warn};
 use utils::crashsafe::fsync_async_opt;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::logging::SecretString;
@@ -449,6 +451,7 @@ pub async fn handle_request(
     sk_auth_token: Option<SecretString>,
     ssl_ca_certs: Vec<Certificate>,
     global_timelines: Arc<GlobalTimelines>,
+    wait_for_peer_timeline_status: bool,
 ) -> Result<PullTimelineResponse, ApiError> {
     let existing_tli = global_timelines.get(TenantTimelineId::new(
         request.tenant_id,
@@ -472,37 +475,100 @@ pub async fn handle_request(
     let http_hosts = request.http_hosts.clone();
 
     // Figure out statuses of potential donors.
-    let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
-        futures::future::join_all(http_hosts.iter().map(|url| async {
-            let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
-            let info = cclient
-                .timeline_status(request.tenant_id, request.timeline_id)
-                .await?;
-            Ok(info)
-        }))
-        .await;
-
     let mut statuses = Vec::new();
-    for (i, response) in responses.into_iter().enumerate() {
-        match response {
-            Ok(status) => {
-                statuses.push((status, i));
-            }
-            Err(e) => {
-                info!("error fetching status from {}: {e}", http_hosts[i]);
+    if !wait_for_peer_timeline_status {
+        let responses: Vec<Result<TimelineStatus, mgmt_api::Error>> =
+            futures::future::join_all(http_hosts.iter().map(|url| async {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                let resp = cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await?;
+                let info: TimelineStatus = resp
+                    .json()
+                    .await
+                    .context("Failed to deserialize timeline status")
+                    .map_err(|e| mgmt_api::Error::ReceiveErrorBody(e.to_string()))?;
+                Ok(info)
+            }))
+            .await;
+
+        for (i, response) in responses.into_iter().enumerate() {
+            match response {
+                Ok(status) => {
+                    statuses.push((status, i));
+                }
+                Err(e) => {
+                    info!("error fetching status from {}: {e}", http_hosts[i]);
+                }
             }
         }
-    }
 
-    // Allow missing responses from up to one safekeeper (say due to downtime)
-    // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
-    // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
-    let min_required_successful = (http_hosts.len() - 1).max(1);
-    if statuses.len() < min_required_successful {
-        return Err(ApiError::InternalServerError(anyhow::anyhow!(
-            "only got {} successful status responses. required: {min_required_successful}",
-            statuses.len()
-        )));
+        // Allow missing responses from up to one safekeeper (say due to downtime)
+        // e.g. if we created a timeline on PS A and B, with C being offline. Then B goes
+        // offline and C comes online. Then we want a pull on C with A and B as hosts to work.
+        let min_required_successful = (http_hosts.len() - 1).max(1);
+        if statuses.len() < min_required_successful {
+            return Err(ApiError::InternalServerError(anyhow::anyhow!(
+                "only got {} successful status responses. required: {min_required_successful}",
+                statuses.len()
+            )));
+        }
+    } else {
+        let mut retry = true;
+        // We must get status from all other peers.
+        // Otherwise, we may run into split-brain scenario.
+        while retry {
+            statuses.clear();
+            retry = false;
+            for (i, url) in http_hosts.iter().enumerate() {
+                let cclient = Client::new(http_client.clone(), url.clone(), sk_auth_token.clone());
+                match cclient
+                    .timeline_status(request.tenant_id, request.timeline_id)
+                    .await
+                {
+                    Ok(resp) => {
+                        if resp.status() == StatusCode::NOT_FOUND {
+                            warn!(
+                                "Timeline {} not found on peer SK {}, no need to pull it",
+                                TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                url
+                            );
+                            return Ok(PullTimelineResponse {
+                                safekeeper_host: None,
+                            });
+                        }
+                        let info: TimelineStatus = resp
+                            .json()
+                            .await
+                            .context("Failed to deserialize timeline status")
+                            .map_err(ApiError::InternalServerError)?;
+                        statuses.push((info, i));
+                    }
+                    Err(e) => {
+                        match e {
+                            // If we get a 404, it means the timeline doesn't exist on this safekeeper.
+                            // We can ignore this error.
+                            mgmt_api::Error::ApiError(status, _)
+                                if status == StatusCode::NOT_FOUND =>
+                            {
+                                warn!(
+                                    "Timeline {} not found on peer SK {}, no need to pull it",
+                                    TenantTimelineId::new(request.tenant_id, request.timeline_id),
+                                    url
+                                );
+                                return Ok(PullTimelineResponse {
+                                    safekeeper_host: None,
+                                });
+                            }
+                            _ => {}
+                        }
+                        retry = true;
+                        error!("Failed to get timeline status from {}: {:#}", url, e);
+                    }
+                }
+            }
+            sleep(std::time::Duration::from_millis(100)).await;
+        }
     }
 
     // Find the most advanced safekeeper
@@ -511,6 +577,12 @@ pub async fn handle_request(
         .max_by_key(|(status, _)| {
             (
                 status.acceptor_state.epoch,
+                /* BEGIN_HADRON */
+                // We need to pull from the SK with the highest term.
+                // This is because another compute may come online and vote the same highest term again on the other two SKs.
+                // Then, there will be 2 computes running on the same term.
+                status.acceptor_state.term,
+                /* END_HADRON */
                 status.flush_lsn,
                 status.commit_lsn,
             )
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 280cd790a4..393df6228e 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -191,6 +191,11 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         use_https_safekeeper_api: false,
         enable_tls_wal_service_api: false,
         force_metric_collection_on_scrape: true,
+        /* BEGIN_HADRON */
+        enable_pull_timeline_on_startup: false,
+        advertise_pg_addr_tenant_only: None,
+        hcc_base_url: None,
+        /* END_HADRON */
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 0bb63308bb..573016f772 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 import sys
 import tarfile
 import tempfile
+from pathlib import Path
 from typing import TYPE_CHECKING
 
 import pytest
@@ -198,3 +199,115 @@ def test_wal_restore_http(neon_env_builder: NeonEnvBuilder, broken_tenant: bool)
     # the table is back now!
     restored = env.endpoints.create_start("main")
     assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
+
+
+# BEGIN_HADRON
+# TODO: re-enable once CM python is integreated.
+# def clear_directory(directory):
+#     for item in os.listdir(directory):
+#         item_path = os.path.join(directory, item)
+#         if os.path.isdir(item_path):
+#             log.info(f"removing SK directory: {item_path}")
+#             shutil.rmtree(item_path)
+#         else:
+#             log.info(f"removing SK file: {item_path}")
+#             os.remove(item_path)
+
+
+# def test_sk_pull_timelines(
+#     neon_env_builder: NeonEnvBuilder,
+# ):
+#     DBNAME = "regression"
+#     superuser_name = "databricks_superuser"
+#     neon_env_builder.num_safekeepers = 3
+#     neon_env_builder.num_pageservers = 4
+#     neon_env_builder.safekeeper_extra_opts = ["--enable-pull-timeline-on-startup"]
+#     neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+
+#     env = neon_env_builder.init_start(initial_tenant_shard_count=4)
+
+#     env.compute_manager.start(base_port=env.compute_manager_port)
+
+#     test_creator = "test_creator"
+#     test_metastore_id = uuid4()
+#     test_account_id = uuid4()
+#     test_workspace_id = 1
+#     test_workspace_url = "http://test_workspace_url"
+#     test_metadata_version = 1
+#     test_metadata = {
+#         "state": "INSTANCE_PROVISIONING",
+#         "admin_rolename": "admin",
+#         "admin_password_scram": "abc123456",
+#     }
+
+#     test_instance_name_1 = "test_instance_1"
+#     test_instance_read_write_compute_pool_1 = {
+#         "instance_name": test_instance_name_1,
+#         "compute_pool_name": "compute_pool_1",
+#         "creator": test_creator,
+#         "capacity": 2.0,
+#         "node_count": 1,
+#         "metadata_version": 0,
+#         "metadata": {
+#             "state": "INSTANCE_PROVISIONING",
+#         },
+#     }
+
+#     test_instance_1_readable_secondaries_enabled = False
+
+#     # Test creation
+#     create_instance_with_retries(
+#         env,
+#         test_instance_name_1,
+#         test_creator,
+#         test_metastore_id,
+#         test_account_id,
+#         test_workspace_id,
+#         test_workspace_url,
+#         test_instance_read_write_compute_pool_1,
+#         test_metadata_version,
+#         test_metadata,
+#         test_instance_1_readable_secondaries_enabled,
+#     )
+#     instance = env.compute_manager.get_instance_by_name(test_instance_name_1, test_workspace_id)
+#     log.info(f"haoyu Instance created: {instance}")
+#     assert instance["instance_name"] == test_instance_name_1
+#     test_instance_id = instance["instance_id"]
+#     instance_detail = env.compute_manager.describe_instance(test_instance_id)
+#     log.info(f"haoyu Instance detail: {instance_detail}")
+
+#     env.initial_tenant = instance_detail[0]["tenant_id"]
+#     env.initial_timeline = instance_detail[0]["timeline_id"]
+
+#     # Connect to postgres and create a database called "regression".
+#     endpoint = env.endpoints.create_start("main")
+#     endpoint.safe_psql(f"CREATE ROLE {superuser_name}")
+#     endpoint.safe_psql(f"CREATE DATABASE {DBNAME}")
+
+#     endpoint.safe_psql("CREATE TABLE usertable ( YCSB_KEY INT, FIELD0 TEXT);")
+#     # Write some data. ~20 MB.
+#     num_rows = 0
+#     for _i in range(0, 20000):
+#         endpoint.safe_psql(
+#             "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+#         )
+#         num_rows += 1
+
+#     log.info(f"SKs {env.storage_controller.hcc_sk_node_list()}")
+
+#     env.safekeepers[0].stop(immediate=True)
+#     clear_directory(env.safekeepers[0].data_dir)
+#     env.safekeepers[0].start()
+
+#     # PG can still write data. ~20 MB.
+#     for _i in range(0, 20000):
+#         endpoint.safe_psql(
+#             "INSERT INTO usertable SELECT random(), repeat('a', 1000);", log_query=False
+#         )
+#         num_rows += 1
+
+#     tuples = endpoint.safe_psql("SELECT COUNT(*) FROM usertable;")
+#     assert tuples[0][0] == num_rows
+#     endpoint.stop_and_destroy()
+
+# END_HADRON

From f8d3f86f586c6615e75251f9919c6c66feefa5d6 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 14 Jul 2025 17:37:28 +0100
Subject: [PATCH 114/163] pageserver: include records in get page debug handler
 (#12578)

Include records and image in the debug get page handler.
This endpoint does not update the metrics and does not support tracing.

Note that this now returns individual bytes which need to be encoded
properly for debugging.

Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 pageserver/src/http/routes.rs          |  35 ++++--
 pageserver/src/tenant/storage_layer.rs |  28 ++++-
 pageserver/src/tenant/timeline.rs      | 143 +++++++++++++++++++++++++
 3 files changed, 196 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 0d40c5ecf7..3e844a375d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -10,6 +10,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 use anyhow::{Context, Result, anyhow};
+use bytes::Bytes;
 use enumset::EnumSet;
 use futures::future::join_all;
 use futures::{StreamExt, TryFutureExt};
@@ -46,6 +47,7 @@ use pageserver_api::shard::{ShardCount, TenantShardId};
 use postgres_ffi::PgMajorVersion;
 use remote_storage::{DownloadError, GenericRemoteStorage, TimeTravelError};
 use scopeguard::defer;
+use serde::{Deserialize, Serialize};
 use serde_json::json;
 use tenant_size_model::svg::SvgBranchKind;
 use tenant_size_model::{SizeResult, StorageModel};
@@ -57,6 +59,7 @@ use utils::auth::SwappableJwtAuth;
 use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
+use wal_decoder::models::record::NeonWalRecord;
 
 use crate::config::PageServerConf;
 use crate::context;
@@ -77,6 +80,7 @@ use crate::tenant::remote_timeline_client::{
 };
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
+use crate::tenant::storage_layer::ValuesReconstructState;
 use crate::tenant::storage_layer::{IoConcurrency, LayerAccessStatsReset, LayerName};
 use crate::tenant::timeline::layer_manager::LayerManagerLockHolder;
 use crate::tenant::timeline::offload::{OffloadError, offload_timeline};
@@ -2708,6 +2712,16 @@ async fn deletion_queue_flush(
     }
 }
 
+/// Try if `GetPage@Lsn` is successful, useful for manual debugging.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+struct GetPageResponse {
+    pub page: Bytes,
+    pub layers_visited: u32,
+    pub delta_layers_visited: u32,
+    pub records: Vec<(Lsn, NeonWalRecord)>,
+    pub img: Option<(Lsn, Bytes)>,
+}
+
 async fn getpage_at_lsn_handler(
     request: Request<Body>,
     cancel: CancellationToken,
@@ -2758,21 +2772,24 @@ async fn getpage_at_lsn_handler_inner(
 
         // Use last_record_lsn if no lsn is provided
         let lsn = lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
-        let page = timeline.get(key.0, lsn, &ctx).await?;
 
         if touch {
             json_response(StatusCode::OK, ())
         } else {
-            Result::<_, ApiError>::Ok(
-                Response::builder()
-                    .status(StatusCode::OK)
-                    .header(header::CONTENT_TYPE, "application/octet-stream")
-                    .body(hyper::Body::from(page))
-                    .unwrap(),
-            )
+            let mut reconstruct_state = ValuesReconstructState::new_with_debug(IoConcurrency::sequential());
+            let page = timeline.debug_get(key.0, lsn, &ctx, &mut reconstruct_state).await?;
+            let response = GetPageResponse {
+                page,
+                layers_visited: reconstruct_state.get_layers_visited(),
+                delta_layers_visited: reconstruct_state.get_delta_layers_visited(),
+                records: reconstruct_state.debug_state.records.clone(),
+                img: reconstruct_state.debug_state.img.clone(),
+            };
+
+            json_response(StatusCode::OK, response)
         }
     }
-    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .instrument(info_span!("timeline_debug_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
     .await
 }
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 9fbb9d2438..43ea8fffa3 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -75,7 +75,7 @@ where
 /// the same ValueReconstructState struct in the next 'get_value_reconstruct_data'
 /// call, to collect more records.
 ///
-#[derive(Debug, Default)]
+#[derive(Debug, Default, Clone)]
 pub(crate) struct ValueReconstructState {
     pub(crate) records: Vec<(Lsn, NeonWalRecord)>,
     pub(crate) img: Option<(Lsn, Bytes)>,
@@ -308,6 +308,9 @@ pub struct ValuesReconstructState {
     layers_visited: u32,
     delta_layers_visited: u32,
 
+    pub(crate) enable_debug: bool,
+    pub(crate) debug_state: ValueReconstructState,
+
     pub(crate) io_concurrency: IoConcurrency,
     num_active_ios: Arc<AtomicUsize>,
 
@@ -657,6 +660,23 @@ impl ValuesReconstructState {
             layers_visited: 0,
             delta_layers_visited: 0,
             io_concurrency,
+            enable_debug: false,
+            debug_state: ValueReconstructState::default(),
+            num_active_ios: Arc::new(AtomicUsize::new(0)),
+            read_path: None,
+        }
+    }
+
+    pub(crate) fn new_with_debug(io_concurrency: IoConcurrency) -> Self {
+        Self {
+            keys: HashMap::new(),
+            keys_done: KeySpaceRandomAccum::new(),
+            keys_with_image_coverage: None,
+            layers_visited: 0,
+            delta_layers_visited: 0,
+            io_concurrency,
+            enable_debug: true,
+            debug_state: ValueReconstructState::default(),
             num_active_ios: Arc::new(AtomicUsize::new(0)),
             read_path: None,
         }
@@ -670,6 +690,12 @@ impl ValuesReconstructState {
         self.io_concurrency.spawn_io(fut).await;
     }
 
+    pub(crate) fn set_debug_state(&mut self, debug_state: &ValueReconstructState) {
+        if self.enable_debug {
+            self.debug_state = debug_state.clone();
+        }
+    }
+
     pub(crate) fn on_layer_visited(&mut self, layer: &ReadableLayer) {
         self.layers_visited += 1;
         if let ReadableLayer::PersistentLayer(layer) = layer {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f2833674a9..73d2d72b59 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1253,6 +1253,57 @@ impl Timeline {
         }
     }
 
+    #[inline(always)]
+    pub(crate) async fn debug_get(
+        &self,
+        key: Key,
+        lsn: Lsn,
+        ctx: &RequestContext,
+        reconstruct_state: &mut ValuesReconstructState,
+    ) -> Result<Bytes, PageReconstructError> {
+        if !lsn.is_valid() {
+            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
+        }
+
+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
+        let query = VersionedKeySpaceQuery::uniform(KeySpace::single(key..key.next()), lsn);
+        let vectored_res = self
+            .debug_get_vectored_impl(query, reconstruct_state, ctx)
+            .await;
+
+        let key_value = vectored_res?.pop_first();
+        match key_value {
+            Some((got_key, value)) => {
+                if got_key != key {
+                    error!(
+                        "Expected {}, but singular vectored get returned {}",
+                        key, got_key
+                    );
+                    Err(PageReconstructError::Other(anyhow!(
+                        "Singular vectored get returned wrong key"
+                    )))
+                } else {
+                    value
+                }
+            }
+            None => Err(PageReconstructError::MissingKey(Box::new(
+                MissingKeyError {
+                    keyspace: KeySpace::single(key..key.next()),
+                    shard: self.shard_identity.get_shard_number(&key),
+                    original_hwm_lsn: lsn,
+                    ancestor_lsn: None,
+                    backtrace: None,
+                    read_path: None,
+                    query: None,
+                },
+            ))),
+        }
+    }
+
     pub(crate) const LAYERS_VISITED_WARN_THRESHOLD: u32 = 100;
 
     /// Look up multiple page versions at a given LSN
@@ -1547,6 +1598,98 @@ impl Timeline {
         Ok(results)
     }
 
+    // A copy of the get_vectored_impl method except that we store the image and wal records into `reconstruct_state`.
+    // This is only used in the http getpage call for debugging purpose.
+    pub(super) async fn debug_get_vectored_impl(
+        &self,
+        query: VersionedKeySpaceQuery,
+        reconstruct_state: &mut ValuesReconstructState,
+        ctx: &RequestContext,
+    ) -> Result<BTreeMap<Key, Result<Bytes, PageReconstructError>>, GetVectoredError> {
+        if query.is_empty() {
+            return Ok(BTreeMap::default());
+        }
+
+        let read_path = if self.conf.enable_read_path_debugging || ctx.read_path_debug() {
+            Some(ReadPath::new(
+                query.total_keyspace(),
+                query.high_watermark_lsn()?,
+            ))
+        } else {
+            None
+        };
+
+        reconstruct_state.read_path = read_path;
+
+        let traversal_res: Result<(), _> = self
+            .get_vectored_reconstruct_data(query.clone(), reconstruct_state, ctx)
+            .await;
+
+        if let Err(err) = traversal_res {
+            // Wait for all the spawned IOs to complete.
+            // See comments on `spawn_io` inside `storage_layer` for more details.
+            let mut collect_futs = std::mem::take(&mut reconstruct_state.keys)
+                .into_values()
+                .map(|state| state.collect_pending_ios())
+                .collect::<FuturesUnordered<_>>();
+            while collect_futs.next().await.is_some() {}
+            return Err(err);
+        };
+
+        let reconstruct_state = Arc::new(Mutex::new(reconstruct_state));
+        let futs = FuturesUnordered::new();
+
+        for (key, state) in std::mem::take(&mut reconstruct_state.lock().unwrap().keys) {
+            let req_lsn_for_key = query.map_key_to_lsn(&key);
+            futs.push({
+                let walredo_self = self.myself.upgrade().expect("&self method holds the arc");
+                let rc_clone = Arc::clone(&reconstruct_state);
+
+                async move {
+                    assert_eq!(state.situation, ValueReconstructSituation::Complete);
+
+                    let converted = match state.collect_pending_ios().await {
+                        Ok(ok) => ok,
+                        Err(err) => {
+                            return (key, Err(err));
+                        }
+                    };
+                    DELTAS_PER_READ_GLOBAL.observe(converted.num_deltas() as f64);
+
+                    // The walredo module expects the records to be descending in terms of Lsn.
+                    // And we submit the IOs in that order, so, there shuold be no need to sort here.
+                    debug_assert!(
+                        converted
+                            .records
+                            .is_sorted_by_key(|(lsn, _)| std::cmp::Reverse(*lsn)),
+                        "{converted:?}"
+                    );
+                    {
+                        let mut guard = rc_clone.lock().unwrap();
+                        guard.set_debug_state(&converted);
+                    }
+                    (
+                        key,
+                        walredo_self
+                            .reconstruct_value(
+                                key,
+                                req_lsn_for_key,
+                                converted,
+                                RedoAttemptType::ReadPage,
+                            )
+                            .await,
+                    )
+                }
+            });
+        }
+
+        let results = futs
+            .collect::<BTreeMap<Key, Result<Bytes, PageReconstructError>>>()
+            .await;
+
+        Ok(results)
+    }
+
     /// Get last or prev record separately. Same as get_last_record_rlsn().last/prev.
     pub(crate) fn get_last_record_lsn(&self) -> Lsn {
         self.last_record_lsn.load().last

From 3e6fdb0aa671e876dddddaf167de4a036409019a Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 14 Jul 2025 18:47:07 +0200
Subject: [PATCH 115/163] Add and use [U]INT64_[HEX_]FORMAT for various
 [u]int64 needs (#12592)

We didn't consistently apply these, and it wasn't consistently solved.
With this patch we should have a more consistent approach to this, and
have less issues porting changes to newer versions.

This also removes some potentially buggy casts to `long` from `uint64` -
they could've truncated the value in systems where `long` only has 32
bits.
---
 pgxn/neon/communicator.c         | 50 ++++++++++++++++----------------
 pgxn/neon/neon_pgversioncompat.h |  4 +++
 2 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/pgxn/neon/communicator.c b/pgxn/neon/communicator.c
index bd53855eab..158b8940a3 100644
--- a/pgxn/neon/communicator.c
+++ b/pgxn/neon/communicator.c
@@ -421,7 +421,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 {
 	if (resp->tag != T_NeonGetPageResponse && resp->tag != T_NeonErrorResponse)
 	{
-		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=%ld, ring_flush=%ld, ring_unused=%ld",
+		neon_shard_log(slot->shard_no, PANIC, "Unexpected prefetch response %d, ring_receive=" UINT64_FORMAT ", ring_flush=" UINT64_FORMAT ", ring_unused=" UINT64_FORMAT "",
 					   resp->tag, MyPState->ring_receive, MyPState->ring_flush, MyPState->ring_unused);
 	}
 	if (neon_protocol_version >= 3)
@@ -438,7 +438,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				getpage_resp->req.blkno != slot->buftag.blockNum)
 			{
 				NEON_PANIC_CONNECTION_STATE(slot->shard_no, PANIC,
-											"Receive unexpected getpage response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
+											"Receive unexpected getpage response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u} to get page request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u, block=%u}",
 											resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(getpage_resp->req.rinfo), getpage_resp->req.forknum, getpage_resp->req.blkno,
 											slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since), RelFileInfoFmt(rinfo), slot->buftag.forkNum, slot->buftag.blockNum);
 			}
@@ -447,7 +447,7 @@ check_getpage_response(PrefetchRequest* slot, NeonResponse* resp)
 				 resp->lsn != slot->request_lsns.request_lsn ||
 				 resp->not_modified_since != slot->request_lsns.not_modified_since)
 		{
-			elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+			elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 				 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 				 slot->reqid, LSN_FORMAT_ARGS(slot->request_lsns.request_lsn), LSN_FORMAT_ARGS(slot->request_lsns.not_modified_since));
 		}
@@ -496,9 +496,9 @@ communicator_prefetch_pump_state(void)
 			slot->my_ring_index != MyPState->ring_receive)
 		{
 			neon_shard_log(slot->shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, MyPState->ring_receive);
 		}
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
@@ -789,9 +789,9 @@ prefetch_read(PrefetchRequest *slot)
 		slot->my_ring_index != MyPState->ring_receive)
 	{
 		neon_shard_log(slot->shard_no, PANIC,
-					   "Incorrect prefetch read: status=%d response=%p my=%lu receive=%lu",
+					   "Incorrect prefetch read: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 					   slot->status, slot->response,
-					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
+					   slot->my_ring_index, MyPState->ring_receive);
 	}
 
 	/*
@@ -816,9 +816,9 @@ prefetch_read(PrefetchRequest *slot)
 			slot->my_ring_index != MyPState->ring_receive)
 		{
 			neon_shard_log(shard_no, PANIC,
-						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=" UINT64_FORMAT " receive=" UINT64_FORMAT "",
 						   slot->status, slot->response,
-						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+						   slot->my_ring_index, MyPState->ring_receive);
 		}
 
 		/* update prefetch state */
@@ -852,8 +852,8 @@ prefetch_read(PrefetchRequest *slot)
 		 * and the prefetch queue was flushed during the receive call
 		 */
 		neon_shard_log(shard_no, LOG,
-					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long) my_ring_index,
+					   "No response from reading prefetch entry " UINT64_FORMAT ": %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
+					   my_ring_index,
 					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
 					   buftag.forkNum, buftag.blockNum);
 		return false;
@@ -1844,7 +1844,7 @@ nm_to_string(NeonMessage *msg)
 				NeonDbSizeResponse *msg_resp = (NeonDbSizeResponse *) msg;
 
 				appendStringInfoString(&s, "{\"type\": \"NeonDbSizeResponse\"");
-				appendStringInfo(&s, ", \"db_size\": %ld}",
+				appendStringInfo(&s, ", \"db_size\": " INT64_FORMAT "}",
 								 msg_resp->db_size);
 				appendStringInfoChar(&s, '}');
 
@@ -2045,7 +2045,7 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 						exists_resp->req.forknum != request.forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to exits request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(exists_resp->req.rinfo), exists_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), request.forknum);
 					}
@@ -2058,14 +2058,14 @@ communicator_exists(NRelFileInfo rinfo, ForkNumber forkNum, neon_request_lsns *r
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match exists request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation existence of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forkNum,
@@ -2241,7 +2241,7 @@ Retry:
 			case T_NeonErrorResponse:
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[shard %d, reqid %lx] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[shard %d, reqid " UINT64_HEX_FORMAT "] could not read block %u in rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								slot->shard_no, resp->reqid, blockno, RelFileInfoFmt(rinfo),
 								forkNum, LSN_FORMAT_ARGS(reqlsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2294,7 +2294,7 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 						relsize_resp->req.forknum != forknum)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u} to get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, rel=%u/%u/%u.%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), RelFileInfoFmt(relsize_resp->req.rinfo), relsize_resp->req.forknum,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), RelFileInfoFmt(request.rinfo), forknum);
 					}
@@ -2307,14 +2307,14 @@ communicator_nblocks(NRelFileInfo rinfo, ForkNumber forknum, neon_request_lsns *
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get relsize request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read relation size of rel %u/%u/%u.%u from page server at lsn %X/%08X",
 								resp->reqid,
 								RelFileInfoFmt(rinfo),
 								forknum,
@@ -2364,7 +2364,7 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 						dbsize_resp->req.dbNode != dbNode)
 					{
 						NEON_PANIC_CONNECTION_STATE(0, PANIC,
-													"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
+													"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u} to get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, dbNode=%u}",
 													resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), dbsize_resp->req.dbNode,
 													request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), dbNode);
 					}
@@ -2377,14 +2377,14 @@ communicator_dbsize(Oid dbNode, neon_request_lsns *request_lsns)
 				{
 					if (!equal_requests(resp, &request.hdr))
 					{
-						elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+						elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get DB size request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 							 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 							 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 					}
 				}
 				ereport(ERROR,
 						(errcode(ERRCODE_IO_ERROR),
-						 errmsg(NEON_TAG "[reqid %lx] could not read db size of db %u from page server at lsn %X/%08X",
+						 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read db size of db %u from page server at lsn %X/%08X",
 								resp->reqid,
 								dbNode, LSN_FORMAT_ARGS(request_lsns->effective_request_lsn)),
 						 errdetail("page server returned error: %s",
@@ -2455,7 +2455,7 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 					slru_resp->req.segno != segno)
 				{
 					NEON_PANIC_CONNECTION_STATE(0, PANIC,
-												"Unexpect response {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
+												"Unexpect response {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%u} to get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X, kind=%u, segno=%lluu}",
 												resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since), slru_resp->req.kind, slru_resp->req.segno,
 												request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since), kind, (unsigned long long) segno);
 				}
@@ -2469,14 +2469,14 @@ communicator_read_slru_segment(SlruKind kind, int64 segno, neon_request_lsns *re
 			{
 				if (!equal_requests(resp, &request.hdr))
 				{
-					elog(WARNING, NEON_TAG "Error message {reqid=%lx,lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=%lx,lsn=%X/%08X, since=%X/%08X}",
+					elog(WARNING, NEON_TAG "Error message {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X} doesn't match get SLRU segment request {reqid=" UINT64_HEX_FORMAT ",lsn=%X/%08X, since=%X/%08X}",
 						 resp->reqid, LSN_FORMAT_ARGS(resp->lsn), LSN_FORMAT_ARGS(resp->not_modified_since),
 						 request.hdr.reqid, LSN_FORMAT_ARGS(request.hdr.lsn), LSN_FORMAT_ARGS(request.hdr.not_modified_since));
 				}
 			}
 			ereport(ERROR,
 					(errcode(ERRCODE_IO_ERROR),
-					 errmsg(NEON_TAG "[reqid %lx] could not read SLRU %d segment %llu at lsn %X/%08X",
+					 errmsg(NEON_TAG "[reqid " UINT64_HEX_FORMAT "] could not read SLRU %d segment %llu at lsn %X/%08X",
 							resp->reqid,
 							kind,
 							(unsigned long long) segno,
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index 787bd552f8..c7574ef0f9 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -165,4 +165,8 @@ extern void InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags);
 extern TimeLineID GetWALInsertionTimeLine(void);
 #endif
 
+/* format codes not present in PG17-; but available in PG18+ */
+#define INT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+#define UINT64_HEX_FORMAT "%" INT64_MODIFIER "x"
+
 #endif							/* NEON_PGVERSIONCOMPAT_H */

From a456e818afbf7a82be0bf72761d6025c1e17b99a Mon Sep 17 00:00:00 2001
From: Mikhail <to@myrrc.dev>
Date: Mon, 14 Jul 2025 18:37:47 +0100
Subject: [PATCH 116/163] LFC prewarm perftest: increase timeout for
 initialization job (#12594)

Tests on
https://github.com/neondatabase/neon/actions/runs/16268609007/job/45930162686
time out due to pgbench init job taking more than 30 minutes to run.
Increase test timeout duration to 2 hours.
---
 test_runner/performance/test_lfc_prewarm.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/test_runner/performance/test_lfc_prewarm.py b/test_runner/performance/test_lfc_prewarm.py
index ad2c759a63..6c0083de95 100644
--- a/test_runner/performance/test_lfc_prewarm.py
+++ b/test_runner/performance/test_lfc_prewarm.py
@@ -60,7 +60,7 @@ def test_compare_prewarmed_pgbench_perf(neon_compare: NeonCompare):
 
 
 @pytest.mark.remote_cluster
-@pytest.mark.timeout(30 * 60)
+@pytest.mark.timeout(2 * 60 * 60)
 def test_compare_prewarmed_pgbench_perf_benchmark(
     pg_bin: PgBin,
     neon_api: NeonAPI,
@@ -91,8 +91,9 @@ def benchmark_impl(
     test_duration_min = 5
     pgbench_duration = f"-T{test_duration_min * 60}"
     # prewarm API is not publicly exposed. In order to test performance of a
-    # fully prewarmed endpoint, wait after it restarts
-    prewarmed_sleep_secs = 30
+    # fully prewarmed endpoint, wait after it restarts.
+    # The number here is empirical, based on manual runs on staging
+    prewarmed_sleep_secs = 180
 
     branch_id = project["branch"]["id"]
     project_id = project["project"]["id"]

From 9a2456bea557b3f140fff9d3b40809b9b853af84 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Mon, 14 Jul 2025 21:42:36 +0300
Subject: [PATCH 117/163] Reduce noise from get_installed_extensions during e.g
 shut down (#12479)

All Errors that can occur during get_installed_extensions() come from
tokio-postgres functions, e.g. if the database is being shut down
("FATAL: terminating connection due to administrator command"). I'm
seeing a lot of such errors in the logs with the regression tests, with
very verbose stack traces. The compute_ctl stack trace is pretty useless
for errors originating from the Postgres connection, the error message
has all the information, so stop printing the stack trace.

I changed the result type of the functions to return the originating
tokio_postgres Error rather than anyhow::Error, so that if we introduce
other error sources to the functions where the stack trace might be
useful, we'll be forced to revisit this, probably by introducing a new
Error type that separates postgres errors from other errors. But this
will do for now.
---
 compute_tools/src/compute.rs              | 2 +-
 compute_tools/src/installed_extensions.rs | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 2e0b7d7b2e..8f42cf699b 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2487,7 +2487,7 @@ pub async fn installed_extensions(conf: tokio_postgres::Config) -> Result<()> {
                 serde_json::to_string(&extensions).expect("failed to serialize extensions list")
             );
         }
-        Err(err) => error!("could not get installed extensions: {err:?}"),
+        Err(err) => error!("could not get installed extensions: {err}"),
     }
     Ok(())
 }
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 411e03b7ec..90e1a17be4 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -2,6 +2,7 @@ use std::collections::HashMap;
 
 use anyhow::Result;
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use tokio_postgres::error::Error as PostgresError;
 use tokio_postgres::{Client, Config, NoTls};
 
 use crate::metrics::INSTALLED_EXTENSIONS;
@@ -10,7 +11,7 @@ use crate::metrics::INSTALLED_EXTENSIONS;
 /// and to make database listing query here more explicit.
 ///
 /// Limit the number of databases to 500 to avoid excessive load.
-async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
+async fn list_dbs(client: &mut Client) -> Result<Vec<String>, PostgresError> {
     // `pg_database.datconnlimit = -2` means that the database is in the
     // invalid state
     let databases = client
@@ -37,7 +38,9 @@ async fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 /// Same extension can be installed in multiple databases with different versions,
 /// so we report a separate metric (number of databases where it is installed)
 /// for each extension version.
-pub async fn get_installed_extensions(mut conf: Config) -> Result<InstalledExtensions> {
+pub async fn get_installed_extensions(
+    mut conf: Config,
+) -> Result<InstalledExtensions, PostgresError> {
     conf.application_name("compute_ctl:get_installed_extensions");
     let databases: Vec<String> = {
         let (mut client, connection) = conf.connect(NoTls).await?;

From ff526a1051b42443ad0cb6e81aff27a314b3482a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= <k.p.szafranski@gmail.com>
Date: Tue, 15 Jul 2025 09:42:48 +0200
Subject: [PATCH 118/163] [proxy] Recognize more cplane errors, use
 retry_delay_ms as TTL (#12543)

## Problem

Not all cplane errors are properly recognized and cached/retried.

## Summary of changes

Add more cplane error reasons. Also, use retry_delay_ms as cache TTL if
present.

Related to https://github.com/neondatabase/cloud/issues/19353
---
 proxy/src/cache/timed_lru.rs                  | 13 ++--
 .../control_plane/client/cplane_proxy_v1.rs   | 75 ++++++++++---------
 proxy/src/control_plane/errors.rs             | 49 ++++++------
 proxy/src/control_plane/messages.rs           | 33 ++++++--
 proxy/src/proxy/mod.rs                        | 10 ++-
 5 files changed, 107 insertions(+), 73 deletions(-)

diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 183e1ea449..e87cf53ab9 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -14,8 +14,8 @@ use std::time::{Duration, Instant};
 use hashlink::{LruCache, linked_hash_map::RawEntryMut};
 use tracing::debug;
 
+use super::Cache;
 use super::common::Cached;
-use super::{Cache, timed_lru};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
@@ -30,7 +30,7 @@ use super::{Cache, timed_lru};
 ///
 /// * There's an API for immediate invalidation (removal) of a cache entry;
 ///   It's useful in case we know for sure that the entry is no longer correct.
-///   See [`timed_lru::Cached`] for more information.
+///   See [`Cached`] for more information.
 ///
 /// * Expired entries are kept in the cache, until they are evicted by the LRU policy,
 ///   or by a successful lookup (i.e. the entry hasn't expired yet).
@@ -217,15 +217,18 @@ impl<K: Hash + Eq + Clone, V: Clone> TimedLru<K, V> {
 }
 
 impl<K: Hash + Eq, V: Clone> TimedLru<K, V> {
-    /// Retrieve a cached entry in convenient wrapper.
-    pub(crate) fn get<Q>(&self, key: &Q) -> Option<timed_lru::Cached<&Self>>
+    /// Retrieve a cached entry in convenient wrapper, alongside timing information.
+    pub(crate) fn get_with_created_at<Q>(
+        &self,
+        key: &Q,
+    ) -> Option<Cached<&Self, (<Self as Cache>::Value, Instant)>>
     where
         K: Borrow<Q> + Clone,
         Q: Hash + Eq + ?Sized,
     {
         self.get_raw(key, |key, entry| Cached {
             token: Some((self, key.clone())),
-            value: entry.value.clone(),
+            value: (entry.value.clone(), entry.created_at),
         })
     }
 }
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index fc263b73b1..bb785b8b0c 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -23,12 +23,13 @@ use crate::control_plane::errors::{
     ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
 };
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse};
 use crate::control_plane::{
     AccessBlockerFlags, AuthInfo, AuthSecret, CachedNodeInfo, EndpointAccessControl, NodeInfo,
     RoleAccessControl,
 };
 use crate::metrics::Metrics;
+use crate::proxy::retry::CouldRetry;
 use crate::rate_limiter::WakeComputeRateLimiter;
 use crate::types::{EndpointCacheKey, EndpointId, RoleName};
 use crate::{compute, http, scram};
@@ -382,16 +383,31 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
 
         macro_rules! check_cache {
             () => {
-                if let Some(cached) = self.caches.node_info.get(&key) {
-                    let (cached, info) = cached.take_value();
-                    let info = info.map_err(|c| {
-                        info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
-                    })?;
+                if let Some(cached) = self.caches.node_info.get_with_created_at(&key) {
+                    let (cached, (info, created_at)) = cached.take_value();
+                    return match info {
+                        Err(mut msg) => {
+                            info!(key = &*key, "found cached wake_compute error");
 
-                    debug!(key = &*key, "found cached compute node info");
-                    ctx.set_project(info.aux.clone());
-                    return Ok(cached.map(|()| info));
+                            // if retry_delay_ms is set, reduce it by the amount of time it spent in cache
+                            if let Some(status) = &mut msg.status {
+                                if let Some(retry_info) = &mut status.details.retry_info {
+                                    retry_info.retry_delay_ms = retry_info
+                                        .retry_delay_ms
+                                        .saturating_sub(created_at.elapsed().as_millis() as u64)
+                                }
+                            }
+
+                            Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                                msg,
+                            )))
+                        }
+                        Ok(info) => {
+                            debug!(key = &*key, "found cached compute node info");
+                            ctx.set_project(info.aux.clone());
+                            Ok(cached.map(|()| info))
+                        }
+                    };
                 }
             };
         }
@@ -434,42 +450,29 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                 Ok(cached.map(|()| node))
             }
             Err(err) => match err {
-                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
-                    let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
-                    };
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(ref msg)) => {
+                    let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info);
 
-                    let reason = status
-                        .details
-                        .error_info
-                        .map_or(Reason::Unknown, |x| x.reason);
-
-                    // if we can retry this error, do not cache it.
-                    if reason.can_retry() {
-                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                            err,
-                        )));
+                    // If we can retry this error, do not cache it,
+                    // unless we were given a retry delay.
+                    if msg.could_retry() && retry_info.is_none() {
+                        return Err(err);
                     }
 
-                    // at this point, we should only have quota errors.
                     debug!(
                         key = &*key,
                         "created a cache entry for the wake compute error"
                     );
 
-                    self.caches.node_info.insert_ttl(
-                        key,
-                        Err(err.clone()),
-                        Duration::from_secs(30),
-                    );
+                    let ttl = retry_info.map_or(Duration::from_secs(30), |r| {
+                        Duration::from_millis(r.retry_delay_ms)
+                    });
 
-                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
-                        err,
-                    )))
+                    self.caches.node_info.insert_ttl(key, Err(msg.clone()), ttl);
+
+                    Err(err)
                 }
-                err => return Err(err),
+                err => Err(err),
             },
         }
     }
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
index f640657d90..12843e48c7 100644
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -43,28 +43,35 @@ impl UserFacingError for ControlPlaneError {
 }
 
 impl ReportableError for ControlPlaneError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
         match self {
             ControlPlaneError::Message(e) => match e.get_reason() {
-                Reason::RoleProtected => ErrorKind::User,
-                Reason::ResourceNotFound => ErrorKind::User,
-                Reason::ProjectNotFound => ErrorKind::User,
-                Reason::EndpointNotFound => ErrorKind::User,
-                Reason::BranchNotFound => ErrorKind::User,
+                Reason::RoleProtected
+                | Reason::ResourceNotFound
+                | Reason::ProjectNotFound
+                | Reason::EndpointNotFound
+                | Reason::EndpointDisabled
+                | Reason::BranchNotFound
+                | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User,
+
                 Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
-                Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
-                Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
-                Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
-                Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
-                Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
-                Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
-                Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
-                Reason::RunningOperations => ErrorKind::ControlPlane,
-                Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
-                Reason::Unknown => ErrorKind::ControlPlane,
+
+                Reason::NonDefaultBranchComputeTimeExceeded
+                | Reason::ActiveTimeQuotaExceeded
+                | Reason::ComputeTimeQuotaExceeded
+                | Reason::WrittenDataQuotaExceeded
+                | Reason::DataTransferQuotaExceeded
+                | Reason::LogicalSizeQuotaExceeded
+                | Reason::ActiveEndpointsLimitExceeded => ErrorKind::Quota,
+
+                Reason::ConcurrencyLimitReached
+                | Reason::LockAlreadyTaken
+                | Reason::RunningOperations
+                | Reason::EndpointIdle
+                | Reason::ProjectUnderMaintenance
+                | Reason::Unknown => ErrorKind::ControlPlane,
             },
-            ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane,
+            ControlPlaneError::Transport(_) => ErrorKind::ControlPlane,
         }
     }
 }
@@ -120,10 +127,10 @@ impl UserFacingError for GetAuthInfoError {
 }
 
 impl ReportableError for GetAuthInfoError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
+    fn get_error_kind(&self) -> ErrorKind {
         match self {
-            Self::BadSecret => crate::error::ErrorKind::ControlPlane,
-            Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+            Self::BadSecret => ErrorKind::ControlPlane,
+            Self::ApiError(_) => ErrorKind::ControlPlane,
         }
     }
 }
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index f0314f91f0..cf193ed268 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -126,10 +126,16 @@ pub(crate) enum Reason {
     /// or that the subject doesn't have enough permissions to access the requested endpoint.
     #[serde(rename = "ENDPOINT_NOT_FOUND")]
     EndpointNotFound,
+    /// EndpointDisabled indicates that the endpoint has been disabled and does not accept connections.
+    #[serde(rename = "ENDPOINT_DISABLED")]
+    EndpointDisabled,
     /// BranchNotFound indicates that the branch wasn't found, usually due to the provided ID not being correct,
     /// or that the subject doesn't have enough permissions to access the requested branch.
     #[serde(rename = "BRANCH_NOT_FOUND")]
     BranchNotFound,
+    /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong.
+    #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")]
+    InvalidEphemeralEndpointOptions,
     /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
     #[serde(rename = "RATE_LIMIT_EXCEEDED")]
     RateLimitExceeded,
@@ -152,6 +158,9 @@ pub(crate) enum Reason {
     /// LogicalSizeQuotaExceeded indicates that the logical size quota was exceeded.
     #[serde(rename = "LOGICAL_SIZE_QUOTA_EXCEEDED")]
     LogicalSizeQuotaExceeded,
+    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
+    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
+    ActiveEndpointsLimitExceeded,
     /// RunningOperations indicates that the project already has some running operations
     /// and scheduling of new ones is prohibited.
     #[serde(rename = "RUNNING_OPERATIONS")]
@@ -162,9 +171,13 @@ pub(crate) enum Reason {
     /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
     #[serde(rename = "LOCK_ALREADY_TAKEN")]
     LockAlreadyTaken,
-    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
-    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
-    ActiveEndpointsLimitExceeded,
+    /// EndpointIdle indicates that the endpoint cannot become active, because it's idle.
+    #[serde(rename = "ENDPOINT_IDLE")]
+    EndpointIdle,
+    /// ProjectUnderMaintenance indicates that the project is currently ongoing maintenance,
+    /// and thus cannot accept connections.
+    #[serde(rename = "PROJECT_UNDER_MAINTENANCE")]
+    ProjectUnderMaintenance,
     #[default]
     #[serde(other)]
     Unknown,
@@ -184,13 +197,15 @@ impl Reason {
     pub(crate) fn can_retry(self) -> bool {
         match self {
             // do not retry role protected errors
-            // not a transitive error
+            // not a transient error
             Reason::RoleProtected => false,
-            // on retry, it will still not be found
+            // on retry, it will still not be found or valid
             Reason::ResourceNotFound
             | Reason::ProjectNotFound
             | Reason::EndpointNotFound
-            | Reason::BranchNotFound => false,
+            | Reason::EndpointDisabled
+            | Reason::BranchNotFound
+            | Reason::InvalidEphemeralEndpointOptions => false,
             // we were asked to go away
             Reason::RateLimitExceeded
             | Reason::NonDefaultBranchComputeTimeExceeded
@@ -200,11 +215,13 @@ impl Reason {
             | Reason::DataTransferQuotaExceeded
             | Reason::LogicalSizeQuotaExceeded
             | Reason::ActiveEndpointsLimitExceeded => false,
-            // transitive error. control plane is currently busy
+            // transient error. control plane is currently busy
             // but might be ready soon
             Reason::RunningOperations
             | Reason::ConcurrencyLimitReached
-            | Reason::LockAlreadyTaken => true,
+            | Reason::LockAlreadyTaken
+            | Reason::EndpointIdle
+            | Reason::ProjectUnderMaintenance => true,
             // unknown error. better not retry it.
             Reason::Unknown => false,
         }
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 08c81afa04..02651109e0 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -195,15 +195,18 @@ impl NeonOptions {
     // proxy options:
 
     /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
-    pub const PARAMS_COMPAT: &str = "proxy_params_compat";
+    pub const PARAMS_COMPAT: &'static str = "proxy_params_compat";
 
     // cplane options:
 
     /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN.
-    const LSN: &str = "lsn";
+    const LSN: &'static str = "lsn";
+
+    /// `TIMESTAMP` allows provisioning an ephemeral compute with time-travel to the provided timestamp.
+    const TIMESTAMP: &'static str = "timestamp";
 
     /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write.
-    const ENDPOINT_TYPE: &str = "endpoint_type";
+    const ENDPOINT_TYPE: &'static str = "endpoint_type";
 
     pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
@@ -228,6 +231,7 @@ impl NeonOptions {
             // This is not a cplane option, we know it does not create ephemeral computes.
             Self::PARAMS_COMPAT => false,
             Self::LSN => true,
+            Self::TIMESTAMP => true,
             Self::ENDPOINT_TYPE => true,
             // err on the side of caution. any cplane options we don't know about
             // might lead to ephemeral computes.

From 7a7ab2a1d1c3ca8acfaa9664984b162a18607e87 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 15 Jul 2025 11:45:49 +0100
Subject: [PATCH 119/163] Move `build-tools.Dockerfile` ->
 `build-tools/Dockerfile` (#12590)

## Problem

This is a prerequisite for neondatabase/neon#12575 to keep all things
relevant to `build-tools` image in a single directory

## Summary of changes
- Rename `build_tools/` to `build-tools/`
- Move `build-tools.Dockerfile` to `build-tools/Dockerfile`
---
 .dockerignore                                           | 2 +-
 .github/workflows/_build-and-test-locally.yml           | 8 ++++----
 .github/workflows/build-build-tools-image.yml           | 4 ++--
 build-tools.Dockerfile => build-tools/Dockerfile        | 2 +-
 {build_tools => build-tools}/patches/pgcopydbv017.patch | 0
 compute/compute-node.Dockerfile                         | 6 +++---
 test_runner/regress/test_compute_metrics.py             | 2 +-
 7 files changed, 12 insertions(+), 12 deletions(-)
 rename build-tools.Dockerfile => build-tools/Dockerfile (99%)
 rename {build_tools => build-tools}/patches/pgcopydbv017.patch (100%)

diff --git a/.dockerignore b/.dockerignore
index 4d9433764e..aa44421fb6 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -27,4 +27,4 @@
 !storage_controller/
 !vendor/postgres-*/
 !workspace_hack/
-!build_tools/patches
+!build-tools/patches
diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index e2203a38ec..94115572df 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -150,7 +150,7 @@ jobs:
           secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
           use-fallback: false
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
@@ -162,7 +162,7 @@ jobs:
           secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
           use-fallback: false
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
@@ -174,7 +174,7 @@ jobs:
           secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
           use-fallback: false
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
 
       - name: Cache postgres v17 build
         id: cache_pg_17
@@ -186,7 +186,7 @@ jobs:
           secretKey: ${{ secrets.HETZNER_CACHE_SECRET_KEY }}
           use-fallback: false
           path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools/Dockerfile') }}
 
       - name: Build all
         # Note: the Makefile picks up BUILD_TYPE and CARGO_PROFILE from the env variables
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 133c8635b6..24e4c8fa3d 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -72,7 +72,7 @@ jobs:
           ARCHS: ${{ inputs.archs || '["x64","arm64"]' }}
           DEBIANS: ${{ inputs.debians || '["bullseye","bookworm"]' }}
           IMAGE_TAG: |
-            ${{ hashFiles('build-tools.Dockerfile',
+            ${{ hashFiles('build-tools/Dockerfile',
                           '.github/workflows/build-build-tools-image.yml') }}
         run: |
           echo "archs=${ARCHS}"           | tee -a ${GITHUB_OUTPUT}
@@ -144,7 +144,7 @@ jobs:
 
       - uses: docker/build-push-action@471d1dc4e07e5cdedd4c2171150001c434f0b7a4 # v6.15.0
         with:
-          file: build-tools.Dockerfile
+          file: build-tools/Dockerfile
           context: .
           provenance: false
           push: true
diff --git a/build-tools.Dockerfile b/build-tools/Dockerfile
similarity index 99%
rename from build-tools.Dockerfile
rename to build-tools/Dockerfile
index 14a52bd736..2ed7bb4f36 100644
--- a/build-tools.Dockerfile
+++ b/build-tools/Dockerfile
@@ -35,7 +35,7 @@ RUN echo 'Acquire::Retries "5";' > /etc/apt/apt.conf.d/80-retries && \
     echo -e "retry_connrefused=on\ntimeout=15\ntries=5\nretry-on-host-error=on\n" > /root/.wgetrc && \
     echo -e "--retry-connrefused\n--connect-timeout 15\n--retry 5\n--max-time 300\n" > /root/.curlrc
 
-COPY build_tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
+COPY build-tools/patches/pgcopydbv017.patch /pgcopydbv017.patch
 
 RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
         set -e && \
diff --git a/build_tools/patches/pgcopydbv017.patch b/build-tools/patches/pgcopydbv017.patch
similarity index 100%
rename from build_tools/patches/pgcopydbv017.patch
rename to build-tools/patches/pgcopydbv017.patch
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 39136fe573..232b1e3bd5 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -9,7 +9,7 @@
 #
 # build-tools:   This contains Rust compiler toolchain and other tools needed at compile
 #                time. This is also used for the storage builds. This image is defined in
-#                build-tools.Dockerfile.
+#                build-tools/Dockerfile.
 #
 # build-deps:    Contains C compiler, other build tools, and compile-time dependencies
 #                needed to compile PostgreSQL and most extensions. (Some extensions need
@@ -115,7 +115,7 @@ ARG EXTENSIONS=all
 FROM $BASE_IMAGE_SHA AS build-deps
 ARG DEBIAN_VERSION
 
-# Keep in sync with build-tools.Dockerfile
+# Keep in sync with build-tools/Dockerfile
 ENV PROTOC_VERSION=25.1
 
 # Use strict mode for bash to catch errors early
@@ -1790,7 +1790,7 @@ RUN set -e \
 #########################################################################################
 FROM build-deps AS exporters
 ARG TARGETARCH
-# Keep sql_exporter version same as in build-tools.Dockerfile and
+# Keep sql_exporter version same as in build-tools/Dockerfile and
 # test_runner/regress/test_compute_metrics.py
 # See comment on the top of the file regading `echo`, `-e` and `\n`
 RUN if [ "$TARGETARCH" = "amd64" ]; then\
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index d1e61e597c..b776f58348 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -217,7 +217,7 @@ if SQL_EXPORTER is None:
             self, logs_dir: Path, config_file: Path, collector_file: Path, port: int
         ) -> None:
             # NOTE: Keep the version the same as in
-            # compute/compute-node.Dockerfile and build-tools.Dockerfile.
+            # compute/compute-node.Dockerfile and build-tools/Dockerfile.
             #
             # The "host" network mode allows sql_exporter to talk to the
             # endpoint which is running on the host.

From eb93c3e3c614f0735beea46bdab2c2d05b19c5ab Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 15 Jul 2025 11:06:58 +0000
Subject: [PATCH 120/163] build(deps): bump aiohttp from 3.10.11 to 3.12.14 in
 the pip group across 1 directory (#12600)

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 poetry.lock    | 209 ++++++++++++++++++++++++-------------------------
 pyproject.toml |   2 +-
 2 files changed, 104 insertions(+), 107 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 1bc5077eb7..b2072bf1bc 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2,127 +2,123 @@
 
 [[package]]
 name = "aiohappyeyeballs"
-version = "2.3.5"
+version = "2.6.1"
 description = "Happy Eyeballs for asyncio"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "aiohappyeyeballs-2.3.5-py3-none-any.whl", hash = "sha256:4d6dea59215537dbc746e93e779caea8178c866856a721c9c660d7a5a7b8be03"},
-    {file = "aiohappyeyeballs-2.3.5.tar.gz", hash = "sha256:6fa48b9f1317254f122a07a131a86b71ca6946ca989ce6326fff54a99a920105"},
+    {file = "aiohappyeyeballs-2.6.1-py3-none-any.whl", hash = "sha256:f349ba8f4b75cb25c99c5c2d84e997e485204d2902a9597802b0371f09331fb8"},
+    {file = "aiohappyeyeballs-2.6.1.tar.gz", hash = "sha256:c3f9d0113123803ccadfdf3f0faa505bc78e6a72d1cc4806cbd719826e943558"},
 ]
 
 [[package]]
 name = "aiohttp"
-version = "3.10.11"
+version = "3.12.14"
 description = "Async http client/server framework (asyncio)"
 optional = false
-python-versions = ">=3.8"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:5077b1a5f40ffa3ba1f40d537d3bec4383988ee51fbba6b74aa8fb1bc466599e"},
-    {file = "aiohttp-3.10.11-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:8d6a14a4d93b5b3c2891fca94fa9d41b2322a68194422bef0dd5ec1e57d7d298"},
-    {file = "aiohttp-3.10.11-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:ffbfde2443696345e23a3c597049b1dd43049bb65337837574205e7368472177"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:20b3d9e416774d41813bc02fdc0663379c01817b0874b932b81c7f777f67b217"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2b943011b45ee6bf74b22245c6faab736363678e910504dd7531a58c76c9015a"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48bc1d924490f0d0b3658fe5c4b081a4d56ebb58af80a6729d4bd13ea569797a"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e12eb3f4b1f72aaaf6acd27d045753b18101524f72ae071ae1c91c1cd44ef115"},
-    {file = "aiohttp-3.10.11-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f14ebc419a568c2eff3c1ed35f634435c24ead2fe19c07426af41e7adb68713a"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:72b191cdf35a518bfc7ca87d770d30941decc5aaf897ec8b484eb5cc8c7706f3"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:5ab2328a61fdc86424ee540d0aeb8b73bbcad7351fb7cf7a6546fc0bcffa0038"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:aa93063d4af05c49276cf14e419550a3f45258b6b9d1f16403e777f1addf4519"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:30283f9d0ce420363c24c5c2421e71a738a2155f10adbb1a11a4d4d6d2715cfc"},
-    {file = "aiohttp-3.10.11-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e5358addc8044ee49143c546d2182c15b4ac3a60be01c3209374ace05af5733d"},
-    {file = "aiohttp-3.10.11-cp310-cp310-win32.whl", hash = "sha256:e1ffa713d3ea7cdcd4aea9cddccab41edf6882fa9552940344c44e59652e1120"},
-    {file = "aiohttp-3.10.11-cp310-cp310-win_amd64.whl", hash = "sha256:778cbd01f18ff78b5dd23c77eb82987ee4ba23408cbed233009fd570dda7e674"},
-    {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:80ff08556c7f59a7972b1e8919f62e9c069c33566a6d28586771711e0eea4f07"},
-    {file = "aiohttp-3.10.11-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:2c8f96e9ee19f04c4914e4e7a42a60861066d3e1abf05c726f38d9d0a466e695"},
-    {file = "aiohttp-3.10.11-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:fb8601394d537da9221947b5d6e62b064c9a43e88a1ecd7414d21a1a6fba9c24"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ea224cf7bc2d8856d6971cea73b1d50c9c51d36971faf1abc169a0d5f85a382"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:db9503f79e12d5d80b3efd4d01312853565c05367493379df76d2674af881caa"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0f449a50cc33f0384f633894d8d3cd020e3ccef81879c6e6245c3c375c448625"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82052be3e6d9e0c123499127782a01a2b224b8af8c62ab46b3f6197035ad94e9"},
-    {file = "aiohttp-3.10.11-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:20063c7acf1eec550c8eb098deb5ed9e1bb0521613b03bb93644b810986027ac"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:489cced07a4c11488f47aab1f00d0c572506883f877af100a38f1fedaa884c3a"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:ea9b3bab329aeaa603ed3bf605f1e2a6f36496ad7e0e1aa42025f368ee2dc07b"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:ca117819d8ad113413016cb29774b3f6d99ad23c220069789fc050267b786c16"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:2dfb612dcbe70fb7cdcf3499e8d483079b89749c857a8f6e80263b021745c730"},
-    {file = "aiohttp-3.10.11-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f9b615d3da0d60e7d53c62e22b4fd1c70f4ae5993a44687b011ea3a2e49051b8"},
-    {file = "aiohttp-3.10.11-cp311-cp311-win32.whl", hash = "sha256:29103f9099b6068bbdf44d6a3d090e0a0b2be6d3c9f16a070dd9d0d910ec08f9"},
-    {file = "aiohttp-3.10.11-cp311-cp311-win_amd64.whl", hash = "sha256:236b28ceb79532da85d59aa9b9bf873b364e27a0acb2ceaba475dc61cffb6f3f"},
-    {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:7480519f70e32bfb101d71fb9a1f330fbd291655a4c1c922232a48c458c52710"},
-    {file = "aiohttp-3.10.11-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:f65267266c9aeb2287a6622ee2bb39490292552f9fbf851baabc04c9f84e048d"},
-    {file = "aiohttp-3.10.11-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7400a93d629a0608dc1d6c55f1e3d6e07f7375745aaa8bd7f085571e4d1cee97"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f34b97e4b11b8d4eb2c3a4f975be626cc8af99ff479da7de49ac2c6d02d35725"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1e7b825da878464a252ccff2958838f9caa82f32a8dbc334eb9b34a026e2c636"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f9f92a344c50b9667827da308473005f34767b6a2a60d9acff56ae94f895f385"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc6f1ab987a27b83c5268a17218463c2ec08dbb754195113867a27b166cd6087"},
-    {file = "aiohttp-3.10.11-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1dc0f4ca54842173d03322793ebcf2c8cc2d34ae91cc762478e295d8e361e03f"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:7ce6a51469bfaacff146e59e7fb61c9c23006495d11cc24c514a455032bcfa03"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:aad3cd91d484d065ede16f3cf15408254e2469e3f613b241a1db552c5eb7ab7d"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:f4df4b8ca97f658c880fb4b90b1d1ec528315d4030af1ec763247ebfd33d8b9a"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:2e4e18a0a2d03531edbc06c366954e40a3f8d2a88d2b936bbe78a0c75a3aab3e"},
-    {file = "aiohttp-3.10.11-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:6ce66780fa1a20e45bc753cda2a149daa6dbf1561fc1289fa0c308391c7bc0a4"},
-    {file = "aiohttp-3.10.11-cp312-cp312-win32.whl", hash = "sha256:a919c8957695ea4c0e7a3e8d16494e3477b86f33067478f43106921c2fef15bb"},
-    {file = "aiohttp-3.10.11-cp312-cp312-win_amd64.whl", hash = "sha256:b5e29706e6389a2283a91611c91bf24f218962717c8f3b4e528ef529d112ee27"},
-    {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:703938e22434d7d14ec22f9f310559331f455018389222eed132808cd8f44127"},
-    {file = "aiohttp-3.10.11-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9bc50b63648840854e00084c2b43035a62e033cb9b06d8c22b409d56eb098413"},
-    {file = "aiohttp-3.10.11-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5f0463bf8b0754bc744e1feb61590706823795041e63edf30118a6f0bf577461"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f6c6dec398ac5a87cb3a407b068e1106b20ef001c344e34154616183fe684288"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:bcaf2d79104d53d4dcf934f7ce76d3d155302d07dae24dff6c9fffd217568067"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:25fd5470922091b5a9aeeb7e75be609e16b4fba81cdeaf12981393fb240dd10e"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bbde2ca67230923a42161b1f408c3992ae6e0be782dca0c44cb3206bf330dee1"},
-    {file = "aiohttp-3.10.11-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:249c8ff8d26a8b41a0f12f9df804e7c685ca35a207e2410adbd3e924217b9006"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:878ca6a931ee8c486a8f7b432b65431d095c522cbeb34892bee5be97b3481d0f"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:8663f7777ce775f0413324be0d96d9730959b2ca73d9b7e2c2c90539139cbdd6"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:6cd3f10b01f0c31481fba8d302b61603a2acb37b9d30e1d14e0f5a58b7b18a31"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:4e8d8aad9402d3aa02fdc5ca2fe68bcb9fdfe1f77b40b10410a94c7f408b664d"},
-    {file = "aiohttp-3.10.11-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:38e3c4f80196b4f6c3a85d134a534a56f52da9cb8d8e7af1b79a32eefee73a00"},
-    {file = "aiohttp-3.10.11-cp313-cp313-win32.whl", hash = "sha256:fc31820cfc3b2863c6e95e14fcf815dc7afe52480b4dc03393c4873bb5599f71"},
-    {file = "aiohttp-3.10.11-cp313-cp313-win_amd64.whl", hash = "sha256:4996ff1345704ffdd6d75fb06ed175938c133425af616142e7187f28dc75f14e"},
-    {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:74baf1a7d948b3d640badeac333af581a367ab916b37e44cf90a0334157cdfd2"},
-    {file = "aiohttp-3.10.11-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:473aebc3b871646e1940c05268d451f2543a1d209f47035b594b9d4e91ce8339"},
-    {file = "aiohttp-3.10.11-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:c2f746a6968c54ab2186574e15c3f14f3e7f67aef12b761e043b33b89c5b5f95"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d110cabad8360ffa0dec8f6ec60e43286e9d251e77db4763a87dcfe55b4adb92"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e0099c7d5d7afff4202a0c670e5b723f7718810000b4abcbc96b064129e64bc7"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0316e624b754dbbf8c872b62fe6dcb395ef20c70e59890dfa0de9eafccd2849d"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5a5f7ab8baf13314e6b2485965cbacb94afff1e93466ac4d06a47a81c50f9cca"},
-    {file = "aiohttp-3.10.11-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c891011e76041e6508cbfc469dd1a8ea09bc24e87e4c204e05f150c4c455a5fa"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:9208299251370ee815473270c52cd3f7069ee9ed348d941d574d1457d2c73e8b"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:459f0f32c8356e8125f45eeff0ecf2b1cb6db1551304972702f34cd9e6c44658"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:14cdc8c1810bbd4b4b9f142eeee23cda528ae4e57ea0923551a9af4820980e39"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:971aa438a29701d4b34e4943e91b5e984c3ae6ccbf80dd9efaffb01bd0b243a9"},
-    {file = "aiohttp-3.10.11-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:9a309c5de392dfe0f32ee57fa43ed8fc6ddf9985425e84bd51ed66bb16bce3a7"},
-    {file = "aiohttp-3.10.11-cp38-cp38-win32.whl", hash = "sha256:9ec1628180241d906a0840b38f162a3215114b14541f1a8711c368a8739a9be4"},
-    {file = "aiohttp-3.10.11-cp38-cp38-win_amd64.whl", hash = "sha256:9c6e0ffd52c929f985c7258f83185d17c76d4275ad22e90aa29f38e211aacbec"},
-    {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:cdc493a2e5d8dc79b2df5bec9558425bcd39aff59fc949810cbd0832e294b106"},
-    {file = "aiohttp-3.10.11-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:b3e70f24e7d0405be2348da9d5a7836936bf3a9b4fd210f8c37e8d48bc32eca6"},
-    {file = "aiohttp-3.10.11-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:968b8fb2a5eee2770eda9c7b5581587ef9b96fbdf8dcabc6b446d35ccc69df01"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:deef4362af9493d1382ef86732ee2e4cbc0d7c005947bd54ad1a9a16dd59298e"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:686b03196976e327412a1b094f4120778c7c4b9cff9bce8d2fdfeca386b89829"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:3bf6d027d9d1d34e1c2e1645f18a6498c98d634f8e373395221121f1c258ace8"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:099fd126bf960f96d34a760e747a629c27fb3634da5d05c7ef4d35ef4ea519fc"},
-    {file = "aiohttp-3.10.11-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c73c4d3dae0b4644bc21e3de546530531d6cdc88659cdeb6579cd627d3c206aa"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:0c5580f3c51eea91559db3facd45d72e7ec970b04528b4709b1f9c2555bd6d0b"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:fdf6429f0caabfd8a30c4e2eaecb547b3c340e4730ebfe25139779b9815ba138"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:d97187de3c276263db3564bb9d9fad9e15b51ea10a371ffa5947a5ba93ad6777"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:0acafb350cfb2eba70eb5d271f55e08bd4502ec35e964e18ad3e7d34d71f7261"},
-    {file = "aiohttp-3.10.11-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:c13ed0c779911c7998a58e7848954bd4d63df3e3575f591e321b19a2aec8df9f"},
-    {file = "aiohttp-3.10.11-cp39-cp39-win32.whl", hash = "sha256:22b7c540c55909140f63ab4f54ec2c20d2635c0289cdd8006da46f3327f971b9"},
-    {file = "aiohttp-3.10.11-cp39-cp39-win_amd64.whl", hash = "sha256:7b26b1551e481012575dab8e3727b16fe7dd27eb2711d2e63ced7368756268fb"},
-    {file = "aiohttp-3.10.11.tar.gz", hash = "sha256:9dc2b8f3dcab2e39e0fa309c8da50c3b55e6f34ab25f1a71d3288f24924d33a7"},
+    {file = "aiohttp-3.12.14-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:906d5075b5ba0dd1c66fcaaf60eb09926a9fef3ca92d912d2a0bbdbecf8b1248"},
+    {file = "aiohttp-3.12.14-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c875bf6fc2fd1a572aba0e02ef4e7a63694778c5646cdbda346ee24e630d30fb"},
+    {file = "aiohttp-3.12.14-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fbb284d15c6a45fab030740049d03c0ecd60edad9cd23b211d7e11d3be8d56fd"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:38e360381e02e1a05d36b223ecab7bc4a6e7b5ab15760022dc92589ee1d4238c"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:aaf90137b5e5d84a53632ad95ebee5c9e3e7468f0aab92ba3f608adcb914fa95"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e532a25e4a0a2685fa295a31acf65e027fbe2bea7a4b02cdfbbba8a064577663"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:eab9762c4d1b08ae04a6c77474e6136da722e34fdc0e6d6eab5ee93ac29f35d1"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:abe53c3812b2899889a7fca763cdfaeee725f5be68ea89905e4275476ffd7e61"},
+    {file = "aiohttp-3.12.14-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5760909b7080aa2ec1d320baee90d03b21745573780a072b66ce633eb77a8656"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:02fcd3f69051467bbaa7f84d7ec3267478c7df18d68b2e28279116e29d18d4f3"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_armv7l.whl", hash = "sha256:4dcd1172cd6794884c33e504d3da3c35648b8be9bfa946942d353b939d5f1288"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:224d0da41355b942b43ad08101b1b41ce633a654128ee07e36d75133443adcda"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e387668724f4d734e865c1776d841ed75b300ee61059aca0b05bce67061dcacc"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:dec9cde5b5a24171e0b0a4ca064b1414950904053fb77c707efd876a2da525d8"},
+    {file = "aiohttp-3.12.14-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:bbad68a2af4877cc103cd94af9160e45676fc6f0c14abb88e6e092b945c2c8e3"},
+    {file = "aiohttp-3.12.14-cp310-cp310-win32.whl", hash = "sha256:ee580cb7c00bd857b3039ebca03c4448e84700dc1322f860cf7a500a6f62630c"},
+    {file = "aiohttp-3.12.14-cp310-cp310-win_amd64.whl", hash = "sha256:cf4f05b8cea571e2ccc3ca744e35ead24992d90a72ca2cf7ab7a2efbac6716db"},
+    {file = "aiohttp-3.12.14-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f4552ff7b18bcec18b60a90c6982049cdb9dac1dba48cf00b97934a06ce2e597"},
+    {file = "aiohttp-3.12.14-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:8283f42181ff6ccbcf25acaae4e8ab2ff7e92b3ca4a4ced73b2c12d8cd971393"},
+    {file = "aiohttp-3.12.14-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:040afa180ea514495aaff7ad34ec3d27826eaa5d19812730fe9e529b04bb2179"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b413c12f14c1149f0ffd890f4141a7471ba4b41234fe4fd4a0ff82b1dc299dbb"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:1d6f607ce2e1a93315414e3d448b831238f1874b9968e1195b06efaa5c87e245"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:565e70d03e924333004ed101599902bba09ebb14843c8ea39d657f037115201b"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:4699979560728b168d5ab63c668a093c9570af2c7a78ea24ca5212c6cdc2b641"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ad5fdf6af93ec6c99bf800eba3af9a43d8bfd66dce920ac905c817ef4a712afe"},
+    {file = "aiohttp-3.12.14-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4ac76627c0b7ee0e80e871bde0d376a057916cb008a8f3ffc889570a838f5cc7"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:798204af1180885651b77bf03adc903743a86a39c7392c472891649610844635"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_armv7l.whl", hash = "sha256:4f1205f97de92c37dd71cf2d5bcfb65fdaed3c255d246172cce729a8d849b4da"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:76ae6f1dd041f85065d9df77c6bc9c9703da9b5c018479d20262acc3df97d419"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:a194ace7bc43ce765338ca2dfb5661489317db216ea7ea700b0332878b392cab"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:16260e8e03744a6fe3fcb05259eeab8e08342c4c33decf96a9dad9f1187275d0"},
+    {file = "aiohttp-3.12.14-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8c779e5ebbf0e2e15334ea404fcce54009dc069210164a244d2eac8352a44b28"},
+    {file = "aiohttp-3.12.14-cp311-cp311-win32.whl", hash = "sha256:a289f50bf1bd5be227376c067927f78079a7bdeccf8daa6a9e65c38bae14324b"},
+    {file = "aiohttp-3.12.14-cp311-cp311-win_amd64.whl", hash = "sha256:0b8a69acaf06b17e9c54151a6c956339cf46db4ff72b3ac28516d0f7068f4ced"},
+    {file = "aiohttp-3.12.14-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:a0ecbb32fc3e69bc25efcda7d28d38e987d007096cbbeed04f14a6662d0eee22"},
+    {file = "aiohttp-3.12.14-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:0400f0ca9bb3e0b02f6466421f253797f6384e9845820c8b05e976398ac1d81a"},
+    {file = "aiohttp-3.12.14-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:a56809fed4c8a830b5cae18454b7464e1529dbf66f71c4772e3cfa9cbec0a1ff"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:27f2e373276e4755691a963e5d11756d093e346119f0627c2d6518208483fb6d"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:ca39e433630e9a16281125ef57ece6817afd1d54c9f1bf32e901f38f16035869"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9c748b3f8b14c77720132b2510a7d9907a03c20ba80f469e58d5dfd90c079a1c"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0a568abe1b15ce69d4cc37e23020720423f0728e3cb1f9bcd3f53420ec3bfe7"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9888e60c2c54eaf56704b17feb558c7ed6b7439bca1e07d4818ab878f2083660"},
+    {file = "aiohttp-3.12.14-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3006a1dc579b9156de01e7916d38c63dc1ea0679b14627a37edf6151bc530088"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:aa8ec5c15ab80e5501a26719eb48a55f3c567da45c6ea5bb78c52c036b2655c7"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_armv7l.whl", hash = "sha256:39b94e50959aa07844c7fe2206b9f75d63cc3ad1c648aaa755aa257f6f2498a9"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:04c11907492f416dad9885d503fbfc5dcb6768d90cad8639a771922d584609d3"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:88167bd9ab69bb46cee91bd9761db6dfd45b6e76a0438c7e884c3f8160ff21eb"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:791504763f25e8f9f251e4688195e8b455f8820274320204f7eafc467e609425"},
+    {file = "aiohttp-3.12.14-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:2785b112346e435dd3a1a67f67713a3fe692d288542f1347ad255683f066d8e0"},
+    {file = "aiohttp-3.12.14-cp312-cp312-win32.whl", hash = "sha256:15f5f4792c9c999a31d8decf444e79fcfd98497bf98e94284bf390a7bb8c1729"},
+    {file = "aiohttp-3.12.14-cp312-cp312-win_amd64.whl", hash = "sha256:3b66e1a182879f579b105a80d5c4bd448b91a57e8933564bf41665064796a338"},
+    {file = "aiohttp-3.12.14-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:3143a7893d94dc82bc409f7308bc10d60285a3cd831a68faf1aa0836c5c3c767"},
+    {file = "aiohttp-3.12.14-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:3d62ac3d506cef54b355bd34c2a7c230eb693880001dfcda0bf88b38f5d7af7e"},
+    {file = "aiohttp-3.12.14-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:48e43e075c6a438937c4de48ec30fa8ad8e6dfef122a038847456bfe7b947b63"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:077b4488411a9724cecc436cbc8c133e0d61e694995b8de51aaf351c7578949d"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:d8c35632575653f297dcbc9546305b2c1133391089ab925a6a3706dfa775ccab"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6b8ce87963f0035c6834b28f061df90cf525ff7c9b6283a8ac23acee6502afd4"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f0a2cf66e32a2563bb0766eb24eae7e9a269ac0dc48db0aae90b575dc9583026"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cdea089caf6d5cde975084a884c72d901e36ef9c2fd972c9f51efbbc64e96fbd"},
+    {file = "aiohttp-3.12.14-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a7865f27db67d49e81d463da64a59365ebd6b826e0e4847aa111056dcb9dc88"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:0ab5b38a6a39781d77713ad930cb5e7feea6f253de656a5f9f281a8f5931b086"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_armv7l.whl", hash = "sha256:9b3b15acee5c17e8848d90a4ebc27853f37077ba6aec4d8cb4dbbea56d156933"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:e4c972b0bdaac167c1e53e16a16101b17c6d0ed7eac178e653a07b9f7fad7151"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:7442488b0039257a3bdbc55f7209587911f143fca11df9869578db6c26feeeb8"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:f68d3067eecb64c5e9bab4a26aa11bd676f4c70eea9ef6536b0a4e490639add3"},
+    {file = "aiohttp-3.12.14-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f88d3704c8b3d598a08ad17d06006cb1ca52a1182291f04979e305c8be6c9758"},
+    {file = "aiohttp-3.12.14-cp313-cp313-win32.whl", hash = "sha256:a3c99ab19c7bf375c4ae3debd91ca5d394b98b6089a03231d4c580ef3c2ae4c5"},
+    {file = "aiohttp-3.12.14-cp313-cp313-win_amd64.whl", hash = "sha256:3f8aad695e12edc9d571f878c62bedc91adf30c760c8632f09663e5f564f4baa"},
+    {file = "aiohttp-3.12.14-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:b8cc6b05e94d837bcd71c6531e2344e1ff0fb87abe4ad78a9261d67ef5d83eae"},
+    {file = "aiohttp-3.12.14-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:d1dcb015ac6a3b8facd3677597edd5ff39d11d937456702f0bb2b762e390a21b"},
+    {file = "aiohttp-3.12.14-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3779ed96105cd70ee5e85ca4f457adbce3d9ff33ec3d0ebcdf6c5727f26b21b3"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:717a0680729b4ebd7569c1dcd718c46b09b360745fd8eb12317abc74b14d14d0"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_armv7l.manylinux2014_armv7l.manylinux_2_31_armv7l.whl", hash = "sha256:b5dd3a2ef7c7e968dbbac8f5574ebeac4d2b813b247e8cec28174a2ba3627170"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:4710f77598c0092239bc12c1fcc278a444e16c7032d91babf5abbf7166463f7b"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f3e9f75ae842a6c22a195d4a127263dbf87cbab729829e0bd7857fb1672400b2"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5f9c8d55d6802086edd188e3a7d85a77787e50d56ce3eb4757a3205fa4657922"},
+    {file = "aiohttp-3.12.14-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:79b29053ff3ad307880d94562cca80693c62062a098a5776ea8ef5ef4b28d140"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:23e1332fff36bebd3183db0c7a547a1da9d3b4091509f6d818e098855f2f27d3"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_armv7l.whl", hash = "sha256:a564188ce831fd110ea76bcc97085dd6c625b427db3f1dbb14ca4baa1447dcbc"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:a7a1b4302f70bb3ec40ca86de82def532c97a80db49cac6a6700af0de41af5ee"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:1b07ccef62950a2519f9bfc1e5b294de5dd84329f444ca0b329605ea787a3de5"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:938bd3ca6259e7e48b38d84f753d548bd863e0c222ed6ee6ace3fd6752768a84"},
+    {file = "aiohttp-3.12.14-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:8bc784302b6b9f163b54c4e93d7a6f09563bd01ff2b841b29ed3ac126e5040bf"},
+    {file = "aiohttp-3.12.14-cp39-cp39-win32.whl", hash = "sha256:a3416f95961dd7d5393ecff99e3f41dc990fb72eda86c11f2a60308ac6dcd7a0"},
+    {file = "aiohttp-3.12.14-cp39-cp39-win_amd64.whl", hash = "sha256:196858b8820d7f60578f8b47e5669b3195c21d8ab261e39b1d705346458f445f"},
+    {file = "aiohttp-3.12.14.tar.gz", hash = "sha256:6e06e120e34d93100de448fd941522e11dafa78ef1a893c179901b7d66aa29f2"},
 ]
 
 [package.dependencies]
-aiohappyeyeballs = ">=2.3.0"
-aiosignal = ">=1.1.2"
+aiohappyeyeballs = ">=2.5.0"
+aiosignal = ">=1.4.0"
 attrs = ">=17.3.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
-yarl = ">=1.12.0,<2.0"
+propcache = ">=0.2.0"
+yarl = ">=1.17.0,<2.0"
 
 [package.extras]
-speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.2.0) ; sys_platform == \"linux\" or sys_platform == \"darwin\"", "brotlicffi ; platform_python_implementation != \"CPython\""]
+speedups = ["Brotli ; platform_python_implementation == \"CPython\"", "aiodns (>=3.3.0)", "brotlicffi ; platform_python_implementation != \"CPython\""]
 
 [[package]]
 name = "aiopg"
@@ -145,18 +141,19 @@ sa = ["sqlalchemy[postgresql-psycopg2binary] (>=1.3,<1.5)"]
 
 [[package]]
 name = "aiosignal"
-version = "1.3.1"
+version = "1.4.0"
 description = "aiosignal: a list of registered asynchronous callbacks"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "aiosignal-1.3.1-py3-none-any.whl", hash = "sha256:f8376fb07dd1e86a584e4fcdec80b36b7f81aac666ebc724e2c090300dd83b17"},
-    {file = "aiosignal-1.3.1.tar.gz", hash = "sha256:54cd96e15e1649b75d6c87526a6ff0b6c1b0dd3459f43d9ca11d48c339b68cfc"},
+    {file = "aiosignal-1.4.0-py3-none-any.whl", hash = "sha256:053243f8b92b990551949e63930a839ff0cf0b0ebbe0597b0f3fb19e1a0fe82e"},
+    {file = "aiosignal-1.4.0.tar.gz", hash = "sha256:f47eecd9468083c2029cc99945502cb7708b082c232f9aca65da147157b251c7"},
 ]
 
 [package.dependencies]
 frozenlist = ">=1.1.0"
+typing-extensions = {version = ">=4.2", markers = "python_version < \"3.13\""}
 
 [[package]]
 name = "allure-pytest"
@@ -3847,4 +3844,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.11"
-content-hash = "bd93313f110110aa53b24a3ed47ba2d7f60e2c658a79cdff7320fed1bb1b57b5"
+content-hash = "6a1e8ba06b8194bf28d87fd5e184e2ddc2b4a19dffcbe3953b26da3d55c9212f"
diff --git a/pyproject.toml b/pyproject.toml
index e7e314d144..e992e81fe7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,7 +32,7 @@ psutil = "^5.9.4"
 types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
-aiohttp = "3.10.11"
+aiohttp = "3.12.14"
 pytest-rerunfailures = "^15.0"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"

From 921a4f20099c5f56c6c1d79692f0710bf563f420 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 15 Jul 2025 12:16:29 +0100
Subject: [PATCH 121/163] CI(run-python-test-set): don't collect code coverage
 (#12601)

## Problem

We don't use code coverage produced by `regress-tests`
(neondatabase/neon#6798), so there's no need to collect it. Potentially,
disabling it should reduce the load on disks and improve the stability
of debug builds.

## Summary of changes
- Disable code coverage collection for regression tests
---
 .github/actions/run-python-test-set/action.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 6f2b48444a..b3e68ab606 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -176,7 +176,11 @@ runs:
         fi
 
         if [[ $BUILD_TYPE == "debug" && $RUNNER_ARCH == 'X64' ]]; then
-          cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+          # We don't use code coverage for regression tests (the step is disabled),
+          # so there's no need to collect it.
+          # Ref https://github.com/neondatabase/neon/issues/4540
+          # cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
+          cov_prefix=()
         else
           cov_prefix=()
         fi

From 62af2a14e2f614b2a7f16a15c80030700e491f93 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 15 Jul 2025 16:06:49 +0300
Subject: [PATCH 122/163] Improve comments a little

---
 pgxn/neon/communicator/src/neon_request.rs | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/communicator/src/neon_request.rs b/pgxn/neon/communicator/src/neon_request.rs
index 9f5d134194..d68ec24ed9 100644
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -6,6 +6,7 @@ pub const MAX_GETPAGEV_PAGES: usize = 32;
 
 use pageserver_page_api as page_api;
 
+/// Request from a Postgres backend to the communicator process
 #[allow(clippy::large_enum_variant)]
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
@@ -60,6 +61,10 @@ pub enum NeonIOResult {
 }
 
 impl NeonIORequest {
+    /// All requests include a unique request ID, which can be used to trace the execution
+    /// of a request all the way to the pageservers. The request ID needs to be unique
+    /// within the lifetime of the Postgres instance (but not across servers or across
+    /// restarts of the same server).
     pub fn request_id(&self) -> u64 {
         use NeonIORequest::*;
         match self {
@@ -80,6 +85,9 @@ impl NeonIORequest {
     }
 }
 
+/// Special quick result to a CGetPageVRequest request, indicating that the
+/// the requested pages are present in the local file cache. The backend can
+/// read the blocks directly from the given LFC blocks.
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CCachedGetPageVResult {
@@ -96,7 +104,7 @@ pub struct CCachedGetPageVResult {
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct ShmemBuf {
-    // These fields define where the result is written. Must point into a buffer in shared memory!
+    // Pointer to where the result is written or where to read from. Must point into a buffer in shared memory!
     pub ptr: *mut u8,
 }
 
@@ -204,7 +212,7 @@ pub struct CWritePageRequest {
     pub block_number: u32,
     pub lsn: CLsn,
 
-    // These fields define where the result is written. Must point into a buffer in shared memory!
+    // `src` defines the new page contents. Must point into a buffer in shared memory!
     pub src: ShmemBuf,
 }
 
@@ -219,7 +227,7 @@ pub struct CRelExtendRequest {
     pub block_number: u32,
     pub lsn: CLsn,
 
-    // These fields define page contents. Must point into a buffer in shared memory!
+    // `src` defines the new page contents. Must point into a buffer in shared memory!
     pub src: ShmemBuf,
 }
 

From 5c9c3b3317239d6d55f45fccdde801002c0dd21b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 15 Jul 2025 17:36:28 +0300
Subject: [PATCH 123/163] Misc cosmetic cleanups (#12598)

- Remove a few obsolete "allowed error messages" from tests. The
pageserver doesn't emit those messages anymore.

- Remove misplaced and outdated docstring comment from
`test_tenants.py`. A docstring is supposed to be the first thing in a
function, but we had added some code before it. And it was outdated, as
we haven't supported running without safekeepers for a long time.

- Fix misc typos in comments

- Remove obsolete comment about backwards compatibility with safekeepers
without `TIMELINE_STATUS` API. All safekeepers have it by now.
---
 compute_tools/src/compute.rs                | 4 +---
 control_plane/src/endpoint.rs               | 2 +-
 docs/pageserver-services.md                 | 2 +-
 pageserver/src/deletion_queue/validator.rs  | 2 +-
 pageserver/src/utilization.rs               | 2 +-
 pageserver/src/walingest.rs                 | 2 +-
 proxy/src/proxy/connect_compute.rs          | 2 +-
 test_runner/fixtures/neon_fixtures.py       | 2 +-
 test_runner/regress/test_broken_timeline.py | 3 ---
 test_runner/regress/test_compaction.py      | 2 +-
 test_runner/regress/test_tenants.py         | 1 -
 11 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 8f42cf699b..6608eb5154 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1286,9 +1286,7 @@ impl ComputeNode {
 
         // In case of error, log and fail the check, but don't crash.
         // We're playing it safe because these errors could be transient
-        // and we don't yet retry. Also being careful here allows us to
-        // be backwards compatible with safekeepers that don't have the
-        // TIMELINE_STATUS API yet.
+        // and we don't yet retry.
         if responses.len() < quorum {
             error!(
                 "failed sync safekeepers check {:?} {:?} {:?}",
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 91a62b0ca4..792da14a32 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -464,7 +464,7 @@ impl Endpoint {
         conf.append("max_connections", "100");
         conf.append("wal_level", "logical");
         // wal_sender_timeout is the maximum time to wait for WAL replication.
-        // It also defines how often the walreciever will send a feedback message to the wal sender.
+        // It also defines how often the walreceiver will send a feedback message to the wal sender.
         conf.append("wal_sender_timeout", "5s");
         conf.append("listen_addresses", &self.pg_address.ip().to_string());
         conf.append("port", &self.pg_address.port().to_string());
diff --git a/docs/pageserver-services.md b/docs/pageserver-services.md
index 11d984eb08..3c430c6236 100644
--- a/docs/pageserver-services.md
+++ b/docs/pageserver-services.md
@@ -75,7 +75,7 @@ CLI examples:
 * AWS S3  : `env AWS_ACCESS_KEY_ID='SOMEKEYAAAAASADSAH*#' AWS_SECRET_ACCESS_KEY='SOMEsEcReTsd292v' ${PAGESERVER_BIN} -c "remote_storage={bucket_name='some-sample-bucket',bucket_region='eu-north-1', prefix_in_bucket='/test_prefix/'}"`
 
 For Amazon AWS S3, a key id and secret access key could be located in `~/.aws/credentials` if awscli was ever configured to work with the desired bucket, on the AWS Settings page for a certain user. Also note, that the bucket names does not contain any protocols when used on AWS.
-For local S3 installations, refer to the their documentation for name format and credentials.
+For local S3 installations, refer to their documentation for name format and credentials.
 
 Similar to other pageserver settings, toml config file can be used to configure either of the storages as backup targets.
 Required sections are:
diff --git a/pageserver/src/deletion_queue/validator.rs b/pageserver/src/deletion_queue/validator.rs
index 363b1427f5..c9bfbd8adc 100644
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -1,5 +1,5 @@
 //! The validator is responsible for validating DeletionLists for execution,
-//! based on whethe the generation in the DeletionList is still the latest
+//! based on whether the generation in the DeletionList is still the latest
 //! generation for a tenant.
 //!
 //! The purpose of validation is to ensure split-brain safety in the cluster
diff --git a/pageserver/src/utilization.rs b/pageserver/src/utilization.rs
index ccfad7a391..0dafa5c4bb 100644
--- a/pageserver/src/utilization.rs
+++ b/pageserver/src/utilization.rs
@@ -1,6 +1,6 @@
 //! An utilization metric which is used to decide on which pageserver to put next tenant.
 //!
-//! The metric is exposed via `GET /v1/utilization`. Refer and maintain it's openapi spec as the
+//! The metric is exposed via `GET /v1/utilization`. Refer and maintain its openapi spec as the
 //! truth.
 
 use std::path::Path;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index f852051178..dfd0071ce3 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1069,7 +1069,7 @@ impl WalIngest {
         // NB: In PostgreSQL, the next-multi-xid stored in the control file is allowed to
         // go to 0, and it's fixed up by skipping to FirstMultiXactId in functions that
         // read it, like GetNewMultiXactId(). This is different from how nextXid is
-        // incremented! nextXid skips over < FirstNormalTransactionId when the the value
+        // incremented! nextXid skips over < FirstNormalTransactionId when the value
         // is stored, so it's never 0 in a checkpoint.
         //
         // I don't know why it's done that way, it seems less error-prone to skip over 0
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 9f642f52ab..ce9774e3eb 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -110,7 +110,7 @@ where
     debug!(error = ?err, COULD_NOT_CONNECT);
 
     let node_info = if !node_info.cached() || !err.should_retry_wake_compute() {
-        // If we just recieved this from cplane and didn't get it from cache, we shouldn't retry.
+        // If we just received this from cplane and not from the cache, we shouldn't retry.
         // Do not need to retrieve a new node_info, just return the old one.
         if !should_retry(&err, num_retries, compute.retry) {
             Metrics::get().proxy.retries_metric.observe(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b9fff05c6c..ea1b045b78 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -728,7 +728,7 @@ class NeonEnvBuilder:
         # NB: neon_local rewrites postgresql.conf on each start based on neon_local config. No need to patch it.
         # However, in this new NeonEnv, the pageservers and safekeepers listen on different ports, and the storage
         # controller will currently reject re-attach requests from them because the NodeMetadata isn't identical.
-        # So, from_repo_dir patches up the the storcon database.
+        # So, from_repo_dir patches up the storcon database.
         patch_script_path = self.repo_dir / "storage_controller_db.startup.sql"
         assert not patch_script_path.exists()
         patch_script = ""
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 1209b3a818..0d92bf8406 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -24,10 +24,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder):
         [
             ".*get_values_reconstruct_data for layer .*",
             ".*could not find data for key.*",
-            ".*is not active. Current state: Broken.*",
             ".*will not become active. Current state: Broken.*",
-            ".*failed to load metadata.*",
-            ".*load failed.*load local timeline.*",
             ".*: layer load failed, assuming permanent failure:.*",
             ".*failed to get checkpoint bytes.*",
             ".*failed to get control bytes.*",
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 963a19d640..76485c8321 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -687,7 +687,7 @@ def test_sharding_compaction(
     for _i in range(0, 10):
         # Each of these does some writes then a checkpoint: because we set image_creation_threshold to 1,
         # these should result in image layers each time we write some data into a shard, and also shards
-        # recieving less data hitting their "empty image layer" path (wherre they should skip writing the layer,
+        # receiving less data hitting their "empty image layer" path (where they should skip writing the layer,
         # rather than asserting)
         workload.churn_rows(64)
 
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index c54dd8b38d..7f32f34d36 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -76,7 +76,6 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.num_safekeepers = 3
 
     env = neon_env_builder.init_start()
-    """Tests tenants with and without wal acceptors"""
     tenant_1, _ = env.create_tenant()
     tenant_2, _ = env.create_tenant()
 

From 5c934efb29c08847f49b8db474a68da7f1d1cbe4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 15 Jul 2025 19:28:08 +0200
Subject: [PATCH 124/163] Don't depend on the postgres_ffi just for one type
 (#12610)

We don't want to depend on postgres_ffi in an API crate. If there is no
such dependency, we can compile stuff like `storcon_cli` without needing
a full working postgres build. Fixes regression of #12548 (before we
could compile it).
---
 Cargo.lock                            | 3 ++-
 libs/postgres_ffi/build.rs            | 1 -
 libs/postgres_ffi/src/lib.rs          | 3 +--
 libs/postgres_ffi/src/walrecord.rs    | 6 ++++--
 libs/postgres_ffi/src/xlog_utils.rs   | 3 ++-
 libs/postgres_ffi_types/src/lib.rs    | 1 +
 libs/safekeeper_api/Cargo.toml        | 2 +-
 libs/safekeeper_api/src/models.rs     | 2 +-
 libs/wal_decoder/src/models/record.rs | 3 ++-
 pageserver/src/pgdatadir_mapping.rs   | 4 ++--
 pageserver/src/walingest.rs           | 5 +++--
 safekeeper/Cargo.toml                 | 1 +
 safekeeper/src/send_wal.rs            | 3 ++-
 13 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2f36790d30..3474211ac6 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6211,6 +6211,7 @@ dependencies = [
  "postgres-protocol",
  "postgres_backend",
  "postgres_ffi",
+ "postgres_ffi_types",
  "postgres_versioninfo",
  "pprof",
  "pq_proto",
@@ -6255,7 +6256,7 @@ dependencies = [
  "anyhow",
  "const_format",
  "pageserver_api",
- "postgres_ffi",
+ "postgres_ffi_types",
  "postgres_versioninfo",
  "pq_proto",
  "serde",
diff --git a/libs/postgres_ffi/build.rs b/libs/postgres_ffi/build.rs
index cdebd43f6f..190d9a78c4 100644
--- a/libs/postgres_ffi/build.rs
+++ b/libs/postgres_ffi/build.rs
@@ -110,7 +110,6 @@ fn main() -> anyhow::Result<()> {
             .allowlist_type("XLogRecPtr")
             .allowlist_type("XLogSegNo")
             .allowlist_type("TimeLineID")
-            .allowlist_type("TimestampTz")
             .allowlist_type("MultiXactId")
             .allowlist_type("MultiXactOffset")
             .allowlist_type("MultiXactStatus")
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 9297ac46c9..a88b520a41 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -227,8 +227,7 @@ pub mod walrecord;
 // Export some widely used datatypes that are unlikely to change across Postgres versions
 pub use v14::bindings::{
     BlockNumber, CheckPoint, ControlFileData, MultiXactId, OffsetNumber, Oid, PageHeaderData,
-    RepOriginId, TimeLineID, TimestampTz, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32,
-    uint64,
+    RepOriginId, TimeLineID, TransactionId, XLogRecPtr, XLogRecord, XLogSegNo, uint32, uint64,
 };
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs
index d593123dc0..7ed07b0e77 100644
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -4,13 +4,14 @@
 //! TODO: Generate separate types for each supported PG version
 
 use bytes::{Buf, Bytes};
+use postgres_ffi_types::TimestampTz;
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;
 
 use crate::{
     BLCKSZ, BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, PgMajorVersion,
-    RepOriginId, TimestampTz, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
+    RepOriginId, TransactionId, XLOG_SIZE_OF_XLOG_RECORD, XLogRecord, pg_constants,
 };
 
 #[repr(C)]
@@ -863,7 +864,8 @@ pub mod v17 {
         XlHeapDelete, XlHeapInsert, XlHeapLock, XlHeapMultiInsert, XlHeapUpdate, XlParameterChange,
         rm_neon,
     };
-    pub use crate::{TimeLineID, TimestampTz};
+    pub use crate::TimeLineID;
+    pub use postgres_ffi_types::TimestampTz;
 
     #[repr(C)]
     #[derive(Debug)]
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index f7b6296053..134baf5ff7 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -9,10 +9,11 @@
 
 use super::super::waldecoder::WalStreamDecoder;
 use super::bindings::{
-    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
+    CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID,
     XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
     MY_PGVERSION
 };
+use postgres_ffi_types::TimestampTz;
 use super::wal_generator::LogicalMessageGenerator;
 use crate::pg_constants;
 use crate::PG_TLI;
diff --git a/libs/postgres_ffi_types/src/lib.rs b/libs/postgres_ffi_types/src/lib.rs
index 84ef499b9f..86e8259e8a 100644
--- a/libs/postgres_ffi_types/src/lib.rs
+++ b/libs/postgres_ffi_types/src/lib.rs
@@ -11,3 +11,4 @@ pub mod forknum;
 
 pub type Oid = u32;
 pub type RepOriginId = u16;
+pub type TimestampTz = i64;
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 928e583b0b..1d09d6fc6d 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -9,7 +9,7 @@ anyhow.workspace = true
 const_format.workspace = true
 serde.workspace = true
 serde_json.workspace = true
-postgres_ffi.workspace = true
+postgres_ffi_types.workspace = true
 postgres_versioninfo.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 59e112654b..a300c8464f 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -3,7 +3,7 @@
 use std::net::SocketAddr;
 
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::TimestampTz;
+use postgres_ffi_types::TimestampTz;
 use postgres_versioninfo::PgVersionId;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
diff --git a/libs/wal_decoder/src/models/record.rs b/libs/wal_decoder/src/models/record.rs
index 51659ed904..a37e1473e0 100644
--- a/libs/wal_decoder/src/models/record.rs
+++ b/libs/wal_decoder/src/models/record.rs
@@ -2,7 +2,8 @@
 
 use bytes::Bytes;
 use postgres_ffi::walrecord::{MultiXactMember, describe_postgres_wal_record};
-use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
+use postgres_ffi::{MultiXactId, MultiXactOffset, TransactionId};
+use postgres_ffi_types::TimestampTz;
 use serde::{Deserialize, Serialize};
 use utils::bin_ser::DeserializeError;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 8532a6938f..08828ec4eb 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -25,9 +25,9 @@ use pageserver_api::keyspace::{KeySpaceRandomAccum, SparseKeySpace};
 use pageserver_api::models::RelSizeMigration;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use postgres_ffi::{BLCKSZ, PgMajorVersion, TimestampTz, TransactionId};
+use postgres_ffi::{BLCKSZ, PgMajorVersion, TransactionId};
 use postgres_ffi_types::forknum::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
-use postgres_ffi_types::{Oid, RepOriginId};
+use postgres_ffi_types::{Oid, RepOriginId, TimestampTz};
 use serde::{Deserialize, Serialize};
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index dfd0071ce3..3acf98b020 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -32,9 +32,10 @@ use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::walrecord::*;
 use postgres_ffi::{
-    PgMajorVersion, TimestampTz, TransactionId, dispatch_pgversion, enum_pgversion,
-    enum_pgversion_dispatch, fsm_logical_to_physical, pg_constants,
+    PgMajorVersion, TransactionId, dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch,
+    fsm_logical_to_physical, pg_constants,
 };
+use postgres_ffi_types::TimestampTz;
 use postgres_ffi_types::forknum::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use tracing::*;
 use utils::bin_ser::{DeserializeError, SerializeError};
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 6955028c73..539e931983 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -58,6 +58,7 @@ metrics.workspace = true
 pem.workspace = true
 postgres_backend.workspace = true
 postgres_ffi.workspace = true
+postgres_ffi_types.workspace = true
 postgres_versioninfo.workspace = true
 pq_proto.workspace = true
 remote_storage.workspace = true
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 177e759db5..5891fa88a4 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -12,7 +12,8 @@ use futures::FutureExt;
 use itertools::Itertools;
 use parking_lot::Mutex;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend, PostgresBackendReader, QueryError};
-use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, TimestampTz, get_current_timestamp};
+use postgres_ffi::{MAX_SEND_SIZE, PgMajorVersion, get_current_timestamp};
+use postgres_ffi_types::TimestampTz;
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use safekeeper_api::Term;
 use safekeeper_api::models::{

From 809633903d15f10942117978398ef6f2ecb82a55 Mon Sep 17 00:00:00 2001
From: quantumish <freifeld.david@gmail.com>
Date: Tue, 15 Jul 2025 10:40:40 -0700
Subject: [PATCH 125/163] Move `ShmemHandle` into separate module, tweak
 documentation (#12595)

Initial PR for the hashmap behind the updated LFC implementation. This
refactors `neon-shmem` so that the actual shared memory utilities are in
a separate module within the crate. Beyond that, it slightly changes
some of the docstrings so that they play nicer with `cargo doc`.
---
 libs/neon-shmem/src/lib.rs   | 419 +----------------------------------
 libs/neon-shmem/src/shmem.rs | 409 ++++++++++++++++++++++++++++++++++
 2 files changed, 410 insertions(+), 418 deletions(-)
 create mode 100644 libs/neon-shmem/src/shmem.rs

diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs
index c689959b68..50d3fbb3cf 100644
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1,418 +1 @@
-//! Shared memory utilities for neon communicator
-
-use std::num::NonZeroUsize;
-use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
-use std::ptr::NonNull;
-use std::sync::atomic::{AtomicUsize, Ordering};
-
-use nix::errno::Errno;
-use nix::sys::mman::MapFlags;
-use nix::sys::mman::ProtFlags;
-use nix::sys::mman::mmap as nix_mmap;
-use nix::sys::mman::munmap as nix_munmap;
-use nix::unistd::ftruncate as nix_ftruncate;
-
-/// ShmemHandle represents a shared memory area that can be shared by processes over fork().
-/// Unlike shared memory allocated by Postgres, this area is resizable, up to 'max_size' that's
-/// specified at creation.
-///
-/// The area is backed by an anonymous file created with memfd_create(). The full address space for
-/// 'max_size' is reserved up-front with mmap(), but whenever you call [`ShmemHandle::set_size`],
-/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
-/// will cause the file to be expanded, but we might use mprotect() etc. to enforce that in the
-/// future.
-pub struct ShmemHandle {
-    /// memfd file descriptor
-    fd: OwnedFd,
-
-    max_size: usize,
-
-    // Pointer to the beginning of the shared memory area. The header is stored there.
-    shared_ptr: NonNull<SharedStruct>,
-
-    // Pointer to the beginning of the user data
-    pub data_ptr: NonNull<u8>,
-}
-
-/// This is stored at the beginning in the shared memory area.
-struct SharedStruct {
-    max_size: usize,
-
-    /// Current size of the backing file. The high-order bit is used for the RESIZE_IN_PROGRESS flag
-    current_size: AtomicUsize,
-}
-
-const RESIZE_IN_PROGRESS: usize = 1 << 63;
-
-const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
-
-/// Error type returned by the ShmemHandle functions.
-#[derive(thiserror::Error, Debug)]
-#[error("{msg}: {errno}")]
-pub struct Error {
-    pub msg: String,
-    pub errno: Errno,
-}
-
-impl Error {
-    fn new(msg: &str, errno: Errno) -> Error {
-        Error {
-            msg: msg.to_string(),
-            errno,
-        }
-    }
-}
-
-impl ShmemHandle {
-    /// Create a new shared memory area. To communicate between processes, the processes need to be
-    /// fork()'d after calling this, so that the ShmemHandle is inherited by all processes.
-    ///
-    /// If the ShmemHandle is dropped, the memory is unmapped from the current process. Other
-    /// processes can continue using it, however.
-    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<ShmemHandle, Error> {
-        // create the backing anonymous file.
-        let fd = create_backing_file(name)?;
-
-        Self::new_with_fd(fd, initial_size, max_size)
-    }
-
-    fn new_with_fd(
-        fd: OwnedFd,
-        initial_size: usize,
-        max_size: usize,
-    ) -> Result<ShmemHandle, Error> {
-        // We reserve the high-order bit for the RESIZE_IN_PROGRESS flag, and the actual size
-        // is a little larger than this because of the SharedStruct header. Make the upper limit
-        // somewhat smaller than that, because with anything close to that, you'll run out of
-        // memory anyway.
-        if max_size >= 1 << 48 {
-            panic!("max size {max_size} too large");
-        }
-        if initial_size > max_size {
-            panic!("initial size {initial_size} larger than max size {max_size}");
-        }
-
-        // The actual initial / max size is the one given by the caller, plus the size of
-        // 'SharedStruct'.
-        let initial_size = HEADER_SIZE + initial_size;
-        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
-
-        // Reserve address space for it with mmap
-        //
-        // TODO: Use MAP_HUGETLB if possible
-        let start_ptr = unsafe {
-            nix_mmap(
-                None,
-                max_size,
-                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
-                MapFlags::MAP_SHARED,
-                &fd,
-                0,
-            )
-        }
-        .map_err(|e| Error::new("mmap failed: {e}", e))?;
-
-        // Reserve space for the initial size
-        enlarge_file(fd.as_fd(), initial_size as u64)?;
-
-        // Initialize the header
-        let shared: NonNull<SharedStruct> = start_ptr.cast();
-        unsafe {
-            shared.write(SharedStruct {
-                max_size: max_size.into(),
-                current_size: AtomicUsize::new(initial_size),
-            })
-        };
-
-        // The user data begins after the header
-        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
-
-        Ok(ShmemHandle {
-            fd,
-            max_size: max_size.into(),
-            shared_ptr: shared,
-            data_ptr,
-        })
-    }
-
-    // return reference to the header
-    fn shared(&self) -> &SharedStruct {
-        unsafe { self.shared_ptr.as_ref() }
-    }
-
-    /// Resize the shared memory area. 'new_size' must not be larger than the 'max_size' specified
-    /// when creating the area.
-    ///
-    /// This may only be called from one process/thread concurrently. We detect that case
-    /// and return an Error.
-    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
-        let new_size = new_size + HEADER_SIZE;
-        let shared = self.shared();
-
-        if new_size > self.max_size {
-            panic!(
-                "new size ({} is greater than max size ({})",
-                new_size, self.max_size
-            );
-        }
-        assert_eq!(self.max_size, shared.max_size);
-
-        // Lock the area by setting the bit in 'current_size'
-        //
-        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
-        // and the posix_fallocate/ftruncate call is surely a synchronization point anyway. But
-        // since this is not performance-critical, better safe than sorry .
-        let mut old_size = shared.current_size.load(Ordering::Acquire);
-        loop {
-            if (old_size & RESIZE_IN_PROGRESS) != 0 {
-                return Err(Error::new(
-                    "concurrent resize detected",
-                    Errno::UnknownErrno,
-                ));
-            }
-            match shared.current_size.compare_exchange(
-                old_size,
-                new_size,
-                Ordering::Acquire,
-                Ordering::Relaxed,
-            ) {
-                Ok(_) => break,
-                Err(x) => old_size = x,
-            }
-        }
-
-        // Ok, we got the lock.
-        //
-        // NB: If anything goes wrong, we *must* clear the bit!
-        let result = {
-            use std::cmp::Ordering::{Equal, Greater, Less};
-            match new_size.cmp(&old_size) {
-                Less => nix_ftruncate(&self.fd, new_size as i64).map_err(|e| {
-                    Error::new("could not shrink shmem segment, ftruncate failed: {e}", e)
-                }),
-                Equal => Ok(()),
-                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
-            }
-        };
-
-        // Unlock
-        shared.current_size.store(
-            if result.is_ok() { new_size } else { old_size },
-            Ordering::Release,
-        );
-
-        result
-    }
-
-    /// Returns the current user-visible size of the shared memory segment.
-    ///
-    /// NOTE: a concurrent set_size() call can change the size at any time. It is the caller's
-    /// responsibility not to access the area beyond the current size.
-    pub fn current_size(&self) -> usize {
-        let total_current_size =
-            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
-        total_current_size - HEADER_SIZE
-    }
-}
-
-impl Drop for ShmemHandle {
-    fn drop(&mut self) {
-        // SAFETY: The pointer was obtained from mmap() with the given size.
-        // We unmap the entire region.
-        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
-        // The fd is dropped automatically by OwnedFd.
-    }
-}
-
-/// Create a "backing file" for the shared memory area. On Linux, use memfd_create(), to create an
-/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
-/// development and testing, but in production we want the file to stay in memory.
-///
-/// disable 'unused_variables' warnings, because in the macos path, 'name' is unused.
-#[allow(unused_variables)]
-fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
-            .map_err(|e| Error::new("memfd_create failed: {e}", e))
-    }
-    #[cfg(target_os = "macos")]
-    {
-        let file = tempfile::tempfile().map_err(|e| {
-            Error::new(
-                "could not create temporary file to back shmem area: {e}",
-                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
-            )
-        })?;
-        Ok(OwnedFd::from(file))
-    }
-}
-
-fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
-    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
-    // we don't get a segfault later when trying to actually use it.
-    #[cfg(not(target_os = "macos"))]
-    {
-        nix::fcntl::posix_fallocate(fd, 0, size as i64).map_err(|e| {
-            Error::new(
-                "could not grow shmem segment, posix_fallocate failed: {e}",
-                e,
-            )
-        })
-    }
-    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
-    #[cfg(target_os = "macos")]
-    {
-        nix::unistd::ftruncate(fd, size as i64)
-            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed: {e}", e))
-    }
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    use nix::unistd::ForkResult;
-    use std::ops::Range;
-
-    /// check that all bytes in given range have the expected value.
-    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
-        for i in range {
-            let b = unsafe { *(ptr.add(i)) };
-            assert_eq!(expected, b, "unexpected byte at offset {i}");
-        }
-    }
-
-    /// Write 'b' to all bytes in the given range
-    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
-        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
-    }
-
-    // simple single-process test of growing and shrinking
-    #[test]
-    fn test_shmem_resize() -> Result<(), Error> {
-        let max_size = 1024 * 1024;
-        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
-
-        assert_eq!(init_struct.current_size(), 0);
-
-        // Initial grow
-        let size1 = 10000;
-        init_struct.set_size(size1).unwrap();
-        assert_eq!(init_struct.current_size(), size1);
-
-        // Write some data
-        let data_ptr = init_struct.data_ptr.as_ptr();
-        write_range(data_ptr, 0xAA, 0..size1);
-        assert_range(data_ptr, 0xAA, 0..size1);
-
-        // Shrink
-        let size2 = 5000;
-        init_struct.set_size(size2).unwrap();
-        assert_eq!(init_struct.current_size(), size2);
-
-        // Grow again
-        let size3 = 20000;
-        init_struct.set_size(size3).unwrap();
-        assert_eq!(init_struct.current_size(), size3);
-
-        // Try to read it. The area that was shrunk and grown again should read as all zeros now
-        assert_range(data_ptr, 0xAA, 0..5000);
-        assert_range(data_ptr, 0, 5000..size1);
-
-        // Try to grow beyond max_size
-        //let size4 = max_size + 1;
-        //assert!(init_struct.set_size(size4).is_err());
-
-        // Dropping init_struct should unmap the memory
-        drop(init_struct);
-
-        Ok(())
-    }
-
-    /// This is used in tests to coordinate between test processes. It's like std::sync::Barrier,
-    /// but is stored in the shared memory area and works across processes. It's implemented by
-    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
-    struct SimpleBarrier {
-        num_procs: usize,
-        count: AtomicUsize,
-    }
-
-    impl SimpleBarrier {
-        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
-            unsafe {
-                *ptr = SimpleBarrier {
-                    num_procs,
-                    count: AtomicUsize::new(0),
-                }
-            }
-        }
-
-        pub fn wait(&self) {
-            let old = self.count.fetch_add(1, Ordering::Relaxed);
-
-            let generation = old / self.num_procs;
-
-            let mut current = old + 1;
-            while current < (generation + 1) * self.num_procs {
-                std::thread::sleep(std::time::Duration::from_millis(10));
-                current = self.count.load(Ordering::Relaxed);
-            }
-        }
-    }
-
-    #[test]
-    fn test_multi_process() {
-        // Initialize
-        let max_size = 1_000_000_000_000;
-        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
-        let ptr = init_struct.data_ptr.as_ptr();
-
-        // Store the SimpleBarrier in the first 1k of the area.
-        init_struct.set_size(10000).unwrap();
-        let barrier_ptr: *mut SimpleBarrier = unsafe {
-            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
-                .cast()
-        };
-        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
-        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
-
-        // Fork another test process. The code after this runs in both processes concurrently.
-        let fork_result = unsafe { nix::unistd::fork().unwrap() };
-
-        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, 1000..2000);
-        } else {
-            write_range(ptr, 0xBB, 2000..3000);
-        }
-        barrier.wait();
-        // Verify the contents. (in both processes)
-        assert_range(ptr, 0xAA, 1000..2000);
-        assert_range(ptr, 0xBB, 2000..3000);
-
-        // Grow, from the child this time
-        let size = 10_000_000;
-        if !fork_result.is_parent() {
-            init_struct.set_size(size).unwrap();
-        }
-        barrier.wait();
-
-        // make some writes at the end
-        if fork_result.is_parent() {
-            write_range(ptr, 0xAA, (size - 10)..size);
-        } else {
-            write_range(ptr, 0xBB, (size - 20)..(size - 10));
-        }
-        barrier.wait();
-
-        // Verify the contents. (This runs in both processes)
-        assert_range(ptr, 0, (size - 1000)..(size - 20));
-        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
-        assert_range(ptr, 0xAA, (size - 10)..size);
-
-        if let ForkResult::Parent { child } = fork_result {
-            nix::sys::wait::waitpid(child, None).unwrap();
-        }
-    }
-}
+pub mod shmem;
diff --git a/libs/neon-shmem/src/shmem.rs b/libs/neon-shmem/src/shmem.rs
new file mode 100644
index 0000000000..f19f402859
--- /dev/null
+++ b/libs/neon-shmem/src/shmem.rs
@@ -0,0 +1,409 @@
+//! Dynamically resizable contiguous chunk of shared memory
+
+use std::num::NonZeroUsize;
+use std::os::fd::{AsFd, BorrowedFd, OwnedFd};
+use std::ptr::NonNull;
+use std::sync::atomic::{AtomicUsize, Ordering};
+
+use nix::errno::Errno;
+use nix::sys::mman::MapFlags;
+use nix::sys::mman::ProtFlags;
+use nix::sys::mman::mmap as nix_mmap;
+use nix::sys::mman::munmap as nix_munmap;
+use nix::unistd::ftruncate as nix_ftruncate;
+
+/// `ShmemHandle` represents a shared memory area that can be shared by processes over `fork()`.
+/// Unlike shared memory allocated by Postgres, this area is resizable, up to `max_size` that's
+/// specified at creation.
+///
+/// The area is backed by an anonymous file created with `memfd_create()`. The full address space for
+/// `max_size` is reserved up-front with `mmap()`, but whenever you call [`ShmemHandle::set_size`],
+/// the underlying file is resized. Do not access the area beyond the current size. Currently, that
+/// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
+/// future.
+pub struct ShmemHandle {
+    /// memfd file descriptor
+    fd: OwnedFd,
+
+    max_size: usize,
+
+    // Pointer to the beginning of the shared memory area. The header is stored there.
+    shared_ptr: NonNull<SharedStruct>,
+
+    // Pointer to the beginning of the user data
+    pub data_ptr: NonNull<u8>,
+}
+
+/// This is stored at the beginning in the shared memory area.
+struct SharedStruct {
+    max_size: usize,
+
+    /// Current size of the backing file. The high-order bit is used for the [`RESIZE_IN_PROGRESS`] flag.
+    current_size: AtomicUsize,
+}
+
+const RESIZE_IN_PROGRESS: usize = 1 << 63;
+
+const HEADER_SIZE: usize = std::mem::size_of::<SharedStruct>();
+
+/// Error type returned by the [`ShmemHandle`] functions.
+#[derive(thiserror::Error, Debug)]
+#[error("{msg}: {errno}")]
+pub struct Error {
+    pub msg: String,
+    pub errno: Errno,
+}
+
+impl Error {
+    fn new(msg: &str, errno: Errno) -> Self {
+        Self {
+            msg: msg.to_string(),
+            errno,
+        }
+    }
+}
+
+impl ShmemHandle {
+    /// Create a new shared memory area. To communicate between processes, the processes need to be
+    /// `fork()`'d after calling this, so that the `ShmemHandle` is inherited by all processes.
+    ///
+    /// If the `ShmemHandle` is dropped, the memory is unmapped from the current process. Other
+    /// processes can continue using it, however.
+    pub fn new(name: &str, initial_size: usize, max_size: usize) -> Result<Self, Error> {
+        // create the backing anonymous file.
+        let fd = create_backing_file(name)?;
+
+        Self::new_with_fd(fd, initial_size, max_size)
+    }
+
+    fn new_with_fd(fd: OwnedFd, initial_size: usize, max_size: usize) -> Result<Self, Error> {
+        // We reserve the high-order bit for the `RESIZE_IN_PROGRESS` flag, and the actual size
+        // is a little larger than this because of the SharedStruct header. Make the upper limit
+        // somewhat smaller than that, because with anything close to that, you'll run out of
+        // memory anyway.
+        assert!(max_size < 1 << 48, "max size {max_size} too large");
+
+        assert!(
+            initial_size <= max_size,
+            "initial size {initial_size} larger than max size {max_size}"
+        );
+
+        // The actual initial / max size is the one given by the caller, plus the size of
+        // 'SharedStruct'.
+        let initial_size = HEADER_SIZE + initial_size;
+        let max_size = NonZeroUsize::new(HEADER_SIZE + max_size).unwrap();
+
+        // Reserve address space for it with mmap
+        //
+        // TODO: Use MAP_HUGETLB if possible
+        let start_ptr = unsafe {
+            nix_mmap(
+                None,
+                max_size,
+                ProtFlags::PROT_READ | ProtFlags::PROT_WRITE,
+                MapFlags::MAP_SHARED,
+                &fd,
+                0,
+            )
+        }
+        .map_err(|e| Error::new("mmap failed", e))?;
+
+        // Reserve space for the initial size
+        enlarge_file(fd.as_fd(), initial_size as u64)?;
+
+        // Initialize the header
+        let shared: NonNull<SharedStruct> = start_ptr.cast();
+        unsafe {
+            shared.write(SharedStruct {
+                max_size: max_size.into(),
+                current_size: AtomicUsize::new(initial_size),
+            });
+        }
+
+        // The user data begins after the header
+        let data_ptr = unsafe { start_ptr.cast().add(HEADER_SIZE) };
+
+        Ok(Self {
+            fd,
+            max_size: max_size.into(),
+            shared_ptr: shared,
+            data_ptr,
+        })
+    }
+
+    // return reference to the header
+    fn shared(&self) -> &SharedStruct {
+        unsafe { self.shared_ptr.as_ref() }
+    }
+
+    /// Resize the shared memory area. `new_size` must not be larger than the `max_size` specified
+    /// when creating the area.
+    ///
+    /// This may only be called from one process/thread concurrently. We detect that case
+    /// and return an [`shmem::Error`](Error).
+    pub fn set_size(&self, new_size: usize) -> Result<(), Error> {
+        let new_size = new_size + HEADER_SIZE;
+        let shared = self.shared();
+
+        assert!(
+            new_size <= self.max_size,
+            "new size ({new_size}) is greater than max size ({})",
+            self.max_size
+        );
+
+        assert_eq!(self.max_size, shared.max_size);
+
+        // Lock the area by setting the bit in `current_size`
+        //
+        // Ordering::Relaxed would probably be sufficient here, as we don't access any other memory
+        // and the `posix_fallocate`/`ftruncate` call is surely a synchronization point anyway. But
+        // since this is not performance-critical, better safe than sorry.
+        let mut old_size = shared.current_size.load(Ordering::Acquire);
+        loop {
+            if (old_size & RESIZE_IN_PROGRESS) != 0 {
+                return Err(Error::new(
+                    "concurrent resize detected",
+                    Errno::UnknownErrno,
+                ));
+            }
+            match shared.current_size.compare_exchange(
+                old_size,
+                new_size,
+                Ordering::Acquire,
+                Ordering::Relaxed,
+            ) {
+                Ok(_) => break,
+                Err(x) => old_size = x,
+            }
+        }
+
+        // Ok, we got the lock.
+        //
+        // NB: If anything goes wrong, we *must* clear the bit!
+        let result = {
+            use std::cmp::Ordering::{Equal, Greater, Less};
+            match new_size.cmp(&old_size) {
+                Less => nix_ftruncate(&self.fd, new_size as i64)
+                    .map_err(|e| Error::new("could not shrink shmem segment, ftruncate failed", e)),
+                Equal => Ok(()),
+                Greater => enlarge_file(self.fd.as_fd(), new_size as u64),
+            }
+        };
+
+        // Unlock
+        shared.current_size.store(
+            if result.is_ok() { new_size } else { old_size },
+            Ordering::Release,
+        );
+
+        result
+    }
+
+    /// Returns the current user-visible size of the shared memory segment.
+    ///
+    /// NOTE: a concurrent [`ShmemHandle::set_size()`] call can change the size at any time.
+    /// It is the caller's responsibility not to access the area beyond the current size.
+    pub fn current_size(&self) -> usize {
+        let total_current_size =
+            self.shared().current_size.load(Ordering::Relaxed) & !RESIZE_IN_PROGRESS;
+        total_current_size - HEADER_SIZE
+    }
+}
+
+impl Drop for ShmemHandle {
+    fn drop(&mut self) {
+        // SAFETY: The pointer was obtained from mmap() with the given size.
+        // We unmap the entire region.
+        let _ = unsafe { nix_munmap(self.shared_ptr.cast(), self.max_size) };
+        // The fd is dropped automatically by OwnedFd.
+    }
+}
+
+/// Create a "backing file" for the shared memory area. On Linux, use `memfd_create()`, to create an
+/// anonymous in-memory file. One macos, fall back to a regular file. That's good enough for
+/// development and testing, but in production we want the file to stay in memory.
+///
+/// Disable unused variables warnings because `name` is unused in the macos path.
+#[allow(unused_variables)]
+fn create_backing_file(name: &str) -> Result<OwnedFd, Error> {
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::sys::memfd::memfd_create(name, nix::sys::memfd::MFdFlags::empty())
+            .map_err(|e| Error::new("memfd_create failed", e))
+    }
+    #[cfg(target_os = "macos")]
+    {
+        let file = tempfile::tempfile().map_err(|e| {
+            Error::new(
+                "could not create temporary file to back shmem area",
+                nix::errno::Errno::from_raw(e.raw_os_error().unwrap_or(0)),
+            )
+        })?;
+        Ok(OwnedFd::from(file))
+    }
+}
+
+fn enlarge_file(fd: BorrowedFd, size: u64) -> Result<(), Error> {
+    // Use posix_fallocate() to enlarge the file. It reserves the space correctly, so that
+    // we don't get a segfault later when trying to actually use it.
+    #[cfg(not(target_os = "macos"))]
+    {
+        nix::fcntl::posix_fallocate(fd, 0, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, posix_fallocate failed", e))
+    }
+    // As a fallback on macos, which doesn't have posix_fallocate, use plain 'fallocate'
+    #[cfg(target_os = "macos")]
+    {
+        nix::unistd::ftruncate(fd, size as i64)
+            .map_err(|e| Error::new("could not grow shmem segment, ftruncate failed", e))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    use nix::unistd::ForkResult;
+    use std::ops::Range;
+
+    /// check that all bytes in given range have the expected value.
+    fn assert_range(ptr: *const u8, expected: u8, range: Range<usize>) {
+        for i in range {
+            let b = unsafe { *(ptr.add(i)) };
+            assert_eq!(expected, b, "unexpected byte at offset {i}");
+        }
+    }
+
+    /// Write 'b' to all bytes in the given range
+    fn write_range(ptr: *mut u8, b: u8, range: Range<usize>) {
+        unsafe { std::ptr::write_bytes(ptr.add(range.start), b, range.end - range.start) };
+    }
+
+    // simple single-process test of growing and shrinking
+    #[test]
+    fn test_shmem_resize() -> Result<(), Error> {
+        let max_size = 1024 * 1024;
+        let init_struct = ShmemHandle::new("test_shmem_resize", 0, max_size)?;
+
+        assert_eq!(init_struct.current_size(), 0);
+
+        // Initial grow
+        let size1 = 10000;
+        init_struct.set_size(size1).unwrap();
+        assert_eq!(init_struct.current_size(), size1);
+
+        // Write some data
+        let data_ptr = init_struct.data_ptr.as_ptr();
+        write_range(data_ptr, 0xAA, 0..size1);
+        assert_range(data_ptr, 0xAA, 0..size1);
+
+        // Shrink
+        let size2 = 5000;
+        init_struct.set_size(size2).unwrap();
+        assert_eq!(init_struct.current_size(), size2);
+
+        // Grow again
+        let size3 = 20000;
+        init_struct.set_size(size3).unwrap();
+        assert_eq!(init_struct.current_size(), size3);
+
+        // Try to read it. The area that was shrunk and grown again should read as all zeros now
+        assert_range(data_ptr, 0xAA, 0..5000);
+        assert_range(data_ptr, 0, 5000..size1);
+
+        // Try to grow beyond max_size
+        //let size4 = max_size + 1;
+        //assert!(init_struct.set_size(size4).is_err());
+
+        // Dropping init_struct should unmap the memory
+        drop(init_struct);
+
+        Ok(())
+    }
+
+    /// This is used in tests to coordinate between test processes. It's like `std::sync::Barrier`,
+    /// but is stored in the shared memory area and works across processes. It's implemented by
+    /// polling, because e.g. standard rust mutexes are not guaranteed to work across processes.
+    struct SimpleBarrier {
+        num_procs: usize,
+        count: AtomicUsize,
+    }
+
+    impl SimpleBarrier {
+        unsafe fn init(ptr: *mut SimpleBarrier, num_procs: usize) {
+            unsafe {
+                *ptr = SimpleBarrier {
+                    num_procs,
+                    count: AtomicUsize::new(0),
+                }
+            }
+        }
+
+        pub fn wait(&self) {
+            let old = self.count.fetch_add(1, Ordering::Relaxed);
+
+            let generation = old / self.num_procs;
+
+            let mut current = old + 1;
+            while current < (generation + 1) * self.num_procs {
+                std::thread::sleep(std::time::Duration::from_millis(10));
+                current = self.count.load(Ordering::Relaxed);
+            }
+        }
+    }
+
+    #[test]
+    fn test_multi_process() {
+        // Initialize
+        let max_size = 1_000_000_000_000;
+        let init_struct = ShmemHandle::new("test_multi_process", 0, max_size).unwrap();
+        let ptr = init_struct.data_ptr.as_ptr();
+
+        // Store the SimpleBarrier in the first 1k of the area.
+        init_struct.set_size(10000).unwrap();
+        let barrier_ptr: *mut SimpleBarrier = unsafe {
+            ptr.add(ptr.align_offset(std::mem::align_of::<SimpleBarrier>()))
+                .cast()
+        };
+        unsafe { SimpleBarrier::init(barrier_ptr, 2) };
+        let barrier = unsafe { barrier_ptr.as_ref().unwrap() };
+
+        // Fork another test process. The code after this runs in both processes concurrently.
+        let fork_result = unsafe { nix::unistd::fork().unwrap() };
+
+        // In the parent, fill bytes between 1000..2000. In the child, between 2000..3000
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, 1000..2000);
+        } else {
+            write_range(ptr, 0xBB, 2000..3000);
+        }
+        barrier.wait();
+        // Verify the contents. (in both processes)
+        assert_range(ptr, 0xAA, 1000..2000);
+        assert_range(ptr, 0xBB, 2000..3000);
+
+        // Grow, from the child this time
+        let size = 10_000_000;
+        if !fork_result.is_parent() {
+            init_struct.set_size(size).unwrap();
+        }
+        barrier.wait();
+
+        // make some writes at the end
+        if fork_result.is_parent() {
+            write_range(ptr, 0xAA, (size - 10)..size);
+        } else {
+            write_range(ptr, 0xBB, (size - 20)..(size - 10));
+        }
+        barrier.wait();
+
+        // Verify the contents. (This runs in both processes)
+        assert_range(ptr, 0, (size - 1000)..(size - 20));
+        assert_range(ptr, 0xBB, (size - 20)..(size - 10));
+        assert_range(ptr, 0xAA, (size - 10)..size);
+
+        if let ForkResult::Parent { child } = fork_result {
+            nix::sys::wait::waitpid(child, None).unwrap();
+        }
+    }
+}

From dd7fff655a96417c56d3cb57dd38747965ffba44 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Tue, 15 Jul 2025 22:22:57 +0200
Subject: [PATCH 126/163] feat(compute): Introduce privileged_role_name
 parameter (#12539)

## Problem

Currently `neon_superuser` is hardcoded in many places. It makes it
harder to reuse the same code in different envs.

## Summary of changes

Parametrize `neon_superuser` in `compute_ctl` via
`--privileged-role-name` and in `neon` extensions via
`neon.privileged_role_name`, so it's now possible to use different
'superuser' role names if needed. Everything still defaults to
`neon_superuser`, so no control plane code changes are needed and I
intentionally do not touch regression and migrations tests.

Postgres PRs:
- https://github.com/neondatabase/postgres/pull/674
- https://github.com/neondatabase/postgres/pull/675
- https://github.com/neondatabase/postgres/pull/676
- https://github.com/neondatabase/postgres/pull/677

Cloud PR:
- https://github.com/neondatabase/cloud/pull/31138
---
 compute/compute-node.Dockerfile               | 55 +++++-------
 compute/patches/anon_v2.patch                 | 20 +++--
 compute/patches/pg_duckdb_v031.patch          | 18 ++--
 .../patches/pg_stat_statements_pg14-16.patch  | 34 ++++++++
 compute/patches/pg_stat_statements_pg17.patch | 52 +++++++++++
 compute/patches/postgres_fdw.patch            | 17 ++++
 compute_tools/src/bin/compute_ctl.rs          | 69 +++++++++++++++
 compute_tools/src/compute.rs                  | 15 +++-
 compute_tools/src/config.rs                   |  8 ++
 ...0001-add_bypass_rls_to_privileged_role.sql |  1 +
 .../0001-neon_superuser_bypass_rls.sql        |  1 -
 .../src/migrations/0002-alter_roles.sql       |  4 +-
 ...reate_subscription_to_privileged_role.sql} |  2 +-
 ...004-grant_pg_monitor_to_neon_superuser.sql |  1 -
 ...04-grant_pg_monitor_to_privileged_role.sql |  1 +
 ...rant_all_on_tables_to_privileged_role.sql} |  4 +-
 ...t_all_on_sequences_to_privileged_role.sql} |  4 +-
 ..._with_grant_option_to_privileged_role.sql} |  2 +-
 ..._with_grant_option_to_privileged_role.sql} |  2 +-
 ...chronization_funcs_to_privileged_role.sql} |  4 +-
 ...cation_origin_status_to_neon_superuser.sql |  1 -
 ...ation_origin_status_to_privileged_role.sql |  1 +
 ...nt_pg_signal_backend_to_neon_superuser.sql |  1 -
 ...t_pg_signal_backend_to_privileged_role.sql |  1 +
 ...001-add_bypass_rls_to_privileged_role.sql} |  0
 ...reate_subscription_to_privileged_role.sql} |  0
 ...4-grant_pg_monitor_to_privileged_role.sql} |  0
 ...rant_all_on_tables_to_privileged_role.sql} |  0
 ...t_all_on_sequences_to_privileged_role.sql} |  0
 ..._with_grant_option_to_privileged_role.sql} |  0
 ..._with_grant_option_to_privileged_role.sql} |  0
 ...chronization_funcs_to_privileged_role.sql} |  0
 ...tion_origin_status_to_privileged_role.sql} |  0
 ..._pg_signal_backend_to_privileged_role.sql} |  0
 compute_tools/src/spec.rs                     | 66 ++++++++++----
 compute_tools/src/spec_apply.rs               | 37 +++++---
 .../src/sql/create_neon_superuser.sql         |  8 --
 .../src/sql/create_privileged_role.sql        |  8 ++
 control_plane/src/bin/neon_local.rs           |  5 ++
 control_plane/src/endpoint.rs                 | 12 +++
 pgxn/neon/neon.c                              |  9 ++
 pgxn/neon/neon.h                              |  1 -
 pgxn/neon/neon_ddl_handler.c                  | 59 ++++++-------
 test_runner/fixtures/neon_cli.py              |  3 +
 test_runner/fixtures/neon_fixtures.py         |  4 +
 test_runner/regress/test_neon_superuser.py    | 87 +++++++++++++++++++
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  8 +-
 51 files changed, 499 insertions(+), 134 deletions(-)
 create mode 100644 compute/patches/pg_stat_statements_pg14-16.patch
 create mode 100644 compute/patches/pg_stat_statements_pg17.patch
 create mode 100644 compute/patches/postgres_fdw.patch
 create mode 100644 compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql
 delete mode 100644 compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
 rename compute_tools/src/migrations/{0003-grant_pg_create_subscription_to_neon_superuser.sql => 0003-grant_pg_create_subscription_to_privileged_role.sql} (63%)
 delete mode 100644 compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql
 rename compute_tools/src/migrations/{0005-grant_all_on_tables_to_neon_superuser.sql => 0005-grant_all_on_tables_to_privileged_role.sql} (58%)
 rename compute_tools/src/migrations/{0006-grant_all_on_sequences_to_neon_superuser.sql => 0006-grant_all_on_sequences_to_privileged_role.sql} (57%)
 rename compute_tools/src/migrations/{0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql => 0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql} (73%)
 rename compute_tools/src/migrations/{0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql => 0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql} (72%)
 rename compute_tools/src/migrations/{0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql => 0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql} (82%)
 delete mode 100644 compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql
 delete mode 100644 compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
 create mode 100644 compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql
 rename compute_tools/src/migrations/tests/{0001-neon_superuser_bypass_rls.sql => 0001-add_bypass_rls_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0003-grant_pg_create_subscription_to_neon_superuser.sql => 0003-grant_pg_create_subscription_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0004-grant_pg_monitor_to_neon_superuser.sql => 0004-grant_pg_monitor_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0005-grant_all_on_tables_to_neon_superuser.sql => 0005-grant_all_on_tables_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0006-grant_all_on_sequences_to_neon_superuser.sql => 0006-grant_all_on_sequences_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql => 0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql => 0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql => 0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql => 0011-grant_pg_show_replication_origin_status_to_privileged_role.sql} (100%)
 rename compute_tools/src/migrations/tests/{0012-grant_pg_signal_backend_to_neon_superuser.sql => 0012-grant_pg_signal_backend_to_privileged_role.sql} (100%)
 delete mode 100644 compute_tools/src/sql/create_neon_superuser.sql
 create mode 100644 compute_tools/src/sql/create_privileged_role.sql

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 232b1e3bd5..a658738d76 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -170,7 +170,29 @@ RUN case $DEBIAN_VERSION in \
 FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION:?} postgres
+COPY compute/patches/postgres_fdw.patch .
+COPY compute/patches/pg_stat_statements_pg14-16.patch .
+COPY compute/patches/pg_stat_statements_pg17.patch .
 RUN cd postgres && \
+    # Apply patches to some contrib extensions
+    # For example, we need to grant EXECUTE on pg_stat_statements_reset() to {privileged_role_name}.
+    # In vanilla Postgres this function is limited to Postgres role superuser.
+    # In Neon we have {privileged_role_name} role that is not a superuser but replaces superuser in some cases.
+    # We could add the additional grant statements to the Postgres repository but it would be hard to maintain,
+    # whenever we need to pick up a new Postgres version and we want to limit the changes in our Postgres fork,
+    # so we do it here.
+    case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16") \
+    patch -p1 < /pg_stat_statements_pg14-16.patch; \
+    ;; \
+    "v17") \
+    patch -p1 < /pg_stat_statements_pg17.patch; \
+    ;; \
+    *) \
+    # To do not forget to migrate patches to the next major version
+    echo "No contrib patches for this PostgreSQL version" && exit 1;; \
+    esac && \
+    patch -p1 < /postgres_fdw.patch && \
     export CONFIGURE_CMD="./configure CFLAGS='-O2 -g3 -fsigned-char' --enable-debug --with-openssl --with-uuid=ossp \
     --with-icu --with-libxml --with-libxslt --with-lz4" && \
     if [ "${PG_VERSION:?}" != "v14" ]; then \
@@ -184,8 +206,6 @@ RUN cd postgres && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgres_fdw.control && \
-    file=/usr/local/pgsql/share/extension/postgres_fdw--1.0.sql && [ -e $file ] && \
-    echo 'GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO neon_superuser;' >> $file && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \
@@ -195,34 +215,7 @@ RUN cd postgres && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/refint.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control && \
-    # We need to grant EXECUTE on pg_stat_statements_reset() to neon_superuser.
-    # In vanilla postgres this function is limited to Postgres role superuser.
-    # In neon we have neon_superuser role that is not a superuser but replaces superuser in some cases.
-    # We could add the additional grant statements to the postgres repository but it would be hard to maintain,
-    # whenever we need to pick up a new postgres version and we want to limit the changes in our postgres fork,
-    # so we do it here.
-    for file in /usr/local/pgsql/share/extension/pg_stat_statements--*.sql; do \
-        filename=$(basename "$file"); \
-        # Note that there are no downgrade scripts for pg_stat_statements, so we \
-        # don't have to modify any downgrade paths or (much) older versions: we only \
-        # have to make sure every creation of the pg_stat_statements_reset function \
-        # also adds execute permissions to the neon_superuser.
-        case $filename in \
-          pg_stat_statements--1.4.sql) \
-            # pg_stat_statements_reset is first created with 1.4
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO neon_superuser;' >> $file; \
-            ;; \
-          pg_stat_statements--1.6--1.7.sql) \
-            # Then with the 1.6-1.7 migration it is re-created with a new signature, thus add the permissions back
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO neon_superuser;' >> $file; \
-            ;; \
-          pg_stat_statements--1.10--1.11.sql) \
-            # Then with the 1.10-1.11 migration it is re-created with a new signature again, thus add the permissions back
-            echo 'GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO neon_superuser;' >> $file; \
-            ;; \
-        esac; \
-    done;
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
 
 # Set PATH for all the subsequent build steps
 ENV PATH="/usr/local/pgsql/bin:$PATH"
@@ -1524,7 +1517,7 @@ WORKDIR /ext-src
 COPY compute/patches/pg_duckdb_v031.patch .
 COPY compute/patches/duckdb_v120.patch .
 # pg_duckdb build requires source dir to be a git repo to get submodules
-# allow neon_superuser to execute some functions that in pg_duckdb are available to superuser only:
+# allow {privileged_role_name} to execute some functions that in pg_duckdb are available to superuser only:
 # - extension management function duckdb.install_extension()
 # - access to duckdb.extensions table and its sequence
 RUN git clone --depth 1 --branch v0.3.1 https://github.com/duckdb/pg_duckdb.git pg_duckdb-src && \
diff --git a/compute/patches/anon_v2.patch b/compute/patches/anon_v2.patch
index 4faf927e39..ba9d7a8fe6 100644
--- a/compute/patches/anon_v2.patch
+++ b/compute/patches/anon_v2.patch
@@ -1,22 +1,26 @@
 diff --git a/sql/anon.sql b/sql/anon.sql
-index 0cdc769..b450327 100644
+index 0cdc769..5eab1d6 100644
 --- a/sql/anon.sql
 +++ b/sql/anon.sql
-@@ -1141,3 +1141,15 @@ $$
+@@ -1141,3 +1141,19 @@ $$
  -- TODO : https://en.wikipedia.org/wiki/L-diversity
  
  -- TODO : https://en.wikipedia.org/wiki/T-closeness
 +
 +-- NEON Patches
 +
-+GRANT ALL ON SCHEMA anon to neon_superuser;
-+GRANT ALL ON ALL TABLES IN SCHEMA anon TO neon_superuser;
-+
 +DO $$
++DECLARE
++  privileged_role_name text;
 +BEGIN
-+    IF current_setting('server_version_num')::int >= 150000 THEN
-+        GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO neon_superuser;
-+    END IF;
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT ALL ON SCHEMA anon to %I', privileged_role_name);
++  EXECUTE format('GRANT ALL ON ALL TABLES IN SCHEMA anon TO %I', privileged_role_name);
++
++  IF current_setting('server_version_num')::int >= 150000 THEN
++    EXECUTE format('GRANT SET ON PARAMETER anon.transparent_dynamic_masking TO %I', privileged_role_name);
++  END IF;
 +END $$;
 diff --git a/sql/init.sql b/sql/init.sql
 index 7da6553..9b6164b 100644
diff --git a/compute/patches/pg_duckdb_v031.patch b/compute/patches/pg_duckdb_v031.patch
index edc7fbf69d..f7aa374116 100644
--- a/compute/patches/pg_duckdb_v031.patch
+++ b/compute/patches/pg_duckdb_v031.patch
@@ -21,13 +21,21 @@ index 3235cc8..6b892bc 100644
  include Makefile.global
  
 diff --git a/sql/pg_duckdb--0.2.0--0.3.0.sql b/sql/pg_duckdb--0.2.0--0.3.0.sql
-index d777d76..af60106 100644
+index d777d76..3b54396 100644
 --- a/sql/pg_duckdb--0.2.0--0.3.0.sql
 +++ b/sql/pg_duckdb--0.2.0--0.3.0.sql
-@@ -1056,3 +1056,6 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
+@@ -1056,3 +1056,14 @@ GRANT ALL ON FUNCTION duckdb.cache(TEXT, TEXT) TO PUBLIC;
  GRANT ALL ON FUNCTION duckdb.cache_info() TO PUBLIC;
  GRANT ALL ON FUNCTION duckdb.cache_delete(TEXT) TO PUBLIC;
  GRANT ALL ON PROCEDURE duckdb.recycle_ddb() TO PUBLIC;
-+GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO neon_superuser;
-+GRANT ALL ON TABLE duckdb.extensions TO neon_superuser;
-+GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO neon_superuser;
++
++DO $$
++DECLARE
++  privileged_role_name text;
++BEGIN
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT ALL ON FUNCTION duckdb.install_extension(TEXT) TO %I', privileged_role_name);
++  EXECUTE format('GRANT ALL ON TABLE duckdb.extensions TO %I', privileged_role_name);
++  EXECUTE format('GRANT ALL ON SEQUENCE duckdb.extensions_table_seq TO %I', privileged_role_name);
++END $$;
diff --git a/compute/patches/pg_stat_statements_pg14-16.patch b/compute/patches/pg_stat_statements_pg14-16.patch
new file mode 100644
index 0000000000..368c6791c7
--- /dev/null
+++ b/compute/patches/pg_stat_statements_pg14-16.patch
@@ -0,0 +1,34 @@
+diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
+index 58cdf600fce..8be57a996f6 100644
+--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
++++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
+@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC;
+ 
+ -- Don't want this to be available to non-superusers.
+ REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
++
++DO $$
++DECLARE
++  privileged_role_name text;
++BEGIN
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name);
++END $$;
+diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
+index 6fc3fed4c93..256345a8f79 100644
+--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
++++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
+@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
+ 
+ -- Don't want this to be available to non-superusers.
+ REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC;
++
++DO $$
++DECLARE
++  privileged_role_name text;
++BEGIN
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name);
++END $$;
diff --git a/compute/patches/pg_stat_statements_pg17.patch b/compute/patches/pg_stat_statements_pg17.patch
new file mode 100644
index 0000000000..ff63b3255c
--- /dev/null
+++ b/compute/patches/pg_stat_statements_pg17.patch
@@ -0,0 +1,52 @@
+diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
+index 0bb2c397711..32764db1d8b 100644
+--- a/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
++++ b/contrib/pg_stat_statements/pg_stat_statements--1.10--1.11.sql
+@@ -80,3 +80,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
+ 
+ -- Don't want this to be available to non-superusers.
+ REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) FROM PUBLIC;
++
++DO $$
++DECLARE
++  privileged_role_name text;
++BEGIN
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint, boolean) TO %I', privileged_role_name);
++END $$;
+\ No newline at end of file
+diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
+index 58cdf600fce..8be57a996f6 100644
+--- a/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
++++ b/contrib/pg_stat_statements/pg_stat_statements--1.4.sql
+@@ -46,3 +46,12 @@ GRANT SELECT ON pg_stat_statements TO PUBLIC;
+ 
+ -- Don't want this to be available to non-superusers.
+ REVOKE ALL ON FUNCTION pg_stat_statements_reset() FROM PUBLIC;
++
++DO $$
++DECLARE
++  privileged_role_name text;
++BEGIN
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset() TO %I', privileged_role_name);
++END $$;
+diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
+index 6fc3fed4c93..256345a8f79 100644
+--- a/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
++++ b/contrib/pg_stat_statements/pg_stat_statements--1.6--1.7.sql
+@@ -20,3 +20,12 @@ LANGUAGE C STRICT PARALLEL SAFE;
+ 
+ -- Don't want this to be available to non-superusers.
+ REVOKE ALL ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) FROM PUBLIC;
++
++DO $$
++DECLARE
++  privileged_role_name text;
++BEGIN
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT EXECUTE ON FUNCTION pg_stat_statements_reset(Oid, Oid, bigint) TO %I', privileged_role_name);
++END $$;
diff --git a/compute/patches/postgres_fdw.patch b/compute/patches/postgres_fdw.patch
new file mode 100644
index 0000000000..d0007ffea5
--- /dev/null
+++ b/compute/patches/postgres_fdw.patch
@@ -0,0 +1,17 @@
+diff --git a/contrib/postgres_fdw/postgres_fdw--1.0.sql b/contrib/postgres_fdw/postgres_fdw--1.0.sql
+index a0f0fc1bf45..ee077f2eea6 100644
+--- a/contrib/postgres_fdw/postgres_fdw--1.0.sql
++++ b/contrib/postgres_fdw/postgres_fdw--1.0.sql
+@@ -16,3 +16,12 @@ LANGUAGE C STRICT;
+ CREATE FOREIGN DATA WRAPPER postgres_fdw
+   HANDLER postgres_fdw_handler
+   VALIDATOR postgres_fdw_validator;
++
++DO $$
++DECLARE
++  privileged_role_name text;
++BEGIN
++  privileged_role_name := current_setting('neon.privileged_role_name');
++
++  EXECUTE format('GRANT USAGE ON FOREIGN DATA WRAPPER postgres_fdw TO %I', privileged_role_name);
++END $$;
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index db7746b8eb..78e2c6308f 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -87,6 +87,14 @@ struct Cli {
     #[arg(short = 'C', long, value_name = "DATABASE_URL")]
     pub connstr: String,
 
+    #[arg(
+        long,
+        default_value = "neon_superuser",
+        value_name = "PRIVILEGED_ROLE_NAME",
+        value_parser = Self::parse_privileged_role_name
+    )]
+    pub privileged_role_name: String,
+
     #[cfg(target_os = "linux")]
     #[arg(long, default_value = "neon-postgres")]
     pub cgroup: String,
@@ -149,6 +157,21 @@ impl Cli {
 
         Ok(url)
     }
+
+    /// For simplicity, we do not escape `privileged_role_name` anywhere in the code.
+    /// Since it's a system role, which we fully control, that's fine. Still, let's
+    /// validate it to avoid any surprises.
+    fn parse_privileged_role_name(value: &str) -> Result<String> {
+        use regex::Regex;
+
+        let pattern = Regex::new(r"^[a-z_]+$").unwrap();
+
+        if !pattern.is_match(value) {
+            bail!("--privileged-role-name can only contain lowercase letters and underscores")
+        }
+
+        Ok(value.to_string())
+    }
 }
 
 fn main() -> Result<()> {
@@ -178,6 +201,7 @@ fn main() -> Result<()> {
         ComputeNodeParams {
             compute_id: cli.compute_id,
             connstr,
+            privileged_role_name: cli.privileged_role_name.clone(),
             pgdata: cli.pgdata.clone(),
             pgbin: cli.pgbin.clone(),
             pgversion: get_pg_version_string(&cli.pgbin),
@@ -327,4 +351,49 @@ mod test {
         ])
         .expect_err("URL parameters are not allowed");
     }
+
+    #[test]
+    fn verify_privileged_role_name() {
+        // Valid name
+        let cli = Cli::parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--privileged-role-name",
+            "my_superuser",
+        ]);
+        assert_eq!(cli.privileged_role_name, "my_superuser");
+
+        // Invalid names
+        Cli::try_parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--privileged-role-name",
+            "NeonSuperuser",
+        ])
+        .expect_err("uppercase letters are not allowed");
+
+        Cli::try_parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--privileged-role-name",
+            "$'neon_superuser",
+        ])
+        .expect_err("special characters are not allowed");
+
+        Cli::try_parse_from([
+            "compute_ctl",
+            "--pgdata=test",
+            "--connstr=test",
+            "--compute-id=test",
+            "--privileged-role-name",
+            "",
+        ])
+        .expect_err("empty name is not allowed");
+    }
 }
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 6608eb5154..941a21806f 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -74,12 +74,20 @@ const DEFAULT_INSTALLED_EXTENSIONS_COLLECTION_INTERVAL: u64 = 3600;
 
 /// Static configuration params that don't change after startup. These mostly
 /// come from the CLI args, or are derived from them.
+#[derive(Clone, Debug)]
 pub struct ComputeNodeParams {
     /// The ID of the compute
     pub compute_id: String,
-    // Url type maintains proper escaping
+
+    /// Url type maintains proper escaping
     pub connstr: url::Url,
 
+    /// The name of the 'weak' superuser role, which we give to the users.
+    /// It follows the allow list approach, i.e., we take a standard role
+    /// and grant it extra permissions with explicit GRANTs here and there,
+    /// and core patches.
+    pub privileged_role_name: String,
+
     pub resize_swap_on_bind: bool,
     pub set_disk_quota_for_fs: Option<String>,
 
@@ -1389,6 +1397,7 @@ impl ComputeNode {
         self.create_pgdata()?;
         config::write_postgres_conf(
             pgdata_path,
+            &self.params,
             &pspec.spec,
             self.params.internal_http_port,
             tls_config,
@@ -1737,6 +1746,7 @@ impl ComputeNode {
         }
 
         // Run migrations separately to not hold up cold starts
+        let params = self.params.clone();
         tokio::spawn(async move {
             let mut conf = conf.as_ref().clone();
             conf.application_name("compute_ctl:migrations");
@@ -1748,7 +1758,7 @@ impl ComputeNode {
                             eprintln!("connection error: {e}");
                         }
                     });
-                    if let Err(e) = handle_migrations(&mut client).await {
+                    if let Err(e) = handle_migrations(params, &mut client).await {
                         error!("Failed to run migrations: {}", e);
                     }
                 }
@@ -1827,6 +1837,7 @@ impl ComputeNode {
         let pgdata_path = Path::new(&self.params.pgdata);
         config::write_postgres_conf(
             pgdata_path,
+            &self.params,
             &spec,
             self.params.internal_http_port,
             tls_config,
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 169de5c963..f6487d33b3 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -9,6 +9,7 @@ use std::path::Path;
 use compute_api::responses::TlsConfig;
 use compute_api::spec::{ComputeAudit, ComputeMode, ComputeSpec, GenericOption};
 
+use crate::compute::ComputeNodeParams;
 use crate::pg_helpers::{
     GenericOptionExt, GenericOptionsSearch, PgOptionsSerialize, escape_conf_value,
 };
@@ -41,6 +42,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 /// Create or completely rewrite configuration file specified by `path`
 pub fn write_postgres_conf(
     pgdata_path: &Path,
+    params: &ComputeNodeParams,
     spec: &ComputeSpec,
     extension_server_port: u16,
     tls_config: &Option<TlsConfig>,
@@ -161,6 +163,12 @@ pub fn write_postgres_conf(
         }
     }
 
+    writeln!(
+        file,
+        "neon.privileged_role_name={}",
+        escape_conf_value(params.privileged_role_name.as_str())
+    )?;
+
     // If there are any extra options in the 'settings' field, append those
     if spec.cluster.settings.is_some() {
         writeln!(file, "# Managed by compute_ctl: begin")?;
diff --git a/compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql b/compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql
new file mode 100644
index 0000000000..6443645336
--- /dev/null
+++ b/compute_tools/src/migrations/0001-add_bypass_rls_to_privileged_role.sql
@@ -0,0 +1 @@
+ALTER ROLE {privileged_role_name} BYPASSRLS;
diff --git a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
deleted file mode 100644
index 73b36a37f6..0000000000
--- a/compute_tools/src/migrations/0001-neon_superuser_bypass_rls.sql
+++ /dev/null
@@ -1 +0,0 @@
-ALTER ROLE neon_superuser BYPASSRLS;
diff --git a/compute_tools/src/migrations/0002-alter_roles.sql b/compute_tools/src/migrations/0002-alter_roles.sql
index 8fc371eb8f..367356e6eb 100644
--- a/compute_tools/src/migrations/0002-alter_roles.sql
+++ b/compute_tools/src/migrations/0002-alter_roles.sql
@@ -15,7 +15,7 @@ DO $$
 DECLARE
     role_name text;
 BEGIN
-    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, 'neon_superuser', 'member')
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE pg_has_role(rolname, '{privileged_role_name}', 'member')
     LOOP
         RAISE NOTICE 'EXECUTING ALTER ROLE % INHERIT', quote_ident(role_name);
         EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' INHERIT';
@@ -23,7 +23,7 @@ BEGIN
 
     FOR role_name IN SELECT rolname FROM pg_roles
         WHERE
-            NOT pg_has_role(rolname, 'neon_superuser', 'member') AND NOT starts_with(rolname, 'pg_')
+            NOT pg_has_role(rolname, '{privileged_role_name}', 'member') AND NOT starts_with(rolname, 'pg_')
     LOOP
         RAISE NOTICE 'EXECUTING ALTER ROLE % NOBYPASSRLS', quote_ident(role_name);
         EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOBYPASSRLS';
diff --git a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql
similarity index 63%
rename from compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
rename to compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql
index 37f0ce211f..adf159dc06 100644
--- a/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0003-grant_pg_create_subscription_to_privileged_role.sql
@@ -1,6 +1,6 @@
 DO $$
 BEGIN
     IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-        EXECUTE 'GRANT pg_create_subscription TO neon_superuser';
+        EXECUTE 'GRANT pg_create_subscription TO {privileged_role_name}';
     END IF;
 END $$;
diff --git a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
deleted file mode 100644
index 11afd3b635..0000000000
--- a/compute_tools/src/migrations/0004-grant_pg_monitor_to_neon_superuser.sql
+++ /dev/null
@@ -1 +0,0 @@
-GRANT pg_monitor TO neon_superuser WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql b/compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql
new file mode 100644
index 0000000000..6a7ed4007f
--- /dev/null
+++ b/compute_tools/src/migrations/0004-grant_pg_monitor_to_privileged_role.sql
@@ -0,0 +1 @@
+GRANT pg_monitor TO {privileged_role_name} WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/0005-grant_all_on_tables_to_privileged_role.sql
similarity index 58%
rename from compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
rename to compute_tools/src/migrations/0005-grant_all_on_tables_to_privileged_role.sql
index 8abe052494..c31f99f3cb 100644
--- a/compute_tools/src/migrations/0005-grant_all_on_tables_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0005-grant_all_on_tables_to_privileged_role.sql
@@ -1,4 +1,4 @@
 -- SKIP: Deemed insufficient for allowing relations created by extensions to be
---       interacted with by neon_superuser without permission issues.
+--       interacted with by {privileged_role_name} without permission issues.
 
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser;
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name};
diff --git a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_privileged_role.sql
similarity index 57%
rename from compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
rename to compute_tools/src/migrations/0006-grant_all_on_sequences_to_privileged_role.sql
index 5bcb026e0c..fadac9ac3b 100644
--- a/compute_tools/src/migrations/0006-grant_all_on_sequences_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0006-grant_all_on_sequences_to_privileged_role.sql
@@ -1,4 +1,4 @@
 -- SKIP: Deemed insufficient for allowing relations created by extensions to be
---       interacted with by neon_superuser without permission issues.
+--       interacted with by {privileged_role_name} without permission issues.
 
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser;
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name};
diff --git a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql
similarity index 73%
rename from compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql
index ce7c96753e..5caa9b7829 100644
--- a/compute_tools/src/migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql
@@ -1,3 +1,3 @@
 -- SKIP: Moved inline to the handle_grants() functions.
 
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO neon_superuser WITH GRANT OPTION;
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON TABLES TO {privileged_role_name} WITH GRANT OPTION;
diff --git a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql
similarity index 72%
rename from compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql
index 72baf920cd..03de0c37ac 100644
--- a/compute_tools/src/migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
+++ b/compute_tools/src/migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql
@@ -1,3 +1,3 @@
 -- SKIP: Moved inline to the handle_grants() functions.
 
-ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO neon_superuser WITH GRANT OPTION;
+ALTER DEFAULT PRIVILEGES IN SCHEMA public GRANT ALL ON SEQUENCES TO {privileged_role_name} WITH GRANT OPTION;
diff --git a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
similarity index 82%
rename from compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
rename to compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
index 28750e00dd..84fcb36391 100644
--- a/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
+++ b/compute_tools/src/migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
@@ -1,7 +1,7 @@
 DO $$
 BEGIN
     IF (SELECT setting::numeric >= 160000 FROM pg_settings WHERE name = 'server_version_num') THEN
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO neon_superuser';
-       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO neon_superuser';
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_export_snapshot TO {privileged_role_name}';
+       EXECUTE 'GRANT EXECUTE ON FUNCTION pg_log_standby_snapshot TO {privileged_role_name}';
     END IF;
 END $$;
diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
deleted file mode 100644
index 425ed8cd3d..0000000000
--- a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
+++ /dev/null
@@ -1 +0,0 @@
-GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO neon_superuser;
diff --git a/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql
new file mode 100644
index 0000000000..125a9f463f
--- /dev/null
+++ b/compute_tools/src/migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql
@@ -0,0 +1 @@
+GRANT EXECUTE ON FUNCTION pg_show_replication_origin_status TO {privileged_role_name};
diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
deleted file mode 100644
index 36e31544be..0000000000
--- a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql
+++ /dev/null
@@ -1 +0,0 @@
-GRANT pg_signal_backend TO neon_superuser WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql
new file mode 100644
index 0000000000..1b54ec8a3b
--- /dev/null
+++ b/compute_tools/src/migrations/0012-grant_pg_signal_backend_to_privileged_role.sql
@@ -0,0 +1 @@
+GRANT pg_signal_backend TO {privileged_role_name} WITH ADMIN OPTION;
diff --git a/compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql b/compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0001-neon_superuser_bypass_rls.sql
rename to compute_tools/src/migrations/tests/0001-add_bypass_rls_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_neon_superuser.sql
rename to compute_tools/src/migrations/tests/0003-grant_pg_create_subscription_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_neon_superuser.sql
rename to compute_tools/src/migrations/tests/0004-grant_pg_monitor_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_neon_superuser.sql
rename to compute_tools/src/migrations/tests/0005-grant_all_on_tables_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_neon_superuser.sql
rename to compute_tools/src/migrations/tests/0006-grant_all_on_sequences_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/tests/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql b/compute_tools/src/migrations/tests/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql
rename to compute_tools/src/migrations/tests/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql
rename to compute_tools/src/migrations/tests/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql
rename to compute_tools/src/migrations/tests/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql
diff --git a/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql b/compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql
similarity index 100%
rename from compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_neon_superuser.sql
rename to compute_tools/src/migrations/tests/0012-grant_pg_signal_backend_to_privileged_role.sql
diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs
index b6382b2f56..4525a0e831 100644
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -9,6 +9,7 @@ use reqwest::StatusCode;
 use tokio_postgres::Client;
 use tracing::{error, info, instrument};
 
+use crate::compute::ComputeNodeParams;
 use crate::config;
 use crate::metrics::{CPLANE_REQUESTS_TOTAL, CPlaneRequestRPC, UNKNOWN_HTTP_STATUS};
 use crate::migration::MigrationRunner;
@@ -169,7 +170,7 @@ pub async fn handle_neon_extension_upgrade(client: &mut Client) -> Result<()> {
 }
 
 #[instrument(skip_all)]
-pub async fn handle_migrations(client: &mut Client) -> Result<()> {
+pub async fn handle_migrations(params: ComputeNodeParams, client: &mut Client) -> Result<()> {
     info!("handle migrations");
 
     // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
@@ -178,26 +179,59 @@ pub async fn handle_migrations(client: &mut Client) -> Result<()> {
 
     // Add new migrations in numerical order.
     let migrations = [
-        include_str!("./migrations/0001-neon_superuser_bypass_rls.sql"),
-        include_str!("./migrations/0002-alter_roles.sql"),
-        include_str!("./migrations/0003-grant_pg_create_subscription_to_neon_superuser.sql"),
-        include_str!("./migrations/0004-grant_pg_monitor_to_neon_superuser.sql"),
-        include_str!("./migrations/0005-grant_all_on_tables_to_neon_superuser.sql"),
-        include_str!("./migrations/0006-grant_all_on_sequences_to_neon_superuser.sql"),
-        include_str!(
-            "./migrations/0007-grant_all_on_tables_to_neon_superuser_with_grant_option.sql"
+        &format!(
+            include_str!("./migrations/0001-add_bypass_rls_to_privileged_role.sql"),
+            privileged_role_name = params.privileged_role_name
         ),
-        include_str!(
-            "./migrations/0008-grant_all_on_sequences_to_neon_superuser_with_grant_option.sql"
+        &format!(
+            include_str!("./migrations/0002-alter_roles.sql"),
+            privileged_role_name = params.privileged_role_name
+        ),
+        &format!(
+            include_str!("./migrations/0003-grant_pg_create_subscription_to_privileged_role.sql"),
+            privileged_role_name = params.privileged_role_name
+        ),
+        &format!(
+            include_str!("./migrations/0004-grant_pg_monitor_to_privileged_role.sql"),
+            privileged_role_name = params.privileged_role_name
+        ),
+        &format!(
+            include_str!("./migrations/0005-grant_all_on_tables_to_privileged_role.sql"),
+            privileged_role_name = params.privileged_role_name
+        ),
+        &format!(
+            include_str!("./migrations/0006-grant_all_on_sequences_to_privileged_role.sql"),
+            privileged_role_name = params.privileged_role_name
+        ),
+        &format!(
+            include_str!(
+                "./migrations/0007-grant_all_on_tables_with_grant_option_to_privileged_role.sql"
+            ),
+            privileged_role_name = params.privileged_role_name
+        ),
+        &format!(
+            include_str!(
+                "./migrations/0008-grant_all_on_sequences_with_grant_option_to_privileged_role.sql"
+            ),
+            privileged_role_name = params.privileged_role_name
         ),
         include_str!("./migrations/0009-revoke_replication_for_previously_allowed_roles.sql"),
-        include_str!(
-            "./migrations/0010-grant_snapshot_synchronization_funcs_to_neon_superuser.sql"
+        &format!(
+            include_str!(
+                "./migrations/0010-grant_snapshot_synchronization_funcs_to_privileged_role.sql"
+            ),
+            privileged_role_name = params.privileged_role_name
         ),
-        include_str!(
-            "./migrations/0011-grant_pg_show_replication_origin_status_to_neon_superuser.sql"
+        &format!(
+            include_str!(
+                "./migrations/0011-grant_pg_show_replication_origin_status_to_privileged_role.sql"
+            ),
+            privileged_role_name = params.privileged_role_name
+        ),
+        &format!(
+            include_str!("./migrations/0012-grant_pg_signal_backend_to_privileged_role.sql"),
+            privileged_role_name = params.privileged_role_name
         ),
-        include_str!("./migrations/0012-grant_pg_signal_backend_to_neon_superuser.sql"),
     ];
 
     MigrationRunner::new(client, &migrations)
diff --git a/compute_tools/src/spec_apply.rs b/compute_tools/src/spec_apply.rs
index fcd072263a..ec7e75922b 100644
--- a/compute_tools/src/spec_apply.rs
+++ b/compute_tools/src/spec_apply.rs
@@ -13,14 +13,14 @@ use tokio_postgres::Client;
 use tokio_postgres::error::SqlState;
 use tracing::{Instrument, debug, error, info, info_span, instrument, warn};
 
-use crate::compute::{ComputeNode, ComputeState};
+use crate::compute::{ComputeNode, ComputeNodeParams, ComputeState};
 use crate::pg_helpers::{
     DatabaseExt, Escaping, GenericOptionsSearch, RoleExt, get_existing_dbs_async,
     get_existing_roles_async,
 };
 use crate::spec_apply::ApplySpecPhase::{
-    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreateNeonSuperuser,
-    CreatePgauditExtension, CreatePgauditlogtofileExtension, CreateSchemaNeon,
+    CreateAndAlterDatabases, CreateAndAlterRoles, CreateAvailabilityCheck, CreatePgauditExtension,
+    CreatePgauditlogtofileExtension, CreatePrivilegedRole, CreateSchemaNeon,
     DisablePostgresDBPgAudit, DropInvalidDatabases, DropRoles, FinalizeDropLogicalSubscriptions,
     HandleNeonExtension, HandleOtherExtensions, RenameAndDeleteDatabases, RenameRoles,
     RunInEachDatabase,
@@ -49,6 +49,7 @@ impl ComputeNode {
             // Proceed with post-startup configuration. Note, that order of operations is important.
             let client = Self::get_maintenance_client(&conf).await?;
             let spec = spec.clone();
+            let params = Arc::new(self.params.clone());
 
             let databases = get_existing_dbs_async(&client).await?;
             let roles = get_existing_roles_async(&client)
@@ -157,6 +158,7 @@ impl ComputeNode {
 
                     let conf = Arc::new(conf);
                     let fut = Self::apply_spec_sql_db(
+                        params.clone(),
                         spec.clone(),
                         conf,
                         ctx.clone(),
@@ -185,7 +187,7 @@ impl ComputeNode {
             }
 
             for phase in [
-                CreateNeonSuperuser,
+                CreatePrivilegedRole,
                 DropInvalidDatabases,
                 RenameRoles,
                 CreateAndAlterRoles,
@@ -195,6 +197,7 @@ impl ComputeNode {
             ] {
                 info!("Applying phase {:?}", &phase);
                 apply_operations(
+                    params.clone(),
                     spec.clone(),
                     ctx.clone(),
                     jwks_roles.clone(),
@@ -243,6 +246,7 @@ impl ComputeNode {
                     }
 
                     let fut = Self::apply_spec_sql_db(
+                        params.clone(),
                         spec.clone(),
                         conf,
                         ctx.clone(),
@@ -293,6 +297,7 @@ impl ComputeNode {
             for phase in phases {
                 debug!("Applying phase {:?}", &phase);
                 apply_operations(
+                    params.clone(),
                     spec.clone(),
                     ctx.clone(),
                     jwks_roles.clone(),
@@ -313,7 +318,9 @@ impl ComputeNode {
     /// May opt to not connect to databases that don't have any scheduled
     /// operations.  The function is concurrency-controlled with the provided
     /// semaphore.  The caller has to make sure the semaphore isn't exhausted.
+    #[allow(clippy::too_many_arguments)] // TODO: needs bigger refactoring
     async fn apply_spec_sql_db(
+        params: Arc<ComputeNodeParams>,
         spec: Arc<ComputeSpec>,
         conf: Arc<tokio_postgres::Config>,
         ctx: Arc<tokio::sync::RwLock<MutableApplyContext>>,
@@ -328,6 +335,7 @@ impl ComputeNode {
 
         for subphase in subphases {
             apply_operations(
+                params.clone(),
                 spec.clone(),
                 ctx.clone(),
                 jwks_roles.clone(),
@@ -467,7 +475,7 @@ pub enum PerDatabasePhase {
 
 #[derive(Clone, Debug)]
 pub enum ApplySpecPhase {
-    CreateNeonSuperuser,
+    CreatePrivilegedRole,
     DropInvalidDatabases,
     RenameRoles,
     CreateAndAlterRoles,
@@ -510,6 +518,7 @@ pub struct MutableApplyContext {
 /// - No timeouts have (yet) been implemented.
 /// - The caller is responsible for limiting and/or applying concurrency.
 pub async fn apply_operations<'a, Fut, F>(
+    params: Arc<ComputeNodeParams>,
     spec: Arc<ComputeSpec>,
     ctx: Arc<RwLock<MutableApplyContext>>,
     jwks_roles: Arc<HashSet<String>>,
@@ -527,7 +536,7 @@ where
         debug!("Processing phase {:?}", &apply_spec_phase);
         let ctx = ctx;
 
-        let mut ops = get_operations(&spec, &ctx, &jwks_roles, &apply_spec_phase)
+        let mut ops = get_operations(&params, &spec, &ctx, &jwks_roles, &apply_spec_phase)
             .await?
             .peekable();
 
@@ -588,14 +597,18 @@ where
 /// sort/merge/batch execution, but for now this is a nice way to improve
 /// batching behavior of the commands.
 async fn get_operations<'a>(
+    params: &'a ComputeNodeParams,
     spec: &'a ComputeSpec,
     ctx: &'a RwLock<MutableApplyContext>,
     jwks_roles: &'a HashSet<String>,
     apply_spec_phase: &'a ApplySpecPhase,
 ) -> Result<Box<dyn Iterator<Item = Operation> + 'a + Send>> {
     match apply_spec_phase {
-        ApplySpecPhase::CreateNeonSuperuser => Ok(Box::new(once(Operation {
-            query: include_str!("sql/create_neon_superuser.sql").to_string(),
+        ApplySpecPhase::CreatePrivilegedRole => Ok(Box::new(once(Operation {
+            query: format!(
+                include_str!("sql/create_privileged_role.sql"),
+                privileged_role_name = params.privileged_role_name
+            ),
             comment: None,
         }))),
         ApplySpecPhase::DropInvalidDatabases => {
@@ -697,8 +710,9 @@ async fn get_operations<'a>(
                         None => {
                             let query = if !jwks_roles.contains(role.name.as_str()) {
                                 format!(
-                                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser {}",
+                                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE {} {}",
                                     role.name.pg_quote(),
+                                    params.privileged_role_name,
                                     role.to_pg_options(),
                                 )
                             } else {
@@ -849,8 +863,9 @@ async fn get_operations<'a>(
                                 // ALL PRIVILEGES grants CREATE, CONNECT, and TEMPORARY on the database
                                 // (see https://www.postgresql.org/docs/current/ddl-priv.html)
                                 query: format!(
-                                    "GRANT ALL PRIVILEGES ON DATABASE {} TO neon_superuser",
-                                    db.name.pg_quote()
+                                    "GRANT ALL PRIVILEGES ON DATABASE {} TO {}",
+                                    db.name.pg_quote(),
+                                    params.privileged_role_name
                                 ),
                                 comment: None,
                             },
diff --git a/compute_tools/src/sql/create_neon_superuser.sql b/compute_tools/src/sql/create_neon_superuser.sql
deleted file mode 100644
index 300645627b..0000000000
--- a/compute_tools/src/sql/create_neon_superuser.sql
+++ /dev/null
@@ -1,8 +0,0 @@
-DO $$
-    BEGIN
-        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
-        THEN
-            CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
-        END IF;
-    END
-$$;
diff --git a/compute_tools/src/sql/create_privileged_role.sql b/compute_tools/src/sql/create_privileged_role.sql
new file mode 100644
index 0000000000..df27ac32fc
--- /dev/null
+++ b/compute_tools/src/sql/create_privileged_role.sql
@@ -0,0 +1,8 @@
+DO $$
+    BEGIN
+        IF NOT EXISTS (SELECT FROM pg_catalog.pg_roles WHERE rolname = '{privileged_role_name}')
+        THEN
+            CREATE ROLE {privileged_role_name} CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
+        END IF;
+    END
+$$;
diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 6021933d6a..efc135ed91 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -631,6 +631,10 @@ struct EndpointCreateCmdArgs {
         help = "Allow multiple primary endpoints running on the same branch. Shouldn't be used normally, but useful for tests."
     )]
     allow_multiple: bool,
+
+    /// Only allow changing it on creation
+    #[clap(long, help = "Name of the privileged role for the endpoint")]
+    privileged_role_name: Option<String>,
 }
 
 #[derive(clap::Args)]
@@ -1480,6 +1484,7 @@ async fn handle_endpoint(subcmd: &EndpointCmd, env: &local_env::LocalEnv) -> Res
                 args.grpc,
                 !args.update_catalog,
                 false,
+                args.privileged_role_name.clone(),
             )?;
         }
         EndpointCmd::Start(args) => {
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 792da14a32..24956e3ac9 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -99,6 +99,7 @@ pub struct EndpointConf {
     features: Vec<ComputeFeature>,
     cluster: Option<Cluster>,
     compute_ctl_config: ComputeCtlConfig,
+    privileged_role_name: Option<String>,
 }
 
 //
@@ -199,6 +200,7 @@ impl ComputeControlPlane {
         grpc: bool,
         skip_pg_catalog_updates: bool,
         drop_subscriptions_before_start: bool,
+        privileged_role_name: Option<String>,
     ) -> Result<Arc<Endpoint>> {
         let pg_port = pg_port.unwrap_or_else(|| self.get_port());
         let external_http_port = external_http_port.unwrap_or_else(|| self.get_port() + 1);
@@ -236,6 +238,7 @@ impl ComputeControlPlane {
             features: vec![],
             cluster: None,
             compute_ctl_config: compute_ctl_config.clone(),
+            privileged_role_name: privileged_role_name.clone(),
         });
 
         ep.create_endpoint_dir()?;
@@ -257,6 +260,7 @@ impl ComputeControlPlane {
                 features: vec![],
                 cluster: None,
                 compute_ctl_config,
+                privileged_role_name,
             })?,
         )?;
         std::fs::write(
@@ -332,6 +336,9 @@ pub struct Endpoint {
 
     /// The compute_ctl config for the endpoint's compute.
     compute_ctl_config: ComputeCtlConfig,
+
+    /// The name of the privileged role for the endpoint.
+    privileged_role_name: Option<String>,
 }
 
 #[derive(PartialEq, Eq)]
@@ -432,6 +439,7 @@ impl Endpoint {
             features: conf.features,
             cluster: conf.cluster,
             compute_ctl_config: conf.compute_ctl_config,
+            privileged_role_name: conf.privileged_role_name,
         })
     }
 
@@ -870,6 +878,10 @@ impl Endpoint {
             cmd.arg("--dev");
         }
 
+        if let Some(privileged_role_name) = self.privileged_role_name.clone() {
+            cmd.args(["--privileged-role-name", &privileged_role_name]);
+        }
+
         let child = cmd.spawn()?;
         // set up a scopeguard to kill & wait for the child in case we panic or bail below
         let child = scopeguard::guard(child, |mut child| {
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 7b749f1080..df5dcf5334 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -543,6 +543,15 @@ _PG_init(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
+
+	DefineCustomStringVariable(
+							"neon.privileged_role_name",
+							"Name of the 'weak' superuser role, which we give to the users",
+							NULL,
+							&privileged_role_name,
+							"neon_superuser",
+							PGC_POSTMASTER, 0, NULL, NULL, NULL);
+
 	/*
 	 * Important: This must happen after other parts of the extension are
 	 * loaded, otherwise any settings to GUCs that were set before the
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 431dacb708..215396ef7a 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -16,7 +16,6 @@
 extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;
-
 extern char *wal_acceptors_list;
 extern int	wal_acceptor_reconnect_timeout;
 extern int	wal_acceptor_connection_timeout;
diff --git a/pgxn/neon/neon_ddl_handler.c b/pgxn/neon/neon_ddl_handler.c
index 1f03e52c67..74a90ea4d4 100644
--- a/pgxn/neon/neon_ddl_handler.c
+++ b/pgxn/neon/neon_ddl_handler.c
@@ -13,7 +13,7 @@
  *        accumulate changes. On subtransaction commit, the top of the stack
  *        is merged with the table below it.
  *
- *    Support event triggers for neon_superuser
+ *    Support event triggers for {privileged_role_name}
  *
  * IDENTIFICATION
  *	 contrib/neon/neon_dll_handler.c
@@ -49,6 +49,7 @@
 
 #include "neon_ddl_handler.h"
 #include "neon_utils.h"
+#include "neon.h"
 
 static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
 static fmgr_hook_type next_fmgr_hook = NULL;
@@ -541,11 +542,11 @@ NeonXactCallback(XactEvent event, void *arg)
 }
 
 static bool
-RoleIsNeonSuperuser(const char *role_name)
+IsPrivilegedRole(const char *role_name)
 {
 	Assert(role_name);
 
-	return strcmp(role_name, "neon_superuser") == 0;
+	return strcmp(role_name, privileged_role_name) == 0;
 }
 
 static void
@@ -578,8 +579,9 @@ HandleCreateDb(CreatedbStmt *stmt)
 	{
 		const char *owner_name = defGetString(downer);
 
-		if (RoleIsNeonSuperuser(owner_name))
-			elog(ERROR, "can't create a database with owner neon_superuser");
+		if (IsPrivilegedRole(owner_name))
+			elog(ERROR, "could not create a database with owner %s", privileged_role_name);
+
 		entry->owner = get_role_oid(owner_name, false);
 	}
 	else
@@ -609,8 +611,9 @@ HandleAlterOwner(AlterOwnerStmt *stmt)
 		memset(entry->old_name, 0, sizeof(entry->old_name));
 
 	new_owner = get_rolespec_name(stmt->newowner);
-	if (RoleIsNeonSuperuser(new_owner))
-		elog(ERROR, "can't alter owner to neon_superuser");
+	if (IsPrivilegedRole(new_owner))
+		elog(ERROR, "could not alter owner to %s", privileged_role_name);
+
 	entry->owner = get_role_oid(new_owner, false);
 	entry->type = Op_Set;
 }
@@ -716,8 +719,8 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	InitRoleTableIfNeeded();
 
 	role_name = get_rolespec_name(stmt->role);
-	if (RoleIsNeonSuperuser(role_name) && !superuser())
-		elog(ERROR, "can't ALTER neon_superuser");
+	if (IsPrivilegedRole(role_name) && !superuser())
+		elog(ERROR, "could not ALTER %s", privileged_role_name);
 
 	dpass = NULL;
 	foreach(option, stmt->options)
@@ -831,7 +834,7 @@ HandleRename(RenameStmt *stmt)
  *
  * In vanilla only superuser can create Event Triggers.
  *
- * We allow it for neon_superuser by temporary switching to superuser. But as
+ * We allow it for {privileged_role_name} by temporary switching to superuser. But as
  * far as event trigger can fire in superuser context we should protect
  * superuser from execution of arbitrary user's code.
  *
@@ -891,7 +894,7 @@ force_noop(FmgrInfo *finfo)
  * Also skip executing Event Triggers when GUC neon.event_triggers has been
  * set to false. This might be necessary to be able to connect again after a
  * LOGIN Event Trigger has been installed that would prevent connections as
- * neon_superuser.
+ * {privileged_role_name}.
  */
 static void
 neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
@@ -910,24 +913,24 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 	}
 
 	/*
-	 * The neon_superuser role can use the GUC neon.event_triggers to disable
+	 * The {privileged_role_name} role can use the GUC neon.event_triggers to disable
 	 * firing Event Trigger.
 	 *
 	 *   SET neon.event_triggers TO false;
 	 *
-	 * This only applies to the neon_superuser role though, and only allows
-	 * skipping Event Triggers owned by neon_superuser, which we check by
-	 * proxy of the Event Trigger function being owned by neon_superuser.
+	 * This only applies to the {privileged_role_name} role though, and only allows
+	 * skipping Event Triggers owned by {privileged_role_name}, which we check by
+	 * proxy of the Event Trigger function being owned by {privileged_role_name}.
 	 *
-	 * A role that is created in role neon_superuser should be allowed to also
+	 * A role that is created in role {privileged_role_name} should be allowed to also
 	 * benefit from the neon_event_triggers GUC, and will be considered the
-	 * same as the neon_superuser role.
+	 * same as the {privileged_role_name} role.
 	 */
 	if (event == FHET_START
 		&& !neon_event_triggers
-		&& is_neon_superuser())
+		&& is_privileged_role())
 	{
-		Oid neon_superuser_oid = get_role_oid("neon_superuser", false);
+		Oid weak_superuser_oid = get_role_oid(privileged_role_name, false);
 
 		/* Find the Function Attributes (owner Oid, security definer) */
 		const char *fun_owner_name = NULL;
@@ -937,8 +940,8 @@ neon_fmgr_hook(FmgrHookEventType event, FmgrInfo *flinfo, Datum *private)
 		LookupFuncOwnerSecDef(flinfo->fn_oid, &fun_owner, &fun_is_secdef);
 		fun_owner_name = GetUserNameFromId(fun_owner, false);
 
-		if (RoleIsNeonSuperuser(fun_owner_name)
-			|| has_privs_of_role(fun_owner, neon_superuser_oid))
+		if (IsPrivilegedRole(fun_owner_name)
+			|| has_privs_of_role(fun_owner, weak_superuser_oid))
 		{
 			elog(WARNING,
 				 "Skipping Event Trigger: neon.event_triggers is false");
@@ -1149,13 +1152,13 @@ ProcessCreateEventTrigger(
 	}
 
 	/*
-	 * Allow neon_superuser to create Event Trigger, while keeping the
+	 * Allow {privileged_role_name} to create Event Trigger, while keeping the
 	 * ownership of the object.
 	 *
 	 * For that we give superuser membership to the role for the execution of
 	 * the command.
 	 */
-	if (IsTransactionState() && is_neon_superuser())
+	if (IsTransactionState() && is_privileged_role())
 	{
 		/* Find the Event Trigger function Oid */
 		Oid func_oid = LookupFuncName(stmt->funcname, 0, NULL, false);
@@ -1232,7 +1235,7 @@ ProcessCreateEventTrigger(
 		 *
 		 * That way [ ALTER | DROP ] EVENT TRIGGER commands just work.
 		 */
-		if (IsTransactionState() && is_neon_superuser())
+		if (IsTransactionState() && is_privileged_role())
 		{
 			if (!current_user_is_super)
 			{
@@ -1352,19 +1355,17 @@ NeonProcessUtility(
 }
 
 /*
- * Only neon_superuser is granted privilege to edit neon.event_triggers GUC.
+ * Only {privileged_role_name} is granted privilege to edit neon.event_triggers GUC.
  */
 static void
 neon_event_triggers_assign_hook(bool newval, void *extra)
 {
-	/* MyDatabaseId == InvalidOid || !OidIsValid(GetUserId())	 */
-
-	if (IsTransactionState() && !is_neon_superuser())
+	if (IsTransactionState() && !is_privileged_role())
 	{
 		ereport(ERROR,
 				(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
 				 errmsg("permission denied to set neon.event_triggers"),
-				 errdetail("Only \"neon_superuser\" is allowed to set the GUC")));
+				 errdetail("Only \"%s\" is allowed to set the GUC", privileged_role_name)));
 	}
 }
 
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 1abd3396e4..f33d4a0d22 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -503,6 +503,7 @@ class NeonLocalCli(AbstractNeonCli):
         pageserver_id: int | None = None,
         allow_multiple=False,
         update_catalog: bool = False,
+        privileged_role_name: str | None = None,
     ) -> subprocess.CompletedProcess[str]:
         args = [
             "endpoint",
@@ -534,6 +535,8 @@ class NeonLocalCli(AbstractNeonCli):
             args.extend(["--allow-multiple"])
         if update_catalog:
             args.extend(["--update-catalog"])
+        if privileged_role_name is not None:
+            args.extend(["--privileged-role-name", privileged_role_name])
 
         res = self.raw_cli(args)
         res.check_returncode()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ea1b045b78..ae73ace9bb 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4324,6 +4324,7 @@ class Endpoint(PgProtocol, LogUtils):
         pageserver_id: int | None = None,
         allow_multiple: bool = False,
         update_catalog: bool = False,
+        privileged_role_name: str | None = None,
     ) -> Self:
         """
         Create a new Postgres endpoint.
@@ -4351,6 +4352,7 @@ class Endpoint(PgProtocol, LogUtils):
             pageserver_id=pageserver_id,
             allow_multiple=allow_multiple,
             update_catalog=update_catalog,
+            privileged_role_name=privileged_role_name,
         )
         path = Path("endpoints") / self.endpoint_id / "pgdata"
         self.pgdata_dir = self.env.repo_dir / path
@@ -4800,6 +4802,7 @@ class EndpointFactory:
         config_lines: list[str] | None = None,
         pageserver_id: int | None = None,
         update_catalog: bool = False,
+        privileged_role_name: str | None = None,
     ) -> Endpoint:
         ep = Endpoint(
             self.env,
@@ -4823,6 +4826,7 @@ class EndpointFactory:
             config_lines=config_lines,
             pageserver_id=pageserver_id,
             update_catalog=update_catalog,
+            privileged_role_name=privileged_role_name,
         )
 
     def stop_all(self, fail_on_error=True) -> Self:
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index f99d79e138..9a28f22e78 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -103,3 +103,90 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion):
         query = "DROP SUBSCRIPTION sub CASCADE"
         log.info(f"Dropping subscription: {query}")
         cur.execute(query)
+
+
+def test_privileged_role_override(neon_simple_env: NeonEnv, pg_version: PgVersion):
+    """
+    Test that we can override the privileged role for an endpoint and when we do it,
+    everything is correctly bootstrapped inside Postgres and we don't have neon_superuser
+    role in the database.
+    """
+    PRIVILEGED_ROLE_NAME = "my_superuser"
+
+    env = neon_simple_env
+    env.create_branch("test_privileged_role_override")
+    ep = env.endpoints.create(
+        "test_privileged_role_override",
+        privileged_role_name=PRIVILEGED_ROLE_NAME,
+        update_catalog=True,
+    )
+
+    ep.start()
+
+    ep.wait_for_migrations()
+
+    member_roles = [
+        "pg_read_all_data",
+        "pg_write_all_data",
+        "pg_monitor",
+        "pg_signal_backend",
+    ]
+
+    non_member_roles = [
+        "pg_execute_server_program",
+        "pg_read_server_files",
+        "pg_write_server_files",
+    ]
+
+    role_attributes = {
+        "rolsuper": False,
+        "rolinherit": True,
+        "rolcreaterole": True,
+        "rolcreatedb": True,
+        "rolcanlogin": False,
+        "rolreplication": True,
+        "rolconnlimit": -1,
+        "rolbypassrls": True,
+    }
+
+    if pg_version >= PgVersion.V15:
+        non_member_roles.append("pg_checkpoint")
+
+    if pg_version >= PgVersion.V16:
+        member_roles.append("pg_create_subscription")
+        non_member_roles.append("pg_use_reserved_connections")
+
+    with ep.cursor() as cur:
+        cur.execute(f"SELECT rolname FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
+        assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
+
+        cur.execute("SELECT rolname FROM pg_roles WHERE rolname = 'neon_superuser'")
+        assert len(cur.fetchall()) == 0
+
+        cur.execute("SHOW neon.privileged_role_name")
+        assert cur.fetchall()[0][0] == PRIVILEGED_ROLE_NAME
+
+        # check PRIVILEGED_ROLE_NAME role is created
+        cur.execute(f"select * from pg_roles where rolname = '{PRIVILEGED_ROLE_NAME}'")
+        assert cur.fetchone() is not None
+
+        # check PRIVILEGED_ROLE_NAME role has the correct member roles
+        for role in member_roles:
+            cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
+            assert cur.fetchone() == (True,), (
+                f"Role {role} should be a member of {PRIVILEGED_ROLE_NAME}"
+            )
+
+        for role in non_member_roles:
+            cur.execute(f"SELECT pg_has_role('{PRIVILEGED_ROLE_NAME}', '{role}', 'member')")
+            assert cur.fetchone() == (False,), (
+                f"Role {role} should not be a member of {PRIVILEGED_ROLE_NAME}"
+            )
+
+        # check PRIVILEGED_ROLE_NAME role has the correct role attributes
+        for attr, val in role_attributes.items():
+            cur.execute(f"SELECT {attr} FROM pg_roles WHERE rolname = '{PRIVILEGED_ROLE_NAME}'")
+            curr_val = cur.fetchone()
+            assert curr_val == (val,), (
+                f"Role attribute {attr} should be {val} instead of {curr_val}"
+            )
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 8ce1f52303..af550a80c6 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 8ce1f52303aec29e098309347b57c01a1962e221
+Subproject commit af550a80c6b86d0fec378ee929e2bb2e591e5cd3
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index afd46987f3..21cb86b814 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit afd46987f3da50c9146a8aa59380052df0862c06
+Subproject commit 21cb86b81454522870d3634cac3e10b821da09fe
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index e08c8d5f15..c148871ead 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit e08c8d5f1576ca0487d14d154510499c5f12adfb
+Subproject commit c148871eada02c0cf15d553d8ff7c389d01810f2
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 353c725b0c..8de764e44b 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 353c725b0c76cc82b15af21d8360d03391dc6814
+Subproject commit 8de764e44b56d1cffb3644368d4d689f482b611a
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 992aa405b1..3c8067a23d 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "353c725b0c76cc82b15af21d8360d03391dc6814"
+    "8de764e44b56d1cffb3644368d4d689f482b611a"
   ],
   "v16": [
     "16.9",
-    "e08c8d5f1576ca0487d14d154510499c5f12adfb"
+    "c148871eada02c0cf15d553d8ff7c389d01810f2"
   ],
   "v15": [
     "15.13",
-    "afd46987f3da50c9146a8aa59380052df0862c06"
+    "21cb86b81454522870d3634cac3e10b821da09fe"
   ],
   "v14": [
     "14.18",
-    "8ce1f52303aec29e098309347b57c01a1962e221"
+    "af550a80c6b86d0fec378ee929e2bb2e591e5cd3"
   ]
 }

From 0c99f16c608ee38f30de11c345d452a574c303c4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 16 Jul 2025 09:26:52 +0100
Subject: [PATCH 127/163] CI(run-python-test-set): don't collect code coverage
 for real (#12611)

## Problem

neondatabase/neon#12601 did't compleatly disable writing `*.profraw`
files, but instead of `/tmp/coverage` it started to write into the
current directory

## Summary of changes
- Set `LLVM_PROFILE_FILE=/dev/null` to avoing writing `*.profraw` at all
---
 .github/actions/run-python-test-set/action.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index b3e68ab606..1f2012358e 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -181,6 +181,8 @@ runs:
           # Ref https://github.com/neondatabase/neon/issues/4540
           # cov_prefix=(scripts/coverage "--profraw-prefix=$GITHUB_JOB" --dir=/tmp/coverage run)
           cov_prefix=()
+          # Explicitly set LLVM_PROFILE_FILE to /dev/null to avoid writing *.profraw files
+          export LLVM_PROFILE_FILE=/dev/null
         else
           cov_prefix=()
         fi

From caca08fe78d73496804ab04ddf214c22ad77fd8a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 16 Jul 2025 12:08:27 +0100
Subject: [PATCH 128/163] CI: rework and merge `lint-openapi-spec` and
 `validate-compute-manifest` jobs (#12575)

## Problem

We have several linters that use Node.js, but they are currently set up
differently, both locally and on CI.

## Summary of changes
- Add Node.js to `build-tools` image
- Move `compute/package.json` -> `build-tools/package.json` and add
`redocly` to it `@redocly/cli`
- Unify and merge into one job `lint-openapi-spec` and
`validate-compute-manifest`
---
 .github/workflows/build_and_test.yml |   43 +-
 .gitignore                           |    3 +
 Makefile                             |    8 +-
 build-tools/Dockerfile               |    6 +
 build-tools/package-lock.json        | 3189 ++++++++++++++++++++++++++
 build-tools/package.json             |    8 +
 compute/Makefile                     |   10 +-
 compute/package.json                 |    7 -
 8 files changed, 3230 insertions(+), 44 deletions(-)
 create mode 100644 build-tools/package-lock.json
 create mode 100644 build-tools/package.json
 delete mode 100644 compute/package.json

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index cc9534f05d..2977f642bc 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -87,22 +87,27 @@ jobs:
     uses: ./.github/workflows/build-build-tools-image.yml
     secrets: inherit
 
-  lint-openapi-spec:
-    runs-on: ubuntu-22.04
-    needs: [ meta, check-permissions ]
+  lint-yamls:
+    needs: [ meta, check-permissions, build-build-tools-image ]
     # We do need to run this in `.*-rc-pr` because of hotfixes.
     if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+      options: --init
+
     steps:
       - name: Harden the runner (Audit all outbound calls)
         uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
         with:
           egress-policy: audit
+
       - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-      - uses: docker/login-action@74a5d142397b4f367a81961eba4e8cd7edddf772 # v3.4.0
-        with:
-          registry: ghcr.io
-          username: ${{ github.actor }}
-          password: ${{ secrets.GITHUB_TOKEN }}
+
+      - run: make -C compute manifest-schema-validation
       - run: make lint-openapi-spec
 
   check-codestyle-python:
@@ -217,28 +222,6 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
     secrets: inherit
 
-  validate-compute-manifest:
-    runs-on: ubuntu-22.04
-    needs: [ meta, check-permissions ]
-    # We do need to run this in `.*-rc-pr` because of hotfixes.
-    if: ${{ contains(fromJSON('["pr", "push-main", "storage-rc-pr", "proxy-rc-pr", "compute-rc-pr"]'), needs.meta.outputs.run-kind) }}
-    steps:
-      - name: Harden the runner (Audit all outbound calls)
-        uses: step-security/harden-runner@4d991eb9b905ef189e4c376166672c3f2f230481 # v2.11.0
-        with:
-          egress-policy: audit
-
-      - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
-
-      - name: Set up Node.js
-        uses: actions/setup-node@49933ea5288caeca8642d1e84afbd3f7d6820020 # v4.4.0
-        with:
-          node-version: '24'
-
-      - name: Validate manifest against schema
-        run: |
-          make -C compute manifest-schema-validation
-
   build-and-test-locally:
     needs: [ meta, build-build-tools-image ]
     # We do need to run this in `.*-rc-pr` because of hotfixes.
diff --git a/.gitignore b/.gitignore
index 4857972f1d..835cceb123 100644
--- a/.gitignore
+++ b/.gitignore
@@ -29,3 +29,6 @@ docker-compose/docker-compose-parallel.yml
 
 # pgindent typedef lists
 *.list
+
+# Node
+**/node_modules/
diff --git a/Makefile b/Makefile
index d07ac907b4..749e527ac3 100644
--- a/Makefile
+++ b/Makefile
@@ -220,11 +220,15 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
 
+build-tools/node_modules: build-tools/package.json
+	cd build-tools && $(if $(CI),npm ci,npm install)
+	touch build-tools/node_modules
+
 .PHONY: lint-openapi-spec
-lint-openapi-spec:
+lint-openapi-spec: build-tools/node_modules
 	# operation-2xx-response: pageserver timeline delete returns 404 on success
 	find . -iname "openapi_spec.y*ml" -exec\
-		docker run --rm -v ${PWD}:/spec ghcr.io/redocly/cli:1.34.4\
+		npx --prefix=build-tools/ redocly\
 			--skip-rule=operation-operationId --skip-rule=operation-summary --extends=minimal\
 			--skip-rule=no-server-example.com --skip-rule=operation-2xx-response\
 			lint {} \+
diff --git a/build-tools/Dockerfile b/build-tools/Dockerfile
index 2ed7bb4f36..e02707a5eb 100644
--- a/build-tools/Dockerfile
+++ b/build-tools/Dockerfile
@@ -188,6 +188,12 @@ RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && bash -c 'for f in /usr/bin/clang*-${LLVM_VERSION} /usr/bin/llvm*-${LLVM_VERSION}; do ln -s "${f}" "${f%-${LLVM_VERSION}}"; done' \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
+# Install node
+ENV NODE_VERSION=24
+RUN curl -fsSL https://deb.nodesource.com/setup_${NODE_VERSION}.x | bash - \
+    && apt install -y nodejs \
+    && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
+
 # Install docker
 RUN curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg \
     && echo "deb [arch=$(dpkg --print-architecture) signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/debian ${DEBIAN_VERSION} stable" > /etc/apt/sources.list.d/docker.list \
diff --git a/build-tools/package-lock.json b/build-tools/package-lock.json
new file mode 100644
index 0000000000..b2c44ed9b4
--- /dev/null
+++ b/build-tools/package-lock.json
@@ -0,0 +1,3189 @@
+{
+  "name": "build-tools",
+  "lockfileVersion": 3,
+  "requires": true,
+  "packages": {
+    "": {
+      "name": "build-tools",
+      "devDependencies": {
+        "@redocly/cli": "1.34.4",
+        "@sourcemeta/jsonschema": "10.0.0"
+      }
+    },
+    "node_modules/@babel/code-frame": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz",
+      "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/helper-validator-identifier": "^7.27.1",
+        "js-tokens": "^4.0.0",
+        "picocolors": "^1.1.1"
+      },
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/helper-validator-identifier": {
+      "version": "7.27.1",
+      "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.27.1.tgz",
+      "integrity": "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@babel/runtime": {
+      "version": "7.27.6",
+      "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.27.6.tgz",
+      "integrity": "sha512-vbavdySgbTTrmFE+EsiqUTzlOr5bzlnJtUv9PynGCAKvfQqjIXbvFdumPM/GxMDfyuGMJaJAU6TO4zc1Jf1i8Q==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6.9.0"
+      }
+    },
+    "node_modules/@emotion/is-prop-valid": {
+      "version": "1.2.2",
+      "resolved": "https://registry.npmjs.org/@emotion/is-prop-valid/-/is-prop-valid-1.2.2.tgz",
+      "integrity": "sha512-uNsoYd37AFmaCdXlg6EYD1KaPOaRWRByMCYzbKUX4+hhMfrxdVSelShywL4JVaAeM/eHUOSprYBQls+/neX3pw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@emotion/memoize": "^0.8.1"
+      }
+    },
+    "node_modules/@emotion/memoize": {
+      "version": "0.8.1",
+      "resolved": "https://registry.npmjs.org/@emotion/memoize/-/memoize-0.8.1.tgz",
+      "integrity": "sha512-W2P2c/VRW1/1tLox0mVUalvnWXxavmv/Oum2aPsRcoDJuob75FC3Y8FbpfLwUegRcxINtGUMPq0tFCvYNTBXNA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@emotion/unitless": {
+      "version": "0.8.1",
+      "resolved": "https://registry.npmjs.org/@emotion/unitless/-/unitless-0.8.1.tgz",
+      "integrity": "sha512-KOEGMu6dmJZtpadb476IsZBclKvILjopjUii3V+7MnXIQCYh8W3NgNcgwo21n9LXZX6EDIKvqfjYxXebDwxKmQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@exodus/schemasafe": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/@exodus/schemasafe/-/schemasafe-1.3.0.tgz",
+      "integrity": "sha512-5Aap/GaRupgNx/feGBwLLTVv8OQFfv3pq2lPRzPg9R+IOBnDgghTGW7l7EuVXOvg5cc/xSAlRW8rBrjIC3Nvqw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@faker-js/faker": {
+      "version": "7.6.0",
+      "resolved": "https://registry.npmjs.org/@faker-js/faker/-/faker-7.6.0.tgz",
+      "integrity": "sha512-XK6BTq1NDMo9Xqw/YkYyGjSsg44fbNwYRx7QK2CuoQgyy+f1rrTDHoExVM5PsyXCtfl2vs2vVJ0MN0yN6LppRw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=14.0.0",
+        "npm": ">=6.0.0"
+      }
+    },
+    "node_modules/@humanwhocodes/momoa": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/@humanwhocodes/momoa/-/momoa-2.0.4.tgz",
+      "integrity": "sha512-RE815I4arJFtt+FVeU1Tgp9/Xvecacji8w/V6XtXsWWH/wz/eNkNbhb+ny/+PlVZjV0rxQpRSQKNKE3lcktHEA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=10.10.0"
+      }
+    },
+    "node_modules/@jest/schemas": {
+      "version": "29.6.3",
+      "resolved": "https://registry.npmjs.org/@jest/schemas/-/schemas-29.6.3.tgz",
+      "integrity": "sha512-mo5j5X+jIZmJQveBKeS/clAueipV7KgiX1vMgCxam1RNYiqE1w62n0/tJJnHtjW8ZHcQco5gY85jA3mi0L+nSA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@sinclair/typebox": "^0.27.8"
+      },
+      "engines": {
+        "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
+      }
+    },
+    "node_modules/@jsep-plugin/assignment": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/@jsep-plugin/assignment/-/assignment-1.3.0.tgz",
+      "integrity": "sha512-VVgV+CXrhbMI3aSusQyclHkenWSAm95WaiKrMxRFam3JSUiIaQjoMIw2sEs/OX4XifnqeQUN4DYbJjlA8EfktQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 10.16.0"
+      },
+      "peerDependencies": {
+        "jsep": "^0.4.0||^1.0.0"
+      }
+    },
+    "node_modules/@jsep-plugin/regex": {
+      "version": "1.0.4",
+      "resolved": "https://registry.npmjs.org/@jsep-plugin/regex/-/regex-1.0.4.tgz",
+      "integrity": "sha512-q7qL4Mgjs1vByCaTnDFcBnV9HS7GVPJX5vyVoCgZHNSC9rjwIlmbXG5sUuorR5ndfHAIlJ8pVStxvjXHbNvtUg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 10.16.0"
+      },
+      "peerDependencies": {
+        "jsep": "^0.4.0||^1.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/api": {
+      "version": "1.9.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz",
+      "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/api-logs": {
+      "version": "0.53.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/api-logs/-/api-logs-0.53.0.tgz",
+      "integrity": "sha512-8HArjKx+RaAI8uEIgcORbZIPklyh1YLjPSBus8hjRmvLi6DeFzgOcdZ7KwPabKj8mXF8dX0hyfAyGfycz0DbFw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=14"
+      }
+    },
+    "node_modules/@opentelemetry/context-async-hooks": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/context-async-hooks/-/context-async-hooks-1.26.0.tgz",
+      "integrity": "sha512-HedpXXYzzbaoutw6DFLWLDket2FwLkLpil4hGCZ1xYEIMTcivdfwEOISgdbLEWyG3HW52gTq2V9mOVJrONgiwg==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/core": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/core/-/core-1.26.0.tgz",
+      "integrity": "sha512-1iKxXXE8415Cdv0yjG3G6hQnB5eVEsJce3QaawX8SjDn0mAS0ZM8fAbZZJD4ajvhC15cePvosSCut404KrIIvQ==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/semantic-conventions": "1.27.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/exporter-trace-otlp-http": {
+      "version": "0.53.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/exporter-trace-otlp-http/-/exporter-trace-otlp-http-0.53.0.tgz",
+      "integrity": "sha512-m7F5ZTq+V9mKGWYpX8EnZ7NjoqAU7VemQ1E2HAG+W/u0wpY1x0OmbxAXfGKFHCspdJk8UKlwPGrpcB8nay3P8A==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/otlp-exporter-base": "0.53.0",
+        "@opentelemetry/otlp-transformer": "0.53.0",
+        "@opentelemetry/resources": "1.26.0",
+        "@opentelemetry/sdk-trace-base": "1.26.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-exporter-base": {
+      "version": "0.53.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-exporter-base/-/otlp-exporter-base-0.53.0.tgz",
+      "integrity": "sha512-UCWPreGQEhD6FjBaeDuXhiMf6kkBODF0ZQzrk/tuQcaVDJ+dDQ/xhJp192H9yWnKxVpEjFrSSLnpqmX4VwX+eA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/otlp-transformer": "0.53.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.0.0"
+      }
+    },
+    "node_modules/@opentelemetry/otlp-transformer": {
+      "version": "0.53.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/otlp-transformer/-/otlp-transformer-0.53.0.tgz",
+      "integrity": "sha512-rM0sDA9HD8dluwuBxLetUmoqGJKSAbWenwD65KY9iZhUxdBHRLrIdrABfNDP7aiTjcgK8XFyTn5fhDz7N+W6DA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api-logs": "0.53.0",
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/resources": "1.26.0",
+        "@opentelemetry/sdk-logs": "0.53.0",
+        "@opentelemetry/sdk-metrics": "1.26.0",
+        "@opentelemetry/sdk-trace-base": "1.26.0",
+        "protobufjs": "^7.3.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": "^1.3.0"
+      }
+    },
+    "node_modules/@opentelemetry/propagator-b3": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-b3/-/propagator-b3-1.26.0.tgz",
+      "integrity": "sha512-vvVkQLQ/lGGyEy9GT8uFnI047pajSOVnZI2poJqVGD3nJ+B9sFGdlHNnQKophE3lHfnIH0pw2ubrCTjZCgIj+Q==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "1.26.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/propagator-jaeger": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/propagator-jaeger/-/propagator-jaeger-1.26.0.tgz",
+      "integrity": "sha512-DelFGkCdaxA1C/QA0Xilszfr0t4YbGd3DjxiCDPh34lfnFr+VkkrjV9S8ZTJvAzfdKERXhfOxIKBoGPJwoSz7Q==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "1.26.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/resources": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/resources/-/resources-1.26.0.tgz",
+      "integrity": "sha512-CPNYchBE7MBecCSVy0HKpUISEeJOniWqcHaAHpmasZ3j9o6V3AyBzhRc90jdmemq0HOxDr6ylhUbDhBqqPpeNw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/semantic-conventions": "1.27.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-logs": {
+      "version": "0.53.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-logs/-/sdk-logs-0.53.0.tgz",
+      "integrity": "sha512-dhSisnEgIj/vJZXZV6f6KcTnyLDx/VuQ6l3ejuZpMpPlh9S1qMHiZU9NMmOkVkwwHkMy3G6mEBwdP23vUZVr4g==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/api-logs": "0.53.0",
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/resources": "1.26.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.4.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-metrics": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-metrics/-/sdk-metrics-1.26.0.tgz",
+      "integrity": "sha512-0SvDXmou/JjzSDOjUmetAAvcKQW6ZrvosU0rkbDGpXvvZN+pQF6JbK/Kd4hNdK4q/22yeruqvukXEJyySTzyTQ==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/resources": "1.26.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.3.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-trace-base": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-base/-/sdk-trace-base-1.26.0.tgz",
+      "integrity": "sha512-olWQldtvbK4v22ymrKLbIcBi9L2SpMO84sCPY54IVsJhP9fRsxJT194C/AVaAuJzLE30EdhhM1VmvVYR7az+cw==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/resources": "1.26.0",
+        "@opentelemetry/semantic-conventions": "1.27.0"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/sdk-trace-node": {
+      "version": "1.26.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/sdk-trace-node/-/sdk-trace-node-1.26.0.tgz",
+      "integrity": "sha512-Fj5IVKrj0yeUwlewCRwzOVcr5avTuNnMHWf7GPc1t6WaT78J6CJyF3saZ/0RkZfdeNO8IcBl/bNcWMVZBMRW8Q==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@opentelemetry/context-async-hooks": "1.26.0",
+        "@opentelemetry/core": "1.26.0",
+        "@opentelemetry/propagator-b3": "1.26.0",
+        "@opentelemetry/propagator-jaeger": "1.26.0",
+        "@opentelemetry/sdk-trace-base": "1.26.0",
+        "semver": "^7.5.2"
+      },
+      "engines": {
+        "node": ">=14"
+      },
+      "peerDependencies": {
+        "@opentelemetry/api": ">=1.0.0 <1.10.0"
+      }
+    },
+    "node_modules/@opentelemetry/semantic-conventions": {
+      "version": "1.27.0",
+      "resolved": "https://registry.npmjs.org/@opentelemetry/semantic-conventions/-/semantic-conventions-1.27.0.tgz",
+      "integrity": "sha512-sAay1RrB+ONOem0OZanAR1ZI/k7yDpnOQSQmTMuGImUQb2y8EbSaCJ94FQluM74xoU03vlb2d2U90hZluL6nQg==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "engines": {
+        "node": ">=14"
+      }
+    },
+    "node_modules/@protobufjs/aspromise": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz",
+      "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/base64": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz",
+      "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/codegen": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.4.tgz",
+      "integrity": "sha512-YyFaikqM5sH0ziFZCN3xDC7zeGaB/d0IUb9CATugHWbd1FRFwWwt4ld4OYMPWu5a3Xe01mGAULCdqhMlPl29Jg==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/eventemitter": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.0.tgz",
+      "integrity": "sha512-j9ednRT81vYJ9OfVuXG6ERSTdEL1xVsNgqpkxMsbIabzSo3goCjDIveeGv5d03om39ML71RdmrGNjG5SReBP/Q==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/fetch": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.0.tgz",
+      "integrity": "sha512-lljVXpqXebpsijW71PZaCYeIcE5on1w5DlQy5WH6GLbFryLUrBD4932W/E2BSpfRJWseIL4v/KPgBFxDOIdKpQ==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.1",
+        "@protobufjs/inquire": "^1.1.0"
+      }
+    },
+    "node_modules/@protobufjs/float": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz",
+      "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/inquire": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/inquire/-/inquire-1.1.0.tgz",
+      "integrity": "sha512-kdSefcPdruJiFMVSbn801t4vFK7KB/5gd2fYvrxhuJYg8ILrmn9SKSX2tZdV6V+ksulWqS7aXjBcRXl3wHoD9Q==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/path": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz",
+      "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/pool": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz",
+      "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@protobufjs/utf8": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.0.tgz",
+      "integrity": "sha512-Vvn3zZrhQZkkBE8LSuW3em98c0FwgO4nxzv6OdSxPKJIEKY2bGbHn+mhGIPerzI4twdxaP8/0+06HBpwf345Lw==",
+      "dev": true,
+      "license": "BSD-3-Clause"
+    },
+    "node_modules/@redocly/ajv": {
+      "version": "8.11.2",
+      "resolved": "https://registry.npmjs.org/@redocly/ajv/-/ajv-8.11.2.tgz",
+      "integrity": "sha512-io1JpnwtIcvojV7QKDUSIuMN/ikdOUd1ReEnUnMKGfDVridQZ31J0MmIuqwuRjWDZfmvr+Q0MqCcfHM2gTivOg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "fast-deep-equal": "^3.1.1",
+        "json-schema-traverse": "^1.0.0",
+        "require-from-string": "^2.0.2",
+        "uri-js-replace": "^1.0.1"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/epoberezkin"
+      }
+    },
+    "node_modules/@redocly/cli": {
+      "version": "1.34.4",
+      "resolved": "https://registry.npmjs.org/@redocly/cli/-/cli-1.34.4.tgz",
+      "integrity": "sha512-seH/GgrjSB1EeOsgJ/4Ct6Jk2N7sh12POn/7G8UQFARMyUMJpe1oHtBwT2ndfp4EFCpgBAbZ/82Iw6dwczNxEA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@opentelemetry/api": "1.9.0",
+        "@opentelemetry/exporter-trace-otlp-http": "0.53.0",
+        "@opentelemetry/resources": "1.26.0",
+        "@opentelemetry/sdk-trace-node": "1.26.0",
+        "@opentelemetry/semantic-conventions": "1.27.0",
+        "@redocly/config": "^0.22.0",
+        "@redocly/openapi-core": "1.34.4",
+        "@redocly/respect-core": "1.34.4",
+        "abort-controller": "^3.0.0",
+        "chokidar": "^3.5.1",
+        "colorette": "^1.2.0",
+        "core-js": "^3.32.1",
+        "dotenv": "16.4.7",
+        "form-data": "^4.0.0",
+        "get-port-please": "^3.0.1",
+        "glob": "^7.1.6",
+        "handlebars": "^4.7.6",
+        "mobx": "^6.0.4",
+        "pluralize": "^8.0.0",
+        "react": "^17.0.0 || ^18.2.0 || ^19.0.0",
+        "react-dom": "^17.0.0 || ^18.2.0 || ^19.0.0",
+        "redoc": "2.5.0",
+        "semver": "^7.5.2",
+        "simple-websocket": "^9.0.0",
+        "styled-components": "^6.0.7",
+        "yargs": "17.0.1"
+      },
+      "bin": {
+        "openapi": "bin/cli.js",
+        "redocly": "bin/cli.js"
+      },
+      "engines": {
+        "node": ">=18.17.0",
+        "npm": ">=9.5.0"
+      }
+    },
+    "node_modules/@redocly/config": {
+      "version": "0.22.2",
+      "resolved": "https://registry.npmjs.org/@redocly/config/-/config-0.22.2.tgz",
+      "integrity": "sha512-roRDai8/zr2S9YfmzUfNhKjOF0NdcOIqF7bhf4MVC5UxpjIysDjyudvlAiVbpPHp3eDRWbdzUgtkK1a7YiDNyQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@redocly/openapi-core": {
+      "version": "1.34.4",
+      "resolved": "https://registry.npmjs.org/@redocly/openapi-core/-/openapi-core-1.34.4.tgz",
+      "integrity": "sha512-hf53xEgpXIgWl3b275PgZU3OTpYh1RoD2LHdIfQ1JzBNTWsiNKczTEsI/4Tmh2N1oq9YcphhSMyk3lDh85oDjg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@redocly/ajv": "^8.11.2",
+        "@redocly/config": "^0.22.0",
+        "colorette": "^1.2.0",
+        "https-proxy-agent": "^7.0.5",
+        "js-levenshtein": "^1.1.6",
+        "js-yaml": "^4.1.0",
+        "minimatch": "^5.0.1",
+        "pluralize": "^8.0.0",
+        "yaml-ast-parser": "0.0.43"
+      },
+      "engines": {
+        "node": ">=18.17.0",
+        "npm": ">=9.5.0"
+      }
+    },
+    "node_modules/@redocly/respect-core": {
+      "version": "1.34.4",
+      "resolved": "https://registry.npmjs.org/@redocly/respect-core/-/respect-core-1.34.4.tgz",
+      "integrity": "sha512-MitKyKyQpsizA4qCVv+MjXL4WltfhFQAoiKiAzrVR1Kusro3VhYb6yJuzoXjiJhR0ukLP5QOP19Vcs7qmj9dZg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@faker-js/faker": "^7.6.0",
+        "@redocly/ajv": "8.11.2",
+        "@redocly/openapi-core": "1.34.4",
+        "better-ajv-errors": "^1.2.0",
+        "colorette": "^2.0.20",
+        "concat-stream": "^2.0.0",
+        "cookie": "^0.7.2",
+        "dotenv": "16.4.7",
+        "form-data": "4.0.0",
+        "jest-diff": "^29.3.1",
+        "jest-matcher-utils": "^29.3.1",
+        "js-yaml": "4.1.0",
+        "json-pointer": "^0.6.2",
+        "jsonpath-plus": "^10.0.6",
+        "open": "^10.1.0",
+        "openapi-sampler": "^1.6.1",
+        "outdent": "^0.8.0",
+        "set-cookie-parser": "^2.3.5",
+        "undici": "^6.21.1"
+      },
+      "engines": {
+        "node": ">=18.17.0",
+        "npm": ">=9.5.0"
+      }
+    },
+    "node_modules/@redocly/respect-core/node_modules/colorette": {
+      "version": "2.0.20",
+      "resolved": "https://registry.npmjs.org/colorette/-/colorette-2.0.20.tgz",
+      "integrity": "sha512-IfEDxwoWIjkeXL1eXcDiow4UbKjhLdq6/EuSVR9GMN7KVH3r9gQ83e73hsz1Nd1T3ijd5xv1wcWRYO+D6kCI2w==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@redocly/respect-core/node_modules/form-data": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.0.tgz",
+      "integrity": "sha512-ETEklSGi5t0QMZuiXoA/Q6vcnxcLQP5vdugSpuAyi6SVGi2clPPp+xgEhuMaHC+zGgn31Kd235W35f7Hykkaww==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "asynckit": "^0.4.0",
+        "combined-stream": "^1.0.8",
+        "mime-types": "^2.1.12"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/@sinclair/typebox": {
+      "version": "0.27.8",
+      "resolved": "https://registry.npmjs.org/@sinclair/typebox/-/typebox-0.27.8.tgz",
+      "integrity": "sha512-+Fj43pSMwJs4KRrH/938Uf+uAELIgVBmQzg/q1YG10djyfA3TnrU8N8XzqCh/okZdszqBQTZf96idMfE5lnwTA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@sourcemeta/jsonschema": {
+      "version": "10.0.0",
+      "resolved": "https://registry.npmjs.org/@sourcemeta/jsonschema/-/jsonschema-10.0.0.tgz",
+      "integrity": "sha512-NyRjy3JxFrcDU9zci4fTe4dhoUZu61UNONgxJ13hmhaUAYF51gYvVEoWpDtl1ckikdboMuAm/QVeelh/+B8hGQ==",
+      "cpu": [
+        "x64",
+        "arm64"
+      ],
+      "dev": true,
+      "license": "AGPL-3.0",
+      "os": [
+        "darwin",
+        "linux",
+        "win32"
+      ],
+      "bin": {
+        "jsonschema": "cli.js"
+      },
+      "engines": {
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sourcemeta"
+      }
+    },
+    "node_modules/@types/json-schema": {
+      "version": "7.0.15",
+      "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz",
+      "integrity": "sha512-5+fP8P8MFNC+AyZCDxrB2pkZFPGzqQWUzpSeuuVLvm8VMcorNYavBqoFcxK8bQz4Qsbn4oUEEem4wDLfcysGHA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@types/node": {
+      "version": "24.0.13",
+      "resolved": "https://registry.npmjs.org/@types/node/-/node-24.0.13.tgz",
+      "integrity": "sha512-Qm9OYVOFHFYg3wJoTSrz80hoec5Lia/dPp84do3X7dZvLikQvM1YpmvTBEdIr/e+U8HTkFjLHLnl78K/qjf+jQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "undici-types": "~7.8.0"
+      }
+    },
+    "node_modules/@types/stylis": {
+      "version": "4.2.5",
+      "resolved": "https://registry.npmjs.org/@types/stylis/-/stylis-4.2.5.tgz",
+      "integrity": "sha512-1Xve+NMN7FWjY14vLoY5tL3BVEQ/n42YLwaqJIPYhotZ9uBHt87VceMwWQpzmdEt2TNXIorIFG+YeCUUW7RInw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/@types/trusted-types": {
+      "version": "2.0.7",
+      "resolved": "https://registry.npmjs.org/@types/trusted-types/-/trusted-types-2.0.7.tgz",
+      "integrity": "sha512-ScaPdn1dQczgbl0QFTeTOmVHFULt394XJgOQNoyVhZ6r2vLnMLJfBPd53SB52T/3G36VI1/g2MZaX0cwDuXsfw==",
+      "dev": true,
+      "license": "MIT",
+      "optional": true
+    },
+    "node_modules/abort-controller": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/abort-controller/-/abort-controller-3.0.0.tgz",
+      "integrity": "sha512-h8lQ8tacZYnR3vNQTgibj+tODHI5/+l06Au2Pcriv/Gmet0eaj4TwWH41sO9wnHDiQsEj19q0drzdWdeAHtweg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "event-target-shim": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=6.5"
+      }
+    },
+    "node_modules/agent-base": {
+      "version": "7.1.4",
+      "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.4.tgz",
+      "integrity": "sha512-MnA+YT8fwfJPgBx3m60MNqakm30XOkyIoH1y6huTQvC0PwZG7ki8NacLBcrPbNoo8vEZy7Jpuk7+jMO+CUovTQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 14"
+      }
+    },
+    "node_modules/ajv": {
+      "version": "8.17.1",
+      "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.17.1.tgz",
+      "integrity": "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g==",
+      "dev": true,
+      "license": "MIT",
+      "peer": true,
+      "dependencies": {
+        "fast-deep-equal": "^3.1.3",
+        "fast-uri": "^3.0.1",
+        "json-schema-traverse": "^1.0.0",
+        "require-from-string": "^2.0.2"
+      },
+      "funding": {
+        "type": "github",
+        "url": "https://github.com/sponsors/epoberezkin"
+      }
+    },
+    "node_modules/ansi-regex": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
+      "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/ansi-styles": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz",
+      "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "color-convert": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/anymatch": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/anymatch/-/anymatch-3.1.3.tgz",
+      "integrity": "sha512-KMReFUr0B4t+D+OBkjR3KYqvocp2XaSzO55UcB6mgQMd3KbcE+mWTyvVV7D/zsdEbNnV6acZUutkiHQXvTr1Rw==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "normalize-path": "^3.0.0",
+        "picomatch": "^2.0.4"
+      },
+      "engines": {
+        "node": ">= 8"
+      }
+    },
+    "node_modules/argparse": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
+      "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
+      "dev": true,
+      "license": "Python-2.0"
+    },
+    "node_modules/asynckit": {
+      "version": "0.4.0",
+      "resolved": "https://registry.npmjs.org/asynckit/-/asynckit-0.4.0.tgz",
+      "integrity": "sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/balanced-match": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz",
+      "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/better-ajv-errors": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/better-ajv-errors/-/better-ajv-errors-1.2.0.tgz",
+      "integrity": "sha512-UW+IsFycygIo7bclP9h5ugkNH8EjCSgqyFB/yQ4Hqqa1OEYDtb0uFIkYE0b6+CjkgJYVM5UKI/pJPxjYe9EZlA==",
+      "dev": true,
+      "license": "Apache-2.0",
+      "dependencies": {
+        "@babel/code-frame": "^7.16.0",
+        "@humanwhocodes/momoa": "^2.0.2",
+        "chalk": "^4.1.2",
+        "jsonpointer": "^5.0.0",
+        "leven": "^3.1.0 < 4"
+      },
+      "engines": {
+        "node": ">= 12.13.0"
+      },
+      "peerDependencies": {
+        "ajv": "4.11.8 - 8"
+      }
+    },
+    "node_modules/binary-extensions": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/binary-extensions/-/binary-extensions-2.3.0.tgz",
+      "integrity": "sha512-Ceh+7ox5qe7LJuLHoY0feh3pHuUDHAcRUeyL2VYghZwfpkNIy/+8Ocg0a3UuSoYzavmylwuLWQOf3hl0jjMMIw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/brace-expansion": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz",
+      "integrity": "sha512-Jt0vHyM+jmUBqojB7E1NIYadt0vI0Qxjxd2TErW94wDz+E2LAm5vKMXXwg6ZZBTHPuUlDgQHKXvjGBdfcF1ZDQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^1.0.0"
+      }
+    },
+    "node_modules/braces": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/braces/-/braces-3.0.3.tgz",
+      "integrity": "sha512-yQbXgO/OSZVD2IsiLlro+7Hf6Q18EJrKSEsdoMzKePKXct3gvD8oLcOQdIzGupr5Fj+EDe8gO/lxc1BzfMpxvA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "fill-range": "^7.1.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/buffer-from": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/buffer-from/-/buffer-from-1.1.2.tgz",
+      "integrity": "sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/bundle-name": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/bundle-name/-/bundle-name-4.1.0.tgz",
+      "integrity": "sha512-tjwM5exMg6BGRI+kNmTntNsvdZS1X8BFYS6tnJ2hdH0kVxM6/eVZ2xy+FqStSWvYmtfFMDLIxurorHwDKfDz5Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "run-applescript": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/call-bind-apply-helpers": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-bind-apply-helpers/-/call-bind-apply-helpers-1.0.2.tgz",
+      "integrity": "sha512-Sp1ablJ0ivDkSzjcaJdxEunN5/XvksFJ2sMBFfq6x0ryhQV/2b/KwFe21cMpmHtPOSij8K99/wSfoEuTObmuMQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/call-me-maybe": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/call-me-maybe/-/call-me-maybe-1.0.2.tgz",
+      "integrity": "sha512-HpX65o1Hnr9HH25ojC1YGs7HCQLq0GCOibSaWER0eNpgJ/Z1MZv2mTc7+xh6WOPxbRVcmgbv4hGU+uSQ/2xFZQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/camelize": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/camelize/-/camelize-1.0.1.tgz",
+      "integrity": "sha512-dU+Tx2fsypxTgtLoE36npi3UqcjSSMNYfkqgmoEhtZrraP5VWq0K7FkWVTYa8eMPtnU/G2txVsfdCJTn9uzpuQ==",
+      "dev": true,
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/chalk": {
+      "version": "4.1.2",
+      "resolved": "https://registry.npmjs.org/chalk/-/chalk-4.1.2.tgz",
+      "integrity": "sha512-oKnbhFyRIXpUuez8iBMmyEa4nbj4IOQyuhc/wy9kY7/WVPcwIO9VA668Pu8RkO7+0G76SLROeyw9CpQ061i4mA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^4.1.0",
+        "supports-color": "^7.1.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/chalk?sponsor=1"
+      }
+    },
+    "node_modules/chokidar": {
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/chokidar/-/chokidar-3.6.0.tgz",
+      "integrity": "sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "anymatch": "~3.1.2",
+        "braces": "~3.0.2",
+        "glob-parent": "~5.1.2",
+        "is-binary-path": "~2.1.0",
+        "is-glob": "~4.0.1",
+        "normalize-path": "~3.0.0",
+        "readdirp": "~3.6.0"
+      },
+      "engines": {
+        "node": ">= 8.10.0"
+      },
+      "funding": {
+        "url": "https://paulmillr.com/funding/"
+      },
+      "optionalDependencies": {
+        "fsevents": "~2.3.2"
+      }
+    },
+    "node_modules/classnames": {
+      "version": "2.5.1",
+      "resolved": "https://registry.npmjs.org/classnames/-/classnames-2.5.1.tgz",
+      "integrity": "sha512-saHYOzhIQs6wy2sVxTM6bUDsQO4F50V9RQ22qBpEdCW+I+/Wmke2HOl6lS6dTpdxVhb88/I6+Hs+438c3lfUow==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/cliui": {
+      "version": "7.0.4",
+      "resolved": "https://registry.npmjs.org/cliui/-/cliui-7.0.4.tgz",
+      "integrity": "sha512-OcRE68cOsVMXp1Yvonl/fzkQOyjLSu/8bhPDfQt0e0/Eb283TKP20Fs2MqoPsr9SwA595rRCA+QMzYc9nBP+JQ==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "string-width": "^4.2.0",
+        "strip-ansi": "^6.0.0",
+        "wrap-ansi": "^7.0.0"
+      }
+    },
+    "node_modules/clsx": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/clsx/-/clsx-2.1.1.tgz",
+      "integrity": "sha512-eYm0QWBtUrBWZWG0d386OGAw16Z995PiOVo2B7bjWSbHedGl5e0ZWaq65kOGgUSNesEIDkB9ISbTg/JK9dhCZA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/color-convert": {
+      "version": "2.0.1",
+      "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz",
+      "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "color-name": "~1.1.4"
+      },
+      "engines": {
+        "node": ">=7.0.0"
+      }
+    },
+    "node_modules/color-name": {
+      "version": "1.1.4",
+      "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz",
+      "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/colorette": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/colorette/-/colorette-1.4.0.tgz",
+      "integrity": "sha512-Y2oEozpomLn7Q3HFP7dpww7AtMJplbM9lGZP6RDfHqmbeRjiwRg4n6VM6j4KLmRke85uWEI7JqF17f3pqdRA0g==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/combined-stream": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/combined-stream/-/combined-stream-1.0.8.tgz",
+      "integrity": "sha512-FQN4MRfuJeHf7cBbBMJFXhKSDq+2kAArBlmRBvcvFE5BB1HZKXtSFASDhdlz9zOYwxh8lDdnvmMOe/+5cdoEdg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "delayed-stream": "~1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.8"
+      }
+    },
+    "node_modules/concat-map": {
+      "version": "0.0.1",
+      "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz",
+      "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/concat-stream": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/concat-stream/-/concat-stream-2.0.0.tgz",
+      "integrity": "sha512-MWufYdFw53ccGjCA+Ol7XJYpAlW6/prSMzuPOTRnJGcGzuhLn4Scrz7qf6o8bROZ514ltazcIFJZevcfbo0x7A==",
+      "dev": true,
+      "engines": [
+        "node >= 6.0"
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "buffer-from": "^1.0.0",
+        "inherits": "^2.0.3",
+        "readable-stream": "^3.0.2",
+        "typedarray": "^0.0.6"
+      }
+    },
+    "node_modules/cookie": {
+      "version": "0.7.2",
+      "resolved": "https://registry.npmjs.org/cookie/-/cookie-0.7.2.tgz",
+      "integrity": "sha512-yki5XnKuf750l50uGTllt6kKILY4nQ1eNIQatoXEByZ5dWgnKqbnqmTrBE5B4N7lrMJKQ2ytWMiTO2o0v6Ew/w==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/core-js": {
+      "version": "3.44.0",
+      "resolved": "https://registry.npmjs.org/core-js/-/core-js-3.44.0.tgz",
+      "integrity": "sha512-aFCtd4l6GvAXwVEh3XbbVqJGHDJt0OZRa+5ePGx3LLwi12WfexqQxcsohb2wgsa/92xtl19Hd66G/L+TaAxDMw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/core-js"
+      }
+    },
+    "node_modules/css-color-keywords": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/css-color-keywords/-/css-color-keywords-1.0.0.tgz",
+      "integrity": "sha512-FyyrDHZKEjXDpNJYvVsV960FiqQyXc/LlYmsxl2BcdMb2WPx0OGRVgTg55rPSyLSNMqP52R9r8geSp7apN3Ofg==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/css-to-react-native": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/css-to-react-native/-/css-to-react-native-3.2.0.tgz",
+      "integrity": "sha512-e8RKaLXMOFii+02mOlqwjbD00KSEKqblnpO9e++1aXS1fPQOpS1YoqdVHBqPjHNoxeF2mimzVqawm2KCbEdtHQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "camelize": "^1.0.0",
+        "css-color-keywords": "^1.0.0",
+        "postcss-value-parser": "^4.0.2"
+      }
+    },
+    "node_modules/csstype": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/csstype/-/csstype-3.1.3.tgz",
+      "integrity": "sha512-M1uQkMl8rQK/szD0LNhtqxIPLpimGm8sOBwU7lLnCpSbTyY3yeU1Vc7l4KT5zT4s/yOxHH5O7tIuuLOCnLADRw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/debug": {
+      "version": "4.4.1",
+      "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz",
+      "integrity": "sha512-KcKCqiftBJcZr++7ykoDIEwSa3XWowTfNPo92BYxjXiyYEVrUQh2aLyhxBCwww+heortUFxEJYcRzosstTEBYQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ms": "^2.1.3"
+      },
+      "engines": {
+        "node": ">=6.0"
+      },
+      "peerDependenciesMeta": {
+        "supports-color": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/decko": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/decko/-/decko-1.2.0.tgz",
+      "integrity": "sha512-m8FnyHXV1QX+S1cl+KPFDIl6NMkxtKsy6+U/aYyjrOqWMuwAwYWu7ePqrsUHtDR5Y8Yk2pi/KIDSgF+vT4cPOQ==",
+      "dev": true
+    },
+    "node_modules/default-browser": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/default-browser/-/default-browser-5.2.1.tgz",
+      "integrity": "sha512-WY/3TUME0x3KPYdRRxEJJvXRHV4PyPoUsxtZa78lwItwRQRHhd2U9xOscaT/YTf8uCXIAjeJOFBVEh/7FtD8Xg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "bundle-name": "^4.1.0",
+        "default-browser-id": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/default-browser-id": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/default-browser-id/-/default-browser-id-5.0.0.tgz",
+      "integrity": "sha512-A6p/pu/6fyBcA1TRz/GqWYPViplrftcW2gZC9q79ngNCKAeR/X3gcEdXQHl4KNXV+3wgIJ1CPkJQ3IHM6lcsyA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/define-lazy-prop": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/define-lazy-prop/-/define-lazy-prop-3.0.0.tgz",
+      "integrity": "sha512-N+MeXYoqr3pOgn8xfyRPREN7gHakLYjhsHhWGT3fWAiL4IkAt0iDw14QiiEm2bE30c5XX5q0FtAA3CK5f9/BUg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/delayed-stream": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz",
+      "integrity": "sha512-ZySD7Nf91aLB0RxL4KGrKHBXl7Eds1DAmEdcoVawXnLD7SDhpNgtuII2aAkg7a7QS41jxPSZ17p4VdGnMHk3MQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.4.0"
+      }
+    },
+    "node_modules/diff-sequences": {
+      "version": "29.6.3",
+      "resolved": "https://registry.npmjs.org/diff-sequences/-/diff-sequences-29.6.3.tgz",
+      "integrity": "sha512-EjePK1srD3P08o2j4f0ExnylqRs5B9tJjcp9t1krH2qRi8CCdsYfwe9JgSLurFBWwq4uOlipzfk5fHNvwFKr8Q==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
+      }
+    },
+    "node_modules/dompurify": {
+      "version": "3.2.6",
+      "resolved": "https://registry.npmjs.org/dompurify/-/dompurify-3.2.6.tgz",
+      "integrity": "sha512-/2GogDQlohXPZe6D6NOgQvXLPSYBqIWMnZ8zzOhn09REE4eyAzb+Hed3jhoM9OkuaJ8P6ZGTTVWQKAi8ieIzfQ==",
+      "dev": true,
+      "license": "(MPL-2.0 OR Apache-2.0)",
+      "optionalDependencies": {
+        "@types/trusted-types": "^2.0.7"
+      }
+    },
+    "node_modules/dotenv": {
+      "version": "16.4.7",
+      "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.7.tgz",
+      "integrity": "sha512-47qPchRCykZC03FhkYAhrvwU4xDBFIj1QPqaarj6mdM/hgUzfPHcpkHJOn3mJAufFeeAxAzeGsr5X0M4k6fLZQ==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "engines": {
+        "node": ">=12"
+      },
+      "funding": {
+        "url": "https://dotenvx.com"
+      }
+    },
+    "node_modules/dunder-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz",
+      "integrity": "sha512-KIN/nDJBQRcXw0MLVhZE9iQHmG68qAVIBg9CqmUYjmQIhgij9U5MFvrqkUL5FbtyyzZuOeOt0zdeRe4UY7ct+A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "gopd": "^1.2.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/emoji-regex": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz",
+      "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/es-define-property": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/es-define-property/-/es-define-property-1.0.1.tgz",
+      "integrity": "sha512-e3nRfgfUZ4rNGL232gUgX06QNyyez04KdjFrF+LTRoOXmrOgFKDg4BCdsjW8EnT69eqdYGmRpJwiPVYNrCaW3g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-errors": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/es-errors/-/es-errors-1.3.0.tgz",
+      "integrity": "sha512-Zf5H2Kxt2xjTvbJvP2ZWLEICxA6j+hAmMzIlypy4xcBg1vKVnx89Wy0GbS+kf5cwCVFFzdCFh2XSCFNULS6csw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-object-atoms": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/es-object-atoms/-/es-object-atoms-1.1.1.tgz",
+      "integrity": "sha512-FGgH2h8zKNim9ljj7dankFPcICIK9Cp5bm+c2gQSYePhpaG5+esrLODihIorn+Pe6FGJzWhXQotPv73jTaldXA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es-set-tostringtag": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/es-set-tostringtag/-/es-set-tostringtag-2.1.0.tgz",
+      "integrity": "sha512-j6vWzfrGVfyXxge+O0x5sh6cvxAog0a/4Rdd2K36zCMV5eJ+/+tOAngRO8cODMNWbVRdVlmGZQL2YS3yR8bIUA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "es-errors": "^1.3.0",
+        "get-intrinsic": "^1.2.6",
+        "has-tostringtag": "^1.0.2",
+        "hasown": "^2.0.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/es6-promise": {
+      "version": "3.3.1",
+      "resolved": "https://registry.npmjs.org/es6-promise/-/es6-promise-3.3.1.tgz",
+      "integrity": "sha512-SOp9Phqvqn7jtEUxPWdWfWoLmyt2VaJ6MpvP9Comy1MceMXqE6bxvaTu4iaxpYYPzhny28Lc+M87/c2cPK6lDg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/escalade": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz",
+      "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/event-target-shim": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/event-target-shim/-/event-target-shim-5.0.1.tgz",
+      "integrity": "sha512-i/2XbnSz/uxRCU6+NdVJgKWDTM427+MqYbkQzD321DuCQJUqOuJKIA0IM2+W2xtYHdKOmZ4dR6fExsd4SXL+WQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/eventemitter3": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/eventemitter3/-/eventemitter3-5.0.1.tgz",
+      "integrity": "sha512-GWkBvjiSZK87ELrYOSESUYeVIc9mvLLf/nXalMOS5dYrgZq9o5OVkbZAVM06CVxYsCwH9BDZFPlQTlPA1j4ahA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/fast-deep-equal": {
+      "version": "3.1.3",
+      "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz",
+      "integrity": "sha512-f3qQ9oQy9j2AhBe/H9VC91wLmKBCCU/gDOnKNAYG5hswO7BLKj09Hc5HYNz9cGI++xlpDCIgDaitVs03ATR84Q==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/fast-safe-stringify": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/fast-safe-stringify/-/fast-safe-stringify-2.1.1.tgz",
+      "integrity": "sha512-W+KJc2dmILlPplD/H4K9l9LcAHAfPtP6BY84uVLXQ6Evcz9Lcg33Y2z1IVblT6xdY54PXYVHEv+0Wpq8Io6zkA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/fast-uri": {
+      "version": "3.0.6",
+      "resolved": "https://registry.npmjs.org/fast-uri/-/fast-uri-3.0.6.tgz",
+      "integrity": "sha512-Atfo14OibSv5wAp4VWNsFYE1AchQRTv9cBGWET4pZWHzYshFSS9NQI6I57rdKn9croWVMbYFbLhJ+yJvmZIIHw==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/fastify"
+        },
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/fastify"
+        }
+      ],
+      "license": "BSD-3-Clause",
+      "peer": true
+    },
+    "node_modules/fast-xml-parser": {
+      "version": "4.5.3",
+      "resolved": "https://registry.npmjs.org/fast-xml-parser/-/fast-xml-parser-4.5.3.tgz",
+      "integrity": "sha512-RKihhV+SHsIUGXObeVy9AXiBbFwkVk7Syp8XgwN5U3JV416+Gwp/GO9i0JYKmikykgz/UHRrrV4ROuZEo/T0ig==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "strnum": "^1.1.1"
+      },
+      "bin": {
+        "fxparser": "src/cli/cli.js"
+      }
+    },
+    "node_modules/fill-range": {
+      "version": "7.1.1",
+      "resolved": "https://registry.npmjs.org/fill-range/-/fill-range-7.1.1.tgz",
+      "integrity": "sha512-YsGpe3WHLK8ZYi4tWDg2Jy3ebRz2rXowDxnld4bkQB00cc/1Zw9AWnC0i9ztDJitivtQvaI9KaLyKrc+hBW0yg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "to-regex-range": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/foreach": {
+      "version": "2.0.6",
+      "resolved": "https://registry.npmjs.org/foreach/-/foreach-2.0.6.tgz",
+      "integrity": "sha512-k6GAGDyqLe9JaebCsFCoudPPWfihKu8pylYXRlqP1J7ms39iPoTtk2fviNglIeQEwdh0bQeKJ01ZPyuyQvKzwg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/form-data": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/form-data/-/form-data-4.0.3.tgz",
+      "integrity": "sha512-qsITQPfmvMOSAdeyZ+12I1c+CKSstAFAwu+97zrnWAbIr5u8wfsExUzCesVLC8NgHuRUqNN4Zy6UPWUTRGslcA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "asynckit": "^0.4.0",
+        "combined-stream": "^1.0.8",
+        "es-set-tostringtag": "^2.1.0",
+        "hasown": "^2.0.2",
+        "mime-types": "^2.1.12"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/fs.realpath": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz",
+      "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/fsevents": {
+      "version": "2.3.3",
+      "resolved": "https://registry.npmjs.org/fsevents/-/fsevents-2.3.3.tgz",
+      "integrity": "sha512-5xoDfX+fL7faATnagmWPpbFtwh/R77WmMMqqHGS65C3vvB0YHrgF+B1YmZ3441tMj5n63k0212XNoJwzlhffQw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "MIT",
+      "optional": true,
+      "os": [
+        "darwin"
+      ],
+      "engines": {
+        "node": "^8.16.0 || ^10.6.0 || >=11.0.0"
+      }
+    },
+    "node_modules/function-bind": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/function-bind/-/function-bind-1.1.2.tgz",
+      "integrity": "sha512-7XHNxH7qX9xG5mIwxkhumTox/MIRNcOgDrxWsMt2pAr23WHp6MrRlN7FBSFpCpr+oVO0F744iUgR82nJMfG2SA==",
+      "dev": true,
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/get-caller-file": {
+      "version": "2.0.5",
+      "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz",
+      "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": "6.* || 8.* || >= 10.*"
+      }
+    },
+    "node_modules/get-intrinsic": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/get-intrinsic/-/get-intrinsic-1.3.0.tgz",
+      "integrity": "sha512-9fSjSaos/fRIVIp+xSJlE6lfwhES7LNtKaCBIamHsjr2na1BiABJPo0mOjjz8GJDURarmCPGqaiVg5mfjb98CQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "call-bind-apply-helpers": "^1.0.2",
+        "es-define-property": "^1.0.1",
+        "es-errors": "^1.3.0",
+        "es-object-atoms": "^1.1.1",
+        "function-bind": "^1.1.2",
+        "get-proto": "^1.0.1",
+        "gopd": "^1.2.0",
+        "has-symbols": "^1.1.0",
+        "hasown": "^2.0.2",
+        "math-intrinsics": "^1.1.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/get-port-please": {
+      "version": "3.2.0",
+      "resolved": "https://registry.npmjs.org/get-port-please/-/get-port-please-3.2.0.tgz",
+      "integrity": "sha512-I9QVvBw5U/hw3RmWpYKRumUeaDgxTPd401x364rLmWBJcOQ753eov1eTgzDqRG9bqFIfDc7gfzcQEWrUri3o1A==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/get-proto": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/get-proto/-/get-proto-1.0.1.tgz",
+      "integrity": "sha512-sTSfBjoXBp89JvIKIefqw7U2CCebsc74kiY6awiGogKtoSGbgjYE/G/+l9sF3MWFPNc9IcoOC4ODfKHfxFmp0g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "dunder-proto": "^1.0.1",
+        "es-object-atoms": "^1.0.0"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/glob": {
+      "version": "7.2.3",
+      "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz",
+      "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==",
+      "deprecated": "Glob versions prior to v9 are no longer supported",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "fs.realpath": "^1.0.0",
+        "inflight": "^1.0.4",
+        "inherits": "2",
+        "minimatch": "^3.1.1",
+        "once": "^1.3.0",
+        "path-is-absolute": "^1.0.0"
+      },
+      "engines": {
+        "node": "*"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/isaacs"
+      }
+    },
+    "node_modules/glob-parent": {
+      "version": "5.1.2",
+      "resolved": "https://registry.npmjs.org/glob-parent/-/glob-parent-5.1.2.tgz",
+      "integrity": "sha512-AOIgSQCepiJYwP3ARnGx+5VnTu2HBYdzbGP45eLw1vr3zB3vZLeyed1sC9hnbcOc9/SrMyM5RPQrkGz4aS9Zow==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "is-glob": "^4.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/glob/node_modules/brace-expansion": {
+      "version": "1.1.12",
+      "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz",
+      "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "balanced-match": "^1.0.0",
+        "concat-map": "0.0.1"
+      }
+    },
+    "node_modules/glob/node_modules/minimatch": {
+      "version": "3.1.2",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz",
+      "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^1.1.7"
+      },
+      "engines": {
+        "node": "*"
+      }
+    },
+    "node_modules/gopd": {
+      "version": "1.2.0",
+      "resolved": "https://registry.npmjs.org/gopd/-/gopd-1.2.0.tgz",
+      "integrity": "sha512-ZUKRh6/kUFoAiTAtTYPZJ3hw9wNxx+BIBOijnlG9PnrJsCcSjs1wyyD6vJpaYtgnzDrKYRSqf3OO6Rfa93xsRg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/handlebars": {
+      "version": "4.7.8",
+      "resolved": "https://registry.npmjs.org/handlebars/-/handlebars-4.7.8.tgz",
+      "integrity": "sha512-vafaFqs8MZkRrSX7sFVUdo3ap/eNiLnb4IakshzvP56X5Nr1iGKAIqdX6tMlm6HcNRIkr6AxO5jFEoJzzpT8aQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "minimist": "^1.2.5",
+        "neo-async": "^2.6.2",
+        "source-map": "^0.6.1",
+        "wordwrap": "^1.0.0"
+      },
+      "bin": {
+        "handlebars": "bin/handlebars"
+      },
+      "engines": {
+        "node": ">=0.4.7"
+      },
+      "optionalDependencies": {
+        "uglify-js": "^3.1.4"
+      }
+    },
+    "node_modules/has-flag": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/has-flag/-/has-flag-4.0.0.tgz",
+      "integrity": "sha512-EykJT/Q1KjTWctppgIAgfSO0tKVuZUjhgMr17kqTumMl6Afv3EISleU7qZUzoXDFTAHTDC4NOoG/ZxU3EvlMPQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/has-symbols": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/has-symbols/-/has-symbols-1.1.0.tgz",
+      "integrity": "sha512-1cDNdwJ2Jaohmb3sg4OmKaMBwuC48sYni5HUw2DvsC8LjGTLK9h+eb1X6RyuOHe4hT0ULCW68iomhjUoKUqlPQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/has-tostringtag": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/has-tostringtag/-/has-tostringtag-1.0.2.tgz",
+      "integrity": "sha512-NqADB8VjPFLM2V0VvHUewwwsw0ZWBaIdgo+ieHtK3hasLz4qeCRjYcqfB6AQrBggRKppKF8L52/VqdVsO47Dlw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "has-symbols": "^1.0.3"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/hasown": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/hasown/-/hasown-2.0.2.tgz",
+      "integrity": "sha512-0hJU9SCPvmMzIBdZFqNPXWa6dqh7WdH0cII9y+CyS8rG3nL48Bclra9HmKhVVUHyPWNH5Y7xDwAB7bfgSjkUMQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "function-bind": "^1.1.2"
+      },
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/http2-client": {
+      "version": "1.3.5",
+      "resolved": "https://registry.npmjs.org/http2-client/-/http2-client-1.3.5.tgz",
+      "integrity": "sha512-EC2utToWl4RKfs5zd36Mxq7nzHHBuomZboI0yYL6Y0RmBgT7Sgkq4rQ0ezFTYoIsSs7Tm9SJe+o2FcAg6GBhGA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/https-proxy-agent": {
+      "version": "7.0.6",
+      "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz",
+      "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "agent-base": "^7.1.2",
+        "debug": "4"
+      },
+      "engines": {
+        "node": ">= 14"
+      }
+    },
+    "node_modules/inflight": {
+      "version": "1.0.6",
+      "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz",
+      "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==",
+      "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "once": "^1.3.0",
+        "wrappy": "1"
+      }
+    },
+    "node_modules/inherits": {
+      "version": "2.0.4",
+      "resolved": "https://registry.npmjs.org/inherits/-/inherits-2.0.4.tgz",
+      "integrity": "sha512-k/vGaX4/Yla3WzyMCvTQOXYeIHvqOKtnqBduzTHpzpQZzAskKMhZ2K+EnBiSM9zGSoIFeMpXKxa4dYeZIQqewQ==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/is-binary-path": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/is-binary-path/-/is-binary-path-2.1.0.tgz",
+      "integrity": "sha512-ZMERYes6pDydyuGidse7OsHxtbI7WVeUEozgR/g7rd0xUimYNlvZRE/K2MgZTjWy725IfelLeVcEM97mmtRGXw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "binary-extensions": "^2.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/is-docker": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-docker/-/is-docker-3.0.0.tgz",
+      "integrity": "sha512-eljcgEDlEns/7AXFosB5K/2nCM4P7FQPkGc/DWLy5rmFEWvZayGrik1d9/QIY5nJ4f9YsVvBkA6kJpHn9rISdQ==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "is-docker": "cli.js"
+      },
+      "engines": {
+        "node": "^12.20.0 || ^14.13.1 || >=16.0.0"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-extglob": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz",
+      "integrity": "sha512-SbKbANkN603Vi4jEZv49LeVJMn4yGwsbzZworEoyEiutsN3nJYdbO36zfhGJ6QEDpOZIFkDtnq5JRxmvl3jsoQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-fullwidth-code-point": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz",
+      "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/is-glob": {
+      "version": "4.0.3",
+      "resolved": "https://registry.npmjs.org/is-glob/-/is-glob-4.0.3.tgz",
+      "integrity": "sha512-xelSayHH36ZgE7ZWhli7pW34hNbNl8Ojv5KVmkJD4hBdD3th8Tfk9vYasLM+mXWOZhFkgZfxhLSnrwRr4elSSg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-extglob": "^2.1.1"
+      },
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/is-inside-container": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/is-inside-container/-/is-inside-container-1.0.0.tgz",
+      "integrity": "sha512-KIYLCCJghfHZxqjYBE7rEy0OBuTd5xCHS7tHVgvCLkx7StIoaxwNW3hCALgEUjFfeRk+MG/Qxmp/vtETEF3tRA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-docker": "^3.0.0"
+      },
+      "bin": {
+        "is-inside-container": "cli.js"
+      },
+      "engines": {
+        "node": ">=14.16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/is-number": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/is-number/-/is-number-7.0.0.tgz",
+      "integrity": "sha512-41Cifkg6e8TylSpdtTpeLVMqvSBEVzTttHvERD741+pnZ8ANv0004MRL43QKPDlK9cGvNp6NZWZUBlbGXYxxng==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.12.0"
+      }
+    },
+    "node_modules/is-wsl": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/is-wsl/-/is-wsl-3.1.0.tgz",
+      "integrity": "sha512-UcVfVfaK4Sc4m7X3dUSoHoozQGBEFeDC+zVo06t98xe8CzHSZZBekNXH+tu0NalHolcJ/QAGqS46Hef7QXBIMw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-inside-container": "^1.0.0"
+      },
+      "engines": {
+        "node": ">=16"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/jest-diff": {
+      "version": "29.7.0",
+      "resolved": "https://registry.npmjs.org/jest-diff/-/jest-diff-29.7.0.tgz",
+      "integrity": "sha512-LMIgiIrhigmPrs03JHpxUh2yISK3vLFPkAodPeo0+BuF7wA2FoQbkEg1u8gBYBThncu7e1oEDUfIXVuTqLRUjw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "chalk": "^4.0.0",
+        "diff-sequences": "^29.6.3",
+        "jest-get-type": "^29.6.3",
+        "pretty-format": "^29.7.0"
+      },
+      "engines": {
+        "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
+      }
+    },
+    "node_modules/jest-get-type": {
+      "version": "29.6.3",
+      "resolved": "https://registry.npmjs.org/jest-get-type/-/jest-get-type-29.6.3.tgz",
+      "integrity": "sha512-zrteXnqYxfQh7l5FHyL38jL39di8H8rHoecLH3JNxH3BwOrBsNeabdap5e0I23lD4HHI8W5VFBZqG4Eaq5LNcw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
+      }
+    },
+    "node_modules/jest-matcher-utils": {
+      "version": "29.7.0",
+      "resolved": "https://registry.npmjs.org/jest-matcher-utils/-/jest-matcher-utils-29.7.0.tgz",
+      "integrity": "sha512-sBkD+Xi9DtcChsI3L3u0+N0opgPYnCRPtGcQYrgXmR+hmt/fYfWAL0xRXYU8eWOdfuLgBe0YCW3AFtnRLagq/g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "chalk": "^4.0.0",
+        "jest-diff": "^29.7.0",
+        "jest-get-type": "^29.6.3",
+        "pretty-format": "^29.7.0"
+      },
+      "engines": {
+        "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
+      }
+    },
+    "node_modules/js-levenshtein": {
+      "version": "1.1.6",
+      "resolved": "https://registry.npmjs.org/js-levenshtein/-/js-levenshtein-1.1.6.tgz",
+      "integrity": "sha512-X2BB11YZtrRqY4EnQcLX5Rh373zbK4alC1FW7D7MBhL2gtcC17cTnr6DmfHZeS0s2rTHjUTMMHfG7gO8SSdw+g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/js-tokens": {
+      "version": "4.0.0",
+      "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz",
+      "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/js-yaml": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.0.tgz",
+      "integrity": "sha512-wpxZs9NoxZaJESJGIZTyDEaYpl0FKSA+FB9aJiyemKhMwkxQg63h4T1KJgUGHpTqPDNRcmmYLugrRjJlBtWvRA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "argparse": "^2.0.1"
+      },
+      "bin": {
+        "js-yaml": "bin/js-yaml.js"
+      }
+    },
+    "node_modules/jsep": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/jsep/-/jsep-1.4.0.tgz",
+      "integrity": "sha512-B7qPcEVE3NVkmSJbaYxvv4cHkVW7DQsZz13pUMrfS8z8Q/BuShN+gcTXrUlPiGqM2/t/EEaI030bpxMqY8gMlw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 10.16.0"
+      }
+    },
+    "node_modules/json-pointer": {
+      "version": "0.6.2",
+      "resolved": "https://registry.npmjs.org/json-pointer/-/json-pointer-0.6.2.tgz",
+      "integrity": "sha512-vLWcKbOaXlO+jvRy4qNd+TI1QUPZzfJj1tpJ3vAXDych5XJf93ftpUKe5pKCrzyIIwgBJcOcCVRUfqQP25afBw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "foreach": "^2.0.4"
+      }
+    },
+    "node_modules/json-schema-traverse": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz",
+      "integrity": "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/jsonpath-plus": {
+      "version": "10.3.0",
+      "resolved": "https://registry.npmjs.org/jsonpath-plus/-/jsonpath-plus-10.3.0.tgz",
+      "integrity": "sha512-8TNmfeTCk2Le33A3vRRwtuworG/L5RrgMvdjhKZxvyShO+mBu2fP50OWUjRLNtvw344DdDarFh9buFAZs5ujeA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jsep-plugin/assignment": "^1.3.0",
+        "@jsep-plugin/regex": "^1.0.4",
+        "jsep": "^1.4.0"
+      },
+      "bin": {
+        "jsonpath": "bin/jsonpath-cli.js",
+        "jsonpath-plus": "bin/jsonpath-cli.js"
+      },
+      "engines": {
+        "node": ">=18.0.0"
+      }
+    },
+    "node_modules/jsonpointer": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/jsonpointer/-/jsonpointer-5.0.1.tgz",
+      "integrity": "sha512-p/nXbhSEcu3pZRdkW1OfJhpsVtW1gd4Wa1fnQc9YLiTfAjn0312eMKimbdIQzuZl9aa9xUGaRlP9T/CJE/ditQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/leven": {
+      "version": "3.1.0",
+      "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz",
+      "integrity": "sha512-qsda+H8jTaUaN/x5vzW2rzc+8Rw4TAQ/4KjB46IwK5VH+IlVeeeje/EoZRpiXvIqjFgK84QffqPztGI3VBLG1A==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/long": {
+      "version": "5.3.2",
+      "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz",
+      "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==",
+      "dev": true,
+      "license": "Apache-2.0"
+    },
+    "node_modules/loose-envify": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz",
+      "integrity": "sha512-lyuxPGr/Wfhrlem2CL/UcnUc1zcqKAImBDzukY7Y5F/yQiNdko6+fRLevlw1HgMySw7f611UIY408EtxRSoK3Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "js-tokens": "^3.0.0 || ^4.0.0"
+      },
+      "bin": {
+        "loose-envify": "cli.js"
+      }
+    },
+    "node_modules/lunr": {
+      "version": "2.3.9",
+      "resolved": "https://registry.npmjs.org/lunr/-/lunr-2.3.9.tgz",
+      "integrity": "sha512-zTU3DaZaF3Rt9rhN3uBMGQD3dD2/vFQqnvZCDv4dl5iOzq2IZQqTxu90r4E5J+nP70J3ilqVCrbho2eWaeW8Ow==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/mark.js": {
+      "version": "8.11.1",
+      "resolved": "https://registry.npmjs.org/mark.js/-/mark.js-8.11.1.tgz",
+      "integrity": "sha512-1I+1qpDt4idfgLQG+BNWmrqku+7/2bi5nLf4YwF8y8zXvmfiTBY3PV3ZibfrjBueCByROpuBjLLFCajqkgYoLQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/marked": {
+      "version": "4.3.0",
+      "resolved": "https://registry.npmjs.org/marked/-/marked-4.3.0.tgz",
+      "integrity": "sha512-PRsaiG84bK+AMvxziE/lCFss8juXjNaWzVbN5tXAm4XjeaS9NAHhop+PjQxz2A9h8Q4M/xGmzP8vqNwy6JeK0A==",
+      "dev": true,
+      "license": "MIT",
+      "bin": {
+        "marked": "bin/marked.js"
+      },
+      "engines": {
+        "node": ">= 12"
+      }
+    },
+    "node_modules/math-intrinsics": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/math-intrinsics/-/math-intrinsics-1.1.0.tgz",
+      "integrity": "sha512-/IXtbwEk5HTPyEwyKX6hGkYXxM9nbj64B+ilVJnC/R6B0pH5G4V3b0pVbL7DBj4tkhBAppbQUlf6F6Xl9LHu1g==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.4"
+      }
+    },
+    "node_modules/mime-db": {
+      "version": "1.52.0",
+      "resolved": "https://registry.npmjs.org/mime-db/-/mime-db-1.52.0.tgz",
+      "integrity": "sha512-sPU4uV7dYlvtWJxwwxHD0PuihVNiE7TyAbQ5SWxDCB9mUYvOgroQOwYQQOKPJ8CIbE+1ETVlOoK1UC2nU3gYvg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/mime-types": {
+      "version": "2.1.35",
+      "resolved": "https://registry.npmjs.org/mime-types/-/mime-types-2.1.35.tgz",
+      "integrity": "sha512-ZDY+bPm5zTTF+YpCrAU9nK0UgICYPT0QtT1NZWFv4s++TNkcgVaT0g6+4R2uI4MjQjzysHB1zxuWL50hzaeXiw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "mime-db": "1.52.0"
+      },
+      "engines": {
+        "node": ">= 0.6"
+      }
+    },
+    "node_modules/minimatch": {
+      "version": "5.1.6",
+      "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-5.1.6.tgz",
+      "integrity": "sha512-lKwV/1brpG6mBUFHtb7NUmtABCb2WZZmm2wNiOA5hAb8VdCS4B3dtMWyvcoViccwAW/COERjXLt0zP1zXUN26g==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "brace-expansion": "^2.0.1"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/minimist": {
+      "version": "1.2.8",
+      "resolved": "https://registry.npmjs.org/minimist/-/minimist-1.2.8.tgz",
+      "integrity": "sha512-2yyAR8qBkN3YuheJanUpWC5U3bb5osDywNB8RzDVlDwDHbocAJveqqj1u8+SVD7jkWT4yvsHCpWqqWqAxb0zCA==",
+      "dev": true,
+      "license": "MIT",
+      "funding": {
+        "url": "https://github.com/sponsors/ljharb"
+      }
+    },
+    "node_modules/mobx": {
+      "version": "6.13.7",
+      "resolved": "https://registry.npmjs.org/mobx/-/mobx-6.13.7.tgz",
+      "integrity": "sha512-aChaVU/DO5aRPmk1GX8L+whocagUUpBQqoPtJk+cm7UOXUk87J4PeWCh6nNmTTIfEhiR9DI/+FnA8dln/hTK7g==",
+      "dev": true,
+      "license": "MIT",
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/mobx"
+      }
+    },
+    "node_modules/mobx-react": {
+      "version": "9.2.0",
+      "resolved": "https://registry.npmjs.org/mobx-react/-/mobx-react-9.2.0.tgz",
+      "integrity": "sha512-dkGWCx+S0/1mfiuFfHRH8D9cplmwhxOV5CkXMp38u6rQGG2Pv3FWYztS0M7ncR6TyPRQKaTG/pnitInoYE9Vrw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "mobx-react-lite": "^4.1.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/mobx"
+      },
+      "peerDependencies": {
+        "mobx": "^6.9.0",
+        "react": "^16.8.0 || ^17 || ^18 || ^19"
+      },
+      "peerDependenciesMeta": {
+        "react-dom": {
+          "optional": true
+        },
+        "react-native": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/mobx-react-lite": {
+      "version": "4.1.0",
+      "resolved": "https://registry.npmjs.org/mobx-react-lite/-/mobx-react-lite-4.1.0.tgz",
+      "integrity": "sha512-QEP10dpHHBeQNv1pks3WnHRCem2Zp636lq54M2nKO2Sarr13pL4u6diQXf65yzXUn0mkk18SyIDCm9UOJYTi1w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "use-sync-external-store": "^1.4.0"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/mobx"
+      },
+      "peerDependencies": {
+        "mobx": "^6.9.0",
+        "react": "^16.8.0 || ^17 || ^18 || ^19"
+      },
+      "peerDependenciesMeta": {
+        "react-dom": {
+          "optional": true
+        },
+        "react-native": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/ms": {
+      "version": "2.1.3",
+      "resolved": "https://registry.npmjs.org/ms/-/ms-2.1.3.tgz",
+      "integrity": "sha512-6FlzubTLZG3J2a/NVCAleEhjzq5oxgHyaCU9yYXvcLsvoVaHJq/s5xXI6/XXP6tz7R9xAOtHnSO/tXtF3WRTlA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/nanoid": {
+      "version": "3.3.11",
+      "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz",
+      "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "bin": {
+        "nanoid": "bin/nanoid.cjs"
+      },
+      "engines": {
+        "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1"
+      }
+    },
+    "node_modules/neo-async": {
+      "version": "2.6.2",
+      "resolved": "https://registry.npmjs.org/neo-async/-/neo-async-2.6.2.tgz",
+      "integrity": "sha512-Yd3UES5mWCSqR+qNT93S3UoYUkqAZ9lLg8a7g9rimsWmYGK8cVToA4/sF3RrshdyV3sAGMXVUmpMYOw+dLpOuw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/node-fetch": {
+      "version": "2.7.0",
+      "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz",
+      "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "whatwg-url": "^5.0.0"
+      },
+      "engines": {
+        "node": "4.x || >=6.0.0"
+      },
+      "peerDependencies": {
+        "encoding": "^0.1.0"
+      },
+      "peerDependenciesMeta": {
+        "encoding": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/node-fetch-h2": {
+      "version": "2.3.0",
+      "resolved": "https://registry.npmjs.org/node-fetch-h2/-/node-fetch-h2-2.3.0.tgz",
+      "integrity": "sha512-ofRW94Ab0T4AOh5Fk8t0h8OBWrmjb0SSB20xh1H8YnPV9EJ+f5AMoYSUQ2zgJ4Iq2HAK0I2l5/Nequ8YzFS3Hg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "http2-client": "^1.2.5"
+      },
+      "engines": {
+        "node": "4.x || >=6.0.0"
+      }
+    },
+    "node_modules/node-readfiles": {
+      "version": "0.2.0",
+      "resolved": "https://registry.npmjs.org/node-readfiles/-/node-readfiles-0.2.0.tgz",
+      "integrity": "sha512-SU00ZarexNlE4Rjdm83vglt5Y9yiQ+XI1XpflWlb7q7UTN1JUItm69xMeiQCTxtTfnzt+83T8Cx+vI2ED++VDA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "es6-promise": "^3.2.1"
+      }
+    },
+    "node_modules/normalize-path": {
+      "version": "3.0.0",
+      "resolved": "https://registry.npmjs.org/normalize-path/-/normalize-path-3.0.0.tgz",
+      "integrity": "sha512-6eZs5Ls3WtCisHWp9S2GUy8dqkpGi4BVSz3GaqiE6ezub0512ESztXUwUB6C6IKbQkY2Pnb/mD4WYojCRwcwLA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/oas-kit-common": {
+      "version": "1.0.8",
+      "resolved": "https://registry.npmjs.org/oas-kit-common/-/oas-kit-common-1.0.8.tgz",
+      "integrity": "sha512-pJTS2+T0oGIwgjGpw7sIRU8RQMcUoKCDWFLdBqKB2BNmGpbBMH2sdqAaOXUg8OzonZHU0L7vfJu1mJFEiYDWOQ==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "fast-safe-stringify": "^2.0.7"
+      }
+    },
+    "node_modules/oas-linter": {
+      "version": "3.2.2",
+      "resolved": "https://registry.npmjs.org/oas-linter/-/oas-linter-3.2.2.tgz",
+      "integrity": "sha512-KEGjPDVoU5K6swgo9hJVA/qYGlwfbFx+Kg2QB/kd7rzV5N8N5Mg6PlsoCMohVnQmo+pzJap/F610qTodKzecGQ==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@exodus/schemasafe": "^1.0.0-rc.2",
+        "should": "^13.2.1",
+        "yaml": "^1.10.0"
+      },
+      "funding": {
+        "url": "https://github.com/Mermade/oas-kit?sponsor=1"
+      }
+    },
+    "node_modules/oas-resolver": {
+      "version": "2.5.6",
+      "resolved": "https://registry.npmjs.org/oas-resolver/-/oas-resolver-2.5.6.tgz",
+      "integrity": "sha512-Yx5PWQNZomfEhPPOphFbZKi9W93CocQj18NlD2Pa4GWZzdZpSJvYwoiuurRI7m3SpcChrnO08hkuQDL3FGsVFQ==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "node-fetch-h2": "^2.3.0",
+        "oas-kit-common": "^1.0.8",
+        "reftools": "^1.1.9",
+        "yaml": "^1.10.0",
+        "yargs": "^17.0.1"
+      },
+      "bin": {
+        "resolve": "resolve.js"
+      },
+      "funding": {
+        "url": "https://github.com/Mermade/oas-kit?sponsor=1"
+      }
+    },
+    "node_modules/oas-schema-walker": {
+      "version": "1.1.5",
+      "resolved": "https://registry.npmjs.org/oas-schema-walker/-/oas-schema-walker-1.1.5.tgz",
+      "integrity": "sha512-2yucenq1a9YPmeNExoUa9Qwrt9RFkjqaMAA1X+U7sbb0AqBeTIdMHky9SQQ6iN94bO5NW0W4TRYXerG+BdAvAQ==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "funding": {
+        "url": "https://github.com/Mermade/oas-kit?sponsor=1"
+      }
+    },
+    "node_modules/oas-validator": {
+      "version": "5.0.8",
+      "resolved": "https://registry.npmjs.org/oas-validator/-/oas-validator-5.0.8.tgz",
+      "integrity": "sha512-cu20/HE5N5HKqVygs3dt94eYJfBi0TsZvPVXDhbXQHiEityDN+RROTleefoKRKKJ9dFAF2JBkDHgvWj0sjKGmw==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "call-me-maybe": "^1.0.1",
+        "oas-kit-common": "^1.0.8",
+        "oas-linter": "^3.2.2",
+        "oas-resolver": "^2.5.6",
+        "oas-schema-walker": "^1.1.5",
+        "reftools": "^1.1.9",
+        "should": "^13.2.1",
+        "yaml": "^1.10.0"
+      },
+      "funding": {
+        "url": "https://github.com/Mermade/oas-kit?sponsor=1"
+      }
+    },
+    "node_modules/object-assign": {
+      "version": "4.1.1",
+      "resolved": "https://registry.npmjs.org/object-assign/-/object-assign-4.1.1.tgz",
+      "integrity": "sha512-rJgTQnkUnH1sFw8yT6VSU3zD3sWmu6sZhIseY8VX+GRu3P6F7Fu+JNDoXfklElbLJSnc3FUQHVe4cU5hj+BcUg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/once": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/once/-/once-1.4.0.tgz",
+      "integrity": "sha512-lNaJgI+2Q5URQBkccEKHTQOPaXdUxnZZElQTZY0MFUAuaEqe1E+Nyvgdz/aIyNi6Z9MzO5dv1H8n58/GELp3+w==",
+      "dev": true,
+      "license": "ISC",
+      "dependencies": {
+        "wrappy": "1"
+      }
+    },
+    "node_modules/open": {
+      "version": "10.2.0",
+      "resolved": "https://registry.npmjs.org/open/-/open-10.2.0.tgz",
+      "integrity": "sha512-YgBpdJHPyQ2UE5x+hlSXcnejzAvD0b22U2OuAP+8OnlJT+PjWPxtgmGqKKc+RgTM63U9gN0YzrYc71R2WT/hTA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "default-browser": "^5.2.1",
+        "define-lazy-prop": "^3.0.0",
+        "is-inside-container": "^1.0.0",
+        "wsl-utils": "^0.1.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/openapi-sampler": {
+      "version": "1.6.1",
+      "resolved": "https://registry.npmjs.org/openapi-sampler/-/openapi-sampler-1.6.1.tgz",
+      "integrity": "sha512-s1cIatOqrrhSj2tmJ4abFYZQK6l5v+V4toO5q1Pa0DyN8mtyqy2I+Qrj5W9vOELEtybIMQs/TBZGVO/DtTFK8w==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@types/json-schema": "^7.0.7",
+        "fast-xml-parser": "^4.5.0",
+        "json-pointer": "0.6.2"
+      }
+    },
+    "node_modules/outdent": {
+      "version": "0.8.0",
+      "resolved": "https://registry.npmjs.org/outdent/-/outdent-0.8.0.tgz",
+      "integrity": "sha512-KiOAIsdpUTcAXuykya5fnVVT+/5uS0Q1mrkRHcF89tpieSmY33O/tmc54CqwA+bfhbtEfZUNLHaPUiB9X3jt1A==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/path-browserify": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/path-browserify/-/path-browserify-1.0.1.tgz",
+      "integrity": "sha512-b7uo2UCUOYZcnF/3ID0lulOJi/bafxa1xPe7ZPsammBSpjSWQkjNxlt635YGS2MiR9GjvuXCtz2emr3jbsz98g==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/path-is-absolute": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz",
+      "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/perfect-scrollbar": {
+      "version": "1.5.6",
+      "resolved": "https://registry.npmjs.org/perfect-scrollbar/-/perfect-scrollbar-1.5.6.tgz",
+      "integrity": "sha512-rixgxw3SxyJbCaSpo1n35A/fwI1r2rdwMKOTCg/AcG+xOEyZcE8UHVjpZMFCVImzsFoCZeJTT+M/rdEIQYO2nw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/picocolors": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz",
+      "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/picomatch": {
+      "version": "2.3.1",
+      "resolved": "https://registry.npmjs.org/picomatch/-/picomatch-2.3.1.tgz",
+      "integrity": "sha512-JU3teHTNjmE2VCGFzuY8EXzCDVwEqB2a8fsIvwaStHhAWJEeVd1o1QD80CU6+ZdEXXSLbSsuLwJjkCBWqRQUVA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8.6"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/jonschlinkert"
+      }
+    },
+    "node_modules/pluralize": {
+      "version": "8.0.0",
+      "resolved": "https://registry.npmjs.org/pluralize/-/pluralize-8.0.0.tgz",
+      "integrity": "sha512-Nc3IT5yHzflTfbjgqWcCPpo7DaKy4FnpB0l/zCAW0Tc7jxAiuqSxHasntB3D7887LSrA93kDJ9IXovxJYxyLCA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=4"
+      }
+    },
+    "node_modules/polished": {
+      "version": "4.3.1",
+      "resolved": "https://registry.npmjs.org/polished/-/polished-4.3.1.tgz",
+      "integrity": "sha512-OBatVyC/N7SCW/FaDHrSd+vn0o5cS855TOmYi4OkdWUMSJCET/xip//ch8xGUvtr3i44X9LVyWwQlRMTN3pwSA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@babel/runtime": "^7.17.8"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/postcss": {
+      "version": "8.4.49",
+      "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.4.49.tgz",
+      "integrity": "sha512-OCVPnIObs4N29kxTjzLfUryOkvZEq+pf8jTF0lg8E7uETuWHA+v7j3c/xJmiqpX450191LlmZfUKkXxkTry7nA==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "opencollective",
+          "url": "https://opencollective.com/postcss/"
+        },
+        {
+          "type": "tidelift",
+          "url": "https://tidelift.com/funding/github/npm/postcss"
+        },
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/ai"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "nanoid": "^3.3.7",
+        "picocolors": "^1.1.1",
+        "source-map-js": "^1.2.1"
+      },
+      "engines": {
+        "node": "^10 || ^12 || >=14"
+      }
+    },
+    "node_modules/postcss-value-parser": {
+      "version": "4.2.0",
+      "resolved": "https://registry.npmjs.org/postcss-value-parser/-/postcss-value-parser-4.2.0.tgz",
+      "integrity": "sha512-1NNCs6uurfkVbeXG4S8JFT9t19m45ICnif8zWLd5oPSZ50QnwMfK+H3jv408d4jw/7Bttv5axS5IiHoLaVNHeQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/pretty-format": {
+      "version": "29.7.0",
+      "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-29.7.0.tgz",
+      "integrity": "sha512-Pdlw/oPxN+aXdmM9R00JVC9WVFoCLTKJvDVLgmJ+qAffBMxsV85l/Lu7sNx4zSzPyoL2euImuEwHhOXdEgNFZQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@jest/schemas": "^29.6.3",
+        "ansi-styles": "^5.0.0",
+        "react-is": "^18.0.0"
+      },
+      "engines": {
+        "node": "^14.15.0 || ^16.10.0 || >=18.0.0"
+      }
+    },
+    "node_modules/pretty-format/node_modules/ansi-styles": {
+      "version": "5.2.0",
+      "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz",
+      "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/ansi-styles?sponsor=1"
+      }
+    },
+    "node_modules/prismjs": {
+      "version": "1.30.0",
+      "resolved": "https://registry.npmjs.org/prismjs/-/prismjs-1.30.0.tgz",
+      "integrity": "sha512-DEvV2ZF2r2/63V+tK8hQvrR2ZGn10srHbXviTlcv7Kpzw8jWiNTqbVgjO3IY8RxrrOUF8VPMQQFysYYYv0YZxw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=6"
+      }
+    },
+    "node_modules/prop-types": {
+      "version": "15.8.1",
+      "resolved": "https://registry.npmjs.org/prop-types/-/prop-types-15.8.1.tgz",
+      "integrity": "sha512-oj87CgZICdulUohogVAR7AjlC0327U4el4L6eAvOqCeudMDVU0NThNaV+b9Df4dXgSP1gXMTnPdhfe/2qDH5cg==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "loose-envify": "^1.4.0",
+        "object-assign": "^4.1.1",
+        "react-is": "^16.13.1"
+      }
+    },
+    "node_modules/prop-types/node_modules/react-is": {
+      "version": "16.13.1",
+      "resolved": "https://registry.npmjs.org/react-is/-/react-is-16.13.1.tgz",
+      "integrity": "sha512-24e6ynE2H+OKt4kqsOvNd8kBpV65zoxbA4BVsEOB3ARVWQki/DHzaUoC5KuON/BiccDaCCTZBuOcfZs70kR8bQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/protobufjs": {
+      "version": "7.5.3",
+      "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.5.3.tgz",
+      "integrity": "sha512-sildjKwVqOI2kmFDiXQ6aEB0fjYTafpEvIBs8tOR8qI4spuL9OPROLVu2qZqi/xgCfsHIwVqlaF8JBjWFHnKbw==",
+      "dev": true,
+      "hasInstallScript": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "@protobufjs/aspromise": "^1.1.2",
+        "@protobufjs/base64": "^1.1.2",
+        "@protobufjs/codegen": "^2.0.4",
+        "@protobufjs/eventemitter": "^1.1.0",
+        "@protobufjs/fetch": "^1.1.0",
+        "@protobufjs/float": "^1.0.2",
+        "@protobufjs/inquire": "^1.1.0",
+        "@protobufjs/path": "^1.1.2",
+        "@protobufjs/pool": "^1.1.0",
+        "@protobufjs/utf8": "^1.1.0",
+        "@types/node": ">=13.7.0",
+        "long": "^5.0.0"
+      },
+      "engines": {
+        "node": ">=12.0.0"
+      }
+    },
+    "node_modules/queue-microtask": {
+      "version": "1.2.3",
+      "resolved": "https://registry.npmjs.org/queue-microtask/-/queue-microtask-1.2.3.tgz",
+      "integrity": "sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/randombytes": {
+      "version": "2.1.0",
+      "resolved": "https://registry.npmjs.org/randombytes/-/randombytes-2.1.0.tgz",
+      "integrity": "sha512-vYl3iOX+4CKUWuxGi9Ukhie6fsqXqS9FE2Zaic4tNFD2N2QQaXOMFbuKK4QmDHC0JO6B1Zp41J0LpT0oR68amQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "safe-buffer": "^5.1.0"
+      }
+    },
+    "node_modules/react": {
+      "version": "19.1.0",
+      "resolved": "https://registry.npmjs.org/react/-/react-19.1.0.tgz",
+      "integrity": "sha512-FS+XFBNvn3GTAWq26joslQgWNoFu08F4kl0J4CgdNKADkdSGXQyTCnKteIAJy96Br6YbpEU1LSzV5dYtjMkMDg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/react-dom": {
+      "version": "19.1.0",
+      "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-19.1.0.tgz",
+      "integrity": "sha512-Xs1hdnE+DyKgeHJeJznQmYMIBG3TKIHJJT95Q58nHLSrElKlGQqDTR2HQ9fx5CN/Gk6Vh/kupBTDLU11/nDk/g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "scheduler": "^0.26.0"
+      },
+      "peerDependencies": {
+        "react": "^19.1.0"
+      }
+    },
+    "node_modules/react-is": {
+      "version": "18.3.1",
+      "resolved": "https://registry.npmjs.org/react-is/-/react-is-18.3.1.tgz",
+      "integrity": "sha512-/LLMVyas0ljjAtoYiPqYiL8VWXzUUdThrmU5+n20DZv+a+ClRoevUzw5JxU+Ieh5/c87ytoTBV9G1FiKfNJdmg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/react-tabs": {
+      "version": "6.1.0",
+      "resolved": "https://registry.npmjs.org/react-tabs/-/react-tabs-6.1.0.tgz",
+      "integrity": "sha512-6QtbTRDKM+jA/MZTTefvigNxo0zz+gnBTVFw2CFVvq+f2BuH0nF0vDLNClL045nuTAdOoK/IL1vTP0ZLX0DAyQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "clsx": "^2.0.0",
+        "prop-types": "^15.5.0"
+      },
+      "peerDependencies": {
+        "react": "^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/readable-stream": {
+      "version": "3.6.2",
+      "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-3.6.2.tgz",
+      "integrity": "sha512-9u/sniCrY3D5WdsERHzHE4G2YCXqoG5FTHUiCC4SIbr6XcLZBY05ya9EKjYek9O5xOAwjGq+1JdGBAS7Q9ScoA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "inherits": "^2.0.3",
+        "string_decoder": "^1.1.1",
+        "util-deprecate": "^1.0.1"
+      },
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/readdirp": {
+      "version": "3.6.0",
+      "resolved": "https://registry.npmjs.org/readdirp/-/readdirp-3.6.0.tgz",
+      "integrity": "sha512-hOS089on8RduqdbhvQ5Z37A0ESjsqz6qnRcffsMU3495FuTdqSm+7bhJ29JvIOsBDEEnan5DPu9t3To9VRlMzA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "picomatch": "^2.2.1"
+      },
+      "engines": {
+        "node": ">=8.10.0"
+      }
+    },
+    "node_modules/redoc": {
+      "version": "2.5.0",
+      "resolved": "https://registry.npmjs.org/redoc/-/redoc-2.5.0.tgz",
+      "integrity": "sha512-NpYsOZ1PD9qFdjbLVBZJWptqE+4Y6TkUuvEOqPUmoH7AKOmPcE+hYjotLxQNTqVoWL4z0T2uxILmcc8JGDci+Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@redocly/openapi-core": "^1.4.0",
+        "classnames": "^2.3.2",
+        "decko": "^1.2.0",
+        "dompurify": "^3.2.4",
+        "eventemitter3": "^5.0.1",
+        "json-pointer": "^0.6.2",
+        "lunr": "^2.3.9",
+        "mark.js": "^8.11.1",
+        "marked": "^4.3.0",
+        "mobx-react": "^9.1.1",
+        "openapi-sampler": "^1.5.0",
+        "path-browserify": "^1.0.1",
+        "perfect-scrollbar": "^1.5.5",
+        "polished": "^4.2.2",
+        "prismjs": "^1.29.0",
+        "prop-types": "^15.8.1",
+        "react-tabs": "^6.0.2",
+        "slugify": "~1.4.7",
+        "stickyfill": "^1.1.1",
+        "swagger2openapi": "^7.0.8",
+        "url-template": "^2.0.8"
+      },
+      "engines": {
+        "node": ">=6.9",
+        "npm": ">=3.0.0"
+      },
+      "peerDependencies": {
+        "core-js": "^3.1.4",
+        "mobx": "^6.0.4",
+        "react": "^16.8.4 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+        "react-dom": "^16.8.4 || ^17.0.0 || ^18.0.0 || ^19.0.0",
+        "styled-components": "^4.1.1 || ^5.1.1 || ^6.0.5"
+      }
+    },
+    "node_modules/reftools": {
+      "version": "1.1.9",
+      "resolved": "https://registry.npmjs.org/reftools/-/reftools-1.1.9.tgz",
+      "integrity": "sha512-OVede/NQE13xBQ+ob5CKd5KyeJYU2YInb1bmV4nRoOfquZPkAkxuOXicSe1PvqIuZZ4kD13sPKBbR7UFDmli6w==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "funding": {
+        "url": "https://github.com/Mermade/oas-kit?sponsor=1"
+      }
+    },
+    "node_modules/require-directory": {
+      "version": "2.1.1",
+      "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz",
+      "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/require-from-string": {
+      "version": "2.0.2",
+      "resolved": "https://registry.npmjs.org/require-from-string/-/require-from-string-2.0.2.tgz",
+      "integrity": "sha512-Xf0nWe6RseziFMu+Ap9biiUbmplq6S9/p+7w7YXP/JBHhrUDDUhwa+vANyubuqfZWTveU//DYVGsDG7RKL/vEw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/run-applescript": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/run-applescript/-/run-applescript-7.0.0.tgz",
+      "integrity": "sha512-9by4Ij99JUr/MCFBUkDKLWK3G9HVXmabKz9U5MlIAIuvuzkiOicRYs8XJLxX+xahD+mLiiCYDqF9dKAgtzKP1A==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/safe-buffer": {
+      "version": "5.2.1",
+      "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
+      "integrity": "sha512-rp3So07KcdmmKbGvgaNxQSJr7bGVSVk5S9Eq1F+ppbRo70+YeaDxkw5Dd8NPN+GD6bjnYm2VuPuCXmpuYvmCXQ==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/scheduler": {
+      "version": "0.26.0",
+      "resolved": "https://registry.npmjs.org/scheduler/-/scheduler-0.26.0.tgz",
+      "integrity": "sha512-NlHwttCI/l5gCPR3D1nNXtWABUmBwvZpEQiD4IXSbIDq8BzLIK/7Ir5gTFSGZDUu37K5cMNp0hFtzO38sC7gWA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/semver": {
+      "version": "7.7.2",
+      "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.2.tgz",
+      "integrity": "sha512-RF0Fw+rO5AMf9MAyaRXI4AV0Ulj5lMHqVxxdSgiVbixSCXoEmmX/jk0CuJw4+3SqroYO9VoUh+HcuJivvtJemA==",
+      "dev": true,
+      "license": "ISC",
+      "bin": {
+        "semver": "bin/semver.js"
+      },
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/set-cookie-parser": {
+      "version": "2.7.1",
+      "resolved": "https://registry.npmjs.org/set-cookie-parser/-/set-cookie-parser-2.7.1.tgz",
+      "integrity": "sha512-IOc8uWeOZgnb3ptbCURJWNjWUPcO3ZnTTdzsurqERrP6nPyv+paC55vJM0LpOlT2ne+Ix+9+CRG1MNLlyZ4GjQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/shallowequal": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/shallowequal/-/shallowequal-1.1.0.tgz",
+      "integrity": "sha512-y0m1JoUZSlPAjXVtPPW70aZWfIL/dSP7AFkRnniLCrK/8MDKog3TySTBmckD+RObVxH0v4Tox67+F14PdED2oQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/should": {
+      "version": "13.2.3",
+      "resolved": "https://registry.npmjs.org/should/-/should-13.2.3.tgz",
+      "integrity": "sha512-ggLesLtu2xp+ZxI+ysJTmNjh2U0TsC+rQ/pfED9bUZZ4DKefP27D+7YJVVTvKsmjLpIi9jAa7itwDGkDDmt1GQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "should-equal": "^2.0.0",
+        "should-format": "^3.0.3",
+        "should-type": "^1.4.0",
+        "should-type-adaptors": "^1.0.1",
+        "should-util": "^1.0.0"
+      }
+    },
+    "node_modules/should-equal": {
+      "version": "2.0.0",
+      "resolved": "https://registry.npmjs.org/should-equal/-/should-equal-2.0.0.tgz",
+      "integrity": "sha512-ZP36TMrK9euEuWQYBig9W55WPC7uo37qzAEmbjHz4gfyuXrEUgF8cUvQVO+w+d3OMfPvSRQJ22lSm8MQJ43LTA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "should-type": "^1.4.0"
+      }
+    },
+    "node_modules/should-format": {
+      "version": "3.0.3",
+      "resolved": "https://registry.npmjs.org/should-format/-/should-format-3.0.3.tgz",
+      "integrity": "sha512-hZ58adtulAk0gKtua7QxevgUaXTTXxIi8t41L3zo9AHvjXO1/7sdLECuHeIN2SRtYXpNkmhoUP2pdeWgricQ+Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "should-type": "^1.3.0",
+        "should-type-adaptors": "^1.0.1"
+      }
+    },
+    "node_modules/should-type": {
+      "version": "1.4.0",
+      "resolved": "https://registry.npmjs.org/should-type/-/should-type-1.4.0.tgz",
+      "integrity": "sha512-MdAsTu3n25yDbIe1NeN69G4n6mUnJGtSJHygX3+oN0ZbO3DTiATnf7XnYJdGT42JCXurTb1JI0qOBR65shvhPQ==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/should-type-adaptors": {
+      "version": "1.1.0",
+      "resolved": "https://registry.npmjs.org/should-type-adaptors/-/should-type-adaptors-1.1.0.tgz",
+      "integrity": "sha512-JA4hdoLnN+kebEp2Vs8eBe9g7uy0zbRo+RMcU0EsNy+R+k049Ki+N5tT5Jagst2g7EAja+euFuoXFCa8vIklfA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "should-type": "^1.3.0",
+        "should-util": "^1.0.0"
+      }
+    },
+    "node_modules/should-util": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/should-util/-/should-util-1.0.1.tgz",
+      "integrity": "sha512-oXF8tfxx5cDk8r2kYqlkUJzZpDBqVY/II2WhvU0n9Y3XYvAYRmeaf1PvvIvTgPnv4KJ+ES5M0PyDq5Jp+Ygy2g==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/simple-websocket": {
+      "version": "9.1.0",
+      "resolved": "https://registry.npmjs.org/simple-websocket/-/simple-websocket-9.1.0.tgz",
+      "integrity": "sha512-8MJPnjRN6A8UCp1I+H/dSFyjwJhp6wta4hsVRhjf8w9qBHRzxYt14RaOcjvQnhD1N4yKOddEjflwMnQM4VtXjQ==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/feross"
+        },
+        {
+          "type": "patreon",
+          "url": "https://www.patreon.com/feross"
+        },
+        {
+          "type": "consulting",
+          "url": "https://feross.org/support"
+        }
+      ],
+      "license": "MIT",
+      "dependencies": {
+        "debug": "^4.3.1",
+        "queue-microtask": "^1.2.2",
+        "randombytes": "^2.1.0",
+        "readable-stream": "^3.6.0",
+        "ws": "^7.4.2"
+      }
+    },
+    "node_modules/slugify": {
+      "version": "1.4.7",
+      "resolved": "https://registry.npmjs.org/slugify/-/slugify-1.4.7.tgz",
+      "integrity": "sha512-tf+h5W1IrjNm/9rKKj0JU2MDMruiopx0jjVA5zCdBtcGjfp0+c5rHw/zADLC3IeKlGHtVbHtpfzvYA0OYT+HKg==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8.0.0"
+      }
+    },
+    "node_modules/source-map": {
+      "version": "0.6.1",
+      "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz",
+      "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/source-map-js": {
+      "version": "1.2.1",
+      "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz",
+      "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "engines": {
+        "node": ">=0.10.0"
+      }
+    },
+    "node_modules/stickyfill": {
+      "version": "1.1.1",
+      "resolved": "https://registry.npmjs.org/stickyfill/-/stickyfill-1.1.1.tgz",
+      "integrity": "sha512-GCp7vHAfpao+Qh/3Flh9DXEJ/qSi0KJwJw6zYlZOtRYXWUIpMM6mC2rIep/dK8RQqwW0KxGJIllmjPIBOGN8AA==",
+      "dev": true
+    },
+    "node_modules/string_decoder": {
+      "version": "1.3.0",
+      "resolved": "https://registry.npmjs.org/string_decoder/-/string_decoder-1.3.0.tgz",
+      "integrity": "sha512-hkRX8U1WjJFd8LsDJ2yQ/wWWxaopEsABU1XfkM8A+j0+85JAGppt16cr1Whg6KIbb4okU6Mql6BOj+uup/wKeA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "safe-buffer": "~5.2.0"
+      }
+    },
+    "node_modules/string-width": {
+      "version": "4.2.3",
+      "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
+      "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "emoji-regex": "^8.0.0",
+        "is-fullwidth-code-point": "^3.0.0",
+        "strip-ansi": "^6.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/strip-ansi": {
+      "version": "6.0.1",
+      "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
+      "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ansi-regex": "^5.0.1"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/strnum": {
+      "version": "1.1.2",
+      "resolved": "https://registry.npmjs.org/strnum/-/strnum-1.1.2.tgz",
+      "integrity": "sha512-vrN+B7DBIoTTZjnPNewwhx6cBA/H+IS7rfW68n7XxC1y7uoiGQBxaKzqucGUgavX15dJgiGztLJ8vxuEzwqBdA==",
+      "dev": true,
+      "funding": [
+        {
+          "type": "github",
+          "url": "https://github.com/sponsors/NaturalIntelligence"
+        }
+      ],
+      "license": "MIT"
+    },
+    "node_modules/styled-components": {
+      "version": "6.1.19",
+      "resolved": "https://registry.npmjs.org/styled-components/-/styled-components-6.1.19.tgz",
+      "integrity": "sha512-1v/e3Dl1BknC37cXMhwGomhO8AkYmN41CqyX9xhUDxry1ns3BFQy2lLDRQXJRdVVWB9OHemv/53xaStimvWyuA==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "@emotion/is-prop-valid": "1.2.2",
+        "@emotion/unitless": "0.8.1",
+        "@types/stylis": "4.2.5",
+        "css-to-react-native": "3.2.0",
+        "csstype": "3.1.3",
+        "postcss": "8.4.49",
+        "shallowequal": "1.1.0",
+        "stylis": "4.3.2",
+        "tslib": "2.6.2"
+      },
+      "engines": {
+        "node": ">= 16"
+      },
+      "funding": {
+        "type": "opencollective",
+        "url": "https://opencollective.com/styled-components"
+      },
+      "peerDependencies": {
+        "react": ">= 16.8.0",
+        "react-dom": ">= 16.8.0"
+      }
+    },
+    "node_modules/stylis": {
+      "version": "4.3.2",
+      "resolved": "https://registry.npmjs.org/stylis/-/stylis-4.3.2.tgz",
+      "integrity": "sha512-bhtUjWd/z6ltJiQwg0dUfxEJ+W+jdqQd8TbWLWyeIJHlnsqmGLRFFd8e5mA0AZi/zx90smXRlN66YMTcaSFifg==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/supports-color": {
+      "version": "7.2.0",
+      "resolved": "https://registry.npmjs.org/supports-color/-/supports-color-7.2.0.tgz",
+      "integrity": "sha512-qpCAvRl9stuOHveKsn7HncJRvv501qIacKzQlO/+Lwxc9+0q2wLyv4Dfvt80/DPn2pqOBsJdDiogXGR9+OvwRw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "has-flag": "^4.0.0"
+      },
+      "engines": {
+        "node": ">=8"
+      }
+    },
+    "node_modules/swagger2openapi": {
+      "version": "7.0.8",
+      "resolved": "https://registry.npmjs.org/swagger2openapi/-/swagger2openapi-7.0.8.tgz",
+      "integrity": "sha512-upi/0ZGkYgEcLeGieoz8gT74oWHA0E7JivX7aN9mAf+Tc7BQoRBvnIGHoPDw+f9TXTW4s6kGYCZJtauP6OYp7g==",
+      "dev": true,
+      "license": "BSD-3-Clause",
+      "dependencies": {
+        "call-me-maybe": "^1.0.1",
+        "node-fetch": "^2.6.1",
+        "node-fetch-h2": "^2.3.0",
+        "node-readfiles": "^0.2.0",
+        "oas-kit-common": "^1.0.8",
+        "oas-resolver": "^2.5.6",
+        "oas-schema-walker": "^1.1.5",
+        "oas-validator": "^5.0.8",
+        "reftools": "^1.1.9",
+        "yaml": "^1.10.0",
+        "yargs": "^17.0.1"
+      },
+      "bin": {
+        "boast": "boast.js",
+        "oas-validate": "oas-validate.js",
+        "swagger2openapi": "swagger2openapi.js"
+      },
+      "funding": {
+        "url": "https://github.com/Mermade/oas-kit?sponsor=1"
+      }
+    },
+    "node_modules/to-regex-range": {
+      "version": "5.0.1",
+      "resolved": "https://registry.npmjs.org/to-regex-range/-/to-regex-range-5.0.1.tgz",
+      "integrity": "sha512-65P7iz6X5yEr1cwcgvQxbbIw7Uk3gOy5dIdtZ4rDveLqhrdJP+Li/Hx6tyK0NEb+2GCyneCMJiGqrADCSNk8sQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-number": "^7.0.0"
+      },
+      "engines": {
+        "node": ">=8.0"
+      }
+    },
+    "node_modules/tr46": {
+      "version": "0.0.3",
+      "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
+      "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/tslib": {
+      "version": "2.6.2",
+      "resolved": "https://registry.npmjs.org/tslib/-/tslib-2.6.2.tgz",
+      "integrity": "sha512-AEYxH93jGFPn/a2iVAwW87VuUIkR1FVUKB77NwMF7nBTDkDrrT/Hpt/IrCJ0QXhW27jTBDcf5ZY7w6RiqTMw2Q==",
+      "dev": true,
+      "license": "0BSD"
+    },
+    "node_modules/typedarray": {
+      "version": "0.0.6",
+      "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz",
+      "integrity": "sha512-/aCDEGatGvZ2BIk+HmLf4ifCJFwvKFNb9/JeZPMulfgFracn9QFcAf5GO8B/mweUjSoblS5In0cWhqpfs/5PQA==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/uglify-js": {
+      "version": "3.19.3",
+      "resolved": "https://registry.npmjs.org/uglify-js/-/uglify-js-3.19.3.tgz",
+      "integrity": "sha512-v3Xu+yuwBXisp6QYTcH4UbH+xYJXqnq2m/LtQVWKWzYc1iehYnLixoQDN9FH6/j9/oybfd6W9Ghwkl8+UMKTKQ==",
+      "dev": true,
+      "license": "BSD-2-Clause",
+      "optional": true,
+      "bin": {
+        "uglifyjs": "bin/uglifyjs"
+      },
+      "engines": {
+        "node": ">=0.8.0"
+      }
+    },
+    "node_modules/undici": {
+      "version": "6.21.3",
+      "resolved": "https://registry.npmjs.org/undici/-/undici-6.21.3.tgz",
+      "integrity": "sha512-gBLkYIlEnSp8pFbT64yFgGE6UIB9tAkhukC23PmMDCe5Nd+cRqKxSjw5y54MK2AZMgZfJWMaNE4nYUHgi1XEOw==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=18.17"
+      }
+    },
+    "node_modules/undici-types": {
+      "version": "7.8.0",
+      "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.8.0.tgz",
+      "integrity": "sha512-9UJ2xGDvQ43tYyVMpuHlsgApydB8ZKfVYTsLDhXkFL/6gfkp+U8xTGdh8pMJv1SpZna0zxG1DwsKZsreLbXBxw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/uri-js-replace": {
+      "version": "1.0.1",
+      "resolved": "https://registry.npmjs.org/uri-js-replace/-/uri-js-replace-1.0.1.tgz",
+      "integrity": "sha512-W+C9NWNLFOoBI2QWDp4UT9pv65r2w5Cx+3sTYFvtMdDBxkKt1syCqsUdSFAChbEe1uK5TfS04wt/nGwmaeIQ0g==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/url-template": {
+      "version": "2.0.8",
+      "resolved": "https://registry.npmjs.org/url-template/-/url-template-2.0.8.tgz",
+      "integrity": "sha512-XdVKMF4SJ0nP/O7XIPB0JwAEuT9lDIYnNsK8yGVe43y0AWoKeJNdv3ZNWh7ksJ6KqQFjOO6ox/VEitLnaVNufw==",
+      "dev": true,
+      "license": "BSD"
+    },
+    "node_modules/use-sync-external-store": {
+      "version": "1.5.0",
+      "resolved": "https://registry.npmjs.org/use-sync-external-store/-/use-sync-external-store-1.5.0.tgz",
+      "integrity": "sha512-Rb46I4cGGVBmjamjphe8L/UnvJD+uPPtTkNvX5mZgqdbavhI4EbgIWJiIHXJ8bc/i9EQGPRh4DwEURJ552Do0A==",
+      "dev": true,
+      "license": "MIT",
+      "peerDependencies": {
+        "react": "^16.8.0 || ^17.0.0 || ^18.0.0 || ^19.0.0"
+      }
+    },
+    "node_modules/util-deprecate": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/util-deprecate/-/util-deprecate-1.0.2.tgz",
+      "integrity": "sha512-EPD5q1uXyFxJpCrLnCc1nHnq3gOa6DZBocAIiI2TaSCA7VCJ1UJDMagCzIkXNsUYfD1daK//LTEQ8xiIbrHtcw==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/webidl-conversions": {
+      "version": "3.0.1",
+      "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
+      "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==",
+      "dev": true,
+      "license": "BSD-2-Clause"
+    },
+    "node_modules/whatwg-url": {
+      "version": "5.0.0",
+      "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
+      "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "tr46": "~0.0.3",
+        "webidl-conversions": "^3.0.0"
+      }
+    },
+    "node_modules/wordwrap": {
+      "version": "1.0.0",
+      "resolved": "https://registry.npmjs.org/wordwrap/-/wordwrap-1.0.0.tgz",
+      "integrity": "sha512-gvVzJFlPycKc5dZN4yPkP8w7Dc37BtP1yczEneOb4uq34pXZcvrtRTmWV8W+Ume+XCxKgbjM+nevkyFPMybd4Q==",
+      "dev": true,
+      "license": "MIT"
+    },
+    "node_modules/wrap-ansi": {
+      "version": "7.0.0",
+      "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz",
+      "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "ansi-styles": "^4.0.0",
+        "string-width": "^4.1.0",
+        "strip-ansi": "^6.0.0"
+      },
+      "engines": {
+        "node": ">=10"
+      },
+      "funding": {
+        "url": "https://github.com/chalk/wrap-ansi?sponsor=1"
+      }
+    },
+    "node_modules/wrappy": {
+      "version": "1.0.2",
+      "resolved": "https://registry.npmjs.org/wrappy/-/wrappy-1.0.2.tgz",
+      "integrity": "sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==",
+      "dev": true,
+      "license": "ISC"
+    },
+    "node_modules/ws": {
+      "version": "7.5.10",
+      "resolved": "https://registry.npmjs.org/ws/-/ws-7.5.10.tgz",
+      "integrity": "sha512-+dbF1tHwZpXcbOJdVOkzLDxZP1ailvSxM6ZweXTegylPny803bFhA+vqBYw4s31NSAk4S2Qz+AKXK9a4wkdjcQ==",
+      "dev": true,
+      "license": "MIT",
+      "engines": {
+        "node": ">=8.3.0"
+      },
+      "peerDependencies": {
+        "bufferutil": "^4.0.1",
+        "utf-8-validate": "^5.0.2"
+      },
+      "peerDependenciesMeta": {
+        "bufferutil": {
+          "optional": true
+        },
+        "utf-8-validate": {
+          "optional": true
+        }
+      }
+    },
+    "node_modules/wsl-utils": {
+      "version": "0.1.0",
+      "resolved": "https://registry.npmjs.org/wsl-utils/-/wsl-utils-0.1.0.tgz",
+      "integrity": "sha512-h3Fbisa2nKGPxCpm89Hk33lBLsnaGBvctQopaBSOW/uIs6FTe1ATyAnKFJrzVs9vpGdsTe73WF3V4lIsk4Gacw==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "is-wsl": "^3.1.0"
+      },
+      "engines": {
+        "node": ">=18"
+      },
+      "funding": {
+        "url": "https://github.com/sponsors/sindresorhus"
+      }
+    },
+    "node_modules/y18n": {
+      "version": "5.0.8",
+      "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz",
+      "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": ">=10"
+      }
+    },
+    "node_modules/yaml": {
+      "version": "1.10.2",
+      "resolved": "https://registry.npmjs.org/yaml/-/yaml-1.10.2.tgz",
+      "integrity": "sha512-r3vXyErRCYJ7wg28yvBY5VSoAF8ZvlcW9/BwUzEtUsjvX/DKs24dIkuwjtuprwJJHsbyUbLApepYTR1BN4uHrg==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": ">= 6"
+      }
+    },
+    "node_modules/yaml-ast-parser": {
+      "version": "0.0.43",
+      "resolved": "https://registry.npmjs.org/yaml-ast-parser/-/yaml-ast-parser-0.0.43.tgz",
+      "integrity": "sha512-2PTINUwsRqSd+s8XxKaJWQlUuEMHJQyEuh2edBbW8KNJz0SJPwUSD2zRWqezFEdN7IzAgeuYHFUCF7o8zRdZ0A==",
+      "dev": true,
+      "license": "Apache-2.0"
+    },
+    "node_modules/yargs": {
+      "version": "17.0.1",
+      "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.0.1.tgz",
+      "integrity": "sha512-xBBulfCc8Y6gLFcrPvtqKz9hz8SO0l1Ni8GgDekvBX2ro0HRQImDGnikfc33cgzcYUSncapnNcZDjVFIH3f6KQ==",
+      "dev": true,
+      "license": "MIT",
+      "dependencies": {
+        "cliui": "^7.0.2",
+        "escalade": "^3.1.1",
+        "get-caller-file": "^2.0.5",
+        "require-directory": "^2.1.1",
+        "string-width": "^4.2.0",
+        "y18n": "^5.0.5",
+        "yargs-parser": "^20.2.2"
+      },
+      "engines": {
+        "node": ">=12"
+      }
+    },
+    "node_modules/yargs-parser": {
+      "version": "20.2.9",
+      "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-20.2.9.tgz",
+      "integrity": "sha512-y11nGElTIV+CT3Zv9t7VKl+Q3hTQoT9a1Qzezhhl6Rp21gJ/IVTW7Z3y9EWXhuUBC2Shnf+DX0antecpAwSP8w==",
+      "dev": true,
+      "license": "ISC",
+      "engines": {
+        "node": ">=10"
+      }
+    }
+  }
+}
diff --git a/build-tools/package.json b/build-tools/package.json
new file mode 100644
index 0000000000..000969c672
--- /dev/null
+++ b/build-tools/package.json
@@ -0,0 +1,8 @@
+{
+  "name": "build-tools",
+  "private": true,
+  "devDependencies": {
+    "@redocly/cli": "1.34.4",
+    "@sourcemeta/jsonschema": "10.0.0"
+  }
+}
diff --git a/compute/Makefile b/compute/Makefile
index ef2e55f7b1..25bbb30d3a 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -50,9 +50,9 @@ jsonnetfmt-format:
 	jsonnetfmt --in-place $(jsonnet_files)
 
 .PHONY: manifest-schema-validation
-manifest-schema-validation: node_modules
-	node_modules/.bin/jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
+manifest-schema-validation: ../build-tools/node_modules
+	npx --prefix=../build-tools/ jsonschema validate -d https://json-schema.org/draft/2020-12/schema manifest.schema.json manifest.yaml
 
-node_modules: package.json
-	npm install
-	touch node_modules
+../build-tools/node_modules: ../build-tools/package.json
+	cd ../build-tools && $(if $(CI),npm ci,npm install)
+	touch ../build-tools/node_modules
diff --git a/compute/package.json b/compute/package.json
deleted file mode 100644
index 581384dc13..0000000000
--- a/compute/package.json
+++ /dev/null
@@ -1,7 +0,0 @@
-{
-  "name": "neon-compute",
-  "private": true,
-  "dependencies": {
-    "@sourcemeta/jsonschema": "9.3.4"
-  }
-} 
\ No newline at end of file

From 87915df2fa9034c4a7da0555c5bf14d214980d59 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 16 Jul 2025 14:27:00 +0100
Subject: [PATCH 129/163] proxy: replace serde_json with our new json ser crate
 in the logging impl (#12602)

This doesn't solve any particular problem, but it does simplify some of
the code that was forced to round-trip through verbose Serialize impls.
---
 proxy/src/logging.rs | 478 +++++++++++++++----------------------------
 proxy/src/metrics.rs |   3 +
 2 files changed, 164 insertions(+), 317 deletions(-)

diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index e608300bd2..a87b0f1175 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -6,7 +6,6 @@ use std::{env, io};
 
 use chrono::{DateTime, Utc};
 use opentelemetry::trace::TraceContextExt;
-use serde::ser::{SerializeMap, Serializer};
 use tracing::subscriber::Interest;
 use tracing::{Event, Metadata, Span, Subscriber, callsite, span};
 use tracing_opentelemetry::OpenTelemetrySpanExt;
@@ -16,7 +15,9 @@ use tracing_subscriber::fmt::time::SystemTime;
 use tracing_subscriber::fmt::{FormatEvent, FormatFields};
 use tracing_subscriber::layer::{Context, Layer};
 use tracing_subscriber::prelude::*;
-use tracing_subscriber::registry::{LookupSpan, SpanRef};
+use tracing_subscriber::registry::LookupSpan;
+
+use crate::metrics::Metrics;
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
@@ -249,7 +250,7 @@ where
         //       early, before OTel machinery, and add as event extension.
         let now = self.clock.now();
 
-        let res: io::Result<()> = EVENT_FORMATTER.with(|f| {
+        EVENT_FORMATTER.with(|f| {
             let mut borrow = f.try_borrow_mut();
             let formatter = match borrow.as_deref_mut() {
                 Ok(formatter) => formatter,
@@ -259,31 +260,19 @@ where
                 Err(_) => &mut EventFormatter::new(),
             };
 
-            formatter.reset();
             formatter.format(
                 now,
                 event,
                 &ctx,
                 &self.skipped_field_indices,
                 self.extract_fields,
-            )?;
-            self.writer.make_writer().write_all(formatter.buffer())
-        });
+            );
 
-        // In case logging fails we generate a simpler JSON object.
-        if let Err(err) = res
-            && let Ok(mut line) = serde_json::to_vec(&serde_json::json!( {
-                "timestamp": now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true),
-                "level": "ERROR",
-                "message": format_args!("cannot log event: {err:?}"),
-                "fields": {
-                    "event": format_args!("{event:?}"),
-                },
-            }))
-        {
-            line.push(b'\n');
-            self.writer.make_writer().write_all(&line).ok();
-        }
+            let mut writer = self.writer.make_writer();
+            if writer.write_all(formatter.buffer()).is_err() {
+                Metrics::get().proxy.logging_errors_count.inc();
+            }
+        });
     }
 
     /// Registers a SpanFields instance as span extension.
@@ -382,9 +371,24 @@ impl CallsiteSpanInfo {
     }
 }
 
+#[derive(Clone)]
+struct RawValue(Box<[u8]>);
+
+impl RawValue {
+    fn new(v: impl json::ValueEncoder) -> Self {
+        Self(json::value_to_vec!(|val| v.encode(val)).into_boxed_slice())
+    }
+}
+
+impl json::ValueEncoder for &RawValue {
+    fn encode(self, v: json::ValueSer<'_>) {
+        v.write_raw_json(&self.0);
+    }
+}
+
 /// Stores span field values recorded during the spans lifetime.
 struct SpanFields {
-    values: [serde_json::Value; MAX_TRACING_FIELDS],
+    values: [Option<RawValue>; MAX_TRACING_FIELDS],
 
     /// cached span info so we can avoid extra hashmap lookups in the hot path.
     span_info: CallsiteSpanInfo,
@@ -394,7 +398,7 @@ impl SpanFields {
     fn new(span_info: CallsiteSpanInfo) -> Self {
         Self {
             span_info,
-            values: [const { serde_json::Value::Null }; MAX_TRACING_FIELDS],
+            values: [const { None }; MAX_TRACING_FIELDS],
         }
     }
 }
@@ -402,55 +406,55 @@ impl SpanFields {
 impl tracing::field::Visit for SpanFields {
     #[inline]
     fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
-        self.values[field.index()] = serde_json::Value::from(value);
+        self.values[field.index()] = Some(RawValue::new(value));
     }
 
     #[inline]
     fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
-        self.values[field.index()] = serde_json::Value::from(value);
+        self.values[field.index()] = Some(RawValue::new(value));
     }
 
     #[inline]
     fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
-        self.values[field.index()] = serde_json::Value::from(value);
+        self.values[field.index()] = Some(RawValue::new(value));
     }
 
     #[inline]
     fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
         if let Ok(value) = i64::try_from(value) {
-            self.values[field.index()] = serde_json::Value::from(value);
+            self.values[field.index()] = Some(RawValue::new(value));
         } else {
-            self.values[field.index()] = serde_json::Value::from(format!("{value}"));
+            self.values[field.index()] = Some(RawValue::new(format_args!("{value}")));
         }
     }
 
     #[inline]
     fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
         if let Ok(value) = u64::try_from(value) {
-            self.values[field.index()] = serde_json::Value::from(value);
+            self.values[field.index()] = Some(RawValue::new(value));
         } else {
-            self.values[field.index()] = serde_json::Value::from(format!("{value}"));
+            self.values[field.index()] = Some(RawValue::new(format_args!("{value}")));
         }
     }
 
     #[inline]
     fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
-        self.values[field.index()] = serde_json::Value::from(value);
+        self.values[field.index()] = Some(RawValue::new(value));
     }
 
     #[inline]
     fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
-        self.values[field.index()] = serde_json::Value::from(value);
+        self.values[field.index()] = Some(RawValue::new(value));
     }
 
     #[inline]
     fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
-        self.values[field.index()] = serde_json::Value::from(value);
+        self.values[field.index()] = Some(RawValue::new(value));
     }
 
     #[inline]
     fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
-        self.values[field.index()] = serde_json::Value::from(format!("{value:?}"));
+        self.values[field.index()] = Some(RawValue::new(format_args!("{value:?}")));
     }
 
     #[inline]
@@ -459,7 +463,7 @@ impl tracing::field::Visit for SpanFields {
         field: &tracing::field::Field,
         value: &(dyn std::error::Error + 'static),
     ) {
-        self.values[field.index()] = serde_json::Value::from(format!("{value}"));
+        self.values[field.index()] = Some(RawValue::new(format_args!("{value}")));
     }
 }
 
@@ -508,11 +512,6 @@ impl EventFormatter {
         &self.logline_buffer
     }
 
-    #[inline]
-    fn reset(&mut self) {
-        self.logline_buffer.clear();
-    }
-
     fn format<S>(
         &mut self,
         now: DateTime<Utc>,
@@ -520,8 +519,7 @@ impl EventFormatter {
         ctx: &Context<'_, S>,
         skipped_field_indices: &CallsiteMap<SkippedFieldIndices>,
         extract_fields: &'static [&'static str],
-    ) -> io::Result<()>
-    where
+    ) where
         S: Subscriber + for<'a> LookupSpan<'a>,
     {
         let timestamp = now.to_rfc3339_opts(chrono::SecondsFormat::Micros, true);
@@ -536,78 +534,99 @@ impl EventFormatter {
             .copied()
             .unwrap_or_default();
 
-        let mut serialize = || {
-            let mut serializer = serde_json::Serializer::new(&mut self.logline_buffer);
-
-            let mut serializer = serializer.serialize_map(None)?;
-
+        self.logline_buffer.clear();
+        let serializer = json::ValueSer::new(&mut self.logline_buffer);
+        json::value_as_object!(|serializer| {
             // Timestamp comes first, so raw lines can be sorted by timestamp.
-            serializer.serialize_entry("timestamp", &timestamp)?;
+            serializer.entry("timestamp", &*timestamp);
 
             // Level next.
-            serializer.serialize_entry("level", &meta.level().as_str())?;
+            serializer.entry("level", meta.level().as_str());
 
             // Message next.
-            serializer.serialize_key("message")?;
             let mut message_extractor =
-                MessageFieldExtractor::new(serializer, skipped_field_indices);
+                MessageFieldExtractor::new(serializer.key("message"), skipped_field_indices);
             event.record(&mut message_extractor);
-            let mut serializer = message_extractor.into_serializer()?;
+            message_extractor.finish();
 
             // Direct message fields.
-            let mut fields_present = FieldsPresent(false, skipped_field_indices);
-            event.record(&mut fields_present);
-            if fields_present.0 {
-                serializer.serialize_entry(
-                    "fields",
-                    &SerializableEventFields(event, skipped_field_indices),
-                )?;
+            {
+                let mut message_skipper = MessageFieldSkipper::new(
+                    serializer.key("fields").object(),
+                    skipped_field_indices,
+                );
+                event.record(&mut message_skipper);
+
+                // rollback if no fields are present.
+                if message_skipper.present {
+                    message_skipper.serializer.finish();
+                }
             }
 
-            let spans = SerializableSpans {
-                // collect all spans from parent to root.
-                spans: ctx
+            let mut extracted = ExtractedSpanFields::new(extract_fields);
+
+            let spans = serializer.key("spans");
+            json::value_as_object!(|spans| {
+                let parent_spans = ctx
                     .event_span(event)
-                    .map_or(vec![], |parent| parent.scope().collect()),
-                extracted: ExtractedSpanFields::new(extract_fields),
-            };
-            serializer.serialize_entry("spans", &spans)?;
+                    .map_or(vec![], |parent| parent.scope().collect());
+
+                for span in parent_spans.iter().rev() {
+                    let ext = span.extensions();
+
+                    // all spans should have this extension.
+                    let Some(fields) = ext.get() else { continue };
+
+                    extracted.layer_span(fields);
+
+                    let SpanFields { values, span_info } = fields;
+
+                    let span_fields = spans.key(&*span_info.normalized_name);
+                    json::value_as_object!(|span_fields| {
+                        for (field, value) in std::iter::zip(span.metadata().fields(), values) {
+                            if let Some(value) = value {
+                                span_fields.entry(field.name(), value);
+                            }
+                        }
+                    });
+                }
+            });
 
             // TODO: thread-local cache?
             let pid = std::process::id();
             // Skip adding pid 1 to reduce noise for services running in containers.
             if pid != 1 {
-                serializer.serialize_entry("process_id", &pid)?;
+                serializer.entry("process_id", pid);
             }
 
-            THREAD_ID.with(|tid| serializer.serialize_entry("thread_id", tid))?;
+            THREAD_ID.with(|tid| serializer.entry("thread_id", tid));
 
             // TODO: tls cache? name could change
             if let Some(thread_name) = std::thread::current().name()
                 && !thread_name.is_empty()
                 && thread_name != "tokio-runtime-worker"
             {
-                serializer.serialize_entry("thread_name", thread_name)?;
+                serializer.entry("thread_name", thread_name);
             }
 
             if let Some(task_id) = tokio::task::try_id() {
-                serializer.serialize_entry("task_id", &format_args!("{task_id}"))?;
+                serializer.entry("task_id", format_args!("{task_id}"));
             }
 
-            serializer.serialize_entry("target", meta.target())?;
+            serializer.entry("target", meta.target());
 
             // Skip adding module if it's the same as target.
             if let Some(module) = meta.module_path()
                 && module != meta.target()
             {
-                serializer.serialize_entry("module", module)?;
+                serializer.entry("module", module);
             }
 
             if let Some(file) = meta.file() {
                 if let Some(line) = meta.line() {
-                    serializer.serialize_entry("src", &format_args!("{file}:{line}"))?;
+                    serializer.entry("src", format_args!("{file}:{line}"));
                 } else {
-                    serializer.serialize_entry("src", file)?;
+                    serializer.entry("src", file);
                 }
             }
 
@@ -616,124 +635,104 @@ impl EventFormatter {
                 let otel_spanref = otel_context.span();
                 let span_context = otel_spanref.span_context();
                 if span_context.is_valid() {
-                    serializer.serialize_entry(
-                        "trace_id",
-                        &format_args!("{}", span_context.trace_id()),
-                    )?;
+                    serializer.entry("trace_id", format_args!("{}", span_context.trace_id()));
                 }
             }
 
-            if spans.extracted.has_values() {
+            if extracted.has_values() {
                 // TODO: add fields from event, too?
-                serializer.serialize_entry("extract", &spans.extracted)?;
+                let extract = serializer.key("extract");
+                json::value_as_object!(|extract| {
+                    for (key, value) in std::iter::zip(extracted.names, extracted.values) {
+                        if let Some(value) = value {
+                            extract.entry(*key, &value);
+                        }
+                    }
+                });
             }
+        });
 
-            serializer.end()
-        };
-
-        serialize().map_err(io::Error::other)?;
         self.logline_buffer.push(b'\n');
-        Ok(())
     }
 }
 
 /// Extracts the message field that's mixed will other fields.
-struct MessageFieldExtractor<S: serde::ser::SerializeMap> {
-    serializer: S,
+struct MessageFieldExtractor<'buf> {
+    serializer: Option<json::ValueSer<'buf>>,
     skipped_field_indices: SkippedFieldIndices,
-    state: Option<Result<(), S::Error>>,
 }
 
-impl<S: serde::ser::SerializeMap> MessageFieldExtractor<S> {
+impl<'buf> MessageFieldExtractor<'buf> {
     #[inline]
-    fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
+    fn new(serializer: json::ValueSer<'buf>, skipped_field_indices: SkippedFieldIndices) -> Self {
         Self {
-            serializer,
+            serializer: Some(serializer),
             skipped_field_indices,
-            state: None,
         }
     }
 
     #[inline]
-    fn into_serializer(mut self) -> Result<S, S::Error> {
-        match self.state {
-            Some(Ok(())) => {}
-            Some(Err(err)) => return Err(err),
-            None => self.serializer.serialize_value("")?,
+    fn finish(self) {
+        if let Some(ser) = self.serializer {
+            ser.value("");
         }
-        Ok(self.serializer)
     }
 
     #[inline]
-    fn accept_field(&self, field: &tracing::field::Field) -> bool {
-        self.state.is_none()
-            && field.name() == MESSAGE_FIELD
+    fn record_field(&mut self, field: &tracing::field::Field, v: impl json::ValueEncoder) {
+        if field.name() == MESSAGE_FIELD
             && !self.skipped_field_indices.contains(field.index())
+            && let Some(ser) = self.serializer.take()
+        {
+            ser.value(v);
+        }
     }
 }
 
-impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtractor<S> {
+impl tracing::field::Visit for MessageFieldExtractor<'_> {
     #[inline]
     fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&value));
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&value));
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&value));
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&value));
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&value));
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&value));
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&format_args!("{value:x?}")));
-        }
+        self.record_field(field, format_args!("{value:x?}"));
     }
 
     #[inline]
     fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&value));
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&format_args!("{value:?}")));
-        }
+        self.record_field(field, format_args!("{value:?}"));
     }
 
     #[inline]
@@ -742,147 +741,83 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldExtracto
         field: &tracing::field::Field,
         value: &(dyn std::error::Error + 'static),
     ) {
-        if self.accept_field(field) {
-            self.state = Some(self.serializer.serialize_value(&format_args!("{value}")));
-        }
-    }
-}
-
-/// Checks if there's any fields and field values present. If not, the JSON subobject
-/// can be skipped.
-// This is entirely optional and only cosmetic, though maybe helps a
-// bit during log parsing in dashboards when there's no field with empty object.
-struct FieldsPresent(pub bool, SkippedFieldIndices);
-
-// Even though some methods have an overhead (error, bytes) it is assumed the
-// compiler won't include this since we ignore the value entirely.
-impl tracing::field::Visit for FieldsPresent {
-    #[inline]
-    fn record_debug(&mut self, field: &tracing::field::Field, _: &dyn std::fmt::Debug) {
-        if !self.1.contains(field.index())
-            && field.name() != MESSAGE_FIELD
-            && !field.name().starts_with("log.")
-        {
-            self.0 |= true;
-        }
-    }
-}
-
-/// Serializes the fields directly supplied with a log event.
-struct SerializableEventFields<'a, 'event>(&'a tracing::Event<'event>, SkippedFieldIndices);
-
-impl serde::ser::Serialize for SerializableEventFields<'_, '_> {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: Serializer,
-    {
-        use serde::ser::SerializeMap;
-        let serializer = serializer.serialize_map(None)?;
-        let mut message_skipper = MessageFieldSkipper::new(serializer, self.1);
-        self.0.record(&mut message_skipper);
-        let serializer = message_skipper.into_serializer()?;
-        serializer.end()
+        self.record_field(field, format_args!("{value}"));
     }
 }
 
 /// A tracing field visitor that skips the message field.
-struct MessageFieldSkipper<S: serde::ser::SerializeMap> {
-    serializer: S,
+struct MessageFieldSkipper<'buf> {
+    serializer: json::ObjectSer<'buf>,
     skipped_field_indices: SkippedFieldIndices,
-    state: Result<(), S::Error>,
+    present: bool,
 }
 
-impl<S: serde::ser::SerializeMap> MessageFieldSkipper<S> {
+impl<'buf> MessageFieldSkipper<'buf> {
     #[inline]
-    fn new(serializer: S, skipped_field_indices: SkippedFieldIndices) -> Self {
+    fn new(serializer: json::ObjectSer<'buf>, skipped_field_indices: SkippedFieldIndices) -> Self {
         Self {
             serializer,
             skipped_field_indices,
-            state: Ok(()),
+            present: false,
         }
     }
 
     #[inline]
-    fn accept_field(&self, field: &tracing::field::Field) -> bool {
-        self.state.is_ok()
-            && field.name() != MESSAGE_FIELD
+    fn record_field(&mut self, field: &tracing::field::Field, v: impl json::ValueEncoder) {
+        if field.name() != MESSAGE_FIELD
             && !field.name().starts_with("log.")
             && !self.skipped_field_indices.contains(field.index())
-    }
-
-    #[inline]
-    fn into_serializer(self) -> Result<S, S::Error> {
-        self.state?;
-        Ok(self.serializer)
+        {
+            self.serializer.entry(field.name(), v);
+            self.present |= true;
+        }
     }
 }
 
-impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<S> {
+impl tracing::field::Visit for MessageFieldSkipper<'_> {
     #[inline]
     fn record_f64(&mut self, field: &tracing::field::Field, value: f64) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_entry(field.name(), &value);
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_i64(&mut self, field: &tracing::field::Field, value: i64) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_entry(field.name(), &value);
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_u64(&mut self, field: &tracing::field::Field, value: u64) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_entry(field.name(), &value);
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_i128(&mut self, field: &tracing::field::Field, value: i128) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_entry(field.name(), &value);
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_u128(&mut self, field: &tracing::field::Field, value: u128) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_entry(field.name(), &value);
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_bool(&mut self, field: &tracing::field::Field, value: bool) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_entry(field.name(), &value);
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_bytes(&mut self, field: &tracing::field::Field, value: &[u8]) {
-        if self.accept_field(field) {
-            self.state = self
-                .serializer
-                .serialize_entry(field.name(), &format_args!("{value:x?}"));
-        }
+        self.record_field(field, format_args!("{value:x?}"));
     }
 
     #[inline]
     fn record_str(&mut self, field: &tracing::field::Field, value: &str) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_entry(field.name(), &value);
-        }
+        self.record_field(field, value);
     }
 
     #[inline]
     fn record_debug(&mut self, field: &tracing::field::Field, value: &dyn std::fmt::Debug) {
-        if self.accept_field(field) {
-            self.state = self
-                .serializer
-                .serialize_entry(field.name(), &format_args!("{value:?}"));
-        }
+        self.record_field(field, format_args!("{value:?}"));
     }
 
     #[inline]
@@ -891,131 +826,40 @@ impl<S: serde::ser::SerializeMap> tracing::field::Visit for MessageFieldSkipper<
         field: &tracing::field::Field,
         value: &(dyn std::error::Error + 'static),
     ) {
-        if self.accept_field(field) {
-            self.state = self.serializer.serialize_value(&format_args!("{value}"));
-        }
-    }
-}
-
-/// Serializes the span stack from root to leaf (parent of event) as object
-/// with the span names as keys. To prevent collision we append a numberic value
-/// to the name. Also, collects any span fields we're interested in. Last one
-/// wins.
-struct SerializableSpans<'ctx, S>
-where
-    S: for<'lookup> LookupSpan<'lookup>,
-{
-    spans: Vec<SpanRef<'ctx, S>>,
-    extracted: ExtractedSpanFields,
-}
-
-impl<S> serde::ser::Serialize for SerializableSpans<'_, S>
-where
-    S: for<'lookup> LookupSpan<'lookup>,
-{
-    fn serialize<Ser>(&self, serializer: Ser) -> Result<Ser::Ok, Ser::Error>
-    where
-        Ser: serde::ser::Serializer,
-    {
-        let mut serializer = serializer.serialize_map(None)?;
-
-        for span in self.spans.iter().rev() {
-            let ext = span.extensions();
-
-            // all spans should have this extension.
-            let Some(fields) = ext.get() else { continue };
-
-            self.extracted.layer_span(fields);
-
-            let SpanFields { values, span_info } = fields;
-            serializer.serialize_entry(
-                &*span_info.normalized_name,
-                &SerializableSpanFields {
-                    fields: span.metadata().fields(),
-                    values,
-                },
-            )?;
-        }
-
-        serializer.end()
-    }
-}
-
-/// Serializes the span fields as object.
-struct SerializableSpanFields<'span> {
-    fields: &'span tracing::field::FieldSet,
-    values: &'span [serde_json::Value; MAX_TRACING_FIELDS],
-}
-
-impl serde::ser::Serialize for SerializableSpanFields<'_> {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::ser::Serializer,
-    {
-        let mut serializer = serializer.serialize_map(None)?;
-
-        for (field, value) in std::iter::zip(self.fields, self.values) {
-            if value.is_null() {
-                continue;
-            }
-            serializer.serialize_entry(field.name(), value)?;
-        }
-
-        serializer.end()
+        self.record_field(field, format_args!("{value}"));
     }
 }
 
 struct ExtractedSpanFields {
     names: &'static [&'static str],
-    values: RefCell<Vec<serde_json::Value>>,
+    values: Vec<Option<RawValue>>,
 }
 
 impl ExtractedSpanFields {
     fn new(names: &'static [&'static str]) -> Self {
         ExtractedSpanFields {
             names,
-            values: RefCell::new(vec![serde_json::Value::Null; names.len()]),
+            values: vec![None; names.len()],
         }
     }
 
-    fn layer_span(&self, fields: &SpanFields) {
-        let mut v = self.values.borrow_mut();
+    fn layer_span(&mut self, fields: &SpanFields) {
         let SpanFields { values, span_info } = fields;
 
         // extract the fields
         for (i, &j) in span_info.extract.iter().enumerate() {
-            let Some(value) = values.get(j) else { continue };
+            let Some(Some(value)) = values.get(j) else {
+                continue;
+            };
 
-            if !value.is_null() {
-                // TODO: replace clone with reference, if possible.
-                v[i] = value.clone();
-            }
+            // TODO: replace clone with reference, if possible.
+            self.values[i] = Some(value.clone());
         }
     }
 
     #[inline]
     fn has_values(&self) -> bool {
-        self.values.borrow().iter().any(|v| !v.is_null())
-    }
-}
-
-impl serde::ser::Serialize for ExtractedSpanFields {
-    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
-    where
-        S: serde::ser::Serializer,
-    {
-        let mut serializer = serializer.serialize_map(None)?;
-
-        let values = self.values.borrow();
-        for (key, value) in std::iter::zip(self.names, &*values) {
-            if value.is_null() {
-                continue;
-            }
-
-            serializer.serialize_entry(key, value)?;
-        }
-
-        serializer.end()
+        self.values.iter().any(|v| v.is_some())
     }
 }
 
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index bf4d5a11eb..916604e2ec 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -112,6 +112,9 @@ pub struct ProxyMetrics {
     /// Number of bytes sent/received between all clients and backends.
     pub io_bytes: CounterVec<StaticLabelSet<Direction>>,
 
+    /// Number of IO errors while logging.
+    pub logging_errors_count: Counter,
+
     /// Number of errors by a given classification.
     pub errors_total: CounterVec<StaticLabelSet<crate::error::ErrorKind>>,
 

From c71aea02238909e4107ef4f750a41b9e35ef4cc3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 16 Jul 2025 14:29:18 +0100
Subject: [PATCH 130/163] proxy: for json logging, only use callsite IDs if
 span name is duplicated (#12625)

## Problem

We run multiple proxies, we get logs like

```
... spans={"http_conn#22":{"conn_id": ...
... spans={"http_conn#24":{"conn_id": ...
```

these are the same span, and the difference is confusing.

## Summary of changes

Introduce a counter per span name, rather than a global counter. If the
counter is 0, no change to the span name is made.

To follow up: see which span names are duplicated within the codebase in
different callsites
---
 proxy/src/logging.rs | 58 +++++++++++++++++++++++++++++---------------
 1 file changed, 39 insertions(+), 19 deletions(-)

diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index a87b0f1175..d4fd826c13 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,7 +1,6 @@
 use std::cell::RefCell;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::sync::atomic::{AtomicU32, Ordering};
 use std::{env, io};
 
 use chrono::{DateTime, Utc};
@@ -211,6 +210,9 @@ struct JsonLoggingLayer<C: Clock, W: MakeWriter> {
     /// tracks which fields of each **event** are duplicates
     skipped_field_indices: CallsiteMap<SkippedFieldIndices>,
 
+    /// tracks callsite names to an ID.
+    callsite_name_ids: papaya::HashMap<&'static str, u32, ahash::RandomState>,
+
     span_info: CallsiteMap<CallsiteSpanInfo>,
 
     /// Fields we want to keep track of in a separate json object.
@@ -223,6 +225,7 @@ impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
             clock,
             skipped_field_indices: CallsiteMap::default(),
             span_info: CallsiteMap::default(),
+            callsite_name_ids: papaya::HashMap::default(),
             writer,
             extract_fields,
         }
@@ -233,7 +236,7 @@ impl<C: Clock, W: MakeWriter> JsonLoggingLayer<C, W> {
         self.span_info
             .pin()
             .get_or_insert_with(metadata.callsite(), || {
-                CallsiteSpanInfo::new(metadata, self.extract_fields)
+                CallsiteSpanInfo::new(&self.callsite_name_ids, metadata, self.extract_fields)
             })
             .clone()
     }
@@ -345,10 +348,11 @@ struct CallsiteSpanInfo {
 }
 
 impl CallsiteSpanInfo {
-    fn new(metadata: &'static Metadata<'static>, extract_fields: &[&'static str]) -> Self {
-        // Start at 1 to reserve 0 for default.
-        static COUNTER: AtomicU32 = AtomicU32::new(1);
-
+    fn new(
+        callsite_name_ids: &papaya::HashMap<&'static str, u32, ahash::RandomState>,
+        metadata: &'static Metadata<'static>,
+        extract_fields: &[&'static str],
+    ) -> Self {
         let names: Vec<&'static str> = metadata.fields().iter().map(|f| f.name()).collect();
 
         // get all the indices of span fields we want to focus
@@ -361,8 +365,18 @@ impl CallsiteSpanInfo {
         // normalized_name is unique for each callsite, but it is not
         // unified across separate proxy instances.
         // todo: can we do better here?
-        let cid = COUNTER.fetch_add(1, Ordering::Relaxed);
-        let normalized_name = format!("{}#{cid}", metadata.name()).into();
+        let cid = *callsite_name_ids
+            .pin()
+            .update_or_insert(metadata.name(), |&cid| cid + 1, 0);
+
+        // we hope that most span names are unique, in which case this will always be 0
+        let normalized_name = if cid == 0 {
+            metadata.name().into()
+        } else {
+            // if the span name is not unique, add the numeric ID to span name to distinguish it.
+            // sadly this is non-determinstic, across restarts but we should fix it by disambiguating re-used span names instead.
+            format!("{}#{cid}", metadata.name()).into()
+        };
 
         Self {
             extract,
@@ -914,6 +928,7 @@ mod tests {
             clock: clock.clone(),
             skipped_field_indices: papaya::HashMap::default(),
             span_info: papaya::HashMap::default(),
+            callsite_name_ids: papaya::HashMap::default(),
             writer: buffer.clone(),
             extract_fields: &["x"],
         };
@@ -922,14 +937,16 @@ mod tests {
 
         tracing::subscriber::with_default(registry, || {
             info_span!("some_span", x = 24).in_scope(|| {
-                info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| {
-                    tracing::error!(
-                        a = 1,
-                        a = 2,
-                        a = 3,
-                        message = "explicit message field",
-                        "implicit message field"
-                    );
+                info_span!("some_other_span", y = 30).in_scope(|| {
+                    info_span!("some_span", x = 40, x = 41, x = 42).in_scope(|| {
+                        tracing::error!(
+                            a = 1,
+                            a = 2,
+                            a = 3,
+                            message = "explicit message field",
+                            "implicit message field"
+                        );
+                    });
                 });
             });
         });
@@ -948,12 +965,15 @@ mod tests {
                     "a": 3,
                 },
                 "spans": {
-                    "some_span#1":{
+                    "some_span":{
                         "x": 24,
                     },
-                    "some_span#2": {
+                    "some_other_span": {
+                        "y": 30,
+                    },
+                    "some_span#1": {
                         "x": 42,
-                    }
+                    },
                 },
                 "extract": {
                     "x": 42,

From 3e4cbaed6727f4440dc8711df59d3852c2f2f159 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Jul 2025 15:37:40 +0100
Subject: [PATCH 131/163] storcon: validate intent state before applying
 optimization (#12593)

## Problem

In the gap between picking an optimization and applying it, something
might insert a change to the intent state that makes it incompatible.
If the change is done via the `schedule()` method, we are covered by the
increased sequence number, but otherwise we can panic if we violate the
intent state invariants.

## Summary of Changes

Validate the optimization right before applying it. Since we hold the
service lock at that point, nothing else can sneak in.

Closes LKB-65
---
 storage_controller/src/tenant_shard.rs        | 41 ++++++++++++++++++-
 .../performance/test_sharding_autosplit.py    |  5 +++
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 0bfca5385e..99079c57b0 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1272,7 +1272,9 @@ impl TenantShard {
     }
 
     /// Return true if the optimization was really applied: it will not be applied if the optimization's
-    /// sequence is behind this tenant shard's
+    /// sequence is behind this tenant shard's or if the intent state proposed by the optimization
+    /// is not compatible with the current intent state. The later may happen when the background
+    /// reconcile loops runs concurrently with HTTP driven optimisations.
     pub(crate) fn apply_optimization(
         &mut self,
         scheduler: &mut Scheduler,
@@ -1282,6 +1284,15 @@ impl TenantShard {
             return false;
         }
 
+        if !self.validate_optimization(&optimization) {
+            tracing::info!(
+                "Skipping optimization for {} because it does not match current intent: {:?}",
+                self.tenant_shard_id,
+                optimization,
+            );
+            return false;
+        }
+
         metrics::METRICS_REGISTRY
             .metrics_group
             .storage_controller_schedule_optimization
@@ -1322,6 +1333,34 @@ impl TenantShard {
         true
     }
 
+    /// Check that the desired modifications to the intent state are compatible with
+    /// the current intent state
+    fn validate_optimization(&self, optimization: &ScheduleOptimization) -> bool {
+        match optimization.action {
+            ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id,
+                new_attached_node_id,
+            }) => {
+                self.intent.attached == Some(old_attached_node_id)
+                    && self.intent.secondary.contains(&new_attached_node_id)
+            }
+            ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+                old_node_id: _,
+                new_node_id,
+            }) => {
+                // It's legal to remove a secondary that is not present in the intent state
+                !self.intent.secondary.contains(&new_node_id)
+            }
+            ScheduleOptimizationAction::CreateSecondary(new_node_id) => {
+                !self.intent.secondary.contains(&new_node_id)
+            }
+            ScheduleOptimizationAction::RemoveSecondary(_) => {
+                // It's legal to remove a secondary that is not present in the intent state
+                true
+            }
+        }
+    }
+
     /// When a shard has several secondary locations, we need to pick one in situations where
     /// we promote one of them to an attached location:
     ///  - When draining a node for restart
diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py
index 0bb210db23..1b77831b75 100644
--- a/test_runner/performance/test_sharding_autosplit.py
+++ b/test_runner/performance/test_sharding_autosplit.py
@@ -73,6 +73,11 @@ def test_sharding_autosplit(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             ".*Local notification hook failed.*",
             ".*Marking shard.*for notification retry.*",
             ".*Failed to notify compute.*",
+            # As an optimization, the storage controller kicks the downloads on the secondary
+            # after the shard split. However, secondaries are created async, so it's possible
+            # that the intent state was modified, but the actual secondary hasn't been created,
+            # which results in an error.
+            ".*Error calling secondary download after shard split.*",
         ]
     )
 

From 8b18d8b31b608a54ce936f5eb893e2ae11a52a04 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Jul 2025 15:43:17 +0100
Subject: [PATCH 132/163] safekeeper: add global disk usage utilization limit
 (#12605)

N.B: No-op for the neon-env.

## Problem

We added a per-timeline disk utilization protection circuit breaker,
which will stop the safekeeper from accepting more WAL writes if the
disk utilization by the timeline has exceeded a configured limit. We
mainly designed the mechanism as a guard against WAL upload/backup bugs,
and we assumed that as long as WAL uploads are proceeding as normal we
will not run into disk pressure. This turned out to be not true. In one
of our load tests where we have 500 PGs ingesting data at the same time,
safekeeper disk utilization started to creep up even though WAL uploads
were completely normal (we likely just maxed out our S3 upload bandwidth
from the single SK). This means the per-timeline disk utilization
protection won't be enough if too many timelines are ingesting data at
the same time.

## Summary of changes

Added a global disk utilization protection circuit breaker which will
stop a safekeeper from accepting more WAL writes if the total disk usage
on the safekeeper (across all tenants) exceeds a limit. We implemented
this circuit breaker through two parts:

1. A "global disk usage watcher" background task that runs at a
configured interval (default every minute) to see how much disk space is
being used in the safekeeper's filesystem. This background task also
performs the check against the limit and publishes the result to a
global atomic boolean flag.
2. The `hadron_check_disk_usage()` routine (in `timeline.rs`) now also
checks this global boolean flag published in the step above, and fails
the `WalAcceptor` (triggers the circuit breaker) if the flag was raised.

The disk usage limit is disabled by default.
It can be tuned with the `--max-global-disk-usage-ratio` CLI arg.

## How is this tested?

Added integration test
`test_wal_acceptor.py::test_global_disk_usage_limit`.

Also noticed that I haven't been using the `wait_until(f)` test function
correctly (the `f` passed in is supposed to raise an exception if the
condition is not met, instead of returning `False`...). Fixed it in both
circuit breaker tests.

---------

Co-authored-by: William Huang <william.huang@databricks.com>
---
 Cargo.lock                                    |  1 +
 safekeeper/Cargo.toml                         |  1 +
 safekeeper/src/bin/safekeeper.rs              | 65 +++++++++++++-
 safekeeper/src/hadron.rs                      | 75 +++++++++++++++-
 safekeeper/src/http/routes.rs                 | 22 +++++
 safekeeper/src/lib.rs                         | 18 +++-
 safekeeper/src/metrics.rs                     | 14 +++
 safekeeper/src/timeline.rs                    |  7 ++
 .../tests/walproposer_sim/safekeeper.rs       |  2 +
 test_runner/regress/test_wal_acceptor.py      | 87 ++++++++++++++++++-
 10 files changed, 284 insertions(+), 8 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3474211ac6..e5f39658a7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6204,6 +6204,7 @@ dependencies = [
  "itertools 0.10.5",
  "jsonwebtoken",
  "metrics",
+ "nix 0.30.1",
  "once_cell",
  "pageserver_api",
  "parking_lot 0.12.1",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 539e931983..56822b5c25 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -72,6 +72,7 @@ http-utils.workspace = true
 utils.workspace = true
 wal_decoder.workspace = true
 env_logger.workspace = true
+nix.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 79cf2f9149..2ec541b6f0 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -17,8 +17,9 @@ use http_utils::tls_certs::ReloadingCertificateResolver;
 use metrics::set_build_info_metric;
 use remote_storage::RemoteStorageConfig;
 use safekeeper::defaults::{
-    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT, DEFAULT_HEARTBEAT_TIMEOUT,
-    DEFAULT_HTTP_LISTEN_ADDR, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
+    DEFAULT_CONTROL_FILE_SAVE_INTERVAL, DEFAULT_EVICTION_MIN_RESIDENT,
+    DEFAULT_GLOBAL_DISK_CHECK_INTERVAL, DEFAULT_HEARTBEAT_TIMEOUT, DEFAULT_HTTP_LISTEN_ADDR,
+    DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO, DEFAULT_MAX_OFFLOADER_LAG_BYTES,
     DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES, DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES,
     DEFAULT_PARTIAL_BACKUP_CONCURRENCY, DEFAULT_PARTIAL_BACKUP_TIMEOUT, DEFAULT_PG_LISTEN_ADDR,
     DEFAULT_SSL_CERT_FILE, DEFAULT_SSL_CERT_RELOAD_PERIOD, DEFAULT_SSL_KEY_FILE,
@@ -42,6 +43,12 @@ use utils::metrics_collector::{METRICS_COLLECTION_INTERVAL, METRICS_COLLECTOR};
 use utils::sentry_init::init_sentry;
 use utils::{pid_file, project_build_tag, project_git_version, tcp_listener};
 
+use safekeeper::hadron::{
+    GLOBAL_DISK_LIMIT_EXCEEDED, get_filesystem_capacity, get_filesystem_usage,
+};
+use safekeeper::metrics::GLOBAL_DISK_UTIL_CHECK_SECONDS;
+use std::sync::atomic::Ordering;
+
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
@@ -256,6 +263,15 @@ struct Args {
     /* BEGIN_HADRON */
     #[arg(long)]
     enable_pull_timeline_on_startup: bool,
+    /// How often to scan entire data-dir for total disk usage
+    #[arg(long, value_parser=humantime::parse_duration, default_value = DEFAULT_GLOBAL_DISK_CHECK_INTERVAL)]
+    global_disk_check_interval: Duration,
+    /// The portion of the filesystem capacity that can be used by all timelines.
+    /// A circuit breaker will trip and reject all WAL writes if the total usage
+    /// exceeds this ratio.
+    /// Set to 0 to disable the global disk usage limit.
+    #[arg(long, default_value_t = DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO)]
+    max_global_disk_usage_ratio: f64,
     /* END_HADRON */
 }
 
@@ -444,6 +460,8 @@ async fn main() -> anyhow::Result<()> {
         advertise_pg_addr_tenant_only: None,
         enable_pull_timeline_on_startup: args.enable_pull_timeline_on_startup,
         hcc_base_url: None,
+        global_disk_check_interval: args.global_disk_check_interval,
+        max_global_disk_usage_ratio: args.max_global_disk_usage_ratio,
         /* END_HADRON */
     });
 
@@ -618,6 +636,49 @@ async fn start_safekeeper(conf: Arc<SafeKeeperConf>) -> Result<()> {
         .map(|res| ("Timeline map housekeeping".to_owned(), res));
     tasks_handles.push(Box::pin(timeline_housekeeping_handle));
 
+    /* BEGIN_HADRON */
+    // Spawn global disk usage watcher task, if a global disk usage limit is specified.
+    let interval = conf.global_disk_check_interval;
+    let data_dir = conf.workdir.clone();
+    // Use the safekeeper data directory to compute filesystem capacity. This only runs once on startup, so
+    // there is little point to continue if we can't have the proper protections in place.
+    let fs_capacity_bytes = get_filesystem_capacity(data_dir.as_std_path())
+        .expect("Failed to get filesystem capacity for data directory");
+    let limit: u64 = (conf.max_global_disk_usage_ratio * fs_capacity_bytes as f64) as u64;
+    if limit > 0 {
+        let disk_usage_watch_handle = BACKGROUND_RUNTIME
+            .handle()
+            .spawn(async move {
+                // Use Tokio interval to preserve fixed cadence between filesystem utilization checks
+                let mut ticker = tokio::time::interval(interval);
+                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
+
+                loop {
+                    ticker.tick().await;
+                    let data_dir_clone = data_dir.clone();
+                    let check_start = Instant::now();
+
+                    let usage = tokio::task::spawn_blocking(move || {
+                        get_filesystem_usage(data_dir_clone.as_std_path())
+                    })
+                    .await
+                    .unwrap_or(0);
+
+                    let elapsed = check_start.elapsed().as_secs_f64();
+                    GLOBAL_DISK_UTIL_CHECK_SECONDS.observe(elapsed);
+                    if usage > limit {
+                        warn!(
+                            "Global disk usage exceeded limit. Usage: {} bytes, limit: {} bytes",
+                            usage, limit
+                        );
+                    }
+                    GLOBAL_DISK_LIMIT_EXCEEDED.store(usage > limit, Ordering::Relaxed);
+                }
+            })
+            .map(|res| ("Global disk usage watcher".to_string(), res));
+        tasks_handles.push(Box::pin(disk_usage_watch_handle));
+    }
+    /* END_HADRON */
     if let Some(pg_listener_tenant_only) = pg_listener_tenant_only {
         let wal_service_handle = current_thread_rt
             .as_ref()
diff --git a/safekeeper/src/hadron.rs b/safekeeper/src/hadron.rs
index b41bf2c3da..8c6a912166 100644
--- a/safekeeper/src/hadron.rs
+++ b/safekeeper/src/hadron.rs
@@ -1,12 +1,17 @@
+use once_cell::sync::Lazy;
 use pem::Pem;
 use safekeeper_api::models::PullTimelineRequest;
-use std::{collections::HashMap, env::VarError, net::IpAddr, sync::Arc, time::Duration};
+use std::{
+    collections::HashMap, env::VarError, net::IpAddr, sync::Arc, sync::atomic::AtomicBool,
+    time::Duration,
+};
 use tokio::time::sleep;
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{backoff, id::TenantTimelineId, ip_address};
+use utils::{backoff, critical_timeline, id::TenantTimelineId, ip_address};
+
+use anyhow::{Result, anyhow};
 
-use anyhow::Result;
 use pageserver_api::controller_api::{
     AvailabilityZone, NodeRegisterRequest, SafekeeperTimeline, SafekeeperTimelinesResponse,
 };
@@ -346,6 +351,70 @@ pub async fn hcc_pull_timelines(
     Ok(())
 }
 
+/// true if the last background scan found total usage > limit
+pub static GLOBAL_DISK_LIMIT_EXCEEDED: Lazy<AtomicBool> = Lazy::new(|| AtomicBool::new(false));
+
+/// Returns filesystem usage in bytes for the filesystem containing the given path.
+// Need to suppress the clippy::unnecessary_cast warning because the casts on the block count and the
+// block size are required on macOS (they are 32-bit integers on macOS, apparantly).
+#[allow(clippy::unnecessary_cast)]
+pub fn get_filesystem_usage(path: &std::path::Path) -> u64 {
+    // Allow overriding disk usage via failpoint for tests
+    fail::fail_point!("sk-global-disk-usage", |val| {
+        // val is Option<String>; parse payload if present
+        val.and_then(|s| s.parse::<u64>().ok()).unwrap_or(0)
+    });
+
+    // Call statvfs(3) for filesystem usage
+    use nix::sys::statvfs::statvfs;
+    match statvfs(path) {
+        Ok(stat) => {
+            // fragment size (f_frsize) if non-zero else block size (f_bsize)
+            let frsize = stat.fragment_size();
+            let blocksz = if frsize > 0 {
+                frsize
+            } else {
+                stat.block_size()
+            };
+            // used blocks = total blocks - available blocks for unprivileged
+            let used_blocks = stat.blocks().saturating_sub(stat.blocks_available());
+            used_blocks as u64 * blocksz as u64
+        }
+        Err(e) => {
+            // The global disk usage watcher aren't associated with a tenant or timeline, so we just
+            // pass placeholder (all-zero) tenant and timeline IDs to the critical!() macro.
+            let placeholder_ttid = TenantTimelineId::empty();
+            critical_timeline!(
+                placeholder_ttid.tenant_id,
+                placeholder_ttid.timeline_id,
+                "Global disk usage watcher failed to read filesystem usage: {:?}",
+                e
+            );
+            0
+        }
+    }
+}
+
+/// Returns the total capacity of the current working directory's filesystem in bytes.
+#[allow(clippy::unnecessary_cast)]
+pub fn get_filesystem_capacity(path: &std::path::Path) -> Result<u64> {
+    // Call statvfs(3) for filesystem stats
+    use nix::sys::statvfs::statvfs;
+    match statvfs(path) {
+        Ok(stat) => {
+            // fragment size (f_frsize) if non-zero else block size (f_bsize)
+            let frsize = stat.fragment_size();
+            let blocksz = if frsize > 0 {
+                frsize
+            } else {
+                stat.block_size()
+            };
+            Ok(stat.blocks() as u64 * blocksz as u64)
+        }
+        Err(e) => Err(anyhow!("Failed to read filesystem capacity: {:?}", e)),
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index a0ee2facb5..c9d8e7d3b0 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -33,11 +33,13 @@ use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
 use crate::debug_dump::TimelineDigestRequest;
+use crate::hadron::{get_filesystem_capacity, get_filesystem_usage};
 use crate::safekeeper::TermLsn;
 use crate::timelines_global_map::DeleteOrExclude;
 use crate::{
     GlobalTimelines, SafeKeeperConf, copy_timeline, debug_dump, patch_control_file, pull_timeline,
 };
+use serde_json::json;
 
 /// Healthcheck handler.
 async fn status_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -127,6 +129,21 @@ async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, A
     json_response(StatusCode::OK, utilization)
 }
 
+/// Returns filesystem capacity and current utilization for the safekeeper data directory.
+async fn filesystem_usage_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let conf = get_conf(&request);
+    let path = conf.workdir.as_std_path();
+    let capacity = get_filesystem_capacity(path).map_err(ApiError::InternalServerError)?;
+    let usage = get_filesystem_usage(path);
+    let resp = json!({
+        "data_dir": path,
+        "capacity_bytes": capacity,
+        "usage_bytes": usage,
+    });
+    json_response(StatusCode::OK, resp)
+}
+
 /// List all (not deleted) timelines.
 /// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -730,6 +747,11 @@ pub fn make_router(
             })
         })
         .get("/v1/utilization", |r| request_span(r, utilization_handler))
+        /* BEGIN_HADRON */
+        .get("/v1/debug/filesystem_usage", |r| {
+            request_span(r, filesystem_usage_handler)
+        })
+        /* END_HADRON */
         .delete("/v1/tenant/:tenant_id", |r| {
             request_span(r, tenant_delete_handler)
         })
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 02533b804d..c6f9cc29e5 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -50,6 +50,7 @@ pub mod wal_storage;
 pub mod test_utils;
 
 mod timelines_global_map;
+
 use std::sync::Arc;
 
 pub use timelines_global_map::GlobalTimelines;
@@ -83,6 +84,10 @@ pub mod defaults {
     pub const DEFAULT_SSL_KEY_FILE: &str = "server.key";
     pub const DEFAULT_SSL_CERT_FILE: &str = "server.crt";
     pub const DEFAULT_SSL_CERT_RELOAD_PERIOD: &str = "60s";
+
+    // Global disk watcher defaults
+    pub const DEFAULT_GLOBAL_DISK_CHECK_INTERVAL: &str = "60s";
+    pub const DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO: f64 = 0.0;
 }
 
 #[derive(Debug, Clone)]
@@ -116,6 +121,10 @@ pub struct SafeKeeperConf {
     /* BEGIN_HADRON */
     pub max_reelect_offloader_lag_bytes: u64,
     pub max_timeline_disk_usage_bytes: u64,
+    /// How often to check the working directory's filesystem for total disk usage.
+    pub global_disk_check_interval: Duration,
+    /// The portion of the filesystem capacity that can be used by all timelines.
+    pub max_global_disk_usage_ratio: f64,
     /* END_HADRON */
     pub backup_parallel_jobs: usize,
     pub wal_backup_enabled: bool,
@@ -173,6 +182,8 @@ impl SafeKeeperConf {
             /* BEGIN_HADRON */
             max_reelect_offloader_lag_bytes: defaults::DEFAULT_MAX_REELECT_OFFLOADER_LAG_BYTES,
             max_timeline_disk_usage_bytes: defaults::DEFAULT_MAX_TIMELINE_DISK_USAGE_BYTES,
+            global_disk_check_interval: Duration::from_secs(60),
+            max_global_disk_usage_ratio: defaults::DEFAULT_MAX_GLOBAL_DISK_USAGE_RATIO,
             /* END_HADRON */
             current_thread_runtime: false,
             walsenders_keep_horizon: false,
@@ -235,10 +246,13 @@ pub static WAL_BACKUP_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
         .expect("Failed to create WAL backup runtime")
 });
 
+/// Hadron: Dedicated runtime for infrequent background tasks.
 pub static BACKGROUND_RUNTIME: Lazy<Runtime> = Lazy::new(|| {
     tokio::runtime::Builder::new_multi_thread()
-        .thread_name("background worker")
-        .worker_threads(1) // there is only one task now (ssl certificate reloading), having more threads doesn't make sense
+        .thread_name("Hadron background worker")
+        // One worker thread is enough, as most of the actual tasks run on blocking threads
+        // which has it own thread pool.
+        .worker_threads(1)
         .enable_all()
         .build()
         .expect("Failed to create background runtime")
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index e1af51c115..b07852aaee 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -963,3 +963,17 @@ async fn collect_timeline_metrics(global_timelines: Arc<GlobalTimelines>) -> Vec
     }
     res
 }
+
+/* BEGIN_HADRON */
+// Metrics reporting the time spent to perform each safekeeper filesystem utilization check.
+pub static GLOBAL_DISK_UTIL_CHECK_SECONDS: Lazy<Histogram> = Lazy::new(|| {
+    // Buckets from 1ms up to 10s
+    let buckets = vec![0.001, 0.01, 0.1, 0.5, 1.0, 2.0, 5.0, 10.0];
+    register_histogram!(
+        "safekeeper_global_disk_utilization_check_seconds",
+        "Seconds spent to perform each safekeeper filesystem utilization check",
+        buckets
+    )
+    .expect("Failed to register safekeeper_global_disk_utilization_check_seconds histogram")
+});
+/* END_HADRON */
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index dbe510a019..a1a0aab9fd 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -29,6 +29,8 @@ use utils::sync::gate::Gate;
 use crate::metrics::{
     FullTimelineInfo, MISC_OPERATION_SECONDS, WAL_STORAGE_LIMIT_ERRORS, WalStorageMetrics,
 };
+
+use crate::hadron::GLOBAL_DISK_LIMIT_EXCEEDED;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
@@ -1081,6 +1083,11 @@ impl WalResidentTimeline {
                 );
             }
         }
+
+        if GLOBAL_DISK_LIMIT_EXCEEDED.load(Ordering::Relaxed) {
+            bail!("Global disk usage exceeded limit");
+        }
+
         Ok(())
     }
     // END HADRON
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 393df6228e..30d3ab1a87 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -195,6 +195,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         enable_pull_timeline_on_startup: false,
         advertise_pg_addr_tenant_only: None,
         hcc_base_url: None,
+        global_disk_check_interval: Duration::from_secs(10),
+        max_global_disk_usage_ratio: 0.0,
         /* END_HADRON */
     };
 
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 22e6d2e1c3..c691087259 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2788,7 +2788,8 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
 
     # Wait for the error message to appear in the compute log
     def error_logged():
-        return endpoint.log_contains("WAL storage utilization exceeds configured limit") is not None
+        if endpoint.log_contains("WAL storage utilization exceeds configured limit") is None:
+            raise Exception("Expected error message not found in compute log yet")
 
     wait_until(error_logged)
     log.info("Found expected error message in compute log, resuming.")
@@ -2822,3 +2823,87 @@ def test_timeline_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
             cur.execute("select count(*) from t")
             # 2000 rows from first insert + 1000 from last insert
             assert cur.fetchone() == (3000,)
+
+
+def test_global_disk_usage_limit(neon_env_builder: NeonEnvBuilder):
+    """
+    Similar to `test_timeline_disk_usage_limit`, but test that the global disk usage circuit breaker
+    also works as expected. The test scenario:
+    1. Create a timeline and endpoint.
+    2. Mock high disk usage via failpoint
+    3. Write data to the timeline so that disk usage exceeds the limit.
+    4. Verify that the writes hang and the expected error message appears in the compute log.
+    5. Mock low disk usage via failpoint
+    6. Verify that the hanging writes unblock and we can continue to write as normal.
+    """
+    neon_env_builder.num_safekeepers = 1
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_safekeeper_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+
+    env.create_branch("test_global_disk_usage_limit")
+    endpoint = env.endpoints.create_start("test_global_disk_usage_limit")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("create table t2(key int, value text)")
+
+    for sk in env.safekeepers:
+        sk.stop().start(
+            extra_opts=["--global-disk-check-interval=1s", "--max-global-disk-usage-ratio=0.8"]
+        )
+
+    # Set the failpoint to have the disk usage check return u64::MAX, which definitely exceeds the practical
+    # limits in the test environment.
+    for sk in env.safekeepers:
+        sk.http_client().configure_failpoints(
+            [("sk-global-disk-usage", "return(18446744073709551615)")]
+        )
+
+    # Wait until the global disk usage limit watcher trips the circuit breaker.
+    def error_logged_in_sk():
+        for sk in env.safekeepers:
+            if sk.log_contains("Global disk usage exceeded limit") is None:
+                raise Exception("Expected error message not found in safekeeper log yet")
+
+    wait_until(error_logged_in_sk)
+
+    def run_hanging_insert_global():
+        with closing(endpoint.connect()) as bg_conn:
+            with bg_conn.cursor() as bg_cur:
+                # This should generate more than 1KiB of WAL
+                bg_cur.execute("insert into t2 select generate_series(1,2000), 'payload'")
+
+    bg_thread_global = threading.Thread(target=run_hanging_insert_global)
+    bg_thread_global.start()
+
+    def error_logged_in_compute():
+        if endpoint.log_contains("Global disk usage exceeded limit") is None:
+            raise Exception("Expected error message not found in compute log yet")
+
+    wait_until(error_logged_in_compute)
+    log.info("Found the expected error message in compute log, resuming.")
+
+    time.sleep(2)
+    assert bg_thread_global.is_alive(), "Global hanging insert unblocked prematurely!"
+
+    # Make the disk usage check always return 0 through the failpoint to simulate the disk pressure easing.
+    # The SKs should resume accepting WAL writes without restarting.
+    for sk in env.safekeepers:
+        sk.http_client().configure_failpoints([("sk-global-disk-usage", "return(0)")])
+
+    bg_thread_global.join(timeout=120)
+    assert not bg_thread_global.is_alive(), "Hanging global insert did not complete after restart"
+    log.info("Global hanging insert unblocked.")
+
+    # Verify that we can continue to write as normal and we don't have obvious data corruption
+    # following the recovery.
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("insert into t2 select generate_series(2001,3000), 'payload'")
+
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("select count(*) from t2")
+            assert cur.fetchone() == (3000,)

From 1178f6fe7c1a7359acda31a499e821c3429bbe65 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Wed, 16 Jul 2025 19:02:01 +0400
Subject: [PATCH 133/163] pageserver: Downgrade log level of 'No broker
 updates' (#12627)

## Problem

The warning message was seen during deployment, but it's actually OK.

## Summary of changes

- Treat `"No broker updates received for a while ..."` as an info
message.

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 .../src/tenant/timeline/walreceiver/connection_manager.rs     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index aba94244a3..f33f47a956 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -184,7 +184,7 @@ pub(super) async fn connection_manager_loop_step(
 
             // If we've not received any updates from the broker from a while, are waiting for WAL
             // and have no safekeeper connection or connection candidates, then it might be that
-            // the broker subscription is wedged. Drop the currrent subscription and re-subscribe
+            // the broker subscription is wedged. Drop the current subscription and re-subscribe
             // with the goal of unblocking it.
             _ = broker_reset_interval.tick() => {
                 let awaiting_lsn = wait_lsn_status.borrow().is_some();
@@ -192,7 +192,7 @@ pub(super) async fn connection_manager_loop_step(
                 let no_connection = connection_manager_state.wal_connection.is_none();
 
                 if awaiting_lsn && no_candidates && no_connection {
-                    tracing::warn!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
+                    tracing::info!("No broker updates received for a while, but waiting for WAL. Re-setting stream ...");
                     broker_subscription = subscribe_for_timeline_updates(broker_client, id, cancel).await?;
                 }
             },

From 80e5771c675ffcac2025664fef002c9d3332cbf5 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 16 Jul 2025 11:51:20 -0400
Subject: [PATCH 134/163] fix(storcon): passthrough 404 as 503 during
 migrations (#12620)

## Problem

close LKB-270, close LKB-253

We periodically saw pageserver returns 404 -> storcon converts it to 500
to cplane, and causing branch operations fail. This is due to storcon is
migrating tenants across pageservers and the request was forwarded from
the storcon to pageservers while the tenant was not attached yet. Such
operations should be retried from cplane and storcon should return 503
in such cases.

## Summary of changes

- Refactor `tenant_timeline_lsn_lease` to have a single function process
and passthrough such requests: `collect_tenant_shards` for collecting
all shards and checking if they're consistent with the observed state,
`process_result_and_passthrough_errors` to convert 404 into 503 if
necessary.
- `tenant_shard_node` also checks observed state now.

Note that for passthrough shard0, we originally had a check to convert
404 to 503:

```
    // Transform 404 into 503 if we raced with a migration
    if resp.status() == reqwest::StatusCode::NOT_FOUND {
        // Look up node again: if we migrated it will be different
        let new_node = service.tenant_shard_node(tenant_shard_id).await?;
        if new_node.get_id() != node.get_id() {
            // Rather than retry here, send the client a 503 to prompt a retry: this matches
            // the pageserver's use of 503, and all clients calling this API should retry on 503.
            return Err(ApiError::ResourceUnavailable(
                format!("Pageserver {node} returned 404, was migrated to {new_node}").into(),
            ));
        }
    }
```

However, this only checks the intent state. It is possible that the
migration is in progress before/after the request is processed and
intent state is always the same throughout the API call, therefore 404
not being processed by this branch.

Also, not sure about if this new code is correct or not, need second
eyes on that:

```
// As a reconciliation is in flight, we do not have the observed state yet, and therefore we assume it is always inconsistent.
Ok((node.clone(), false))
```

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/http.rs    |  46 ++++---
 storage_controller/src/service.rs | 192 +++++++++++++++++++-----------
 2 files changed, 141 insertions(+), 97 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 62fc212e12..c8227f0219 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -735,15 +735,13 @@ async fn handle_tenant_timeline_passthrough(
     );
 
     // Find the node that holds shard zero
-    let (node, tenant_shard_id) = if tenant_or_shard_id.is_unsharded() {
+    let (node, tenant_shard_id, consistent) = if tenant_or_shard_id.is_unsharded() {
         service
             .tenant_shard0_node(tenant_or_shard_id.tenant_id)
             .await?
     } else {
-        (
-            service.tenant_shard_node(tenant_or_shard_id).await?,
-            tenant_or_shard_id,
-        )
+        let (node, consistent) = service.tenant_shard_node(tenant_or_shard_id).await?;
+        (node, tenant_or_shard_id, consistent)
     };
 
     // Callers will always pass an unsharded tenant ID.  Before proxying, we must
@@ -788,16 +786,12 @@ async fn handle_tenant_timeline_passthrough(
     }
 
     // Transform 404 into 503 if we raced with a migration
-    if resp.status() == reqwest::StatusCode::NOT_FOUND {
-        // Look up node again: if we migrated it will be different
-        let new_node = service.tenant_shard_node(tenant_shard_id).await?;
-        if new_node.get_id() != node.get_id() {
-            // Rather than retry here, send the client a 503 to prompt a retry: this matches
-            // the pageserver's use of 503, and all clients calling this API should retry on 503.
-            return Err(ApiError::ResourceUnavailable(
-                format!("Pageserver {node} returned 404, was migrated to {new_node}").into(),
-            ));
-        }
+    if resp.status() == reqwest::StatusCode::NOT_FOUND && !consistent {
+        // Rather than retry here, send the client a 503 to prompt a retry: this matches
+        // the pageserver's use of 503, and all clients calling this API should retry on 503.
+        return Err(ApiError::ResourceUnavailable(
+            format!("Pageserver {node} returned 404 due to ongoing migration, retry later").into(),
+        ));
     }
 
     // We have a reqest::Response, would like a http::Response
@@ -2597,6 +2591,17 @@ pub fn make_router(
                 )
             },
         )
+        // Tenant timeline mark_invisible passthrough to shard zero
+        .put(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_timeline_passthrough,
+                    RequestName("v1_tenant_timeline_mark_invisible_passthrough"),
+                )
+            },
+        )
         // Tenant detail GET passthrough to shard zero:
         .get("/v1/tenant/:tenant_id", |r| {
             tenant_service_handler(
@@ -2615,17 +2620,6 @@ pub fn make_router(
                 RequestName("v1_tenant_passthrough"),
             )
         })
-        // Tenant timeline mark_invisible passthrough to shard zero
-        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/mark_invisible",
-            |r| {
-                tenant_service_handler(
-                    r,
-                    handle_tenant_timeline_passthrough,
-                    RequestName("v1_tenant_timeline_mark_invisible_passthrough"),
-                )
-            },
-        )
 }
 
 #[cfg(test)]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 638cb410fa..0c5d7f44d4 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -207,6 +207,27 @@ enum ShardGenerationValidity {
     },
 }
 
+/// We collect the state of attachments for some operations to determine if the operation
+/// needs to be retried when it fails.
+struct TenantShardAttachState {
+    /// The targets of the operation.
+    ///
+    /// Tenant shard ID, node ID, node, is intent node observed primary.
+    targets: Vec<(TenantShardId, NodeId, Node, bool)>,
+
+    /// The targets grouped by node ID.
+    by_node_id: HashMap<NodeId, (TenantShardId, Node, bool)>,
+}
+
+impl TenantShardAttachState {
+    fn for_api_call(&self) -> Vec<(TenantShardId, Node)> {
+        self.targets
+            .iter()
+            .map(|(tenant_shard_id, _, node, _)| (*tenant_shard_id, node.clone()))
+            .collect()
+    }
+}
+
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
@@ -4752,6 +4773,86 @@ impl Service {
         Ok(())
     }
 
+    fn is_observed_consistent_with_intent(
+        &self,
+        shard: &TenantShard,
+        intent_node_id: NodeId,
+    ) -> bool {
+        if let Some(location) = shard.observed.locations.get(&intent_node_id)
+            && let Some(ref conf) = location.conf
+            && (conf.mode == LocationConfigMode::AttachedSingle
+                || conf.mode == LocationConfigMode::AttachedMulti)
+        {
+            true
+        } else {
+            false
+        }
+    }
+
+    fn collect_tenant_shards(
+        &self,
+        tenant_id: TenantId,
+    ) -> Result<TenantShardAttachState, ApiError> {
+        let locked = self.inner.read().unwrap();
+        let mut targets = Vec::new();
+        let mut by_node_id = HashMap::new();
+
+        // If the request got an unsharded tenant id, then apply
+        // the operation to all shards. Otherwise, apply it to a specific shard.
+        let shards_range = TenantShardId::tenant_range(tenant_id);
+
+        for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
+            if let Some(node_id) = shard.intent.get_attached() {
+                let node = locked
+                    .nodes
+                    .get(node_id)
+                    .expect("Pageservers may not be deleted while referenced");
+
+                let consistent = self.is_observed_consistent_with_intent(shard, *node_id);
+
+                targets.push((*tenant_shard_id, *node_id, node.clone(), consistent));
+                by_node_id.insert(*node_id, (*tenant_shard_id, node.clone(), consistent));
+            }
+        }
+
+        Ok(TenantShardAttachState {
+            targets,
+            by_node_id,
+        })
+    }
+
+    fn process_result_and_passthrough_errors<T>(
+        &self,
+        results: Vec<(Node, Result<T, mgmt_api::Error>)>,
+        attach_state: TenantShardAttachState,
+    ) -> Result<Vec<(Node, T)>, ApiError> {
+        let mut processed_results: Vec<(Node, T)> = Vec::with_capacity(results.len());
+        debug_assert_eq!(results.len(), attach_state.targets.len());
+        for (node, res) in results {
+            let is_consistent = attach_state
+                .by_node_id
+                .get(&node.get_id())
+                .map(|(_, _, consistent)| *consistent);
+            match res {
+                Ok(res) => processed_results.push((node, res)),
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
+                    if is_consistent == Some(false) =>
+                {
+                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    return Err(ApiError::ResourceUnavailable(
+                        format!(
+                            "Timeline is not attached to the pageserver {} yet, please retry",
+                            node.get_id()
+                        )
+                        .into(),
+                    ));
+                }
+                Err(e) => return Err(passthrough_api_error(&node, e)),
+            }
+        }
+        Ok(processed_results)
+    }
+
     pub(crate) async fn tenant_timeline_lsn_lease(
         &self,
         tenant_id: TenantId,
@@ -4765,49 +4866,11 @@ impl Service {
         )
         .await;
 
-        let mut retry_if_not_attached = false;
-        let targets = {
-            let locked = self.inner.read().unwrap();
-            let mut targets = Vec::new();
+        let attach_state = self.collect_tenant_shards(tenant_id)?;
 
-            // If the request got an unsharded tenant id, then apply
-            // the operation to all shards. Otherwise, apply it to a specific shard.
-            let shards_range = TenantShardId::tenant_range(tenant_id);
-
-            for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
-                if let Some(node_id) = shard.intent.get_attached() {
-                    let node = locked
-                        .nodes
-                        .get(node_id)
-                        .expect("Pageservers may not be deleted while referenced");
-
-                    targets.push((*tenant_shard_id, node.clone()));
-
-                    if let Some(location) = shard.observed.locations.get(node_id) {
-                        if let Some(ref conf) = location.conf {
-                            if conf.mode != LocationConfigMode::AttachedSingle
-                                && conf.mode != LocationConfigMode::AttachedMulti
-                            {
-                                // If the shard is attached as secondary, we need to retry if 404.
-                                retry_if_not_attached = true;
-                            }
-                            // If the shard is attached as primary, we should succeed.
-                        } else {
-                            // Location conf is not available yet, retry if 404.
-                            retry_if_not_attached = true;
-                        }
-                    } else {
-                        // The shard is not attached to the intended pageserver yet, retry if 404.
-                        retry_if_not_attached = true;
-                    }
-                }
-            }
-            targets
-        };
-
-        let res = self
+        let results = self
             .tenant_for_shards_api(
-                targets,
+                attach_state.for_api_call(),
                 |tenant_shard_id, client| async move {
                     client
                         .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
@@ -4820,31 +4883,13 @@ impl Service {
             )
             .await;
 
+        let leases = self.process_result_and_passthrough_errors(results, attach_state)?;
         let mut valid_until = None;
-        for (node, r) in res {
-            match r {
-                Ok(lease) => {
-                    if let Some(ref mut valid_until) = valid_until {
-                        *valid_until = std::cmp::min(*valid_until, lease.valid_until);
-                    } else {
-                        valid_until = Some(lease.valid_until);
-                    }
-                }
-                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
-                    if retry_if_not_attached =>
-                {
-                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
-                    return Err(ApiError::ResourceUnavailable(
-                        format!(
-                            "Timeline is not attached to the pageserver {} yet, please retry",
-                            node.get_id()
-                        )
-                        .into(),
-                    ));
-                }
-                Err(e) => {
-                    return Err(passthrough_api_error(&node, e));
-                }
+        for (_, lease) in leases {
+            if let Some(ref mut valid_until) = valid_until {
+                *valid_until = std::cmp::min(*valid_until, lease.valid_until);
+            } else {
+                valid_until = Some(lease.valid_until);
             }
         }
         Ok(LsnLease {
@@ -5267,10 +5312,12 @@ impl Service {
         status_code
     }
     /// When you know the TenantId but not a specific shard, and would like to get the node holding shard 0.
+    ///
+    /// Returns the node, tenant shard id, and whether it is consistent with the observed state.
     pub(crate) async fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(Node, TenantShardId), ApiError> {
+    ) -> Result<(Node, TenantShardId, bool), ApiError> {
         let tenant_shard_id = {
             let locked = self.inner.read().unwrap();
             let Some((tenant_shard_id, _shard)) = locked
@@ -5288,15 +5335,17 @@ impl Service {
 
         self.tenant_shard_node(tenant_shard_id)
             .await
-            .map(|node| (node, tenant_shard_id))
+            .map(|(node, consistent)| (node, tenant_shard_id, consistent))
     }
 
     /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this
     /// function looks up and returns node. If the shard isn't found, returns Err(ApiError::NotFound)
+    ///
+    /// Returns the intent node and whether it is consistent with the observed state.
     pub(crate) async fn tenant_shard_node(
         &self,
         tenant_shard_id: TenantShardId,
-    ) -> Result<Node, ApiError> {
+    ) -> Result<(Node, bool), ApiError> {
         // Look up in-memory state and maybe use the node from there.
         {
             let locked = self.inner.read().unwrap();
@@ -5326,7 +5375,8 @@ impl Service {
                         "Shard refers to nonexistent node"
                     )));
                 };
-                return Ok(node.clone());
+                let consistent = self.is_observed_consistent_with_intent(shard, *intent_node_id);
+                return Ok((node.clone(), consistent));
             }
         };
 
@@ -5360,8 +5410,8 @@ impl Service {
                 "Shard refers to nonexistent node"
             )));
         };
-
-        Ok(node.clone())
+        // As a reconciliation is in flight, we do not have the observed state yet, and therefore we assume it is always inconsistent.
+        Ok((node.clone(), false))
     }
 
     pub(crate) fn tenant_locate(

From 79d72c94e86d0205f98b526e9d51ab723335e094 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Wed, 16 Jul 2025 18:02:07 +0200
Subject: [PATCH 135/163] reformat cargo install invocations in build-tools
 image (#12629)

## Problem
Same change with different formatting happened in multiple branches.

## Summary of changes
Realign formatting with the other branch.
---
 build-tools/Dockerfile | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/build-tools/Dockerfile b/build-tools/Dockerfile
index e02707a5eb..b5fe642e6f 100644
--- a/build-tools/Dockerfile
+++ b/build-tools/Dockerfile
@@ -317,14 +317,14 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     . "$HOME/.cargo/env" && \
     cargo --version && rustup --version && \
     rustup component add llvm-tools rustfmt clippy && \
-    cargo install rustfilt            --version ${RUSTFILT_VERSION} --locked && \
-    cargo install cargo-hakari        --version ${CARGO_HAKARI_VERSION} --locked && \
-    cargo install cargo-deny          --version ${CARGO_DENY_VERSION} --locked && \
-    cargo install cargo-hack          --version ${CARGO_HACK_VERSION} --locked && \
-    cargo install cargo-nextest       --version ${CARGO_NEXTEST_VERSION} --locked && \
-    cargo install cargo-chef          --version ${CARGO_CHEF_VERSION} --locked && \
-    cargo install diesel_cli          --version ${CARGO_DIESEL_CLI_VERSION} --locked \
-                                      --features postgres-bundled --no-default-features && \
+    cargo install rustfilt      --locked --version ${RUSTFILT_VERSION} && \
+    cargo install cargo-hakari  --locked --version ${CARGO_HAKARI_VERSION} && \
+    cargo install cargo-deny    --locked --version ${CARGO_DENY_VERSION} && \
+    cargo install cargo-hack    --locked --version ${CARGO_HACK_VERSION} && \
+    cargo install cargo-nextest --locked --version ${CARGO_NEXTEST_VERSION} && \
+    cargo install cargo-chef    --locked --version ${CARGO_CHEF_VERSION} && \
+    cargo install diesel_cli    --locked --version ${CARGO_DIESEL_CLI_VERSION} \
+                                --features postgres-bundled --no-default-features && \
     rm -rf /home/nonroot/.cargo/registry && \
     rm -rf /home/nonroot/.cargo/git
 

From 9e154a8130ebd82e042f83d62165291fa9355ccd Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Jul 2025 10:11:25 -0600
Subject: [PATCH 136/163] PG: smooth max wal rate (#12514)

## Problem
We were only resetting the limit in the wal proposer. If backends are
back pressured, it might take a while for the wal proposer to receive a
new WAL to reset the limit.

## Summary of changes
Backend also checks the time and resets the limit.

## How is this tested?
pgbench has more smooth tps

Signed-off-by: Tristan Partin <tristan.partin@databricks.com>
Co-authored-by: Haoyu Huang <haoyu.huang@databricks.com>
---
 libs/walproposer/src/api_bindings.rs |  2 +-
 pgxn/neon/walproposer.h              | 12 +++++++++++-
 pgxn/neon/walproposer_pg.c           | 26 ++++++++++++++++++++------
 3 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/libs/walproposer/src/api_bindings.rs b/libs/walproposer/src/api_bindings.rs
index 5f856a44d4..825a137d0f 100644
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -431,7 +431,7 @@ pub fn empty_shmem() -> crate::bindings::WalproposerShmemState {
     let empty_wal_rate_limiter = crate::bindings::WalRateLimiter {
         should_limit: crate::bindings::pg_atomic_uint32 { value: 0 },
         sent_bytes: 0,
-        last_recorded_time_us: 0,
+        last_recorded_time_us: crate::bindings::pg_atomic_uint64 { value: 0 },
     };
 
     crate::bindings::WalproposerShmemState {
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index e3a4022664..19d23925a5 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -377,6 +377,16 @@ typedef struct PageserverFeedback
 } PageserverFeedback;
 
 /* BEGIN_HADRON */
+/**
+ * WAL proposer is the only backend that will update `sent_bytes` and `last_recorded_time_us`.
+ * Once the `sent_bytes` reaches the limit, it puts backpressure on PG backends.
+ *
+ * A PG backend checks `should_limit` to see if it should hit backpressure.
+ * - If yes, it also checks the `last_recorded_time_us` to see
+ *   if it's time to push more WALs. This is because the WAL proposer
+ *   only resets `should_limit` to 0 after it is notified about new WALs
+ *   which might take a while.
+ */
 typedef struct WalRateLimiter
 {
 	/* If the value is 1, PG backends will hit backpressure. */
@@ -384,7 +394,7 @@ typedef struct WalRateLimiter
 	/* The number of bytes sent in the current second. */
 	uint64		sent_bytes;
 	/* The last recorded time in microsecond. */
-	TimestampTz last_recorded_time_us;
+	pg_atomic_uint64 last_recorded_time_us;
 } WalRateLimiter;
 /* END_HADRON */
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index aaf8f43eeb..18655d4c6c 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -449,8 +449,20 @@ backpressure_lag_impl(void)
 	}
 
 	state = GetWalpropShmemState();
-	if (state != NULL && pg_atomic_read_u32(&state->wal_rate_limiter.should_limit) == 1)
+	if (state != NULL && !!pg_atomic_read_u32(&state->wal_rate_limiter.should_limit))
 	{
+		TimestampTz now = GetCurrentTimestamp();
+		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
+		uint64 last_recorded_time = pg_atomic_read_u64(&limiter->last_recorded_time_us);
+		if (now - last_recorded_time > USECS_PER_SEC)
+		{
+			/*
+			 * The backend has past 1 second since the last recorded time and it's time to push more WALs.
+			 * If the backends are pushing WALs too fast, the wal proposer will rate limit them again.
+			 */
+			uint32 expected = true;
+			pg_atomic_compare_exchange_u32(&state->wal_rate_limiter.should_limit, &expected, false);
+		}
 		return 1;
 	}
 	/* END_HADRON */
@@ -502,6 +514,7 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->currentClusterSize, 0);
 		/* BEGIN_HADRON */
 		pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+		pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0);
 		/* END_HADRON */
 	}
 	LWLockRelease(AddinShmemInitLock);
@@ -520,6 +533,7 @@ WalproposerShmemInit_SyncSafekeeper(void)
 	pg_atomic_init_u64(&walprop_shared->backpressureThrottlingTime, 0);
 	/* BEGIN_HADRON */
 	pg_atomic_init_u32(&walprop_shared->wal_rate_limiter.should_limit, 0);
+	pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0);
 	/* END_HADRON */
 }
 
@@ -1551,18 +1565,18 @@ XLogBroadcastWalProposer(WalProposer *wp)
 	{
 		uint64 max_wal_bytes = (uint64) databricks_max_wal_mb_per_second * 1024 * 1024;
 		struct WalRateLimiter *limiter = &state->wal_rate_limiter;
-
-		if (now - limiter->last_recorded_time_us > USECS_PER_SEC)
+		uint64 last_recorded_time = pg_atomic_read_u64(&limiter->last_recorded_time_us);
+		if (now - last_recorded_time > USECS_PER_SEC)
 		{
 			/* Reset the rate limiter */
-			limiter->last_recorded_time_us = now;
 			limiter->sent_bytes = 0;
-			pg_atomic_exchange_u32(&limiter->should_limit, 0);
+			pg_atomic_write_u64(&limiter->last_recorded_time_us, now);
+			pg_atomic_write_u32(&limiter->should_limit, false);
 		}
 		limiter->sent_bytes += (endptr - startptr);
 		if (limiter->sent_bytes > max_wal_bytes)
 		{
-			pg_atomic_exchange_u32(&limiter->should_limit, 1);
+			pg_atomic_write_u32(&limiter->should_limit, true);
 		}
 	}
 	/* END_HADRON */

From e2982ed3ecdb8e1e67239ea84953550909c4700b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= <k.p.szafranski@gmail.com>
Date: Wed, 16 Jul 2025 18:23:05 +0200
Subject: [PATCH 137/163] [proxy] Cache node info only for TTL, even if Redis
 is available (#12626)

This PR simplifies our node info cache. Now we'll store entries for at
most the TTL duration, even if Redis notifications are available. This
will allow us to cache intermittent errors later (e.g. due to rate
limits) with more predictable behavior.

Related to https://github.com/neondatabase/cloud/issues/19353
---
 proxy/src/cache/project_info.rs  | 107 +++++--------------------------
 proxy/src/redis/notifications.rs |   7 +-
 2 files changed, 16 insertions(+), 98 deletions(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index d37c107323..c812779e30 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -1,13 +1,11 @@
 use std::collections::{HashMap, HashSet, hash_map};
 use std::convert::Infallible;
-use std::sync::atomic::AtomicU64;
 use std::time::Duration;
 
 use async_trait::async_trait;
 use clashmap::ClashMap;
 use clashmap::mapref::one::Ref;
 use rand::{Rng, thread_rng};
-use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
@@ -22,31 +20,23 @@ pub(crate) trait ProjectInfoCache {
     fn invalidate_endpoint_access_for_project(&self, project_id: ProjectIdInt);
     fn invalidate_endpoint_access_for_org(&self, account_id: AccountIdInt);
     fn invalidate_role_secret_for_project(&self, project_id: ProjectIdInt, role_name: RoleNameInt);
-    async fn decrement_active_listeners(&self);
-    async fn increment_active_listeners(&self);
 }
 
 struct Entry<T> {
-    created_at: Instant,
+    expires_at: Instant,
     value: T,
 }
 
 impl<T> Entry<T> {
-    pub(crate) fn new(value: T) -> Self {
+    pub(crate) fn new(value: T, ttl: Duration) -> Self {
         Self {
-            created_at: Instant::now(),
+            expires_at: Instant::now() + ttl,
             value,
         }
     }
 
-    pub(crate) fn get(&self, valid_since: Instant) -> Option<&T> {
-        (valid_since < self.created_at).then_some(&self.value)
-    }
-}
-
-impl<T> From<T> for Entry<T> {
-    fn from(value: T) -> Self {
-        Self::new(value)
+    pub(crate) fn get(&self) -> Option<&T> {
+        (self.expires_at > Instant::now()).then_some(&self.value)
     }
 }
 
@@ -56,18 +46,12 @@ struct EndpointInfo {
 }
 
 impl EndpointInfo {
-    pub(crate) fn get_role_secret(
-        &self,
-        role_name: RoleNameInt,
-        valid_since: Instant,
-    ) -> Option<RoleAccessControl> {
-        let controls = self.role_controls.get(&role_name)?;
-        controls.get(valid_since).cloned()
+    pub(crate) fn get_role_secret(&self, role_name: RoleNameInt) -> Option<RoleAccessControl> {
+        self.role_controls.get(&role_name)?.get().cloned()
     }
 
-    pub(crate) fn get_controls(&self, valid_since: Instant) -> Option<EndpointAccessControl> {
-        let controls = self.controls.as_ref()?;
-        controls.get(valid_since).cloned()
+    pub(crate) fn get_controls(&self) -> Option<EndpointAccessControl> {
+        self.controls.as_ref()?.get().cloned()
     }
 
     pub(crate) fn invalidate_endpoint(&mut self) {
@@ -92,11 +76,8 @@ pub struct ProjectInfoCacheImpl {
     project2ep: ClashMap<ProjectIdInt, HashSet<EndpointIdInt>>,
     // FIXME(stefan): we need a way to GC the account2ep map.
     account2ep: ClashMap<AccountIdInt, HashSet<EndpointIdInt>>,
-    config: ProjectInfoCacheOptions,
 
-    start_time: Instant,
-    ttl_disabled_since_us: AtomicU64,
-    active_listeners_lock: Mutex<usize>,
+    config: ProjectInfoCacheOptions,
 }
 
 #[async_trait]
@@ -152,29 +133,6 @@ impl ProjectInfoCache for ProjectInfoCacheImpl {
             }
         }
     }
-
-    async fn decrement_active_listeners(&self) {
-        let mut listeners_guard = self.active_listeners_lock.lock().await;
-        if *listeners_guard == 0 {
-            tracing::error!("active_listeners count is already 0, something is broken");
-            return;
-        }
-        *listeners_guard -= 1;
-        if *listeners_guard == 0 {
-            self.ttl_disabled_since_us
-                .store(u64::MAX, std::sync::atomic::Ordering::SeqCst);
-        }
-    }
-
-    async fn increment_active_listeners(&self) {
-        let mut listeners_guard = self.active_listeners_lock.lock().await;
-        *listeners_guard += 1;
-        if *listeners_guard == 1 {
-            let new_ttl = (self.start_time.elapsed() + self.config.ttl).as_micros() as u64;
-            self.ttl_disabled_since_us
-                .store(new_ttl, std::sync::atomic::Ordering::SeqCst);
-        }
-    }
 }
 
 impl ProjectInfoCacheImpl {
@@ -184,9 +142,6 @@ impl ProjectInfoCacheImpl {
             project2ep: ClashMap::new(),
             account2ep: ClashMap::new(),
             config,
-            ttl_disabled_since_us: AtomicU64::new(u64::MAX),
-            start_time: Instant::now(),
-            active_listeners_lock: Mutex::new(0),
         }
     }
 
@@ -203,19 +158,17 @@ impl ProjectInfoCacheImpl {
         endpoint_id: &EndpointId,
         role_name: &RoleName,
     ) -> Option<RoleAccessControl> {
-        let valid_since = self.get_cache_times();
         let role_name = RoleNameInt::get(role_name)?;
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_role_secret(role_name, valid_since)
+        endpoint_info.get_role_secret(role_name)
     }
 
     pub(crate) fn get_endpoint_access(
         &self,
         endpoint_id: &EndpointId,
     ) -> Option<EndpointAccessControl> {
-        let valid_since = self.get_cache_times();
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_controls(valid_since)
+        endpoint_info.get_controls()
     }
 
     pub(crate) fn insert_endpoint_access(
@@ -237,8 +190,8 @@ impl ProjectInfoCacheImpl {
             return;
         }
 
-        let controls = Entry::from(controls);
-        let role_controls = Entry::from(role_controls);
+        let controls = Entry::new(controls, self.config.ttl);
+        let role_controls = Entry::new(role_controls, self.config.ttl);
 
         match self.cache.entry(endpoint_id) {
             clashmap::Entry::Vacant(e) => {
@@ -275,27 +228,6 @@ impl ProjectInfoCacheImpl {
         }
     }
 
-    fn ignore_ttl_since(&self) -> Option<Instant> {
-        let ttl_disabled_since_us = self
-            .ttl_disabled_since_us
-            .load(std::sync::atomic::Ordering::Relaxed);
-
-        if ttl_disabled_since_us == u64::MAX {
-            return None;
-        }
-
-        Some(self.start_time + Duration::from_micros(ttl_disabled_since_us))
-    }
-
-    fn get_cache_times(&self) -> Instant {
-        let mut valid_since = Instant::now() - self.config.ttl;
-        if let Some(ignore_ttl_since) = self.ignore_ttl_since() {
-            // We are fine if entry is not older than ttl or was added before we are getting notifications.
-            valid_since = valid_since.min(ignore_ttl_since);
-        }
-        valid_since
-    }
-
     pub fn maybe_invalidate_role_secret(&self, endpoint_id: &EndpointId, role_name: &RoleName) {
         let Some(endpoint_id) = EndpointIdInt::get(endpoint_id) else {
             return;
@@ -313,16 +245,7 @@ impl ProjectInfoCacheImpl {
             return;
         };
 
-        let created_at = role_controls.get().created_at;
-        let expire = match self.ignore_ttl_since() {
-            // if ignoring TTL, we should still try and roll the password if it's old
-            // and we the client gave an incorrect password. There could be some lag on the redis channel.
-            Some(_) => created_at + self.config.ttl < Instant::now(),
-            // edge case: redis is down, let's be generous and invalidate the cache immediately.
-            None => true,
-        };
-
-        if expire {
+        if role_controls.get().expires_at <= Instant::now() {
             role_controls.remove();
         }
     }
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index 973a4c5b02..a6d376562b 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -265,10 +265,7 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
             return Ok(());
         }
         let mut conn = match try_connect(&redis).await {
-            Ok(conn) => {
-                handler.cache.increment_active_listeners().await;
-                conn
-            }
+            Ok(conn) => conn,
             Err(e) => {
                 tracing::error!(
                     "failed to connect to redis: {e}, will try to reconnect in {RECONNECT_TIMEOUT:#?}"
@@ -287,11 +284,9 @@ async fn handle_messages<C: ProjectInfoCache + Send + Sync + 'static>(
                 }
             }
             if cancellation_token.is_cancelled() {
-                handler.cache.decrement_active_listeners().await;
                 return Ok(());
             }
         }
-        handler.cache.decrement_active_listeners().await;
     }
 }
 

From 267fb4990888ef2a325005b21b88cf66fd214c72 Mon Sep 17 00:00:00 2001
From: Dimitri Fontaine <dim@tapoueh.org>
Date: Wed, 16 Jul 2025 20:39:54 +0200
Subject: [PATCH 138/163] Update Postgres branches. (#12628)

## Problem

## Summary of changes
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index af550a80c6..ac3c460e01 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit af550a80c6b86d0fec378ee929e2bb2e591e5cd3
+Subproject commit ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 21cb86b814..24313bf8f3 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 21cb86b81454522870d3634cac3e10b821da09fe
+Subproject commit 24313bf8f3de722968a2fdf764de7ef77ed64f06
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index c148871ead..51194dc5ce 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit c148871eada02c0cf15d553d8ff7c389d01810f2
+Subproject commit 51194dc5ce2e3523068d8607852e6c3125a17e58
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 8de764e44b..eac5279cd1 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 8de764e44b56d1cffb3644368d4d689f482b611a
+Subproject commit eac5279cd147d4086e0eb242198aae2f4b766d7b
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 3c8067a23d..e4b6c8e23a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "8de764e44b56d1cffb3644368d4d689f482b611a"
+    "eac5279cd147d4086e0eb242198aae2f4b766d7b"
   ],
   "v16": [
     "16.9",
-    "c148871eada02c0cf15d553d8ff7c389d01810f2"
+    "51194dc5ce2e3523068d8607852e6c3125a17e58"
   ],
   "v15": [
     "15.13",
-    "21cb86b81454522870d3634cac3e10b821da09fe"
+    "24313bf8f3de722968a2fdf764de7ef77ed64f06"
   ],
   "v14": [
     "14.18",
-    "af550a80c6b86d0fec378ee929e2bb2e591e5cd3"
+    "ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4"
   ]
 }

From fb796229bf16d6e1684b3f498d3fb5a55f13c5ee Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 16 Jul 2025 22:20:44 +0100
Subject: [PATCH 139/163] Fix `make neon-pgindent`  (#12535)

## Problem

`make neon-pgindent` doesn't work:
- there's no `$(BUILD_DIR)/neon-v17` dir
- `make -C ...` along with relative `BUILD_DIR` resolves to a path that
doesn't exist

## Summary of changes
- Fix path for to neon extension for `make neon-pgindent`
- Make `BUILD_DIR` absolute
- Remove trailing slash from `POSTGRES_INSTALL_DIR` to avoid duplicated
slashed in commands (doesn't break anything, it make it look nicer)
---
 Makefile | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 749e527ac3..dc8bacc78e 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ ROOT_PROJECT_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
 
 # Where to install Postgres, default is ./pg_install, maybe useful for package
 # managers.
-POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install/
+POSTGRES_INSTALL_DIR ?= $(ROOT_PROJECT_DIR)/pg_install
 
 # Supported PostgreSQL versions
 POSTGRES_VERSIONS = v17 v16 v15 v14
@@ -14,7 +14,7 @@ POSTGRES_VERSIONS = v17 v16 v15 v14
 # it is derived from BUILD_TYPE.
 
 # All intermediate build artifacts are stored here.
-BUILD_DIR := build
+BUILD_DIR := $(ROOT_PROJECT_DIR)/build
 
 ICU_PREFIX_DIR := /usr/local/icu
 
@@ -212,7 +212,7 @@ neon-pgindent: postgres-v17-pg-bsd-indent neon-pg-ext-v17
 		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/find_typedef \
 		INDENT=$(BUILD_DIR)/v17/src/tools/pg_bsd_indent/pg_bsd_indent \
 		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v17/src/tools/pgindent/pgindent \
-		-C $(BUILD_DIR)/neon-v17 \
+		-C $(BUILD_DIR)/pgxn-v17/neon \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
 
 
From f2828bbe198a45c1604e67cad60bdcb96634b64d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 16 Jul 2025 17:52:18 -0400
Subject: [PATCH 140/163] fix(pageserver): skip gc-compaction for metadata key
 ranges (#12618)

## Problem

part of https://github.com/neondatabase/neon/issues/11318 ; it is not
entirely safe to run gc-compaction over the metadata key range due to
tombstones and implications of image layers (missing key in image layer
== key not exist). The auto gc-compaction trigger already skips metadata
key ranges (see `schedule_auto_compaction` call in
`trigger_auto_compaction`). In this patch we enforce it directly in
gc_compact_inner so that compactions triggered via HTTP API will also be
subject to this restriction.

## Summary of changes

Ensure gc-compaction only runs on rel key ranges.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |   1 +
 pageserver/src/tenant.rs                     | 104 ++++++++++++++-----
 pageserver/src/tenant/timeline.rs            |  15 +++
 pageserver/src/tenant/timeline/compaction.rs |  30 +++++-
 4 files changed, 123 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3e844a375d..3a08244d71 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2357,6 +2357,7 @@ async fn timeline_compact_handler(
         flags,
         sub_compaction,
         sub_compaction_max_job_size_mb,
+        gc_compaction_do_metadata_compaction: false,
     };
 
     let scheduled = compact_request
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1a3016e7f1..3d66ae4719 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -9216,7 +9216,11 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -9299,7 +9303,11 @@ mod tests {
             guard.cutoffs.space = Lsn(0x40);
         }
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -9836,7 +9844,11 @@ mod tests {
 
         let cancel = CancellationToken::new();
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -9871,7 +9883,11 @@ mod tests {
             guard.cutoffs.space = Lsn(0x40);
         }
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -10446,7 +10462,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: dryrun_flags,
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -10457,14 +10473,22 @@ mod tests {
         verify_result().await;
 
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
 
         // compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -10483,14 +10507,22 @@ mod tests {
             guard.cutoffs.space = Lsn(0x38);
         }
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await; // no wals between 0x30 and 0x38, so we should obtain the same result
 
         // not increasing the GC horizon and compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -10695,7 +10727,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     flags: dryrun_flags,
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -10706,14 +10738,22 @@ mod tests {
         verify_result().await;
 
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
 
         // compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -10913,7 +10953,11 @@ mod tests {
 
         let cancel = CancellationToken::new();
         branch_tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
 
@@ -10926,7 +10970,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x40))),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11594,7 +11638,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(0)..get_key(2)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11641,7 +11685,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(2)..get_key(4)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11693,7 +11737,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(4)..get_key(9)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11744,7 +11788,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(9)..get_key(10)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -11800,7 +11844,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_key_range: Some((get_key(0)..get_key(10)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12071,7 +12115,7 @@ mod tests {
                 &cancel,
                 CompactOptions {
                     compact_lsn_range: Some(CompactLsnRange::above(Lsn(0x28))),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12106,7 +12150,11 @@ mod tests {
 
         // compact again
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -12325,7 +12373,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: Some((get_key(0)..get_key(2)).into()),
                     compact_lsn_range: Some((Lsn(0x20)..Lsn(0x28)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12371,7 +12419,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: Some((get_key(3)..get_key(8)).into()),
                     compact_lsn_range: Some((Lsn(0x28)..Lsn(0x40)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12419,7 +12467,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: Some((get_key(0)..get_key(5)).into()),
                     compact_lsn_range: Some((Lsn(0x20)..Lsn(0x50)).into()),
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
@@ -12454,7 +12502,11 @@ mod tests {
 
         // final full compaction
         tline
-            .compact_with_gc(&cancel, CompactOptions::default(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions::default_for_gc_compaction_unit_tests(),
+                &ctx,
+            )
             .await
             .unwrap();
         verify_result().await;
@@ -12564,7 +12616,7 @@ mod tests {
                 CompactOptions {
                     compact_key_range: None,
                     compact_lsn_range: None,
-                    ..Default::default()
+                    ..CompactOptions::default_for_gc_compaction_unit_tests()
                 },
                 &ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 73d2d72b59..8f25555929 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -939,6 +939,20 @@ pub(crate) struct CompactOptions {
     /// Set job size for the GC compaction.
     /// This option is only used by GC compaction.
     pub sub_compaction_max_job_size_mb: Option<u64>,
+    /// Only for GC compaction.
+    /// If set, the compaction will compact the metadata layers. Should be only set to true in unit tests
+    /// because metadata compaction is not fully supported yet.
+    pub gc_compaction_do_metadata_compaction: bool,
+}
+
+impl CompactOptions {
+    #[cfg(test)]
+    pub fn default_for_gc_compaction_unit_tests() -> Self {
+        Self {
+            gc_compaction_do_metadata_compaction: true,
+            ..Default::default()
+        }
+    }
 }
 
 impl std::fmt::Debug for Timeline {
@@ -2185,6 +2199,7 @@ impl Timeline {
                     compact_lsn_range: None,
                     sub_compaction: false,
                     sub_compaction_max_job_size_mb: None,
+                    gc_compaction_do_metadata_compaction: false,
                 },
                 ctx,
             )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index aa1aa937b6..f76ef502dc 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -396,6 +396,7 @@ impl GcCompactionQueue {
                     }),
                     compact_lsn_range: None,
                     sub_compaction_max_job_size_mb: None,
+                    gc_compaction_do_metadata_compaction: false,
                 },
                 permit,
             );
@@ -512,6 +513,7 @@ impl GcCompactionQueue {
                     compact_key_range: Some(job.compact_key_range.into()),
                     compact_lsn_range: Some(job.compact_lsn_range.into()),
                     sub_compaction_max_job_size_mb: None,
+                    gc_compaction_do_metadata_compaction: false,
                 };
                 pending_tasks.push(GcCompactionQueueItem::SubCompactionJob {
                     options,
@@ -785,6 +787,8 @@ pub(crate) struct GcCompactJob {
     /// as specified here. The true range being compacted is `min_lsn/max_lsn` in [`GcCompactionJobDescription`].
     /// min_lsn will always <= the lower bound specified here, and max_lsn will always >= the upper bound specified here.
     pub compact_lsn_range: Range<Lsn>,
+    /// See [`CompactOptions::gc_compaction_do_metadata_compaction`].
+    pub do_metadata_compaction: bool,
 }
 
 impl GcCompactJob {
@@ -799,6 +803,7 @@ impl GcCompactJob {
                 .compact_lsn_range
                 .map(|x| x.into())
                 .unwrap_or(Lsn::INVALID..Lsn::MAX),
+            do_metadata_compaction: options.gc_compaction_do_metadata_compaction,
         }
     }
 }
@@ -3174,6 +3179,7 @@ impl Timeline {
                         dry_run: job.dry_run,
                         compact_key_range: start..end,
                         compact_lsn_range: job.compact_lsn_range.start..compact_below_lsn,
+                        do_metadata_compaction: false,
                     });
                     current_start = Some(end);
                 }
@@ -3236,7 +3242,7 @@ impl Timeline {
     async fn compact_with_gc_inner(
         self: &Arc<Self>,
         cancel: &CancellationToken,
-        job: GcCompactJob,
+        mut job: GcCompactJob,
         ctx: &RequestContext,
         yield_for_l0: bool,
     ) -> Result<CompactionOutcome, CompactionError> {
@@ -3244,6 +3250,28 @@ impl Timeline {
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
 
+        // If the job is not configured to compact the metadata key range, shrink the key range
+        // to exclude the metadata key range. The check is done by checking if the end of the key range
+        // is larger than the start of the metadata key range. Note that metadata keys cover the entire
+        // second half of the keyspace, so it's enough to only check the end of the key range.
+        if !job.do_metadata_compaction
+            && job.compact_key_range.end > Key::metadata_key_range().start
+        {
+            tracing::info!(
+                "compaction for metadata key range is not supported yet, overriding compact_key_range from {} to {}",
+                job.compact_key_range.end,
+                Key::metadata_key_range().start
+            );
+            // Shrink the key range to exclude the metadata key range.
+            job.compact_key_range.end = Key::metadata_key_range().start;
+
+            // Skip the job if the key range completely lies within the metadata key range.
+            if job.compact_key_range.start >= job.compact_key_range.end {
+                tracing::info!("compact_key_range is empty, skipping compaction");
+                return Ok(CompactionOutcome::Done);
+            }
+        }
+
         let timer = Instant::now();
         let begin_timer = timer;
 

From 5dd24c7ad8ec46669ea474c544c0180b9253acd9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 17 Jul 2025 09:57:36 +0100
Subject: [PATCH 141/163] test_total_size_limit: support hosts with up to 256
 GB of RAM (#12617)

## Problem

`test_total_size_limit` fails on runners with 256 GB of RAM

## Summary of changes
- Generate more data in `test_total_size_limit`
---
 test_runner/regress/test_pageserver_layer_rolling.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index 91c4ef521c..68f470d962 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -246,9 +246,9 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
 
     system_memory = psutil.virtual_memory().total
 
-    # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 128MB on
-    # a system with 128GB of RAM).  We will then write enough data to violate this limit.
-    max_dirty_data = 128 * 1024 * 1024
+    # The smallest total size limit we can configure is 1/1024th of the system memory (e.g. 256MB on
+    # a system with 256GB of RAM).  We will then write enough data to violate this limit.
+    max_dirty_data = 256 * 1024 * 1024
     ephemeral_bytes_per_memory_kb = (max_dirty_data * 1024) // system_memory
     assert ephemeral_bytes_per_memory_kb > 0
 
@@ -272,7 +272,7 @@ def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     timeline_count = 10
 
     # This is about 2MiB of data per timeline
-    entries_per_timeline = 100_000
+    entries_per_timeline = 200_000
 
     last_flush_lsns = asyncio.run(workload(env, tenant_conf, timeline_count, entries_per_timeline))
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)

From f765bd3677226a1cd845c14ec5d284c0f587512b Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 17 Jul 2025 12:34:46 +0200
Subject: [PATCH 142/163] pageserver: improve gRPC cancellation

---
 pageserver/src/page_service.rs      | 73 +++++++++++++++++++----------
 pageserver/src/pgdatadir_mapping.rs |  4 ++
 pageserver/src/tenant/timeline.rs   |  3 ++
 3 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 6c371cfef6..23146ac40e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -3218,14 +3218,25 @@ where
 pub struct GrpcPageServiceHandler {
     tenant_manager: Arc<TenantManager>,
     ctx: RequestContext,
+
+    /// Cancelled to shut down the server. Tonic will shut down in response to this, but wait for
+    /// in-flight requests to complete. Any tasks we spawn ourselves must respect this token.
     cancel: CancellationToken,
+
+    /// Any tasks we spawn ourselves should clone this gate guard, so that we can wait for them to
+    /// complete during shutdown. Request handlers implicitly hold this guard already.
     gate_guard: GateGuard,
+
+    /// `get_vectored` concurrency setting.
     get_vectored_concurrent_io: GetVectoredConcurrentIo,
 }
 
 impl GrpcPageServiceHandler {
     /// Spawns a gRPC server for the page service.
     ///
+    /// Returns a `CancellableTask` handle that can be used to shut down the server. It waits for
+    /// any in-flight requests and tasks to complete first.
+    ///
     /// TODO: this doesn't support TLS. We need TLS reloading via ReloadingCertificateResolver, so we
     /// need to reimplement the TCP+TLS accept loop ourselves.
     pub fn spawn(
@@ -3235,12 +3246,15 @@ impl GrpcPageServiceHandler {
         get_vectored_concurrent_io: GetVectoredConcurrentIo,
         listener: std::net::TcpListener,
     ) -> anyhow::Result<CancellableTask> {
+        // Set up a cancellation token for shutting down the server, and a gate to wait for all
+        // requests and spawned tasks to complete.
         let cancel = CancellationToken::new();
+        let gate = Gate::default();
+
         let ctx = RequestContextBuilder::new(TaskKind::PageRequestHandler)
             .download_behavior(DownloadBehavior::Download)
             .perf_span_dispatch(perf_trace_dispatch)
             .detached_child();
-        let gate = Gate::default();
 
         // Set up the TCP socket. We take a preconfigured TcpListener to bind the
         // port early during startup.
@@ -3308,19 +3322,20 @@ impl GrpcPageServiceHandler {
             .build_v1()?;
         let server = server.add_service(reflection_service);
 
-        // Spawn server task.
+        // Spawn server task. It runs until the cancellation token fires and in-flight requests and
+        // tasks complete. The `CancellableTask` will wait for the task's join handle, which
+        // implicitly waits for the gate to close.
         let task_cancel = cancel.clone();
         let task = COMPUTE_REQUEST_RUNTIME.spawn(task_mgr::exit_on_panic_or_error(
-            "grpc listener",
+            "grpc pageservice listener",
             async move {
-                let result = server
+                server
                     .serve_with_incoming_shutdown(incoming, task_cancel.cancelled())
-                    .await;
-                if result.is_ok() {
-                    // TODO: revisit shutdown logic once page service is implemented.
-                    gate.close().await;
-                }
-                result
+                    .await?;
+                // Server exited cleanly. All requests should have completed by now. Wait for any
+                // spawned tasks to complete as well (e.g. IoConcurrency sidecars) via the gate.
+                gate.close().await;
+                anyhow::Ok(())
             },
         ));
 
@@ -3408,8 +3423,6 @@ impl GrpcPageServiceHandler {
     /// TODO: get_vectored() currently enforces a batch limit of 32. Postgres will typically send
     /// batches up to effective_io_concurrency = 100. Either we have to accept large batches, or
     /// split them up in the client or server.
-    ///
-    /// TODO: verify that the given keys belong to this shard.
     #[instrument(skip_all, fields(req_id, rel, blkno, blks, req_lsn, mod_lsn))]
     async fn get_page(
         ctx: &RequestContext,
@@ -3512,7 +3525,10 @@ impl GrpcPageServiceHandler {
 
 /// Implements the gRPC page service.
 ///
-/// TODO: cancellation.
+/// Tonic will drop the request handler futures if the client goes away (e.g. due to a timeout or
+/// cancellation), so the read path must be cancellation-safe. On shutdown, Tonic will wait for
+/// in-flight requests to complete.
+///
 /// TODO: when the libpq impl is removed, remove the Pagestream types and inline the handler code.
 #[tonic::async_trait]
 impl proto::PageService for GrpcPageServiceHandler {
@@ -3597,8 +3613,14 @@ impl proto::PageService for GrpcPageServiceHandler {
 
         // Spawn a task to run the basebackup.
         let span = Span::current();
+        let gate_guard = self
+            .gate_guard
+            .try_clone()
+            .map_err(|_| tonic::Status::unavailable("shutting down"))?;
         let (mut simplex_read, mut simplex_write) = tokio::io::simplex(CHUNK_SIZE);
         let jh = tokio::spawn(async move {
+            let _gate_guard = gate_guard; // keep gate open until task completes
+
             let gzip_level = match req.compression {
                 page_api::BaseBackupCompression::None => None,
                 // NB: using fast compression because it's on the critical path for compute
@@ -3722,13 +3744,14 @@ impl proto::PageService for GrpcPageServiceHandler {
             .await?;
 
         // Spawn an IoConcurrency sidecar, if enabled.
-        let Ok(gate_guard) = self.gate_guard.try_clone() else {
-            return Err(tonic::Status::unavailable("shutting down"));
-        };
+        let gate_guard = self
+            .gate_guard
+            .try_clone()
+            .map_err(|_| tonic::Status::unavailable("shutting down"))?;
         let io_concurrency =
             IoConcurrency::spawn_from_conf(self.get_vectored_concurrent_io, gate_guard);
 
-        // Spawn a task to handle the GetPageRequest stream.
+        // Construct the GetPageRequest stream handler.
         let span = Span::current();
         let ctx = self.ctx.attached_child();
         let cancel = self.cancel.clone();
@@ -3739,18 +3762,18 @@ impl proto::PageService for GrpcPageServiceHandler {
                 .get(ttid.tenant_id, ttid.timeline_id, shard_selector)
                 .await?
                 .downgrade();
-
             loop {
+                // NB: Tonic considers the entire stream to be an in-flight request and will wait
+                // for it to complete before shutting down. React to cancellation between requests.
                 let req = tokio::select! {
-                    req = reqs.message() => req,
-                    _ = cancel.cancelled() => {
-                        tracing::info!("closing getpages stream due to shutdown");
-                        break;
+                    result = reqs.message() => match result {
+                        Ok(Some(req)) => Ok(req),
+                        Ok(None) => break, // client closed the stream
+                        Err(err) => Err(err),
                     },
-                };
-                let Some(req) = req? else { break };
+                    _ = cancel.cancelled() => Err(tonic::Status::unavailable("shutting down")),
+                }?;
                 let req_id = req.request_id.map(page_api::RequestID::from).unwrap_or_default();
-
                 let result = Self::get_page(&ctx, &timeline, req, io_concurrency.clone())
                     .instrument(span.clone()) // propagate request span
                     .await;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 08828ec4eb..17401dbae8 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -286,6 +286,10 @@ impl Timeline {
     /// Like [`Self::get_rel_page_at_lsn`], but returns a batch of pages.
     ///
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
+    ///
+    /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future
+    /// if the client goes away (e.g. due to timeout or cancellation).
+    /// TODO: verify that it actually is cancellation-safe.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
         pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber, LsnRange, RequestContext)>,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f25555929..06e02a7386 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1324,6 +1324,9 @@ impl Timeline {
     ///
     /// This naive implementation will be replaced with a more efficient one
     /// which actually vectorizes the read path.
+    ///
+    /// NB: the read path must be cancellation-safe. The Tonic gRPC service will drop the future
+    /// if the client goes away (e.g. due to timeout or cancellation).
     pub(crate) async fn get_vectored(
         &self,
         query: VersionedKeySpaceQuery,

From 10a7d49726c993732880f520aac58afac82b196f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 17 Jul 2025 13:34:52 +0300
Subject: [PATCH 143/163] Use XLogRecPtr for LSNs in C generated code.

This hopefully silences the static assertion Erik is seeing:

```
pgxn/neon/communicator_new.c:1352:9: error: static assertion failed due to requirement '__builtin_types_compatible_p(unsigned long long, unsigned long)': (r->lsn) does not have type XLogRecPtr
 1352 |                                                                 LSN_FORMAT_ARGS(r->lsn));
      |                                                                 ^~~~~~~~~~~~~~~~~~~~~~~
```
---
 pgxn/neon/communicator/src/neon_request.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/communicator/src/neon_request.rs b/pgxn/neon/communicator/src/neon_request.rs
index d68ec24ed9..732c35d6ce 100644
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -1,4 +1,15 @@
-pub type CLsn = u64;
+
+// Definitions of some core PostgreSQL datatypes.
+
+/// XLogRecPtr is defined in "access/xlogdefs.h" as:
+///
+/// ```
+/// typedef uint64 XLogRecPtr;
+/// ```
+/// cbindgen:no-export
+pub type XLogRecPtr = u64;
+
+pub type CLsn = XLogRecPtr;
 pub type COid = u32;
 
 // This conveniently matches PG_IOV_MAX

From 4559ba79b66bb19062de65fd3963543ed1b01fa2 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Thu, 17 Jul 2025 15:51:31 +0400
Subject: [PATCH 144/163] Introduce force flag for new deletion API (#12588)

## Problem

The force deletion API should behave like the graceful deletion API - it
needs to support cancellation, persistence, and be non-blocking.

## Summary of Changes

- Added a `force` flag to the `NodeStartDelete` command.
- Passed the `force` flag through the `start_node_delete` handler in the
storage controller.
- Handled the `force` flag in the `delete_node` function.
- Set the tombstone after removing the node from memory.
- Minor cleanup, like adding a `get_error_on_cancel` closure.

---------

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 control_plane/storcon_cli/src/main.rs         | 19 +++--
 storage_controller/src/http.rs                |  3 +-
 storage_controller/src/service.rs             | 74 ++++++++++---------
 test_runner/fixtures/neon_fixtures.py         |  7 +-
 .../regress/test_storage_controller.py        | 58 +++++++++++++--
 5 files changed, 111 insertions(+), 50 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index fcc5549beb..a4d1030488 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -76,6 +76,12 @@ enum Command {
     NodeStartDelete {
         #[arg(long)]
         node_id: NodeId,
+        /// When `force` is true, skip waiting for shards to prewarm during migration.
+        /// This can significantly speed up node deletion since prewarming all shards
+        /// can take considerable time, but may result in slower initial access to
+        /// migrated shards until they warm up naturally.
+        #[arg(long)]
+        force: bool,
     },
     /// Cancel deletion of the specified pageserver and wait for `timeout`
     /// for the operation to be canceled. May be retried.
@@ -952,13 +958,14 @@ async fn main() -> anyhow::Result<()> {
                 .dispatch::<(), ()>(Method::DELETE, format!("control/v1/node/{node_id}"), None)
                 .await?;
         }
-        Command::NodeStartDelete { node_id } => {
+        Command::NodeStartDelete { node_id, force } => {
+            let query = if force {
+                format!("control/v1/node/{node_id}/delete?force=true")
+            } else {
+                format!("control/v1/node/{node_id}/delete")
+            };
             storcon_client
-                .dispatch::<(), ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/delete"),
-                    None,
-                )
+                .dispatch::<(), ()>(Method::PUT, query, None)
                 .await?;
             println!("Delete started for {node_id}");
         }
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c8227f0219..5f9a1124de 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1085,9 +1085,10 @@ async fn handle_node_delete(req: Request<Body>) -> Result<Response<Body>, ApiErr
 
     let state = get_state(&req);
     let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    let force: bool = parse_query_param(&req, "force")?.unwrap_or(false);
     json_response(
         StatusCode::OK,
-        state.service.start_node_delete(node_id).await?,
+        state.service.start_node_delete(node_id, force).await?,
     )
 }
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 0c5d7f44d4..b315b88fcc 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7385,6 +7385,7 @@ impl Service {
         self: &Arc<Self>,
         node_id: NodeId,
         policy_on_start: NodeSchedulingPolicy,
+        force: bool,
         cancel: CancellationToken,
     ) -> Result<(), OperationError> {
         let reconciler_config = ReconcilerConfigBuilder::new(ReconcilerPriority::Normal).build();
@@ -7392,23 +7393,27 @@ impl Service {
         let mut waiters: Vec<ReconcilerWaiter> = Vec::new();
         let mut tid_iter = create_shared_shard_iterator(self.clone());
 
+        let reset_node_policy_on_cancel = || async {
+            match self
+                .node_configure(node_id, None, Some(policy_on_start))
+                .await
+            {
+                Ok(()) => OperationError::Cancelled,
+                Err(err) => {
+                    OperationError::FinalizeError(
+                        format!(
+                            "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}",
+                            node_id, String::from(policy_on_start), err
+                        )
+                        .into(),
+                    )
+                }
+            }
+        };
+
         while !tid_iter.finished() {
             if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(policy_on_start))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise delete cancel of {} by setting scheduling policy to {}: {}",
-                                node_id, String::from(policy_on_start), err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
+                return Err(reset_node_policy_on_cancel().await);
             }
 
             operation_utils::validate_node_state(
@@ -7477,8 +7482,18 @@ impl Service {
                         nodes,
                         reconciler_config,
                     );
-                    if let Some(some) = waiter {
-                        waiters.push(some);
+
+                    if force {
+                        // Here we remove an existing observed location for the node we're removing, and it will
+                        // not be re-added by a reconciler's completion because we filter out removed nodes in
+                        // process_result.
+                        //
+                        // Note that we update the shard's observed state _after_ calling maybe_configured_reconcile_shard:
+                        // that means any reconciles we spawned will know about the node we're deleting,
+                        // enabling them to do live migrations if it's still online.
+                        tenant_shard.observed.locations.remove(&node_id);
+                    } else if let Some(waiter) = waiter {
+                        waiters.push(waiter);
                     }
                 }
             }
@@ -7492,21 +7507,7 @@ impl Service {
 
         while !waiters.is_empty() {
             if cancel.is_cancelled() {
-                match self
-                    .node_configure(node_id, None, Some(policy_on_start))
-                    .await
-                {
-                    Ok(()) => return Err(OperationError::Cancelled),
-                    Err(err) => {
-                        return Err(OperationError::FinalizeError(
-                            format!(
-                                "Failed to finalise drain cancel of {} by setting scheduling policy to {}: {}",
-                                node_id, String::from(policy_on_start), err
-                            )
-                            .into(),
-                        ));
-                    }
-                }
+                return Err(reset_node_policy_on_cancel().await);
             }
 
             tracing::info!("Awaiting {} pending delete reconciliations", waiters.len());
@@ -7516,6 +7517,12 @@ impl Service {
                 .await;
         }
 
+        let pf = pausable_failpoint!("delete-node-after-reconciles-spawned", &cancel);
+        if pf.is_err() {
+            // An error from pausable_failpoint indicates the cancel token was triggered.
+            return Err(reset_node_policy_on_cancel().await);
+        }
+
         self.persistence
             .set_tombstone(node_id)
             .await
@@ -8111,6 +8118,7 @@ impl Service {
     pub(crate) async fn start_node_delete(
         self: &Arc<Self>,
         node_id: NodeId,
+        force: bool,
     ) -> Result<(), ApiError> {
         let (ongoing_op, node_policy, schedulable_nodes_count) = {
             let locked = self.inner.read().unwrap();
@@ -8180,7 +8188,7 @@ impl Service {
 
                             tracing::info!("Delete background operation starting");
                             let res = service
-                                .delete_node(node_id, policy_on_start, cancel)
+                                .delete_node(node_id, policy_on_start, force, cancel)
                                 .await;
                             match res {
                                 Ok(()) => {
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ae73ace9bb..86ffa9e4d4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2119,11 +2119,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             headers=self.headers(TokenScope.ADMIN),
         )
 
-    def node_delete(self, node_id):
+    def node_delete(self, node_id, force: bool = False):
         log.info(f"node_delete({node_id})")
+        query = f"{self.api}/control/v1/node/{node_id}/delete"
+        if force:
+            query += "?force=true"
         self.request(
             "PUT",
-            f"{self.api}/control/v1/node/{node_id}/delete",
+            query,
             headers=self.headers(TokenScope.ADMIN),
         )
 
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 10845ef02e..d1e9bbd7dc 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -72,6 +72,12 @@ def get_node_shard_counts(env: NeonEnv, tenant_ids):
     return counts
 
 
+class DeletionAPIKind(Enum):
+    OLD = "old"
+    FORCE = "force"
+    GRACEFUL = "graceful"
+
+
 @pytest.mark.parametrize(**fixtures.utils.allpairs_versions())
 def test_storage_controller_smoke(
     neon_env_builder: NeonEnvBuilder, compute_reconfigure_listener: ComputeReconfigure, combination
@@ -2572,9 +2578,11 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder):
 
 
 @pytest.mark.parametrize("while_offline", [True, False])
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
 def test_storage_controller_node_deletion(
     neon_env_builder: NeonEnvBuilder,
     while_offline: bool,
+    deletion_api: DeletionAPIKind,
 ):
     """
     Test that deleting a node works & properly reschedules everything that was on the node.
@@ -2598,6 +2606,8 @@ def test_storage_controller_node_deletion(
     assert env.storage_controller.reconcile_all() == 0
 
     victim = env.pageservers[-1]
+    if deletion_api == DeletionAPIKind.FORCE and not while_offline:
+        victim.allowed_errors.append(".*request was dropped before completing.*")
 
     # The procedure a human would follow is:
     # 1. Mark pageserver scheduling=pause
@@ -2621,7 +2631,12 @@ def test_storage_controller_node_deletion(
         wait_until(assert_shards_migrated)
 
     log.info(f"Deleting pageserver {victim.id}")
-    env.storage_controller.node_delete_old(victim.id)
+    if deletion_api == DeletionAPIKind.FORCE:
+        env.storage_controller.node_delete(victim.id, force=True)
+    elif deletion_api == DeletionAPIKind.OLD:
+        env.storage_controller.node_delete_old(victim.id)
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")
 
     if not while_offline:
 
@@ -2634,7 +2649,15 @@ def test_storage_controller_node_deletion(
         wait_until(assert_victim_evacuated)
 
     # The node should be gone from the list API
-    assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+    def assert_node_is_gone():
+        assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
+
+    if deletion_api == DeletionAPIKind.FORCE:
+        wait_until(assert_node_is_gone)
+    elif deletion_api == DeletionAPIKind.OLD:
+        assert_node_is_gone()
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")
 
     # No tenants should refer to the node in their intent
     for tenant_id in tenant_ids:
@@ -2656,7 +2679,11 @@ def test_storage_controller_node_deletion(
     env.storage_controller.consistency_check()
 
 
-def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL])
+def test_storage_controller_node_delete_cancellation(
+    neon_env_builder: NeonEnvBuilder,
+    deletion_api: DeletionAPIKind,
+):
     neon_env_builder.num_pageservers = 3
     neon_env_builder.num_azs = 3
     env = neon_env_builder.init_configs()
@@ -2680,12 +2707,16 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu
     assert len(nodes) == 3
 
     env.storage_controller.configure_failpoints(("sleepy-delete-loop", "return(10000)"))
+    env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "pause"))
 
     ps_id_to_delete = env.pageservers[0].id
 
     env.storage_controller.warm_up_all_secondaries()
+
+    assert deletion_api in [DeletionAPIKind.FORCE, DeletionAPIKind.GRACEFUL]
+    force = deletion_api == DeletionAPIKind.FORCE
     env.storage_controller.retryable_node_operation(
-        lambda ps_id: env.storage_controller.node_delete(ps_id),
+        lambda ps_id: env.storage_controller.node_delete(ps_id, force),
         ps_id_to_delete,
         max_attempts=3,
         backoff=2,
@@ -2701,6 +2732,8 @@ def test_storage_controller_node_delete_cancellation(neon_env_builder: NeonEnvBu
 
     env.storage_controller.cancel_node_delete(ps_id_to_delete)
 
+    env.storage_controller.configure_failpoints(("delete-node-after-reconciles-spawned", "off"))
+
     env.storage_controller.poll_node_status(
         ps_id_to_delete,
         PageserverAvailability.ACTIVE,
@@ -3252,7 +3285,10 @@ def test_storage_controller_ps_restarted_during_drain(neon_env_builder: NeonEnvB
     wait_until(reconfigure_node_again)
 
 
-def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("deletion_api", [DeletionAPIKind.OLD, DeletionAPIKind.FORCE])
+def test_ps_unavailable_after_delete(
+    neon_env_builder: NeonEnvBuilder, deletion_api: DeletionAPIKind
+):
     neon_env_builder.num_pageservers = 3
 
     env = neon_env_builder.init_start()
@@ -3265,10 +3301,16 @@ def test_ps_unavailable_after_delete(neon_env_builder: NeonEnvBuilder):
     assert_nodes_count(3)
 
     ps = env.pageservers[0]
-    env.storage_controller.node_delete_old(ps.id)
 
-    # After deletion, the node count must be reduced
-    assert_nodes_count(2)
+    if deletion_api == DeletionAPIKind.FORCE:
+        ps.allowed_errors.append(".*request was dropped before completing.*")
+        env.storage_controller.node_delete(ps.id, force=True)
+        wait_until(lambda: assert_nodes_count(2))
+    elif deletion_api == DeletionAPIKind.OLD:
+        env.storage_controller.node_delete_old(ps.id)
+        assert_nodes_count(2)
+    else:
+        raise AssertionError(f"Invalid deletion API: {deletion_api}")
 
     # Running pageserver CLI init in a separate thread
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:

From b7fc5a2fe0b2a2f0e45fc88e026fa8a4b498cb5a Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Thu, 17 Jul 2025 06:14:36 -0700
Subject: [PATCH 145/163] A few SC changes (#12615)

## Summary of changes
A bunch of no-op changes.

---------

Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 Cargo.lock                             |  4 +-
 libs/utils/Cargo.toml                  |  1 +
 libs/utils/src/auth.rs                 | 38 ++++++++++++++-
 pageserver/src/auth.rs                 |  3 +-
 safekeeper/src/auth.rs                 |  3 +-
 storage_controller/Cargo.toml          |  2 +
 storage_controller/src/auth.rs         | 12 +++++
 storage_controller/src/compute_hook.rs | 30 ++++++++++--
 storage_controller/src/metrics.rs      | 64 ++++++++++++++++++++++++++
 storage_controller/src/node.rs         | 55 ++++++++++++++++++++++
 workspace_hack/Cargo.toml              |  1 -
 11 files changed, 203 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index e5f39658a7..215b3360bc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1872,6 +1872,7 @@ dependencies = [
  "diesel_derives",
  "itoa",
  "serde_json",
+ "uuid",
 ]
 
 [[package]]
@@ -6933,6 +6934,7 @@ dependencies = [
  "tokio-util",
  "tracing",
  "utils",
+ "uuid",
  "workspace_hack",
 ]
 
@@ -8206,6 +8208,7 @@ dependencies = [
  "tracing-error",
  "tracing-subscriber",
  "tracing-utils",
+ "uuid",
  "walkdir",
 ]
 
@@ -8807,7 +8810,6 @@ dependencies = [
  "tracing-log",
  "tracing-subscriber",
  "url",
- "uuid",
  "zeroize",
  "zstd",
  "zstd-safe",
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 7b1dc56071..4b326949d7 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -47,6 +47,7 @@ tracing-subscriber = { workspace = true, features = ["json", "registry"] }
 tracing-utils.workspace = true
 rand.workspace = true
 scopeguard.workspace = true
+uuid.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 walkdir.workspace = true
diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index de3a964d23..b2aade15de 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -12,7 +12,8 @@ use jsonwebtoken::{
     Algorithm, DecodingKey, EncodingKey, Header, TokenData, Validation, decode, encode,
 };
 use pem::Pem;
-use serde::{Deserialize, Serialize, de::DeserializeOwned};
+use serde::{Deserialize, Deserializer, Serialize, de::DeserializeOwned};
+use uuid::Uuid;
 
 use crate::id::TenantId;
 
@@ -25,6 +26,11 @@ pub enum Scope {
     /// Provides access to all data for a specific tenant (specified in `struct Claims` below)
     // TODO: join these two?
     Tenant,
+    /// Provides access to all data for a specific tenant, but based on endpoint ID. This token scope
+    /// is only used by compute to fetch the spec for a specific endpoint. The spec contains a Tenant-scoped
+    /// token authorizing access to all data of a tenant, so the spec-fetch API requires a TenantEndpoint
+    /// scope token to ensure that untrusted compute nodes can't fetch spec for arbitrary endpoints.
+    TenantEndpoint,
     /// Provides blanket access to all tenants on the pageserver plus pageserver-wide APIs.
     /// Should only be used e.g. for status check/tenant creation/list.
     PageServerApi,
@@ -51,17 +57,43 @@ pub enum Scope {
     ControllerPeer,
 }
 
+fn deserialize_empty_string_as_none_uuid<'de, D>(deserializer: D) -> Result<Option<Uuid>, D::Error>
+where
+    D: Deserializer<'de>,
+{
+    let opt = Option::<String>::deserialize(deserializer)?;
+    match opt.as_deref() {
+        Some("") => Ok(None),
+        Some(s) => Uuid::parse_str(s)
+            .map(Some)
+            .map_err(serde::de::Error::custom),
+        None => Ok(None),
+    }
+}
+
 /// JWT payload. See docs/authentication.md for the format
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq)]
 pub struct Claims {
     #[serde(default)]
     pub tenant_id: Option<TenantId>,
+    #[serde(
+        default,
+        skip_serializing_if = "Option::is_none",
+        // Neon control plane includes this field as empty in the claims.
+        // Consider it None in those cases.
+        deserialize_with = "deserialize_empty_string_as_none_uuid"
+    )]
+    pub endpoint_id: Option<Uuid>,
     pub scope: Scope,
 }
 
 impl Claims {
     pub fn new(tenant_id: Option<TenantId>, scope: Scope) -> Self {
-        Self { tenant_id, scope }
+        Self {
+            tenant_id,
+            scope,
+            endpoint_id: None,
+        }
     }
 }
 
@@ -212,6 +244,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         let expected_claims = Claims {
             tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
             scope: Scope::Tenant,
+            endpoint_id: None,
         };
 
         // A test token containing the following payload, signed using TEST_PRIV_KEY_ED25519:
@@ -240,6 +273,7 @@ MC4CAQAwBQYDK2VwBCIEID/Drmc1AA6U/znNRWpF3zEGegOATQxfkdWxitcOMsIH
         let claims = Claims {
             tenant_id: Some(TenantId::from_str("3d1f7595b468230304e0b73cecbcb081").unwrap()),
             scope: Scope::Tenant,
+            endpoint_id: None,
         };
 
         let pem = pem::parse(TEST_PRIV_KEY_ED25519).unwrap();
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 4075427ab4..9e97fdaba8 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -20,7 +20,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::GenerationsApi
             | Scope::Infra
             | Scope::Scrubber
-            | Scope::ControllerPeer,
+            | Scope::ControllerPeer
+            | Scope::TenantEndpoint,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index 81c79fae30..008f903a89 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -21,7 +21,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::GenerationsApi
             | Scope::Infra
             | Scope::Scrubber
-            | Scope::ControllerPeer,
+            | Scope::ControllerPeer
+            | Scope::TenantEndpoint,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml
index 143f4241f4..d67be6d469 100644
--- a/storage_controller/Cargo.toml
+++ b/storage_controller/Cargo.toml
@@ -52,6 +52,7 @@ tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio.workspace = true
 tracing.workspace = true
+uuid.workspace = true
 measured.workspace = true
 rustls.workspace = true
 scopeguard.workspace = true
@@ -63,6 +64,7 @@ tokio-postgres-rustls.workspace = true
 diesel = { version = "2.2.6", features = [
     "serde_json",
     "chrono",
+    "uuid",
 ] }
 diesel-async = { version = "0.5.2", features = ["postgres", "bb8", "async-connection-wrapper"] }
 diesel_migrations = { version = "2.2.0" }
diff --git a/storage_controller/src/auth.rs b/storage_controller/src/auth.rs
index ef47abf8c7..8f15f0f072 100644
--- a/storage_controller/src/auth.rs
+++ b/storage_controller/src/auth.rs
@@ -1,4 +1,5 @@
 use utils::auth::{AuthError, Claims, Scope};
+use uuid::Uuid;
 
 pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), AuthError> {
     if claims.scope != required_scope {
@@ -7,3 +8,14 @@ pub fn check_permission(claims: &Claims, required_scope: Scope) -> Result<(), Au
 
     Ok(())
 }
+
+#[allow(dead_code)]
+pub fn check_endpoint_permission(claims: &Claims, endpoint_id: Uuid) -> Result<(), AuthError> {
+    if claims.scope != Scope::TenantEndpoint {
+        return Err(AuthError("Scope mismatch. Permission denied".into()));
+    }
+    if claims.endpoint_id != Some(endpoint_id) {
+        return Err(AuthError("Endpoint id mismatch. Permission denied".into()));
+    }
+    Ok(())
+}
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index ab37a207e4..fb03412f3c 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -810,6 +810,7 @@ impl ComputeHook {
                 let send_locked = tokio::select! {
                     guard = send_lock.lock_owned() => {guard},
                     _ = cancel.cancelled() => {
+                        tracing::info!("Notification cancelled while waiting for lock");
                         return Err(NotifyError::ShuttingDown)
                     }
                 };
@@ -851,11 +852,32 @@ impl ComputeHook {
             let notify_url = compute_hook_url.as_ref().unwrap();
             self.do_notify(notify_url, &request, cancel).await
         } else {
-            self.do_notify_local::<M>(&request).await.map_err(|e| {
+            match self.do_notify_local::<M>(&request).await.map_err(|e| {
                 // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("neon_local notification hook failed: {e}");
-                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-            })
+                if e.to_string().contains("refresh-configuration-pending") {
+                    // If the error message mentions "refresh-configuration-pending", it means the compute node
+                    // rejected our notification request because it already trying to reconfigure itself. We
+                    // can proceed with the rest of the reconcliation process as the compute node already
+                    // discovers the need to reconfigure and will eventually update its configuration once
+                    // we update the pageserver mappings. In fact, it is important that we continue with
+                    // reconcliation to make sure we update the pageserver mappings to unblock the compute node.
+                    tracing::info!("neon_local notification hook failed: {e}");
+                    tracing::info!("Notification failed likely due to compute node self-reconfiguration, will retry.");
+                    Ok(())
+                } else {
+                    tracing::error!("neon_local notification hook failed: {e}");
+                    Err(NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR))
+                }
+            }) {
+                // Compute node accepted the notification request. Ok to proceed.
+                Ok(_) => Ok(()),
+                // Compute node rejected our request but it is already self-reconfiguring. Ok to proceed.
+                Err(Ok(_)) => Ok(()),
+                // Fail the reconciliation attempt in all other cases. Recall that this whole code path involving
+                // neon_local is for testing only. In production we always retry failed reconcliations so we
+                // don't have any deadends here.
+                Err(Err(e)) => Err(e),
+            }
         };
 
         match result {
diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 8738386968..0c923e742e 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -151,6 +151,29 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Indicator of completed safekeeper reconciles, broken down by safekeeper.
     pub(crate) storage_controller_safekeeper_reconciles_complete:
         measured::CounterVec<SafekeeperReconcilerLabelGroupSet>,
+
+    /* BEGIN HADRON */
+    /// Hadron `config_watcher` reconciliation runs completed, broken down by success/failure.
+    pub(crate) storage_controller_config_watcher_complete:
+        measured::CounterVec<ConfigWatcherCompleteLabelGroupSet>,
+
+    /// Hadron long waits for node state changes during drain and fill.
+    pub(crate) storage_controller_drain_and_fill_long_waits: measured::Counter,
+
+    /// Set to 1 if we detect any page server pods with pending node pool rotation annotations.
+    /// Requires manual reset after oncall investigation.
+    pub(crate) storage_controller_ps_node_pool_rotation_pending: measured::Gauge,
+
+    /// Hadron storage scrubber status.
+    pub(crate) storage_controller_storage_scrub_status:
+        measured::CounterVec<StorageScrubberLabelGroupSet>,
+
+    /// Desired number of pageservers managed by the storage controller
+    pub(crate) storage_controller_num_pageservers_desired: measured::Gauge,
+
+    /// Desired number of safekeepers managed by the storage controller
+    pub(crate) storage_controller_num_safekeeper_desired: measured::Gauge,
+    /* END HADRON */
 }
 
 impl StorageControllerMetrics {
@@ -173,6 +196,10 @@ impl Default for StorageControllerMetrics {
             .storage_controller_reconcile_complete
             .init_all_dense();
 
+        metrics_group
+            .storage_controller_config_watcher_complete
+            .init_all_dense();
+
         Self {
             metrics_group,
             encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
@@ -262,11 +289,48 @@ pub(crate) struct ReconcileLongRunningLabelGroup<'a> {
     pub(crate) sequence: &'a str,
 }
 
+#[derive(measured::LabelGroup, Clone)]
+#[label(set = StorageScrubberLabelGroupSet)]
+pub(crate) struct StorageScrubberLabelGroup<'a> {
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) tenant_id: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) shard_number: &'a str,
+    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    pub(crate) timeline_id: &'a str,
+    pub(crate) outcome: StorageScrubberOutcome,
+}
+
+#[derive(FixedCardinalityLabel, Clone, Copy)]
+pub(crate) enum StorageScrubberOutcome {
+    PSOk,
+    PSWarning,
+    PSError,
+    PSOrphan,
+    SKOk,
+    SKError,
+}
+
+#[derive(measured::LabelGroup)]
+#[label(set = ConfigWatcherCompleteLabelGroupSet)]
+pub(crate) struct ConfigWatcherCompleteLabelGroup {
+    // Reuse the ReconcileOutcome from the SC's reconciliation metrics.
+    pub(crate) status: ReconcileOutcome,
+}
+
 #[derive(FixedCardinalityLabel, Clone, Copy)]
 pub(crate) enum ReconcileOutcome {
+    // Successfully reconciled everything.
     #[label(rename = "ok")]
     Success,
+    // Used by tenant-shard reconciler only. Reconciled pageserver state successfully,
+    // but failed to delivery the compute notificiation. This error is typically transient
+    // but if its occurance keeps increasing, it should be investigated.
+    #[label(rename = "ok_no_notify")]
+    SuccessNoNotify,
+    // We failed to reconcile some state and the reconcilation will be retried.
     Error,
+    // Reconciliation was cancelled.
     Cancel,
 }
 
diff --git a/storage_controller/src/node.rs b/storage_controller/src/node.rs
index 6642c72f3c..63c82b5682 100644
--- a/storage_controller/src/node.rs
+++ b/storage_controller/src/node.rs
@@ -51,6 +51,39 @@ pub(crate) struct Node {
     cancel: CancellationToken,
 }
 
+#[allow(dead_code)]
+const ONE_MILLION: i64 = 1000000;
+
+// Converts a pool ID to a large number that can be used to assign unique IDs to pods in StatefulSets.
+/// For example, if pool_id is 1, then the pods have NodeIds 1000000, 1000001, 1000002, etc.
+/// If pool_id is None, then the pods have NodeIds 0, 1, 2, etc.
+#[allow(dead_code)]
+pub fn transform_pool_id(pool_id: Option<i32>) -> i64 {
+    match pool_id {
+        Some(id) => (id as i64) * ONE_MILLION,
+        None => 0,
+    }
+}
+
+#[allow(dead_code)]
+pub fn get_pool_id_from_node_id(node_id: i64) -> i32 {
+    (node_id / ONE_MILLION) as i32
+}
+
+/// Example pod name: page-server-0-1, safe-keeper-1-0
+#[allow(dead_code)]
+pub fn get_node_id_from_pod_name(pod_name: &str) -> anyhow::Result<NodeId> {
+    let parts: Vec<&str> = pod_name.split('-').collect();
+    if parts.len() != 4 {
+        return Err(anyhow::anyhow!("Invalid pod name: {}", pod_name));
+    }
+    let pool_id = parts[2].parse::<i32>()?;
+    let node_offset = parts[3].parse::<i64>()?;
+    let node_id = transform_pool_id(Some(pool_id)) + node_offset;
+
+    Ok(NodeId(node_id as u64))
+}
+
 /// When updating [`Node::availability`] we use this type to indicate to the caller
 /// whether/how they changed it.
 pub(crate) enum AvailabilityTransition {
@@ -403,3 +436,25 @@ impl std::fmt::Debug for Node {
         write!(f, "{} ({})", self.id, self.listen_http_addr)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use utils::id::NodeId;
+
+    use crate::node::get_node_id_from_pod_name;
+
+    #[test]
+    fn test_get_node_id_from_pod_name() {
+        let pod_name = "page-server-3-12";
+        let node_id = get_node_id_from_pod_name(pod_name).unwrap();
+        assert_eq!(node_id, NodeId(3000012));
+
+        let pod_name = "safe-keeper-1-0";
+        let node_id = get_node_id_from_pod_name(pod_name).unwrap();
+        assert_eq!(node_id, NodeId(1000000));
+
+        let pod_name = "invalid-pod-name";
+        let result = get_node_id_from_pod_name(pod_name);
+        assert!(result.is_err());
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index c61598cdf6..d6d64a2045 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -107,7 +107,6 @@ tracing-core = { version = "0.1" }
 tracing-log = { version = "0.2" }
 tracing-subscriber = { version = "0.3", features = ["env-filter", "json"] }
 url = { version = "2", features = ["serde"] }
-uuid = { version = "1", features = ["serde", "v4", "v7"] }
 zeroize = { version = "1", features = ["derive", "serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }

From 8862e7c4bf16a77bd9c354f4f94e5625c86b302f Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 17 Jul 2025 14:20:40 +0100
Subject: [PATCH 146/163] tests: use new snapshot in test_forward_compat
 (#12637)

## Problem

The forward compatibility test is erroneously
using the downloaded (old) compatibility data. This test is meant to
test that old binaries can work with **new** data. Using the old
compatibility data renders this test useless.

## Summary of changes

Use new snapshot in test_forward_compat

Closes LKB-666

Co-authored-by: William Huang <william.huang@databricks.com>
---
 test_runner/regress/test_compatibility.py | 24 +++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index a4d2bf8d9b..a3a20cdc62 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -187,19 +187,21 @@ def test_create_snapshot(
     env.pageserver.stop()
     env.storage_controller.stop()
 
-    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
-    compatibility_snapshot_dir = (
+    # Directory `new_compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
+    new_compatibility_snapshot_dir = (
         top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
     )
-    if compatibility_snapshot_dir.exists():
-        shutil.rmtree(compatibility_snapshot_dir)
+    if new_compatibility_snapshot_dir.exists():
+        shutil.rmtree(new_compatibility_snapshot_dir)
 
     shutil.copytree(
         test_output_dir,
-        compatibility_snapshot_dir,
+        new_compatibility_snapshot_dir,
         ignore=shutil.ignore_patterns("pg_dynshmem"),
     )
 
+    log.info(f"Copied new compatibility snapshot dir to: {new_compatibility_snapshot_dir}")
+
 
 # check_neon_works does recovery from WAL => the compatibility snapshot's WAL is old => will log this warning
 ingest_lag_log_line = ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
@@ -218,6 +220,7 @@ def test_backward_compatibility(
     """
     Test that the new binaries can read old data
     """
+    log.info(f"Using snapshot dir at {compatibility_snapshot_dir}")
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.from_repo_dir(compatibility_snapshot_dir / "repo")
     env.pageserver.allowed_errors.append(ingest_lag_log_line)
@@ -242,7 +245,6 @@ def test_forward_compatibility(
     test_output_dir: Path,
     top_output_dir: Path,
     pg_version: PgVersion,
-    compatibility_snapshot_dir: Path,
     compute_reconfigure_listener: ComputeReconfigure,
 ):
     """
@@ -266,8 +268,14 @@ def test_forward_compatibility(
     neon_env_builder.neon_binpath = neon_env_builder.compatibility_neon_binpath
     neon_env_builder.pg_distrib_dir = neon_env_builder.compatibility_pg_distrib_dir
 
+    # Note that we are testing with new data, so we should use `new_compatibility_snapshot_dir`, which is created by test_create_snapshot.
+    new_compatibility_snapshot_dir = (
+        top_output_dir / f"compatibility_snapshot_pg{pg_version.v_prefixed}"
+    )
+
+    log.info(f"Using snapshot dir at {new_compatibility_snapshot_dir}")
     env = neon_env_builder.from_repo_dir(
-        compatibility_snapshot_dir / "repo",
+        new_compatibility_snapshot_dir / "repo",
     )
     # there may be an arbitrary number of unrelated tests run between create_snapshot and here
     env.pageserver.allowed_errors.append(ingest_lag_log_line)
@@ -296,7 +304,7 @@ def test_forward_compatibility(
     check_neon_works(
         env,
         test_output_dir=test_output_dir,
-        sql_dump_path=compatibility_snapshot_dir / "dump.sql",
+        sql_dump_path=new_compatibility_snapshot_dir / "dump.sql",
         repo_dir=env.repo_dir,
     )
 

From f0c0733a64889e0e9291b08fcc471ecb502540c1 Mon Sep 17 00:00:00 2001
From: Aleksandr Sarantsev <99037063+ephemeralsad@users.noreply.github.com>
Date: Thu, 17 Jul 2025 18:52:57 +0400
Subject: [PATCH 147/163] storcon: Ignore stuck reconciles when considering
 optimizations (#12589)

## Problem

The `keep_failing_reconciles` counter was introduced in #12391, but
there is a special case:

> if a reconciliation loop claims to have succeeded, but maybe_reconcile
still thinks the tenant is in need of reconciliation, then that's a
probable bug and we should activate a similar backoff to prevent
flapping.

This PR redefines "flapping" to include not just repeated failures, but
also consecutive reconciliations of any kind (success or failure).

## Summary of Changes

- Replace `keep_failing_reconciles` with a new `stuck_reconciles` metric
- Replace `MAX_CONSECUTIVE_RECONCILIATION_ERRORS` with
`MAX_CONSECUTIVE_RECONCILES`, and increasing that from 5 to 10
- Increment the consecutive reconciles counter for all reconciles, not
just failures
- Reset the counter in `reconcile_all` when no reconcile is needed for a
shard
- Improve and fix the related test

---------

Co-authored-by: Aleksandr Sarantsev <aleksandr.sarantsev@databricks.com>
---
 storage_controller/src/metrics.rs             |  4 +-
 storage_controller/src/service.rs             | 61 +++++++++----------
 storage_controller/src/tenant_shard.rs        | 18 +++---
 .../regress/test_storage_controller.py        |  8 ++-
 4 files changed, 46 insertions(+), 45 deletions(-)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 0c923e742e..9c34b34044 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -76,8 +76,8 @@ pub(crate) struct StorageControllerMetricGroup {
     /// How many shards would like to reconcile but were blocked by concurrency limits
     pub(crate) storage_controller_pending_reconciles: measured::Gauge,
 
-    /// How many shards are keep-failing and will be ignored when considering to run optimizations
-    pub(crate) storage_controller_keep_failing_reconciles: measured::Gauge,
+    /// How many shards are stuck and will be ignored when considering to run optimizations
+    pub(crate) storage_controller_stuck_reconciles: measured::Gauge,
 
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index b315b88fcc..ec3b419437 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -232,9 +232,9 @@ pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
 
-// Number of consecutive reconciliation errors, occured for one shard,
+// Number of consecutive reconciliations that have occurred for one shard,
 // after which the shard is ignored when considering to run optimizations.
-const MAX_CONSECUTIVE_RECONCILIATION_ERRORS: usize = 5;
+const MAX_CONSECUTIVE_RECONCILES: usize = 10;
 
 // Depth of the channel used to enqueue shards for reconciliation when they can't do it immediately.
 // This channel is finite-size to avoid using excessive memory if we get into a state where reconciles are finishing more slowly
@@ -735,31 +735,31 @@ struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
 
 struct ReconcileAllResult {
     spawned_reconciles: usize,
-    keep_failing_reconciles: usize,
+    stuck_reconciles: usize,
     has_delayed_reconciles: bool,
 }
 
 impl ReconcileAllResult {
     fn new(
         spawned_reconciles: usize,
-        keep_failing_reconciles: usize,
+        stuck_reconciles: usize,
         has_delayed_reconciles: bool,
     ) -> Self {
         assert!(
-            spawned_reconciles >= keep_failing_reconciles,
-            "It is impossible to have more keep-failing reconciles than spawned reconciles"
+            spawned_reconciles >= stuck_reconciles,
+            "It is impossible to have less spawned reconciles than stuck reconciles"
         );
         Self {
             spawned_reconciles,
-            keep_failing_reconciles,
+            stuck_reconciles,
             has_delayed_reconciles,
         }
     }
 
     /// We can run optimizations only if we don't have any delayed reconciles and
-    /// all spawned reconciles are also keep-failing reconciles.
+    /// all spawned reconciles are also stuck reconciles.
     fn can_run_optimizations(&self) -> bool {
-        !self.has_delayed_reconciles && self.spawned_reconciles == self.keep_failing_reconciles
+        !self.has_delayed_reconciles && self.spawned_reconciles == self.stuck_reconciles
     }
 }
 
@@ -1503,7 +1503,6 @@ impl Service {
 
         match result.result {
             Ok(()) => {
-                tenant.consecutive_errors_count = 0;
                 tenant.apply_observed_deltas(deltas);
                 tenant.waiter.advance(result.sequence);
             }
@@ -1522,8 +1521,6 @@ impl Service {
                     }
                 }
 
-                tenant.consecutive_errors_count = tenant.consecutive_errors_count.saturating_add(1);
-
                 // Ordering: populate last_error before advancing error_seq,
                 // so that waiters will see the correct error after waiting.
                 tenant.set_last_error(result.sequence, e);
@@ -1535,6 +1532,8 @@ impl Service {
             }
         }
 
+        tenant.consecutive_reconciles_count = tenant.consecutive_reconciles_count.saturating_add(1);
+
         // If we just finished detaching all shards for a tenant, it might be time to drop it from memory.
         if tenant.policy == PlacementPolicy::Detached {
             // We may only drop a tenant from memory while holding the exclusive lock on the tenant ID: this protects us
@@ -8640,7 +8639,7 @@ impl Service {
         // This function is an efficient place to update lazy statistics, since we are walking
         // all tenants.
         let mut pending_reconciles = 0;
-        let mut keep_failing_reconciles = 0;
+        let mut stuck_reconciles = 0;
         let mut az_violations = 0;
 
         // If we find any tenants to drop from memory, stash them to offload after
@@ -8676,30 +8675,32 @@ impl Service {
 
             // Eventual consistency: if an earlier reconcile job failed, and the shard is still
             // dirty, spawn another one
-            let consecutive_errors_count = shard.consecutive_errors_count;
             if self
                 .maybe_reconcile_shard(shard, &pageservers, ReconcilerPriority::Normal)
                 .is_some()
             {
                 spawned_reconciles += 1;
 
-                // Count shards that are keep-failing. We still want to reconcile them
-                // to avoid a situation where a shard is stuck.
-                // But we don't want to consider them when deciding to run optimizations.
-                if consecutive_errors_count >= MAX_CONSECUTIVE_RECONCILIATION_ERRORS {
+                if shard.consecutive_reconciles_count >= MAX_CONSECUTIVE_RECONCILES {
+                    // Count shards that are stuck, butwe still want to reconcile them.
+                    // We don't want to consider them when deciding to run optimizations.
                     tracing::warn!(
                         tenant_id=%shard.tenant_shard_id.tenant_id,
                         shard_id=%shard.tenant_shard_id.shard_slug(),
-                        "Shard reconciliation is keep-failing: {} errors",
-                        consecutive_errors_count
+                        "Shard reconciliation is stuck: {} consecutive launches",
+                        shard.consecutive_reconciles_count
                     );
-                    keep_failing_reconciles += 1;
+                    stuck_reconciles += 1;
+                }
+            } else {
+                if shard.delayed_reconcile {
+                    // Shard wanted to reconcile but for some reason couldn't.
+                    pending_reconciles += 1;
                 }
-            } else if shard.delayed_reconcile {
-                // Shard wanted to reconcile but for some reason couldn't.
-                pending_reconciles += 1;
-            }
 
+                // Reset the counter when we don't need to launch a reconcile.
+                shard.consecutive_reconciles_count = 0;
+            }
             // If this tenant is detached, try dropping it from memory. This is usually done
             // proactively in [`Self::process_results`], but we do it here to handle the edge
             // case where a reconcile completes while someone else is holding an op lock for the tenant.
@@ -8735,14 +8736,10 @@ impl Service {
 
         metrics::METRICS_REGISTRY
             .metrics_group
-            .storage_controller_keep_failing_reconciles
-            .set(keep_failing_reconciles as i64);
+            .storage_controller_stuck_reconciles
+            .set(stuck_reconciles as i64);
 
-        ReconcileAllResult::new(
-            spawned_reconciles,
-            keep_failing_reconciles,
-            has_delayed_reconciles,
-        )
+        ReconcileAllResult::new(spawned_reconciles, stuck_reconciles, has_delayed_reconciles)
     }
 
     /// `optimize` in this context means identifying shards which have valid scheduled locations, but
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 99079c57b0..05de155963 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -131,14 +131,16 @@ pub(crate) struct TenantShard {
     #[serde(serialize_with = "read_last_error")]
     pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
 
-    /// Number of consecutive reconciliation errors that have occurred for this shard.
+    /// Amount of consecutive [`crate::service::Service::reconcile_all`] iterations that have been
+    /// scheduled a reconciliation for this shard.
     ///
-    /// When this count reaches MAX_CONSECUTIVE_RECONCILIATION_ERRORS, the tenant shard
-    /// will be countered as keep-failing in `reconcile_all` calculations. This will lead to
-    /// allowing optimizations to run even with some failing shards.
+    /// If this reaches `MAX_CONSECUTIVE_RECONCILES`, the shard is considered "stuck" and will be
+    /// ignored when deciding whether optimizations can run. This includes both successful and failed
+    /// reconciliations.
     ///
-    /// The counter is reset to 0 after a successful reconciliation.
-    pub(crate) consecutive_errors_count: usize,
+    /// Incremented in [`crate::service::Service::process_result`], and reset to 0 when
+    /// [`crate::service::Service::reconcile_all`] determines no reconciliation is needed for this shard.
+    pub(crate) consecutive_reconciles_count: usize,
 
     /// If we have a pending compute notification that for some reason we weren't able to send,
     /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
@@ -603,7 +605,7 @@ impl TenantShard {
             waiter: Arc::new(SeqWait::new(Sequence(0))),
             error_waiter: Arc::new(SeqWait::new(Sequence(0))),
             last_error: Arc::default(),
-            consecutive_errors_count: 0,
+            consecutive_reconciles_count: 0,
             pending_compute_notification: false,
             scheduling_policy: ShardSchedulingPolicy::default(),
             preferred_node: None,
@@ -1908,7 +1910,7 @@ impl TenantShard {
             waiter: Arc::new(SeqWait::new(Sequence::initial())),
             error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
             last_error: Arc::default(),
-            consecutive_errors_count: 0,
+            consecutive_reconciles_count: 0,
             pending_compute_notification: false,
             delayed_reconcile: false,
             scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d1e9bbd7dc..fbdb14b6bb 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -996,7 +996,7 @@ def test_storage_controller_compute_hook_retry(
 
 
 @run_only_on_default_postgres("postgres behavior is not relevant")
-def test_storage_controller_compute_hook_keep_failing(
+def test_storage_controller_compute_hook_stuck_reconciles(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
     httpserver_listen_address: ListenAddress,
@@ -1046,7 +1046,7 @@ def test_storage_controller_compute_hook_keep_failing(
     env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
     env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
     env.storage_controller.allowed_errors.append(".*Keeping extra secondaries.*")
-    env.storage_controller.allowed_errors.append(".*Shard reconciliation is keep-failing.*")
+    env.storage_controller.allowed_errors.append(".*Shard reconciliation is stuck.*")
     env.storage_controller.node_configure(banned_tenant_ps.id, {"availability": "Offline"})
 
     # Migrate all allowed tenant shards to the first alive pageserver
@@ -1061,7 +1061,7 @@ def test_storage_controller_compute_hook_keep_failing(
 
     # Make some reconcile_all calls to trigger optimizations
     # RECONCILE_COUNT must be greater than storcon's MAX_CONSECUTIVE_RECONCILIATION_ERRORS
-    RECONCILE_COUNT = 12
+    RECONCILE_COUNT = 20
     for i in range(RECONCILE_COUNT):
         try:
             n = env.storage_controller.reconcile_all()
@@ -1074,6 +1074,8 @@ def test_storage_controller_compute_hook_keep_failing(
         assert banned_descr["shards"][0]["is_pending_compute_notification"] is True
         time.sleep(2)
 
+    env.storage_controller.assert_log_contains(".*Shard reconciliation is stuck.*")
+
     # Check that the allowed tenant shards are optimized due to affinity rules
     locations = alive_pageservers[0].http_client().tenant_list_locations()["tenant_shards"]
     not_optimized_shard_count = 0

From b309cbc6e9d2cf540f5c081969bbc62f34351f73 Mon Sep 17 00:00:00 2001
From: quantumish <freifeld.david@gmail.com>
Date: Thu, 17 Jul 2025 10:40:53 -0700
Subject: [PATCH 148/163] Add resizable hashmap and RwLock implementations to
 `neon-shmem` (#12596)

Second PR for the hashmap behind the updated LFC implementation ([see
first here](https://github.com/neondatabase/neon/pull/12595)). This only
adds the raw code for the hashmap/lock implementations and doesn't plug
it into the crate (that's dependent on the previous PR and should
probably be done when the full integration into the new communicator is
merged alongside `communicator-rewrite` changes?).

Some high level details: the communicator codebase expects to be able to
store references to entries within this hashmap for arbitrary periods of
time and so the hashmap cannot be allowed to move them during a rehash.
As a result, this implementation has a slightly unusual structure where
key-value pairs (and hash chains) are allocated in a separate region
with a freelist. The core hashmap structure is then an array of
"dictionary entries" that are just indexes into this region of key-value
pairs.

Concurrency support is very naive at the moment with the entire map
guarded by one big `RwLock` (which is implemented on top of a
`pthread_rwlock_t` since Rust doesn't guarantee that a
`std::sync::RwLock` is safe to use in shared memory). This (along with a
lot of other things) is being changed on the
`quantumish/lfc-resizable-map` branch.
---
 Cargo.lock                        |  90 ++++-
 Cargo.toml                        |   3 +-
 libs/neon-shmem/Cargo.toml        |   7 +
 libs/neon-shmem/src/hash.rs       | 583 ++++++++++++++++++++++++++++++
 libs/neon-shmem/src/hash/core.rs  | 174 +++++++++
 libs/neon-shmem/src/hash/entry.rs | 130 +++++++
 libs/neon-shmem/src/hash/tests.rs | 428 ++++++++++++++++++++++
 libs/neon-shmem/src/lib.rs        |   2 +
 libs/neon-shmem/src/sync.rs       | 111 ++++++
 9 files changed, 1522 insertions(+), 6 deletions(-)
 create mode 100644 libs/neon-shmem/src/hash.rs
 create mode 100644 libs/neon-shmem/src/hash/core.rs
 create mode 100644 libs/neon-shmem/src/hash/entry.rs
 create mode 100644 libs/neon-shmem/src/hash/tests.rs
 create mode 100644 libs/neon-shmem/src/sync.rs

diff --git a/Cargo.lock b/Cargo.lock
index 215b3360bc..137b883a6d 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2534,6 +2534,18 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "getrandom"
+version = "0.3.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26145e563e54f2cadc477553f1ec5ee650b00862f0a58bcd12cbdc5f0ea2d2f4"
+dependencies = [
+ "cfg-if",
+ "libc",
+ "r-efi",
+ "wasi 0.14.2+wasi-0.2.4",
+]
+
 [[package]]
 name = "gettid"
 version = "0.1.3"
@@ -3607,9 +3619,9 @@ checksum = "4ee93343901ab17bd981295f2cf0026d4ad018c7c31ba84549a4ddbb47a45104"
 
 [[package]]
 name = "lock_api"
-version = "0.4.10"
+version = "0.4.13"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c1cc9717a20b1bb222f333e6a92fd32f7d8a18ddc5a3191a11af45dcbf4dcd16"
+checksum = "96936507f153605bddfcda068dd804796c84324ed2510809e5b2a624c81da765"
 dependencies = [
  "autocfg",
  "scopeguard",
@@ -3759,7 +3771,7 @@ dependencies = [
  "procfs",
  "prometheus",
  "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
  "twox-hash",
 ]
 
@@ -3847,7 +3859,12 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
 name = "neon-shmem"
 version = "0.1.0"
 dependencies = [
+ "libc",
+ "lock_api",
  "nix 0.30.1",
+ "rand 0.9.1",
+ "rand_distr 0.5.1",
+ "rustc-hash 2.1.1",
  "tempfile",
  "thiserror 1.0.69",
  "workspace_hack",
@@ -5348,7 +5365,7 @@ dependencies = [
  "postgres_backend",
  "pq_proto",
  "rand 0.8.5",
- "rand_distr",
+ "rand_distr 0.4.3",
  "rcgen",
  "redis",
  "regex",
@@ -5359,7 +5376,7 @@ dependencies = [
  "reqwest-tracing",
  "rsa",
  "rstest",
- "rustc-hash 1.1.0",
+ "rustc-hash 2.1.1",
  "rustls 0.23.27",
  "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
@@ -5452,6 +5469,12 @@ dependencies = [
  "proc-macro2",
 ]
 
+[[package]]
+name = "r-efi"
+version = "5.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
+
 [[package]]
 name = "rand"
 version = "0.7.3"
@@ -5476,6 +5499,16 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "rand"
+version = "0.9.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
+dependencies = [
+ "rand_chacha 0.9.0",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_chacha"
 version = "0.2.2"
@@ -5496,6 +5529,16 @@ dependencies = [
  "rand_core 0.6.4",
 ]
 
+[[package]]
+name = "rand_chacha"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb"
+dependencies = [
+ "ppv-lite86",
+ "rand_core 0.9.3",
+]
+
 [[package]]
 name = "rand_core"
 version = "0.5.1"
@@ -5514,6 +5557,15 @@ dependencies = [
  "getrandom 0.2.11",
 ]
 
+[[package]]
+name = "rand_core"
+version = "0.9.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "99d9a13982dcf210057a8a78572b2217b667c3beacbf3a0d8b454f6f82837d38"
+dependencies = [
+ "getrandom 0.3.3",
+]
+
 [[package]]
 name = "rand_distr"
 version = "0.4.3"
@@ -5524,6 +5576,16 @@ dependencies = [
  "rand 0.8.5",
 ]
 
+[[package]]
+name = "rand_distr"
+version = "0.5.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a8615d50dcf34fa31f7ab52692afec947c4dd0ab803cc87cb3b0b4570ff7463"
+dependencies = [
+ "num-traits",
+ "rand 0.9.1",
+]
+
 [[package]]
 name = "rand_hc"
 version = "0.2.0"
@@ -8351,6 +8413,15 @@ version = "0.11.0+wasi-snapshot-preview1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
 
+[[package]]
+name = "wasi"
+version = "0.14.2+wasi-0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9683f9a5a998d873c0d21fcbe3c083009670149a8fab228644b8bd36b2c48cb3"
+dependencies = [
+ "wit-bindgen-rt",
+]
+
 [[package]]
 name = "wasite"
 version = "0.1.0"
@@ -8708,6 +8779,15 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "wit-bindgen-rt"
+version = "0.39.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6f42320e61fe2cfd34354ecb597f86f413484a798ba44a8ca1165c58d42da6c1"
+dependencies = [
+ "bitflags 2.8.0",
+]
+
 [[package]]
 name = "workspace_hack"
 version = "0.1.0"
diff --git a/Cargo.toml b/Cargo.toml
index df2064a4a7..6d91262882 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -130,6 +130,7 @@ jemalloc_pprof = { version = "0.7", features = ["symbolize", "flamegraph"] }
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
+lock_api = "0.4.13"
 md5 = "0.7.0"
 measured = { version = "0.0.22", features=["lasso"] }
 measured-process = { version = "0.0.22" }
@@ -165,7 +166,7 @@ reqwest-middleware = "0.4"
 reqwest-retry = "0.7"
 routerify = "3"
 rpds = "0.13"
-rustc-hash = "1.1.0"
+rustc-hash = "2.1.1"
 rustls = { version = "0.23.16", default-features = false }
 rustls-pemfile = "2"
 rustls-pki-types = "1.11"
diff --git a/libs/neon-shmem/Cargo.toml b/libs/neon-shmem/Cargo.toml
index 2a636bec40..7ed991502e 100644
--- a/libs/neon-shmem/Cargo.toml
+++ b/libs/neon-shmem/Cargo.toml
@@ -8,6 +8,13 @@ license.workspace = true
 thiserror.workspace = true
 nix.workspace=true
 workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+libc.workspace = true
+lock_api.workspace = true
+rustc-hash.workspace = true
 
 [target.'cfg(target_os = "macos")'.dependencies]
 tempfile = "3.14.0"
+
+[dev-dependencies]
+rand = "0.9"
+rand_distr = "0.5.1"
diff --git a/libs/neon-shmem/src/hash.rs b/libs/neon-shmem/src/hash.rs
new file mode 100644
index 0000000000..58726b9ba3
--- /dev/null
+++ b/libs/neon-shmem/src/hash.rs
@@ -0,0 +1,583 @@
+//! Resizable hash table implementation on top of byte-level storage (either a [`ShmemHandle`] or a fixed byte array).
+//!
+//! This hash table has two major components: the bucket array and the dictionary. Each bucket within the
+//! bucket array contains a `Option<(K, V)>` and an index of another bucket. In this way there is both an
+//! implicit freelist within the bucket array (`None` buckets point to other `None` entries) and various hash
+//! chains within the bucket array (a Some bucket will point to other Some buckets that had the same hash).
+//!
+//! Buckets are never moved unless they are within a region that is being shrunk, and so the actual hash-
+//! dependent component is done with the dictionary. When a new key is inserted into the map, a position
+//! within the dictionary is decided based on its hash, the data is inserted into an empty bucket based
+//! off of the freelist, and then the index of said bucket is placed in the dictionary.
+//!
+//! This map is resizable (if initialized on top of a [`ShmemHandle`]). Both growing and shrinking happen
+//! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
+//! dictionary by rehashing all keys.
+//!
+//! Concurrency is managed very simply: the entire map is guarded by one shared-memory RwLock.
+
+use std::hash::{BuildHasher, Hash};
+use std::mem::MaybeUninit;
+
+use crate::shmem::ShmemHandle;
+use crate::{shmem, sync::*};
+
+mod core;
+pub mod entry;
+
+#[cfg(test)]
+mod tests;
+
+use core::{Bucket, CoreHashMap, INVALID_POS};
+use entry::{Entry, OccupiedEntry, PrevPos, VacantEntry};
+
+use thiserror::Error;
+
+/// Error type for a hashmap shrink operation.
+#[derive(Error, Debug)]
+pub enum HashMapShrinkError {
+    /// There was an error encountered while resizing the memory area.
+    #[error("shmem resize failed: {0}")]
+    ResizeError(shmem::Error),
+    /// Occupied entries in to-be-shrunk space were encountered beginning at the given index.
+    #[error("occupied entry in deallocated space found at {0}")]
+    RemainingEntries(usize),
+}
+
+/// This represents a hash table that (possibly) lives in shared memory.
+/// If a new process is launched with fork(), the child process inherits
+/// this struct.
+#[must_use]
+pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    shared_size: usize,
+    hasher: S,
+    num_buckets: u32,
+}
+
+/// This is a per-process handle to a hash table that (possibly) lives in shared memory.
+/// If a child process is launched with fork(), the child process should
+/// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
+///
+/// XXX: We're not making use of it at the moment, but this struct could
+/// hold process-local information in the future.
+pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
+    shmem_handle: Option<ShmemHandle>,
+    shared_ptr: *mut HashMapShared<'a, K, V>,
+    hasher: S,
+}
+
+unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
+unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
+
+impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
+    /// Change the 'hasher' used by the hash table.
+    ///
+    /// NOTE: This must be called right after creating the hash table,
+    /// before inserting any entries and before calling attach_writer/reader.
+    /// Otherwise different accessors could be using different hash function,
+    /// with confusing results.
+    pub fn with_hasher<T: BuildHasher>(self, hasher: T) -> HashMapInit<'a, K, V, T> {
+        HashMapInit {
+            hasher,
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+            shared_size: self.shared_size,
+            num_buckets: self.num_buckets,
+        }
+    }
+
+    /// Loosely (over)estimate the size needed to store a hash table with `num_buckets` buckets.
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        // add some margin to cover alignment etc.
+        CoreHashMap::<K, V>::estimate_size(num_buckets) + size_of::<HashMapShared<K, V>>() + 1000
+    }
+
+    fn new(
+        num_buckets: u32,
+        shmem_handle: Option<ShmemHandle>,
+        area_ptr: *mut u8,
+        area_size: usize,
+        hasher: S,
+    ) -> Self {
+        let mut ptr: *mut u8 = area_ptr;
+        let end_ptr: *mut u8 = unsafe { ptr.add(area_size) };
+
+        // carve out area for the One Big Lock (TM) and the HashMapShared.
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<libc::pthread_rwlock_t>())) };
+        let raw_lock_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<libc::pthread_rwlock_t>()) };
+        ptr = unsafe { ptr.add(ptr.align_offset(align_of::<HashMapShared<K, V>>())) };
+        let shared_ptr: *mut HashMapShared<K, V> = ptr.cast();
+        ptr = unsafe { ptr.add(size_of::<HashMapShared<K, V>>()) };
+
+        // carve out the buckets
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<core::Bucket<K, V>>())) };
+        let buckets_ptr = ptr;
+        ptr = unsafe { ptr.add(size_of::<core::Bucket<K, V>>() * num_buckets as usize) };
+
+        // use remaining space for the dictionary
+        ptr = unsafe { ptr.byte_add(ptr.align_offset(align_of::<u32>())) };
+        assert!(ptr.addr() < end_ptr.addr());
+        let dictionary_ptr = ptr;
+        let dictionary_size = unsafe { end_ptr.byte_offset_from(ptr) / size_of::<u32>() as isize };
+        assert!(dictionary_size > 0);
+
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets_ptr.cast(), num_buckets as usize) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary_ptr.cast(), dictionary_size as usize)
+        };
+
+        let hashmap = CoreHashMap::new(buckets, dictionary);
+        unsafe {
+            let lock = RwLock::from_raw(PthreadRwLock::new(raw_lock_ptr.cast()), hashmap);
+            std::ptr::write(shared_ptr, lock);
+        }
+
+        Self {
+            num_buckets,
+            shmem_handle,
+            shared_ptr,
+            shared_size: area_size,
+            hasher,
+        }
+    }
+
+    /// Attach to a hash table for writing.
+    pub fn attach_writer(self) -> HashMapAccess<'a, K, V, S> {
+        HashMapAccess {
+            shmem_handle: self.shmem_handle,
+            shared_ptr: self.shared_ptr,
+            hasher: self.hasher,
+        }
+    }
+
+    /// Initialize a table for reading. Currently identical to [`HashMapInit::attach_writer`].
+    ///
+    /// This is a holdover from a previous implementation and is being kept around for
+    /// backwards compatibility reasons.
+    pub fn attach_reader(self) -> HashMapAccess<'a, K, V, S> {
+        self.attach_writer()
+    }
+}
+
+/// Hash table data that is actually stored in the shared memory area.
+///
+/// NOTE: We carve out the parts from a contiguous chunk. Growing and shrinking the hash table
+/// relies on the memory layout! The data structures are laid out in the contiguous shared memory
+/// area as follows:
+///
+/// [`libc::pthread_rwlock_t`]
+/// [`HashMapShared`]
+/// buckets
+/// dictionary
+///
+/// In between the above parts, there can be padding bytes to align the parts correctly.
+type HashMapShared<'a, K, V> = RwLock<CoreHashMap<'a, K, V>>;
+
+impl<'a, K, V> HashMapInit<'a, K, V, rustc_hash::FxBuildHasher>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Place the hash table within a user-supplied fixed memory area.
+    pub fn with_fixed(num_buckets: u32, area: &'a mut [MaybeUninit<u8>]) -> Self {
+        Self::new(
+            num_buckets,
+            None,
+            area.as_mut_ptr().cast(),
+            area.len(),
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Place a new hash map in the given shared memory area
+    ///
+    /// # Panics
+    /// Will panic on failure to resize area to expected map size.
+    pub fn with_shmem(num_buckets: u32, shmem: ShmemHandle) -> Self {
+        let size = Self::estimate_size(num_buckets);
+        shmem
+            .set_size(size)
+            .expect("could not resize shared memory area");
+        let ptr = shmem.data_ptr.as_ptr().cast();
+        Self::new(
+            num_buckets,
+            Some(shmem),
+            ptr,
+            size,
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new shared memory area with the given name.
+    pub fn new_resizeable_named(num_buckets: u32, max_buckets: u32, name: &str) -> Self {
+        let size = Self::estimate_size(num_buckets);
+        let max_size = Self::estimate_size(max_buckets);
+        let shmem =
+            ShmemHandle::new(name, size, max_size).expect("failed to make shared memory area");
+        let ptr = shmem.data_ptr.as_ptr().cast();
+
+        Self::new(
+            num_buckets,
+            Some(shmem),
+            ptr,
+            size,
+            rustc_hash::FxBuildHasher,
+        )
+    }
+
+    /// Make a resizable hash map within a new anonymous shared memory area.
+    pub fn new_resizeable(num_buckets: u32, max_buckets: u32) -> Self {
+        use std::sync::atomic::{AtomicUsize, Ordering};
+        static COUNTER: AtomicUsize = AtomicUsize::new(0);
+        let val = COUNTER.fetch_add(1, Ordering::Relaxed);
+        let name = format!("neon_shmem_hmap{val}");
+        Self::new_resizeable_named(num_buckets, max_buckets, &name)
+    }
+}
+
+impl<'a, K, V, S: BuildHasher> HashMapAccess<'a, K, V, S>
+where
+    K: Clone + Hash + Eq,
+{
+    /// Hash a key using the map's hasher.
+    #[inline]
+    fn get_hash_value(&self, key: &K) -> u64 {
+        self.hasher.hash_one(key)
+    }
+
+    fn entry_with_hash(&self, key: K, hash: u64) -> Entry<'a, '_, K, V> {
+        let mut map = unsafe { self.shared_ptr.as_ref() }.unwrap().write();
+        let dict_pos = hash as usize % map.dictionary.len();
+        let first = map.dictionary[dict_pos];
+        if first == INVALID_POS {
+            // no existing entry
+            return Entry::Vacant(VacantEntry {
+                map,
+                key,
+                dict_pos: dict_pos as u32,
+            });
+        }
+
+        let mut prev_pos = PrevPos::First(dict_pos as u32);
+        let mut next = first;
+        loop {
+            let bucket = &mut map.buckets[next as usize];
+            let (bucket_key, _bucket_value) = bucket.inner.as_mut().expect("entry is in use");
+            if *bucket_key == key {
+                // found existing entry
+                return Entry::Occupied(OccupiedEntry {
+                    map,
+                    _key: key,
+                    prev_pos,
+                    bucket_pos: next,
+                });
+            }
+
+            if bucket.next == INVALID_POS {
+                // No existing entry
+                return Entry::Vacant(VacantEntry {
+                    map,
+                    key,
+                    dict_pos: dict_pos as u32,
+                });
+            }
+            prev_pos = PrevPos::Chained(next);
+            next = bucket.next;
+        }
+    }
+
+    /// Get a reference to the corresponding value for a key.
+    pub fn get<'e>(&'e self, key: &K) -> Option<ValueReadGuard<'e, V>> {
+        let hash = self.get_hash_value(key);
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        RwLockReadGuard::try_map(map, |m| m.get_with_hash(key, hash)).ok()
+    }
+
+    /// Get a reference to the entry containing a key.
+    ///
+    /// NB: THis takes a write lock as there's no way to distinguish whether the intention
+    /// is to use the entry for reading or for writing in advance.
+    pub fn entry(&self, key: K) -> Entry<'a, '_, K, V> {
+        let hash = self.get_hash_value(&key);
+        self.entry_with_hash(key, hash)
+    }
+
+    /// Remove a key given its hash. Returns the associated value if it existed.
+    pub fn remove(&self, key: &K) -> Option<V> {
+        let hash = self.get_hash_value(key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        }
+    }
+
+    /// Insert/update a key. Returns the previous associated value if it existed.
+    ///
+    /// # Errors
+    /// Will return [`core::FullError`] if there is no more space left in the map.
+    pub fn insert(&self, key: K, value: V) -> Result<Option<V>, core::FullError> {
+        let hash = self.get_hash_value(&key);
+        match self.entry_with_hash(key.clone(), hash) {
+            Entry::Occupied(mut e) => Ok(Some(e.insert(value))),
+            Entry::Vacant(e) => {
+                _ = e.insert(value)?;
+                Ok(None)
+            }
+        }
+    }
+
+    /// Optionally return the entry for a bucket at a given index if it exists.
+    ///
+    /// Has more overhead than one would intuitively expect: performs both a clone of the key
+    /// due to the [`OccupiedEntry`] type owning the key and also a hash of the key in order
+    /// to enable repairing the hash chain if the entry is removed.
+    pub fn entry_at_bucket(&self, pos: usize) -> Option<OccupiedEntry<'a, '_, K, V>> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        if pos >= map.buckets.len() {
+            return None;
+        }
+
+        let entry = map.buckets[pos].inner.as_ref();
+        match entry {
+            Some((key, _)) => Some(OccupiedEntry {
+                _key: key.clone(),
+                bucket_pos: pos as u32,
+                prev_pos: entry::PrevPos::Unknown(self.get_hash_value(key)),
+                map,
+            }),
+            _ => None,
+        }
+    }
+
+    /// Returns the number of buckets in the table.
+    pub fn get_num_buckets(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.get_num_buckets()
+    }
+
+    /// Return the key and value stored in bucket with given index. This can be used to
+    /// iterate through the hash map.
+    // TODO: An Iterator might be nicer. The communicator's clock algorithm needs to
+    // _slowly_ iterate through all buckets with its clock hand,  without holding a lock.
+    // If we switch to an Iterator, it must not hold the lock.
+    pub fn get_at_bucket(&self, pos: usize) -> Option<ValueReadGuard<(K, V)>> {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        if pos >= map.buckets.len() {
+            return None;
+        }
+        RwLockReadGuard::try_map(map, |m| m.buckets[pos].inner.as_ref()).ok()
+    }
+
+    /// Returns the index of the bucket a given value corresponds to.
+    pub fn get_bucket_for_value(&self, val_ptr: *const V) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+
+        let origin = map.buckets.as_ptr();
+        let idx = (val_ptr as usize - origin as usize) / size_of::<Bucket<K, V>>();
+        assert!(idx < map.buckets.len());
+
+        idx
+    }
+
+    /// Returns the number of occupied buckets in the table.
+    pub fn get_num_buckets_in_use(&self) -> usize {
+        let map = unsafe { self.shared_ptr.as_ref() }.unwrap().read();
+        map.buckets_in_use as usize
+    }
+
+    /// Clears all entries in a table. Does not reset any shrinking operations.
+    pub fn clear(&self) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        map.clear();
+    }
+
+    /// Perform an in-place rehash of some region (0..`rehash_buckets`) of the table and reset
+    /// the `buckets` and `dictionary` slices to be as long as `num_buckets`. Resets the freelist
+    /// in the process.
+    fn rehash_dict(
+        &self,
+        inner: &mut CoreHashMap<'a, K, V>,
+        buckets_ptr: *mut core::Bucket<K, V>,
+        end_ptr: *mut u8,
+        num_buckets: u32,
+        rehash_buckets: u32,
+    ) {
+        inner.free_head = INVALID_POS;
+
+        let buckets;
+        let dictionary;
+        unsafe {
+            let buckets_end_ptr = buckets_ptr.add(num_buckets as usize);
+            let dictionary_ptr: *mut u32 = buckets_end_ptr
+                .byte_add(buckets_end_ptr.align_offset(align_of::<u32>()))
+                .cast();
+            let dictionary_size: usize =
+                end_ptr.byte_offset_from(buckets_end_ptr) as usize / size_of::<u32>();
+
+            buckets = std::slice::from_raw_parts_mut(buckets_ptr, num_buckets as usize);
+            dictionary = std::slice::from_raw_parts_mut(dictionary_ptr, dictionary_size);
+        }
+        for e in dictionary.iter_mut() {
+            *e = INVALID_POS;
+        }
+
+        for (i, bucket) in buckets.iter_mut().enumerate().take(rehash_buckets as usize) {
+            if bucket.inner.is_none() {
+                bucket.next = inner.free_head;
+                inner.free_head = i as u32;
+                continue;
+            }
+
+            let hash = self.hasher.hash_one(&bucket.inner.as_ref().unwrap().0);
+            let pos: usize = (hash % dictionary.len() as u64) as usize;
+            bucket.next = dictionary[pos];
+            dictionary[pos] = i as u32;
+        }
+
+        inner.dictionary = dictionary;
+        inner.buckets = buckets;
+    }
+
+    /// Rehash the map without growing or shrinking.
+    pub fn shuffle(&self) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        let num_buckets = map.get_num_buckets() as u32;
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        let end_ptr: *mut u8 = unsafe { self.shared_ptr.byte_add(size_bytes).cast() };
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+    }
+
+    /// Grow the number of buckets within the table.
+    ///
+    /// 1. Grows the underlying shared memory area
+    /// 2. Initializes new buckets and overwrites the current dictionary
+    /// 3. Rehashes the dictionary
+    ///
+    /// # Panics
+    /// Panics if called on a map initialized with [`HashMapInit::with_fixed`].
+    ///
+    /// # Errors
+    /// Returns an [`shmem::Error`] if any errors occur resizing the memory region.
+    pub fn grow(&self, num_buckets: u32) -> Result<(), shmem::Error> {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        let old_num_buckets = map.buckets.len() as u32;
+
+        assert!(
+            num_buckets >= old_num_buckets,
+            "grow called with a smaller number of buckets"
+        );
+        if num_buckets == old_num_buckets {
+            return Ok(());
+        }
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("grow called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        shmem_handle.set_size(size_bytes)?;
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+
+        // Initialize new buckets. The new buckets are linked to the free list.
+        // NB: This overwrites the dictionary!
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        unsafe {
+            for i in old_num_buckets..num_buckets {
+                let bucket = buckets_ptr.add(i as usize);
+                bucket.write(core::Bucket {
+                    next: if i < num_buckets - 1 {
+                        i + 1
+                    } else {
+                        map.free_head
+                    },
+                    inner: None,
+                });
+            }
+        }
+
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, old_num_buckets);
+        map.free_head = old_num_buckets;
+
+        Ok(())
+    }
+
+    /// Begin a shrink, limiting all new allocations to be in buckets with index below `num_buckets`.
+    ///
+    /// # Panics
+    /// Panics if called on a map initialized with [`HashMapInit::with_fixed`] or if `num_buckets` is
+    /// greater than the number of buckets in the map.
+    pub fn begin_shrink(&mut self, num_buckets: u32) {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        assert!(
+            num_buckets <= map.get_num_buckets() as u32,
+            "shrink called with a larger number of buckets"
+        );
+        _ = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+        map.alloc_limit = num_buckets;
+    }
+
+    /// If a shrink operation is underway, returns the target size of the map. Otherwise, returns None.
+    pub fn shrink_goal(&self) -> Option<usize> {
+        let map = unsafe { self.shared_ptr.as_mut() }.unwrap().read();
+        let goal = map.alloc_limit;
+        if goal == INVALID_POS {
+            None
+        } else {
+            Some(goal as usize)
+        }
+    }
+
+    /// Complete a shrink after caller has evicted entries, removing the unused buckets and rehashing.
+    ///
+    /// # Panics
+    /// The following cases result in a panic:
+    /// - Calling this function on a map initialized with [`HashMapInit::with_fixed`].
+    /// - Calling this function on a map when no shrink operation is in progress.
+    pub fn finish_shrink(&self) -> Result<(), HashMapShrinkError> {
+        let mut map = unsafe { self.shared_ptr.as_mut() }.unwrap().write();
+        assert!(
+            map.alloc_limit != INVALID_POS,
+            "called finish_shrink when no shrink is in progress"
+        );
+
+        let num_buckets = map.alloc_limit;
+
+        if map.get_num_buckets() == num_buckets as usize {
+            return Ok(());
+        }
+
+        assert!(
+            map.buckets_in_use <= num_buckets,
+            "called finish_shrink before enough entries were removed"
+        );
+
+        for i in (num_buckets as usize)..map.buckets.len() {
+            if map.buckets[i].inner.is_some() {
+                return Err(HashMapShrinkError::RemainingEntries(i));
+            }
+        }
+
+        let shmem_handle = self
+            .shmem_handle
+            .as_ref()
+            .expect("shrink called on a fixed-size hash table");
+
+        let size_bytes = HashMapInit::<K, V, S>::estimate_size(num_buckets);
+        if let Err(e) = shmem_handle.set_size(size_bytes) {
+            return Err(HashMapShrinkError::ResizeError(e));
+        }
+        let end_ptr: *mut u8 = unsafe { shmem_handle.data_ptr.as_ptr().add(size_bytes) };
+        let buckets_ptr = map.buckets.as_mut_ptr();
+        self.rehash_dict(&mut map, buckets_ptr, end_ptr, num_buckets, num_buckets);
+        map.alloc_limit = INVALID_POS;
+
+        Ok(())
+    }
+}
diff --git a/libs/neon-shmem/src/hash/core.rs b/libs/neon-shmem/src/hash/core.rs
new file mode 100644
index 0000000000..4665c36adb
--- /dev/null
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -0,0 +1,174 @@
+//! Simple hash table with chaining.
+
+use std::hash::Hash;
+use std::mem::MaybeUninit;
+
+use crate::hash::entry::*;
+
+/// Invalid position within the map (either within the dictionary or bucket array).
+pub(crate) const INVALID_POS: u32 = u32::MAX;
+
+/// Fundamental storage unit within the hash table. Either empty or contains a key-value pair.
+/// Always part of a chain of some kind (either a freelist if empty or a hash chain if full).
+pub(crate) struct Bucket<K, V> {
+    /// Index of next bucket in the chain.
+    pub(crate) next: u32,
+    /// Key-value pair contained within bucket.
+    pub(crate) inner: Option<(K, V)>,
+}
+
+/// Core hash table implementation.
+pub(crate) struct CoreHashMap<'a, K, V> {
+    /// Dictionary used to map hashes to bucket indices.
+    pub(crate) dictionary: &'a mut [u32],
+    /// Buckets containing key-value pairs.
+    pub(crate) buckets: &'a mut [Bucket<K, V>],
+    /// Head of the freelist.
+    pub(crate) free_head: u32,
+    /// Maximum index of a bucket allowed to be allocated. [`INVALID_POS`] if no limit.
+    pub(crate) alloc_limit: u32,
+    /// The number of currently occupied buckets.
+    pub(crate) buckets_in_use: u32,
+}
+
+/// Error for when there are no empty buckets left but one is needed.
+#[derive(Debug, PartialEq)]
+pub struct FullError;
+
+impl<'a, K: Clone + Hash + Eq, V> CoreHashMap<'a, K, V> {
+    const FILL_FACTOR: f32 = 0.60;
+
+    /// Estimate the size of data contained within the the hash map.
+    pub fn estimate_size(num_buckets: u32) -> usize {
+        let mut size = 0;
+
+        // buckets
+        size += size_of::<Bucket<K, V>>() * num_buckets as usize;
+
+        // dictionary
+        size += (f32::ceil((size_of::<u32>() * num_buckets as usize) as f32 / Self::FILL_FACTOR))
+            as usize;
+
+        size
+    }
+
+    pub fn new(
+        buckets: &'a mut [MaybeUninit<Bucket<K, V>>],
+        dictionary: &'a mut [MaybeUninit<u32>],
+    ) -> Self {
+        // Initialize the buckets
+        for i in 0..buckets.len() {
+            buckets[i].write(Bucket {
+                next: if i < buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+                inner: None,
+            });
+        }
+
+        // Initialize the dictionary
+        for e in dictionary.iter_mut() {
+            e.write(INVALID_POS);
+        }
+
+        // TODO: use std::slice::assume_init_mut() once it stabilizes
+        let buckets =
+            unsafe { std::slice::from_raw_parts_mut(buckets.as_mut_ptr().cast(), buckets.len()) };
+        let dictionary = unsafe {
+            std::slice::from_raw_parts_mut(dictionary.as_mut_ptr().cast(), dictionary.len())
+        };
+
+        Self {
+            dictionary,
+            buckets,
+            free_head: 0,
+            buckets_in_use: 0,
+            alloc_limit: INVALID_POS,
+        }
+    }
+
+    /// Get the value associated with a key (if it exists) given its hash.
+    pub fn get_with_hash(&self, key: &K, hash: u64) -> Option<&V> {
+        let mut next = self.dictionary[hash as usize % self.dictionary.len()];
+        loop {
+            if next == INVALID_POS {
+                return None;
+            }
+
+            let bucket = &self.buckets[next as usize];
+            let (bucket_key, bucket_value) = bucket.inner.as_ref().expect("entry is in use");
+            if bucket_key == key {
+                return Some(bucket_value);
+            }
+            next = bucket.next;
+        }
+    }
+
+    /// Get number of buckets in map.
+    pub fn get_num_buckets(&self) -> usize {
+        self.buckets.len()
+    }
+
+    /// Clears all entries from the hashmap.
+    ///
+    /// Does not reset any allocation limits, but does clear any entries beyond them.
+    pub fn clear(&mut self) {
+        for i in 0..self.buckets.len() {
+            self.buckets[i] = Bucket {
+                next: if i < self.buckets.len() - 1 {
+                    i as u32 + 1
+                } else {
+                    INVALID_POS
+                },
+                inner: None,
+            }
+        }
+        for i in 0..self.dictionary.len() {
+            self.dictionary[i] = INVALID_POS;
+        }
+
+        self.free_head = 0;
+        self.buckets_in_use = 0;
+    }
+
+    /// Find the position of an unused bucket via the freelist and initialize it.
+    pub(crate) fn alloc_bucket(&mut self, key: K, value: V) -> Result<u32, FullError> {
+        let mut pos = self.free_head;
+
+        // Find the first bucket we're *allowed* to use.
+        let mut prev = PrevPos::First(self.free_head);
+        while pos != INVALID_POS && pos >= self.alloc_limit {
+            let bucket = &mut self.buckets[pos as usize];
+            prev = PrevPos::Chained(pos);
+            pos = bucket.next;
+        }
+        if pos == INVALID_POS {
+            return Err(FullError);
+        }
+
+        // Repair the freelist.
+        match prev {
+            PrevPos::First(_) => {
+                let next_pos = self.buckets[pos as usize].next;
+                self.free_head = next_pos;
+            }
+            PrevPos::Chained(p) => {
+                if p != INVALID_POS {
+                    let next_pos = self.buckets[pos as usize].next;
+                    self.buckets[p as usize].next = next_pos;
+                }
+            }
+            _ => unreachable!(),
+        }
+
+        // Initialize the bucket.
+        let bucket = &mut self.buckets[pos as usize];
+        self.buckets_in_use += 1;
+        bucket.next = INVALID_POS;
+        bucket.inner = Some((key, value));
+
+        Ok(pos)
+    }
+}
diff --git a/libs/neon-shmem/src/hash/entry.rs b/libs/neon-shmem/src/hash/entry.rs
new file mode 100644
index 0000000000..560a20db1d
--- /dev/null
+++ b/libs/neon-shmem/src/hash/entry.rs
@@ -0,0 +1,130 @@
+//! Equivalent of [`std::collections::hash_map::Entry`] for this hashmap.
+
+use crate::hash::core::{CoreHashMap, FullError, INVALID_POS};
+use crate::sync::{RwLockWriteGuard, ValueWriteGuard};
+
+use std::hash::Hash;
+use std::mem;
+
+pub enum Entry<'a, 'b, K, V> {
+    Occupied(OccupiedEntry<'a, 'b, K, V>),
+    Vacant(VacantEntry<'a, 'b, K, V>),
+}
+
+/// Enum representing the previous position within a chain.
+#[derive(Clone, Copy)]
+pub(crate) enum PrevPos {
+    /// Starting index within the dictionary.  
+    First(u32),
+    /// Regular index within the buckets.
+    Chained(u32),
+    /// Unknown - e.g. the associated entry was retrieved by index instead of chain.
+    Unknown(u64),
+}
+
+pub struct OccupiedEntry<'a, 'b, K, V> {
+    /// Mutable reference to the map containing this entry.
+    pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+    /// The key of the occupied entry
+    pub(crate) _key: K,
+    /// The index of the previous entry in the chain.
+    pub(crate) prev_pos: PrevPos,
+    /// The position of the bucket in the [`CoreHashMap`] bucket array.
+    pub(crate) bucket_pos: u32,
+}
+
+impl<K, V> OccupiedEntry<'_, '_, K, V> {
+    pub fn get(&self) -> &V {
+        &self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_ref()
+            .unwrap()
+            .1
+    }
+
+    pub fn get_mut(&mut self) -> &mut V {
+        &mut self.map.buckets[self.bucket_pos as usize]
+            .inner
+            .as_mut()
+            .unwrap()
+            .1
+    }
+
+    /// Inserts a value into the entry, replacing (and returning) the existing value.
+    pub fn insert(&mut self, value: V) -> V {
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        // This assumes inner is Some, which it must be for an OccupiedEntry
+        mem::replace(&mut bucket.inner.as_mut().unwrap().1, value)
+    }
+
+    /// Removes the entry from the hash map, returning the value originally stored within it.
+    ///
+    /// This may result in multiple bucket accesses if the entry was obtained by index as the
+    /// previous chain entry needs to be discovered in this case.
+    pub fn remove(mut self) -> V {
+        // If this bucket was queried by index, go ahead and follow its chain from the start.
+        let prev = if let PrevPos::Unknown(hash) = self.prev_pos {
+            let dict_idx = hash as usize % self.map.dictionary.len();
+            let mut prev = PrevPos::First(dict_idx as u32);
+            let mut curr = self.map.dictionary[dict_idx];
+            while curr != self.bucket_pos {
+                assert!(curr != INVALID_POS);
+                prev = PrevPos::Chained(curr);
+                curr = self.map.buckets[curr as usize].next;
+            }
+            prev
+        } else {
+            self.prev_pos
+        };
+
+        // CoreHashMap::remove returns Option<(K, V)>. We know it's Some for an OccupiedEntry.
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+
+        // unlink it from the chain
+        match prev {
+            PrevPos::First(dict_pos) => {
+                self.map.dictionary[dict_pos as usize] = bucket.next;
+            }
+            PrevPos::Chained(bucket_pos) => {
+                self.map.buckets[bucket_pos as usize].next = bucket.next;
+            }
+            _ => unreachable!(),
+        }
+
+        // and add it to the freelist
+        let free = self.map.free_head;
+        let bucket = &mut self.map.buckets[self.bucket_pos as usize];
+        let old_value = bucket.inner.take();
+        bucket.next = free;
+        self.map.free_head = self.bucket_pos;
+        self.map.buckets_in_use -= 1;
+
+        old_value.unwrap().1
+    }
+}
+
+/// An abstract view into a vacant entry within the map.
+pub struct VacantEntry<'a, 'b, K, V> {
+    /// Mutable reference to the map containing this entry.
+    pub(crate) map: RwLockWriteGuard<'b, CoreHashMap<'a, K, V>>,
+    /// The key to be inserted into this entry.
+    pub(crate) key: K,
+    /// The position within the dictionary corresponding to the key's hash.
+    pub(crate) dict_pos: u32,
+}
+
+impl<'b, K: Clone + Hash + Eq, V> VacantEntry<'_, 'b, K, V> {
+    /// Insert a value into the vacant entry, finding and populating an empty bucket in the process.
+    ///
+    /// # Errors
+    /// Will return [`FullError`] if there are no unoccupied buckets in the map.
+    pub fn insert(mut self, value: V) -> Result<ValueWriteGuard<'b, V>, FullError> {
+        let pos = self.map.alloc_bucket(self.key, value)?;
+        self.map.buckets[pos as usize].next = self.map.dictionary[self.dict_pos as usize];
+        self.map.dictionary[self.dict_pos as usize] = pos;
+
+        Ok(RwLockWriteGuard::map(self.map, |m| {
+            &mut m.buckets[pos as usize].inner.as_mut().unwrap().1
+        }))
+    }
+}
diff --git a/libs/neon-shmem/src/hash/tests.rs b/libs/neon-shmem/src/hash/tests.rs
new file mode 100644
index 0000000000..92233e8140
--- /dev/null
+++ b/libs/neon-shmem/src/hash/tests.rs
@@ -0,0 +1,428 @@
+use std::collections::BTreeMap;
+use std::collections::HashSet;
+use std::fmt::Debug;
+use std::mem::MaybeUninit;
+
+use crate::hash::Entry;
+use crate::hash::HashMapAccess;
+use crate::hash::HashMapInit;
+use crate::hash::core::FullError;
+
+use rand::seq::SliceRandom;
+use rand::{Rng, RngCore};
+use rand_distr::Zipf;
+
+const TEST_KEY_LEN: usize = 16;
+
+#[derive(Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord)]
+struct TestKey([u8; TEST_KEY_LEN]);
+
+impl From<&TestKey> for u128 {
+    fn from(val: &TestKey) -> u128 {
+        u128::from_be_bytes(val.0)
+    }
+}
+
+impl From<u128> for TestKey {
+    fn from(val: u128) -> TestKey {
+        TestKey(val.to_be_bytes())
+    }
+}
+
+impl<'a> From<&'a [u8]> for TestKey {
+    fn from(bytes: &'a [u8]) -> TestKey {
+        TestKey(bytes.try_into().unwrap())
+    }
+}
+
+fn test_inserts<K: Into<TestKey> + Copy>(keys: &[K]) {
+    let w = HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_inserts")
+        .attach_writer();
+
+    for (idx, k) in keys.iter().enumerate() {
+        let res = w.entry((*k).into());
+        match res {
+            Entry::Occupied(mut e) => {
+                e.insert(idx);
+            }
+            Entry::Vacant(e) => {
+                let res = e.insert(idx);
+                assert!(res.is_ok());
+            }
+        };
+    }
+
+    for (idx, k) in keys.iter().enumerate() {
+        let x = w.get(&(*k).into());
+        let value = x.as_deref().copied();
+        assert_eq!(value, Some(idx));
+    }
+}
+
+#[test]
+fn dense() {
+    // This exercises splitting a node with prefix
+    let keys: &[u128] = &[0, 1, 2, 3, 256];
+    test_inserts(keys);
+
+    // Dense keys
+    let mut keys: Vec<u128> = (0..10000).collect();
+    test_inserts(&keys);
+
+    // Do the same in random orders
+    for _ in 1..10 {
+        keys.shuffle(&mut rand::rng());
+        test_inserts(&keys);
+    }
+}
+
+#[test]
+fn sparse() {
+    // sparse keys
+    let mut keys: Vec<TestKey> = Vec::new();
+    let mut used_keys = HashSet::new();
+    for _ in 0..10000 {
+        loop {
+            let key = rand::random::<u128>();
+            if used_keys.contains(&key) {
+                continue;
+            }
+            used_keys.insert(key);
+            keys.push(key.into());
+            break;
+        }
+    }
+    test_inserts(&keys);
+}
+
+#[derive(Clone, Debug)]
+struct TestOp(TestKey, Option<usize>);
+
+fn apply_op(
+    op: &TestOp,
+    map: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    // apply the change to the shadow tree first
+    let shadow_existing = if let Some(v) = op.1 {
+        shadow.insert(op.0, v)
+    } else {
+        shadow.remove(&op.0)
+    };
+
+    let entry = map.entry(op.0);
+    let hash_existing = match op.1 {
+        Some(new) => match entry {
+            Entry::Occupied(mut e) => Some(e.insert(new)),
+            Entry::Vacant(e) => {
+                _ = e.insert(new).unwrap();
+                None
+            }
+        },
+        None => match entry {
+            Entry::Occupied(e) => Some(e.remove()),
+            Entry::Vacant(_) => None,
+        },
+    };
+
+    assert_eq!(shadow_existing, hash_existing);
+}
+
+fn do_random_ops(
+    num_ops: usize,
+    size: u32,
+    del_prob: f64,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    rng: &mut rand::rngs::ThreadRng,
+) {
+    for i in 0..num_ops {
+        let key: TestKey = ((rng.next_u32() % size) as u128).into();
+        let op = TestOp(
+            key,
+            if rng.random_bool(del_prob) {
+                Some(i)
+            } else {
+                None
+            },
+        );
+        apply_op(&op, writer, shadow);
+    }
+}
+
+fn do_deletes(
+    num_ops: usize,
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+) {
+    for _ in 0..num_ops {
+        let (k, _) = shadow.pop_first().unwrap();
+        writer.remove(&k);
+    }
+}
+
+fn do_shrink(
+    writer: &mut HashMapAccess<TestKey, usize>,
+    shadow: &mut BTreeMap<TestKey, usize>,
+    from: u32,
+    to: u32,
+) {
+    assert!(writer.shrink_goal().is_none());
+    writer.begin_shrink(to);
+    assert_eq!(writer.shrink_goal(), Some(to as usize));
+    for i in to..from {
+        if let Some(entry) = writer.entry_at_bucket(i as usize) {
+            shadow.remove(&entry._key);
+            entry.remove();
+        }
+    }
+    let old_usage = writer.get_num_buckets_in_use();
+    writer.finish_shrink().unwrap();
+    assert!(writer.shrink_goal().is_none());
+    assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+}
+
+#[test]
+fn random_ops() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(100000, 120000, "test_random")
+            .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+
+    let distribution = Zipf::new(u128::MAX as f64, 1.1).unwrap();
+    let mut rng = rand::rng();
+    for i in 0..100000 {
+        let key: TestKey = (rng.sample(distribution) as u128).into();
+
+        let op = TestOp(key, if rng.random_bool(0.75) { Some(i) } else { None });
+
+        apply_op(&op, &mut writer, &mut shadow);
+    }
+}
+
+#[test]
+fn test_shuffle() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_shuf")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.shuffle();
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_grow() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 2000, "test_grow")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1000, 0.75, &mut writer, &mut shadow, &mut rng);
+    let old_usage = writer.get_num_buckets_in_use();
+    writer.grow(1500).unwrap();
+    assert_eq!(writer.get_num_buckets_in_use(), old_usage);
+    assert_eq!(writer.get_num_buckets(), 1500);
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_clear() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    writer.clear();
+    assert_eq!(writer.get_num_buckets_in_use(), 0);
+    assert_eq!(writer.get_num_buckets(), 1500);
+    while let Some((key, _)) = shadow.pop_first() {
+        assert!(writer.get(&key).is_none());
+    }
+    do_random_ops(2000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    for i in 0..(1500 - writer.get_num_buckets_in_use()) {
+        writer.insert((1500 + i as u128).into(), 0).unwrap();
+    }
+    assert_eq!(writer.insert(5000.into(), 0), Err(FullError {}));
+    writer.clear();
+    assert!(writer.insert(5000.into(), 0).is_ok());
+}
+
+#[test]
+fn test_idx_remove() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    for _ in 0..100 {
+        let idx = (rng.next_u32() % 1500) as usize;
+        if let Some(e) = writer.entry_at_bucket(idx) {
+            shadow.remove(&e._key);
+            e.remove();
+        }
+    }
+    while let Some((key, val)) = shadow.pop_first() {
+        assert_eq!(*writer.get(&key).unwrap(), val);
+    }
+}
+
+#[test]
+fn test_idx_get() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_clear")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+    do_random_ops(2000, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    for _ in 0..100 {
+        let idx = (rng.next_u32() % 1500) as usize;
+        if let Some(pair) = writer.get_at_bucket(idx) {
+            {
+                let v: *const usize = &pair.1;
+                assert_eq!(writer.get_bucket_for_value(v), idx);
+            }
+            {
+                let v: *const usize = &pair.1;
+                assert_eq!(writer.get_bucket_for_value(v), idx);
+            }
+        }
+    }
+}
+
+#[test]
+fn test_shrink() {
+    let mut writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink")
+        .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(10000, 1500, 0.75, &mut writer, &mut shadow, &mut rng);
+    do_shrink(&mut writer, &mut shadow, 1500, 1000);
+    assert_eq!(writer.get_num_buckets(), 1000);
+    do_deletes(500, &mut writer, &mut shadow);
+    do_random_ops(10000, 500, 0.75, &mut writer, &mut shadow, &mut rng);
+    assert!(writer.get_num_buckets_in_use() <= 1000);
+}
+
+#[test]
+fn test_shrink_grow_seq() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 20000, "test_grow_seq")
+            .attach_writer();
+    let mut shadow: std::collections::BTreeMap<TestKey, usize> = BTreeMap::new();
+    let mut rng = rand::rng();
+
+    do_random_ops(500, 1000, 0.1, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Shrinking to 750");
+    do_shrink(&mut writer, &mut shadow, 1000, 750);
+    do_random_ops(200, 1000, 0.5, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Growing to 1500");
+    writer.grow(1500).unwrap();
+    do_random_ops(600, 1500, 0.1, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Shrinking to 200");
+    while shadow.len() > 100 {
+        do_deletes(1, &mut writer, &mut shadow);
+    }
+    do_shrink(&mut writer, &mut shadow, 1500, 200);
+    do_random_ops(50, 1500, 0.25, &mut writer, &mut shadow, &mut rng);
+    eprintln!("Growing to 10k");
+    writer.grow(10000).unwrap();
+    do_random_ops(10000, 5000, 0.25, &mut writer, &mut shadow, &mut rng);
+}
+
+#[test]
+fn test_bucket_ops() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1000, 1200, "test_bucket_ops")
+        .attach_writer();
+    match writer.entry(1.into()) {
+        Entry::Occupied(mut e) => {
+            e.insert(2);
+        }
+        Entry::Vacant(e) => {
+            _ = e.insert(2).unwrap();
+        }
+    }
+    assert_eq!(writer.get_num_buckets_in_use(), 1);
+    assert_eq!(writer.get_num_buckets(), 1000);
+    assert_eq!(*writer.get(&1.into()).unwrap(), 2);
+    let pos = match writer.entry(1.into()) {
+        Entry::Occupied(e) => {
+            assert_eq!(e._key, 1.into());
+            e.bucket_pos as usize
+        }
+        Entry::Vacant(_) => {
+            panic!("Insert didn't affect entry");
+        }
+    };
+    assert_eq!(writer.entry_at_bucket(pos).unwrap()._key, 1.into());
+    assert_eq!(*writer.get_at_bucket(pos).unwrap(), (1.into(), 2));
+    {
+        let ptr: *const usize = &*writer.get(&1.into()).unwrap();
+        assert_eq!(writer.get_bucket_for_value(ptr), pos);
+    }
+    writer.remove(&1.into());
+    assert!(writer.get(&1.into()).is_none());
+}
+
+#[test]
+fn test_shrink_zero() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_shrink_zero")
+            .attach_writer();
+    writer.begin_shrink(0);
+    for i in 0..1500 {
+        writer.entry_at_bucket(i).map(|x| x.remove());
+    }
+    writer.finish_shrink().unwrap();
+    assert_eq!(writer.get_num_buckets_in_use(), 0);
+    let entry = writer.entry(1.into());
+    if let Entry::Vacant(v) = entry {
+        assert!(v.insert(2).is_err());
+    } else {
+        panic!("Somehow got non-vacant entry in empty map.")
+    }
+    writer.grow(50).unwrap();
+    let entry = writer.entry(1.into());
+    if let Entry::Vacant(v) = entry {
+        assert!(v.insert(2).is_ok());
+    } else {
+        panic!("Somehow got non-vacant entry in empty map.")
+    }
+    assert_eq!(writer.get_num_buckets_in_use(), 1);
+}
+
+#[test]
+#[should_panic]
+fn test_grow_oom() {
+    let writer = HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2000, "test_grow_oom")
+        .attach_writer();
+    writer.grow(20000).unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_bigger() {
+    let mut writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_bigger")
+            .attach_writer();
+    writer.begin_shrink(2000);
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_early_finish() {
+    let writer =
+        HashMapInit::<TestKey, usize>::new_resizeable_named(1500, 2500, "test_shrink_early_finish")
+            .attach_writer();
+    writer.finish_shrink().unwrap();
+}
+
+#[test]
+#[should_panic]
+fn test_shrink_fixed_size() {
+    let mut area = [MaybeUninit::uninit(); 10000];
+    let init_struct = HashMapInit::<TestKey, usize>::with_fixed(3, &mut area);
+    let mut writer = init_struct.attach_writer();
+    writer.begin_shrink(1);
+}
diff --git a/libs/neon-shmem/src/lib.rs b/libs/neon-shmem/src/lib.rs
index 50d3fbb3cf..226cc0c22d 100644
--- a/libs/neon-shmem/src/lib.rs
+++ b/libs/neon-shmem/src/lib.rs
@@ -1 +1,3 @@
+pub mod hash;
 pub mod shmem;
+pub mod sync;
diff --git a/libs/neon-shmem/src/sync.rs b/libs/neon-shmem/src/sync.rs
new file mode 100644
index 0000000000..95719778ba
--- /dev/null
+++ b/libs/neon-shmem/src/sync.rs
@@ -0,0 +1,111 @@
+//! Simple utilities akin to what's in [`std::sync`] but designed to work with shared memory.
+
+use std::mem::MaybeUninit;
+use std::ptr::NonNull;
+
+use nix::errno::Errno;
+
+pub type RwLock<T> = lock_api::RwLock<PthreadRwLock, T>;
+pub type RwLockReadGuard<'a, T> = lock_api::RwLockReadGuard<'a, PthreadRwLock, T>;
+pub type RwLockWriteGuard<'a, T> = lock_api::RwLockWriteGuard<'a, PthreadRwLock, T>;
+pub type ValueReadGuard<'a, T> = lock_api::MappedRwLockReadGuard<'a, PthreadRwLock, T>;
+pub type ValueWriteGuard<'a, T> = lock_api::MappedRwLockWriteGuard<'a, PthreadRwLock, T>;
+
+/// Shared memory read-write lock.
+pub struct PthreadRwLock(Option<NonNull<libc::pthread_rwlock_t>>);
+
+/// Simple macro that calls a function in the libc namespace and panics if return value is nonzero.
+macro_rules! libc_checked {
+    ($fn_name:ident ( $($arg:expr),* )) => {{
+        let res = libc::$fn_name($($arg),*);
+        if res != 0 {
+            panic!("{} failed with {}", stringify!($fn_name), Errno::from_raw(res));
+        }
+    }};
+}
+
+impl PthreadRwLock {
+    /// Creates a new `PthreadRwLock` on top of a pointer to a pthread rwlock.
+    ///
+    /// # Safety
+    /// `lock` must be non-null. Every unsafe operation will panic in the event of an error.
+    pub unsafe fn new(lock: *mut libc::pthread_rwlock_t) -> Self {
+        unsafe {
+            let mut attrs = MaybeUninit::uninit();
+            libc_checked!(pthread_rwlockattr_init(attrs.as_mut_ptr()));
+            libc_checked!(pthread_rwlockattr_setpshared(
+                attrs.as_mut_ptr(),
+                libc::PTHREAD_PROCESS_SHARED
+            ));
+            libc_checked!(pthread_rwlock_init(lock, attrs.as_mut_ptr()));
+            // Safety: POSIX specifies that "any function affecting the attributes
+            // object (including destruction) shall not affect any previously
+            // initialized read-write locks".
+            libc_checked!(pthread_rwlockattr_destroy(attrs.as_mut_ptr()));
+            Self(Some(NonNull::new_unchecked(lock)))
+        }
+    }
+
+    fn inner(&self) -> NonNull<libc::pthread_rwlock_t> {
+        match self.0 {
+            None => {
+                panic!("PthreadRwLock constructed badly - something likely used RawRwLock::INIT")
+            }
+            Some(x) => x,
+        }
+    }
+}
+
+unsafe impl lock_api::RawRwLock for PthreadRwLock {
+    type GuardMarker = lock_api::GuardSend;
+    const INIT: Self = Self(None);
+
+    fn try_lock_shared(&self) -> bool {
+        unsafe {
+            let res = libc::pthread_rwlock_tryrdlock(self.inner().as_ptr());
+            match res {
+                0 => true,
+                libc::EAGAIN => false,
+                _ => panic!(
+                    "pthread_rwlock_tryrdlock failed with {}",
+                    Errno::from_raw(res)
+                ),
+            }
+        }
+    }
+
+    fn try_lock_exclusive(&self) -> bool {
+        unsafe {
+            let res = libc::pthread_rwlock_trywrlock(self.inner().as_ptr());
+            match res {
+                0 => true,
+                libc::EAGAIN => false,
+                _ => panic!("try_wrlock failed with {}", Errno::from_raw(res)),
+            }
+        }
+    }
+
+    fn lock_shared(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_rdlock(self.inner().as_ptr()));
+        }
+    }
+
+    fn lock_exclusive(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_wrlock(self.inner().as_ptr()));
+        }
+    }
+
+    unsafe fn unlock_exclusive(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
+        }
+    }
+
+    unsafe fn unlock_shared(&self) {
+        unsafe {
+            libc_checked!(pthread_rwlock_unlock(self.inner().as_ptr()));
+        }
+    }
+}

From 8b0f2efa573834a11f9bd01a673fac87970023fb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 17 Jul 2025 18:58:47 +0100
Subject: [PATCH 149/163] experiment with an InfoMetrics metric family (#12612)

Putting this in the neon codebase for now, to experiment. Can be lifted
into measured at a later date.

This metric family is like a MetricVec, but it only supports 1 label
being set at a time. It is useful for reporting info, rather than
reporting metrics.
https://www.robustperception.io/exposing-the-software-version-to-prometheus/
---
 libs/metrics/src/lib.rs | 68 ++++++++++++++++++++++++++++-------------
 1 file changed, 46 insertions(+), 22 deletions(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 5d028ee041..41873cdcd6 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,12 +4,14 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 
+use std::sync::RwLock;
+
 use measured::label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels};
 use measured::metric::counter::CounterState;
 use measured::metric::gauge::GaugeState;
 use measured::metric::group::Encoding;
 use measured::metric::name::{MetricName, MetricNameEncoder};
-use measured::metric::{MetricEncoding, MetricFamilyEncoding};
+use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType};
 use measured::{FixedCardinalityLabel, LabelGroup, MetricGroup};
 use once_cell::sync::Lazy;
 use prometheus::Registry;
@@ -116,12 +118,52 @@ pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
         .collect()
 }
 
+pub struct InfoMetric<L: LabelGroup, M: MetricType = GaugeState> {
+    label: RwLock<L>,
+    metric: M,
+}
+
+impl<L: LabelGroup> InfoMetric<L> {
+    pub fn new(label: L) -> Self {
+        Self::with_metric(label, GaugeState::new(1))
+    }
+}
+
+impl<L: LabelGroup, M: MetricType<Metadata = ()>> InfoMetric<L, M> {
+    pub fn with_metric(label: L, metric: M) -> Self {
+        Self {
+            label: RwLock::new(label),
+            metric,
+        }
+    }
+
+    pub fn set_label(&self, label: L) {
+        *self.label.write().unwrap() = label;
+    }
+}
+
+impl<L, M, E> MetricFamilyEncoding<E> for InfoMetric<L, M>
+where
+    L: LabelGroup,
+    M: MetricEncoding<E, Metadata = ()>,
+    E: Encoding,
+{
+    fn collect_family_into(
+        &self,
+        name: impl measured::metric::name::MetricNameEncoder,
+        enc: &mut E,
+    ) -> Result<(), E::Err> {
+        M::write_type(&name, enc)?;
+        self.metric
+            .collect_into(&(), &*self.label.read().unwrap(), name, enc)
+    }
+}
+
 pub struct BuildInfo {
     pub revision: &'static str,
     pub build_tag: &'static str,
 }
 
-// todo: allow label group without the set
 impl LabelGroup for BuildInfo {
     fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
         const REVISION: &LabelName = LabelName::from_str("revision");
@@ -131,24 +173,6 @@ impl LabelGroup for BuildInfo {
     }
 }
 
-impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        enc.write_help(&name, "Build/version information")?;
-        GaugeState::write_type(&name, enc)?;
-        GaugeState {
-            count: std::sync::atomic::AtomicI64::new(1),
-        }
-        .collect_into(&(), self, name, enc)
-    }
-}
-
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct NeonMetrics {
@@ -165,8 +189,8 @@ pub struct NeonMetrics {
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct LibMetrics {
-    #[metric(init = build_info)]
-    build_info: BuildInfo,
+    #[metric(init = InfoMetric::new(build_info))]
+    build_info: InfoMetric<BuildInfo>,
 
     #[metric(flatten)]
     rusage: Rusage,

From 29ee273d780e70471286ac9238c70894eba7b6e2 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Jul 2025 15:42:48 -0400
Subject: [PATCH 150/163] fix(storcon): correctly converts 404 for tenant
 passthrough requests (#12631)

## Problem

Follow up of https://github.com/neondatabase/neon/pull/12620

Discussions:
https://databricks.slack.com/archives/C09254R641L/p1752677940697529

The original code and after the patch above we converts 404s to 503s
regardless of the type of 404. We should only do that for tenant not
found errors. For other 404s like timeline not found, we should not
prompt clients to retry.

## Summary of changes

- Inspect the response body to figure out the type of 404. If it's a
tenant not found error, return 503.
- Otherwise, fallthrough and return 404 as-is.
- Add `tenant_shard_remote_mutation` that manipulates a single shard.
- Use `Service::tenant_shard_remote_mutation` for tenant shard
passthrough requests. This prevents us from another race that the attach
state changes within the request. (This patch mainly addresses the case
that the tenant is "not yet attached").
- TODO: lease API is still using the old code path. We should refactor
it to use `tenant_remote_mutation`.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/http.rs                | 148 +++++++++++-------
 storage_controller/src/service.rs             | 110 ++++++++++---
 test_runner/fixtures/pageserver/http.py       |   3 +-
 .../regress/test_storage_controller.py        | 103 +++++++++++-
 4 files changed, 284 insertions(+), 80 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 5f9a1124de..6b6d081dcd 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -48,7 +48,10 @@ use crate::metrics::{
 };
 use crate::persistence::SafekeeperUpsert;
 use crate::reconciler::ReconcileError;
-use crate::service::{LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service};
+use crate::service::{
+    LeadershipStatus, RECONCILE_TIMEOUT, STARTUP_RECONCILE_TIMEOUT, Service,
+    TenantMutationLocations,
+};
 
 /// State available to HTTP request handlers
 pub struct HttpState {
@@ -734,77 +737,104 @@ async fn handle_tenant_timeline_passthrough(
         path
     );
 
-    // Find the node that holds shard zero
-    let (node, tenant_shard_id, consistent) = if tenant_or_shard_id.is_unsharded() {
-        service
+    let tenant_shard_id = if tenant_or_shard_id.is_unsharded() {
+        // If the request contains only tenant ID, find the node that holds shard zero
+        let (_, shard_id) = service
             .tenant_shard0_node(tenant_or_shard_id.tenant_id)
-            .await?
+            .await?;
+        shard_id
     } else {
-        let (node, consistent) = service.tenant_shard_node(tenant_or_shard_id).await?;
-        (node, tenant_or_shard_id, consistent)
+        tenant_or_shard_id
     };
 
-    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
-    // rewrite this to a shard-aware shard zero ID.
-    let path = format!("{path}");
-    let tenant_str = tenant_or_shard_id.tenant_id.to_string();
-    let tenant_shard_str = format!("{tenant_shard_id}");
-    let path = path.replace(&tenant_str, &tenant_shard_str);
+    let service_inner = service.clone();
 
-    let latency = &METRICS_REGISTRY
-        .metrics_group
-        .storage_controller_passthrough_request_latency;
+    service.tenant_shard_remote_mutation(tenant_shard_id, |locations| async move {
+        let TenantMutationLocations(locations) = locations;
+        if locations.is_empty() {
+            return Err(ApiError::NotFound(anyhow::anyhow!("Tenant {} not found", tenant_or_shard_id.tenant_id).into()));
+        }
 
-    let path_label = path_without_ids(&path)
-        .split('/')
-        .filter(|token| !token.is_empty())
-        .collect::<Vec<_>>()
-        .join("_");
-    let labels = PageserverRequestLabelGroup {
-        pageserver_id: &node.get_id().to_string(),
-        path: &path_label,
-        method: crate::metrics::Method::Get,
-    };
+        let (tenant_or_shard_id, locations) = locations.into_iter().next().unwrap();
+        let node = locations.latest.node;
 
-    let _timer = latency.start_timer(labels.clone());
+        // Callers will always pass an unsharded tenant ID.  Before proxying, we must
+        // rewrite this to a shard-aware shard zero ID.
+        let path = format!("{path}");
+        let tenant_str = tenant_or_shard_id.tenant_id.to_string();
+        let tenant_shard_str = format!("{tenant_shard_id}");
+        let path = path.replace(&tenant_str, &tenant_shard_str);
 
-    let client = mgmt_api::Client::new(
-        service.get_http_client().clone(),
-        node.base_url(),
-        service.get_config().pageserver_jwt_token.as_deref(),
-    );
-    let resp = client.op_raw(method, path).await.map_err(|e|
-        // We return 503 here because if we can't successfully send a request to the pageserver,
-        // either we aren't available or the pageserver is unavailable.
-        ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
-
-    if !resp.status().is_success() {
-        let error_counter = &METRICS_REGISTRY
+        let latency = &METRICS_REGISTRY
             .metrics_group
-            .storage_controller_passthrough_request_error;
-        error_counter.inc(labels);
-    }
+            .storage_controller_passthrough_request_latency;
 
-    // Transform 404 into 503 if we raced with a migration
-    if resp.status() == reqwest::StatusCode::NOT_FOUND && !consistent {
-        // Rather than retry here, send the client a 503 to prompt a retry: this matches
-        // the pageserver's use of 503, and all clients calling this API should retry on 503.
-        return Err(ApiError::ResourceUnavailable(
-            format!("Pageserver {node} returned 404 due to ongoing migration, retry later").into(),
-        ));
-    }
+        let path_label = path_without_ids(&path)
+            .split('/')
+            .filter(|token| !token.is_empty())
+            .collect::<Vec<_>>()
+            .join("_");
+        let labels = PageserverRequestLabelGroup {
+            pageserver_id: &node.get_id().to_string(),
+            path: &path_label,
+            method: crate::metrics::Method::Get,
+        };
 
-    // We have a reqest::Response, would like a http::Response
-    let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
-    for (k, v) in resp.headers() {
-        builder = builder.header(k.as_str(), v.as_bytes());
-    }
+        let _timer = latency.start_timer(labels.clone());
 
-    let response = builder
-        .body(Body::wrap_stream(resp.bytes_stream()))
-        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        let client = mgmt_api::Client::new(
+            service_inner.get_http_client().clone(),
+            node.base_url(),
+            service_inner.get_config().pageserver_jwt_token.as_deref(),
+        );
+        let resp = client.op_raw(method, path).await.map_err(|e|
+            // We return 503 here because if we can't successfully send a request to the pageserver,
+            // either we aren't available or the pageserver is unavailable.
+            ApiError::ResourceUnavailable(format!("Error sending pageserver API request to {node}: {e}").into()))?;
 
-    Ok(response)
+        if !resp.status().is_success() {
+            let error_counter = &METRICS_REGISTRY
+                .metrics_group
+                .storage_controller_passthrough_request_error;
+            error_counter.inc(labels);
+        }
+        let resp_staus = resp.status();
+
+        // We have a reqest::Response, would like a http::Response
+        let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp_staus)?);
+        for (k, v) in resp.headers() {
+            builder = builder.header(k.as_str(), v.as_bytes());
+        }
+        let resp_bytes = resp
+            .bytes()
+            .await
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        // Inspect 404 errors: at this point, we know that the tenant exists, but the pageserver we route
+        // the request to might not yet be ready. Therefore, if it is a _tenant_ not found error, we can
+        // convert it into a 503. TODO: we should make this part of the check in `tenant_shard_remote_mutation`.
+        // However, `tenant_shard_remote_mutation` currently cannot inspect the HTTP error response body,
+        // so we have to do it here instead.
+        if resp_staus == reqwest::StatusCode::NOT_FOUND {
+            let resp_str = std::str::from_utf8(&resp_bytes)
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            // We only handle "tenant not found" errors; other 404s like timeline not found should
+            // be forwarded as-is.
+            if resp_str.contains(&format!("tenant {tenant_or_shard_id}")) {
+                // Rather than retry here, send the client a 503 to prompt a retry: this matches
+                // the pageserver's use of 503, and all clients calling this API should retry on 503.
+                return Err(ApiError::ResourceUnavailable(
+                    format!(
+                        "Pageserver {node} returned tenant 404 due to ongoing migration, retry later"
+                    )
+                    .into(),
+                ));
+            }
+        }
+        let response = builder
+            .body(Body::from(resp_bytes))
+            .map_err(|e| ApiError::InternalServerError(e.into()))?;
+        Ok(response)
+    }).await?
 }
 
 async fn handle_tenant_locate(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ec3b419437..a1ff9b3c61 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -719,19 +719,19 @@ pub(crate) enum ReconcileResultRequest {
 }
 
 #[derive(Clone)]
-struct MutationLocation {
-    node: Node,
-    generation: Generation,
+pub(crate) struct MutationLocation {
+    pub(crate) node: Node,
+    pub(crate) generation: Generation,
 }
 
 #[derive(Clone)]
-struct ShardMutationLocations {
-    latest: MutationLocation,
-    other: Vec<MutationLocation>,
+pub(crate) struct ShardMutationLocations {
+    pub(crate) latest: MutationLocation,
+    pub(crate) other: Vec<MutationLocation>,
 }
 
 #[derive(Default, Clone)]
-struct TenantMutationLocations(BTreeMap<TenantShardId, ShardMutationLocations>);
+pub(crate) struct TenantMutationLocations(pub BTreeMap<TenantShardId, ShardMutationLocations>);
 
 struct ReconcileAllResult {
     spawned_reconciles: usize,
@@ -763,6 +763,29 @@ impl ReconcileAllResult {
     }
 }
 
+enum TenantIdOrShardId {
+    TenantId(TenantId),
+    TenantShardId(TenantShardId),
+}
+
+impl TenantIdOrShardId {
+    fn tenant_id(&self) -> TenantId {
+        match self {
+            TenantIdOrShardId::TenantId(tenant_id) => *tenant_id,
+            TenantIdOrShardId::TenantShardId(tenant_shard_id) => tenant_shard_id.tenant_id,
+        }
+    }
+
+    fn matches(&self, tenant_shard_id: &TenantShardId) -> bool {
+        match self {
+            TenantIdOrShardId::TenantId(tenant_id) => tenant_shard_id.tenant_id == *tenant_id,
+            TenantIdOrShardId::TenantShardId(this_tenant_shard_id) => {
+                this_tenant_shard_id == tenant_shard_id
+            }
+        }
+    }
+}
+
 impl Service {
     pub fn get_config(&self) -> &Config {
         &self.config
@@ -4814,6 +4837,12 @@ impl Service {
             }
         }
 
+        if targets.is_empty() {
+            return Err(ApiError::NotFound(
+                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
+            ));
+        }
+
         Ok(TenantShardAttachState {
             targets,
             by_node_id,
@@ -5040,11 +5069,37 @@ impl Service {
     /// - Looks up the shards and the nodes where they were most recently attached
     /// - Guarantees that after the inner function returns, the shards' generations haven't moved on: this
     ///   ensures that the remote operation acted on the most recent generation, and is therefore durable.
-    async fn tenant_remote_mutation<R, O, F>(
+    pub(crate) async fn tenant_remote_mutation<R, O, F>(
         &self,
         tenant_id: TenantId,
         op: O,
     ) -> Result<R, ApiError>
+    where
+        O: FnOnce(TenantMutationLocations) -> F,
+        F: std::future::Future<Output = R>,
+    {
+        self.tenant_remote_mutation_inner(TenantIdOrShardId::TenantId(tenant_id), op)
+            .await
+    }
+
+    pub(crate) async fn tenant_shard_remote_mutation<R, O, F>(
+        &self,
+        tenant_shard_id: TenantShardId,
+        op: O,
+    ) -> Result<R, ApiError>
+    where
+        O: FnOnce(TenantMutationLocations) -> F,
+        F: std::future::Future<Output = R>,
+    {
+        self.tenant_remote_mutation_inner(TenantIdOrShardId::TenantShardId(tenant_shard_id), op)
+            .await
+    }
+
+    async fn tenant_remote_mutation_inner<R, O, F>(
+        &self,
+        tenant_id_or_shard_id: TenantIdOrShardId,
+        op: O,
+    ) -> Result<R, ApiError>
     where
         O: FnOnce(TenantMutationLocations) -> F,
         F: std::future::Future<Output = R>,
@@ -5056,7 +5111,13 @@ impl Service {
             // run concurrently with reconciliations, and it is not guaranteed that the node we find here
             // will still be the latest when we're done: we will check generations again at the end of
             // this function to handle that.
-            let generations = self.persistence.tenant_generations(tenant_id).await?;
+            let generations = self
+                .persistence
+                .tenant_generations(tenant_id_or_shard_id.tenant_id())
+                .await?
+                .into_iter()
+                .filter(|i| tenant_id_or_shard_id.matches(&i.tenant_shard_id))
+                .collect::<Vec<_>>();
 
             if generations
                 .iter()
@@ -5070,9 +5131,14 @@ impl Service {
                 // One or more shards has not been attached to a pageserver.  Check if this is because it's configured
                 // to be detached (409: caller should give up), or because it's meant to be attached but isn't yet (503: caller should retry)
                 let locked = self.inner.read().unwrap();
-                for (shard_id, shard) in
-                    locked.tenants.range(TenantShardId::tenant_range(tenant_id))
-                {
+                let tenant_shards = locked
+                    .tenants
+                    .range(TenantShardId::tenant_range(
+                        tenant_id_or_shard_id.tenant_id(),
+                    ))
+                    .filter(|(shard_id, _)| tenant_id_or_shard_id.matches(shard_id))
+                    .collect::<Vec<_>>();
+                for (shard_id, shard) in tenant_shards {
                     match shard.policy {
                         PlacementPolicy::Attached(_) => {
                             // This shard is meant to be attached: the caller is not wrong to try and
@@ -5182,7 +5248,14 @@ impl Service {
         // Post-check: are all the generations of all the shards the same as they were initially?  This proves that
         // our remote operation executed on the latest generation and is therefore persistent.
         {
-            let latest_generations = self.persistence.tenant_generations(tenant_id).await?;
+            let latest_generations = self
+                .persistence
+                .tenant_generations(tenant_id_or_shard_id.tenant_id())
+                .await?
+                .into_iter()
+                .filter(|i| tenant_id_or_shard_id.matches(&i.tenant_shard_id))
+                .collect::<Vec<_>>();
+
             if latest_generations
                 .into_iter()
                 .map(
@@ -5316,7 +5389,7 @@ impl Service {
     pub(crate) async fn tenant_shard0_node(
         &self,
         tenant_id: TenantId,
-    ) -> Result<(Node, TenantShardId, bool), ApiError> {
+    ) -> Result<(Node, TenantShardId), ApiError> {
         let tenant_shard_id = {
             let locked = self.inner.read().unwrap();
             let Some((tenant_shard_id, _shard)) = locked
@@ -5334,7 +5407,7 @@ impl Service {
 
         self.tenant_shard_node(tenant_shard_id)
             .await
-            .map(|(node, consistent)| (node, tenant_shard_id, consistent))
+            .map(|node| (node, tenant_shard_id))
     }
 
     /// When you need to send an HTTP request to the pageserver that holds a shard of a tenant, this
@@ -5344,7 +5417,7 @@ impl Service {
     pub(crate) async fn tenant_shard_node(
         &self,
         tenant_shard_id: TenantShardId,
-    ) -> Result<(Node, bool), ApiError> {
+    ) -> Result<Node, ApiError> {
         // Look up in-memory state and maybe use the node from there.
         {
             let locked = self.inner.read().unwrap();
@@ -5374,8 +5447,7 @@ impl Service {
                         "Shard refers to nonexistent node"
                     )));
                 };
-                let consistent = self.is_observed_consistent_with_intent(shard, *intent_node_id);
-                return Ok((node.clone(), consistent));
+                return Ok(node.clone());
             }
         };
 
@@ -5410,7 +5482,7 @@ impl Service {
             )));
         };
         // As a reconciliation is in flight, we do not have the observed state yet, and therefore we assume it is always inconsistent.
-        Ok((node.clone(), false))
+        Ok(node.clone())
     }
 
     pub(crate) fn tenant_locate(
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 23b9d1c8c9..f95b0ee4d1 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -847,7 +847,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return res_json
 
     def timeline_lsn_lease(
-        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn
+        self, tenant_id: TenantId | TenantShardId, timeline_id: TimelineId, lsn: Lsn, **kwargs
     ):
         data = {
             "lsn": str(lsn),
@@ -857,6 +857,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/lsn_lease",
             json=data,
+            **kwargs,
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index fbdb14b6bb..9986c1f24a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -12,7 +12,7 @@ from typing import TYPE_CHECKING
 import fixtures.utils
 import pytest
 from fixtures.auth_tokens import TokenScope
-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     DEFAULT_AZ_ID,
@@ -47,6 +47,7 @@ from fixtures.utils import (
     wait_until,
 )
 from fixtures.workload import Workload
+from requests.adapters import HTTPAdapter
 from urllib3 import Retry
 from werkzeug.wrappers.response import Response
 
@@ -4858,3 +4859,103 @@ def test_storage_controller_migrate_with_pageserver_restart(
         "shards": [{"node_id": int(secondary.id), "shard_number": 0}],
         "preferred_az": DEFAULT_AZ_ID,
     }
+
+
+@run_only_on_default_postgres("PG version is not important for this test")
+def test_storage_controller_forward_404(neon_env_builder: NeonEnvBuilder):
+    """
+    Ensures that the storage controller correctly forwards 404s and converts some of them
+    into 503s before forwarding to the client.
+    """
+    neon_env_builder.num_pageservers = 2
+    neon_env_builder.num_azs = 2
+
+    env = neon_env_builder.init_start()
+    env.storage_controller.allowed_errors.append(".*Reconcile error.*")
+    env.storage_controller.allowed_errors.append(".*Timed out.*")
+
+    env.storage_controller.tenant_policy_update(env.initial_tenant, {"placement": {"Attached": 1}})
+    env.storage_controller.reconcile_until_idle()
+
+    # 404s on tenants and timelines are forwarded as-is when reconciler is not running.
+
+    # Access a non-existing timeline -> 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_detail(
+            env.initial_tenant, TimelineId.generate()
+        )
+    assert e.value.status_code == 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            env.initial_tenant, TimelineId.generate(), Lsn(0)
+        )
+    assert e.value.status_code == 404
+
+    # Access a non-existing tenant when reconciler is not running -> 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_detail(
+            TenantId.generate(), env.initial_timeline
+        )
+    assert e.value.status_code == 404
+    with pytest.raises(PageserverApiException) as e:
+        env.storage_controller.pageserver_api().timeline_lsn_lease(
+            TenantId.generate(), env.initial_timeline, Lsn(0)
+        )
+    assert e.value.status_code == 404
+
+    # Normal requests should succeed
+    detail = env.storage_controller.pageserver_api().timeline_detail(
+        env.initial_tenant, env.initial_timeline
+    )
+    last_record_lsn = Lsn(detail["last_record_lsn"])
+    env.storage_controller.pageserver_api().timeline_lsn_lease(
+        env.initial_tenant, env.initial_timeline, last_record_lsn
+    )
+
+    # Get into a situation where the intent state is not the same as the observed state.
+    describe = env.storage_controller.tenant_describe(env.initial_tenant)["shards"][0]
+    current_primary = describe["node_attached"]
+    current_secondary = describe["node_secondary"][0]
+    assert current_primary != current_secondary
+
+    # Pause the reconciler so that the generation number won't be updated.
+    env.storage_controller.configure_failpoints(
+        ("reconciler-live-migrate-post-generation-inc", "pause")
+    )
+
+    # Do the migration in another thread; the request will be dropped as we don't wait.
+    shard_zero = TenantShardId(env.initial_tenant, 0, 0)
+    concurrent.futures.ThreadPoolExecutor(max_workers=1).submit(
+        env.storage_controller.tenant_shard_migrate,
+        shard_zero,
+        current_secondary,
+        StorageControllerMigrationConfig(override_scheduler=True),
+    )
+    # Not the best way to do this, we should wait until the migration gets started.
+    time.sleep(1)
+    placement = env.storage_controller.get_tenants_placement()[str(shard_zero)]
+    assert placement["observed"] != placement["intent"]
+    assert placement["observed"]["attached"] == current_primary
+    assert placement["intent"]["attached"] == current_secondary
+
+    # Now we issue requests that would cause 404 again
+    retry_strategy = Retry(total=0)
+    adapter = HTTPAdapter(max_retries=retry_strategy)
+
+    no_retry_api = env.storage_controller.pageserver_api()
+    no_retry_api.mount("http://", adapter)
+    no_retry_api.mount("https://", adapter)
+
+    # As intent state != observed state, tenant not found error should return 503,
+    # so that the client can retry once we've successfully migrated.
+    with pytest.raises(PageserverApiException) as e:
+        no_retry_api.timeline_detail(env.initial_tenant, TimelineId.generate())
+    assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
+    with pytest.raises(PageserverApiException) as e:
+        no_retry_api.timeline_lsn_lease(env.initial_tenant, TimelineId.generate(), Lsn(0))
+    assert e.value.status_code == 503, f"unexpected status code and error: {e.value}"
+
+    # Unblock reconcile operations
+    env.storage_controller.configure_failpoints(
+        ("reconciler-live-migrate-post-generation-inc", "off")
+    )

From 43fd5b218b28267aa500de9907a2bcfc325f3eb1 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Jul 2025 23:20:38 +0300
Subject: [PATCH 151/163] Refactor shmem initialization in Neon extension
 (#12630)

## Problem

Initializing of shared memory in extension is complex and non-portable.
In neon extension this boilerplate code is duplicated in several files.

## Summary of changes

Perform all initialization in one place - neon.c
All other module procvide *ShmemRequest() and *ShmemInit() fuinction
which are called from neon.c

---------

Co-authored-by: Kosntantin Knizhnik <konstantin.knizhnik@databricks.com>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pgxn/neon/file_cache.c         |  45 ++++-----------
 pgxn/neon/libpagestore.c       |  54 ++----------------
 pgxn/neon/neon.c               | 101 ++++++++++++++++++++++++++++++---
 pgxn/neon/neon.h               |  15 +++++
 pgxn/neon/neon_lwlsncache.c    |  37 +++---------
 pgxn/neon/neon_perf_counters.c |  26 ++++++---
 pgxn/neon/relsize_cache.c      |  48 ++++------------
 pgxn/neon/walproposer_pg.c     |  48 ++--------------
 8 files changed, 164 insertions(+), 210 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 2c87f139af..7cfa769959 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -219,10 +219,6 @@ static char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
 static bool lfc_do_prewarm;
-static shmem_startup_hook_type prev_shmem_startup_hook;
-#if PG_VERSION_NUM>=150000
-static shmem_request_hook_type prev_shmem_request_hook;
-#endif
 
 bool lfc_store_prefetch_result;
 bool lfc_prewarm_update_ws_estimation;
@@ -342,18 +338,14 @@ lfc_ensure_opened(void)
 	return true;
 }
 
-static void
-lfc_shmem_startup(void)
+void
+LfcShmemInit(void)
 {
 	bool		found;
 	static HASHCTL info;
 
-	if (prev_shmem_startup_hook)
-	{
-		prev_shmem_startup_hook();
-	}
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+	if (lfc_max_size <= 0)
+		return;
 
 	lfc_ctl = (FileCacheControl *) ShmemInitStruct("lfc", sizeof(FileCacheControl), &found);
 	if (!found)
@@ -398,19 +390,16 @@ lfc_shmem_startup(void)
 			ConditionVariableInit(&lfc_ctl->cv[i]);
 
 	}
-	LWLockRelease(AddinShmemInitLock);
 }
 
-static void
-lfc_shmem_request(void)
+void
+LfcShmemRequest(void)
 {
-#if PG_VERSION_NUM>=150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-#endif
-
-	RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
-	RequestNamedLWLockTranche("lfc_lock", 1);
+	if (lfc_max_size > 0)
+	{
+		RequestAddinShmemSpace(sizeof(FileCacheControl) + hash_estimate_size(SIZE_MB_TO_CHUNKS(lfc_max_size) + 1, FILE_CACHE_ENRTY_SIZE));
+		RequestNamedLWLockTranche("lfc_lock", 1);
+	}
 }
 
 static bool
@@ -642,18 +631,6 @@ lfc_init(void)
 							NULL,
 							NULL,
 							NULL);
-
-	if (lfc_max_size == 0)
-		return;
-
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = lfc_shmem_startup;
-#if PG_VERSION_NUM>=150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = lfc_shmem_request;
-#else
-	lfc_shmem_request();
-#endif
 }
 
 FileCacheState*
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 05ba6da663..596258007a 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -118,10 +118,6 @@ typedef struct
 	ShardMap	shard_map;
 } PagestoreShmemState;
 
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-#endif
-static shmem_startup_hook_type prev_shmem_startup_hook;
 static PagestoreShmemState *pagestore_shared;
 static uint64 pagestore_local_counter = 0;
 
@@ -1284,18 +1280,12 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }
 
-static Size
-PagestoreShmemSize(void)
-{
-	return add_size(sizeof(PagestoreShmemState), NeonPerfCountersShmemSize());
-}
 
-static bool
+void
 PagestoreShmemInit(void)
 {
 	bool		found;
 
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	pagestore_shared = ShmemInitStruct("libpagestore shared state",
 									   sizeof(PagestoreShmemState),
 									   &found);
@@ -1306,44 +1296,12 @@ PagestoreShmemInit(void)
 		memset(&pagestore_shared->shard_map, 0, sizeof(ShardMap));
 		AssignPageserverConnstring(page_server_connstring, NULL);
 	}
-
-	NeonPerfCountersShmemInit();
-
-	LWLockRelease(AddinShmemInitLock);
-	return found;
 }
 
-static void
-pagestore_shmem_startup_hook(void)
+void
+PagestoreShmemRequest(void)
 {
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	PagestoreShmemInit();
-}
-
-static void
-pagestore_shmem_request(void)
-{
-#if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-#endif
-
-	RequestAddinShmemSpace(PagestoreShmemSize());
-}
-
-static void
-pagestore_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = pagestore_shmem_request;
-#else
-	pagestore_shmem_request();
-#endif
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = pagestore_shmem_startup_hook;
+	RequestAddinShmemSpace(sizeof(PagestoreShmemState));
 }
 
 /*
@@ -1352,8 +1310,6 @@ pagestore_prepare_shmem(void)
 void
 pg_init_libpagestore(void)
 {
-	pagestore_prepare_shmem();
-
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -1504,8 +1460,6 @@ pg_init_libpagestore(void)
 							0,
 							NULL, NULL, NULL);
 
-	relsize_hash_init();
-
 	if (page_server != NULL)
 		neon_log(ERROR, "libpagestore already loaded");
 
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index df5dcf5334..4e4320e498 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -22,6 +22,7 @@
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
+#include "storage/ipc.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/builtins.h"
@@ -59,11 +60,15 @@ static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
 static void neon_ExecutorStart(QueryDesc *queryDesc, int eflags);
 static void neon_ExecutorEnd(QueryDesc *queryDesc);
 
-#if PG_MAJORVERSION_NUM >= 16
 static shmem_startup_hook_type prev_shmem_startup_hook;
-
 static void neon_shmem_startup_hook(void);
+static void neon_shmem_request_hook(void);
+
+#if PG_MAJORVERSION_NUM >= 15
+static shmem_request_hook_type prev_shmem_request_hook = NULL;
 #endif
+
+
 #if PG_MAJORVERSION_NUM >= 17
 uint32		WAIT_EVENT_NEON_LFC_MAINTENANCE;
 uint32		WAIT_EVENT_NEON_LFC_READ;
@@ -450,15 +455,44 @@ _PG_init(void)
 	 */
 #if PG_VERSION_NUM >= 160000
 	load_file("$libdir/neon_rmgr", false);
-
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = neon_shmem_startup_hook;
 #endif
 
 	/* dummy call to a Rust function in the communicator library, to check that it works */
 	(void) communicator_dummy(123);
 
+	/*
+	 * Initializing a pre-loaded Postgres extension happens in three stages:
+	 *
+	 * 1. _PG_init() is called early at postmaster startup. In this stage, no
+	 *    shared memory has been allocated yet. Core Postgres GUCs have been
+	 *    initialized from the config files, but notably, MaxBackends has not
+	 *    calculated yet. In this stage, we must register any extension GUCs
+	 *    and can do other early initialization that doesn't depend on shared
+	 *    memory. In this stage we must also register "shmem request" and
+	 *    "shmem starutup" hooks, to be called in stages 2 and 3.
+	 *
+	 * 2. After MaxBackends have been calculated, the "shmem request" hooks
+	 *    are called. The hooks can reserve shared memory by calling
+	 *    RequestAddinShmemSpace and RequestNamedLWLockTranche().  The "shmem
+	 *    request hooks" are a new mechanism in Postgres v15. In v14 and
+	 *    below, you had to make those Requests in stage 1 already, which
+	 *    means they could not depend on MaxBackends. (See hack in
+	 *    NeonPerfCountersShmemRequest())
+	 *
+	 * 3. After some more runtime-computed GUCs that affect the amount of
+	 *    shared memory needed have been calculated, the "shmem startup" hooks
+	 *    are called. In this stage, we allocate any shared memory, LWLocks
+	 *    and other shared resources.
+	 *
+	 * Here, in the 'neon' extension, we register just one shmem request hook
+	 * and one startup hook, which call into functions in all the subsystems
+	 * that are part of the extension. On v14, the ShmemRequest functions are
+	 * called in stage 1, and on v15 onwards they are called in stage 2.
+	 */
+
+	/* Stage 1: Define GUCs, and other early intialization */
 	pg_init_libpagestore();
+	relsize_hash_init();
 	lfc_init();
 	pg_init_walproposer();
 	init_lwlsncache();
@@ -561,6 +595,22 @@ _PG_init(void)
 
 	ReportSearchPath();
 
+	/*
+	 * Register initialization hooks for stage 2. (On v14, there's no "shmem
+	 * request" hooks, so call the ShmemRequest functions immediately.)
+	 */
+#if PG_VERSION_NUM >= 150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = neon_shmem_request_hook;
+#else
+	neon_shmem_request_hook();
+#endif
+
+	/* Register hooks for stage 3 */
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = neon_shmem_startup_hook;
+
+	/* Other misc initialization */
 	prev_ExecutorStart = ExecutorStart_hook;
 	ExecutorStart_hook = neon_ExecutorStart;
 	prev_ExecutorEnd = ExecutorEnd_hook;
@@ -646,7 +696,34 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 		PG_RETURN_INT32(dc);
 }
 
-#if PG_MAJORVERSION_NUM >= 16
+/*
+ * Initialization stage 2: make requests for the amount of shared memory we
+ * will need.
+ *
+ * For a high-level explanation of the initialization process, see _PG_init().
+ */
+static void
+neon_shmem_request_hook(void)
+{
+#if PG_VERSION_NUM >= 150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif
+
+	LfcShmemRequest();
+	NeonPerfCountersShmemRequest();
+	PagestoreShmemRequest();
+	RelsizeCacheShmemRequest();
+	WalproposerShmemRequest();
+	LwLsnCacheShmemRequest();
+}
+
+
+/*
+ * Initialization stage 3: Initialize shared memory.
+ *
+ * For a high-level explanation of the initialization process, see _PG_init().
+ */
 static void
 neon_shmem_startup_hook(void)
 {
@@ -654,6 +731,15 @@ neon_shmem_startup_hook(void)
 	if (prev_shmem_startup_hook)
 		prev_shmem_startup_hook();
 
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	LfcShmemInit();
+	NeonPerfCountersShmemInit();
+	PagestoreShmemInit();
+	RelsizeCacheShmemInit();
+	WalproposerShmemInit();
+	LwLsnCacheShmemInit();
+
 #if PG_MAJORVERSION_NUM >= 17
 	WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance");
 	WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read");
@@ -666,8 +752,9 @@ neon_shmem_startup_hook(void)
 	WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO");
 	WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download");
 #endif
+
+	LWLockRelease(AddinShmemInitLock);
 }
-#endif
 
 /*
  * ExecutorStart hook: start up tracking if needed
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index 215396ef7a..20c850864a 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -70,4 +70,19 @@ extern PGDLLEXPORT void WalProposerSync(int argc, char *argv[]);
 extern PGDLLEXPORT void WalProposerMain(Datum main_arg);
 extern PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
 
+extern void LfcShmemRequest(void);
+extern void PagestoreShmemRequest(void);
+extern void RelsizeCacheShmemRequest(void);
+extern void WalproposerShmemRequest(void);
+extern void LwLsnCacheShmemRequest(void);
+extern void NeonPerfCountersShmemRequest(void);
+
+extern void LfcShmemInit(void);
+extern void PagestoreShmemInit(void);
+extern void RelsizeCacheShmemInit(void);
+extern void WalproposerShmemInit(void);
+extern void LwLsnCacheShmemInit(void);
+extern void NeonPerfCountersShmemInit(void);
+
+
 #endif							/* NEON_H */
diff --git a/pgxn/neon/neon_lwlsncache.c b/pgxn/neon/neon_lwlsncache.c
index a8cfa0f825..5887c02c36 100644
--- a/pgxn/neon/neon_lwlsncache.c
+++ b/pgxn/neon/neon_lwlsncache.c
@@ -1,5 +1,6 @@
 #include "postgres.h"
 
+#include "neon.h"
 #include "neon_lwlsncache.h"
 
 #include "miscadmin.h"
@@ -81,14 +82,6 @@ static set_max_lwlsn_hook_type prev_set_max_lwlsn_hook = NULL;
 static set_lwlsn_relation_hook_type prev_set_lwlsn_relation_hook = NULL;
 static set_lwlsn_db_hook_type prev_set_lwlsn_db_hook = NULL;
 
-static shmem_startup_hook_type prev_shmem_startup_hook;
-
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook;
-#endif
-
-static void shmemrequest(void);
-static void shmeminit(void);
 static void neon_set_max_lwlsn(XLogRecPtr lsn);
 
 void
@@ -99,16 +92,6 @@ init_lwlsncache(void)
 	
 	lwlc_register_gucs();
 
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = shmeminit;
-
-	#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = shmemrequest;
-	#else
-	shmemrequest();
-	#endif
-	
 	prev_set_lwlsn_block_range_hook = set_lwlsn_block_range_hook;
 	set_lwlsn_block_range_hook = neon_set_lwlsn_block_range;
 	prev_set_lwlsn_block_v_hook = set_lwlsn_block_v_hook;
@@ -124,20 +107,19 @@ init_lwlsncache(void)
 }
 
 
-static void shmemrequest(void) {
+void
+LwLsnCacheShmemRequest(void)
+{
 	Size requested_size = sizeof(LwLsnCacheCtl);
-	
+
 	requested_size += hash_estimate_size(lwlsn_cache_size, sizeof(LastWrittenLsnCacheEntry));
 
 	RequestAddinShmemSpace(requested_size);
-
-	#if PG_VERSION_NUM >= 150000
-	if (prev_shmem_request_hook)
-			prev_shmem_request_hook();
-	#endif
 }
 
-static void shmeminit(void) {
+void
+LwLsnCacheShmemInit(void)
+{
 	static HASHCTL info;
 	bool found;
 	if (lwlsn_cache_size > 0)
@@ -157,9 +139,6 @@ static void shmeminit(void) {
 	}
 	dlist_init(&LwLsnCache->lastWrittenLsnLRU);
     LwLsnCache->maxLastWrittenLsn = GetRedoRecPtr();
-	if (prev_shmem_startup_hook) {
-		prev_shmem_startup_hook();
-	}
 }
 
 /*
diff --git a/pgxn/neon/neon_perf_counters.c b/pgxn/neon/neon_perf_counters.c
index d0a3d15108..dd576e4e73 100644
--- a/pgxn/neon/neon_perf_counters.c
+++ b/pgxn/neon/neon_perf_counters.c
@@ -17,22 +17,32 @@
 #include "storage/shmem.h"
 #include "utils/builtins.h"
 
+#include "neon.h"
 #include "neon_perf_counters.h"
 #include "neon_pgversioncompat.h"
 
 neon_per_backend_counters *neon_per_backend_counters_shared;
 
-Size
-NeonPerfCountersShmemSize(void)
+void
+NeonPerfCountersShmemRequest(void)
 {
-	Size		size = 0;
-
-	size = add_size(size, mul_size(NUM_NEON_PERF_COUNTER_SLOTS,
-								   sizeof(neon_per_backend_counters)));
-
-	return size;
+	Size size;
+#if PG_MAJORVERSION_NUM < 15
+	/* Hack: in PG14 MaxBackends is not initialized at the time of calling NeonPerfCountersShmemRequest function.
+	 * Do it ourselves and then undo to prevent assertion failure
+	 */
+	Assert(MaxBackends == 0); /* not initialized yet */
+	InitializeMaxBackends();
+	size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters));
+	MaxBackends = 0;
+#else
+	size = mul_size(NUM_NEON_PERF_COUNTER_SLOTS, sizeof(neon_per_backend_counters));
+#endif
+	RequestAddinShmemSpace(size);
 }
 
+
+
 void
 NeonPerfCountersShmemInit(void)
 {
diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
index 60ca1675d9..bf7961574a 100644
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -10,6 +10,7 @@
  */
 #include "postgres.h"
 
+#include "neon.h"
 #include "neon_pgversioncompat.h"
 
 #include "pagestore_client.h"
@@ -49,32 +50,23 @@ typedef struct
 								 * algorithm */
 } RelSizeHashControl;
 
-static HTAB *relsize_hash;
-static LWLockId relsize_lock;
-static int	relsize_hash_size;
-static RelSizeHashControl* relsize_ctl;
-static shmem_startup_hook_type prev_shmem_startup_hook = NULL;
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void relsize_shmem_request(void);
-#endif
-
 /*
  * Size of a cache entry is 36 bytes. So this default will take about 2.3 MB,
  * which seems reasonable.
  */
 #define DEFAULT_RELSIZE_HASH_SIZE (64 * 1024)
 
-static void
-neon_smgr_shmem_startup(void)
+static HTAB *relsize_hash;
+static LWLockId relsize_lock;
+static int	relsize_hash_size = DEFAULT_RELSIZE_HASH_SIZE;
+static RelSizeHashControl* relsize_ctl;
+
+void
+RelsizeCacheShmemInit(void)
 {
 	static HASHCTL info;
 	bool found;
 
-	if (prev_shmem_startup_hook)
-		prev_shmem_startup_hook();
-
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	relsize_ctl = (RelSizeHashControl *) ShmemInitStruct("relsize_hash", sizeof(RelSizeHashControl), &found);
 	if (!found)
 	{
@@ -85,7 +77,6 @@ neon_smgr_shmem_startup(void)
 									 relsize_hash_size, relsize_hash_size,
 									 &info,
 									 HASH_ELEM | HASH_BLOBS);
-		LWLockRelease(AddinShmemInitLock);
 		relsize_ctl->size = 0;
 		relsize_ctl->hits = 0;
 		relsize_ctl->misses = 0;
@@ -242,34 +233,15 @@ relsize_hash_init(void)
 							PGC_POSTMASTER,
 							0,
 							NULL, NULL, NULL);
-
-	if (relsize_hash_size > 0)
-	{
-#if PG_VERSION_NUM >= 150000
-		prev_shmem_request_hook = shmem_request_hook;
-		shmem_request_hook = relsize_shmem_request;
-#else
-		RequestAddinShmemSpace(hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
-		RequestNamedLWLockTranche("neon_relsize", 1);
-#endif
-
-		prev_shmem_startup_hook = shmem_startup_hook;
-		shmem_startup_hook = neon_smgr_shmem_startup;
-	}
 }
 
-#if PG_VERSION_NUM >= 150000
 /*
  * shmem_request hook: request additional shared resources.  We'll allocate or
  * attach to the shared resources in neon_smgr_shmem_startup().
  */
-static void
-relsize_shmem_request(void)
+void
+RelsizeCacheShmemRequest(void)
 {
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-
 	RequestAddinShmemSpace(sizeof(RelSizeHashControl) + hash_estimate_size(relsize_hash_size, sizeof(RelSizeEntry)));
 	RequestNamedLWLockTranche("neon_relsize", 1);
 }
-#endif
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 18655d4c6c..9ed8d0d2d2 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -83,10 +83,8 @@ static XLogRecPtr standby_flush_lsn = InvalidXLogRecPtr;
 static XLogRecPtr standby_apply_lsn = InvalidXLogRecPtr;
 static HotStandbyFeedback agg_hs_feedback;
 
-static void nwp_shmem_startup_hook(void);
 static void nwp_register_gucs(void);
 static void assign_neon_safekeepers(const char *newval, void *extra);
-static void nwp_prepare_shmem(void);
 static uint64 backpressure_lag_impl(void);
 static uint64 startup_backpressure_wrap(void);
 static bool backpressure_throttling_impl(void);
@@ -99,11 +97,6 @@ static TimestampTz walprop_pg_get_current_timestamp(WalProposer *wp);
 static void walprop_pg_load_libpqwalreceiver(void);
 
 static process_interrupts_callback_t PrevProcessInterruptsCallback = NULL;
-static shmem_startup_hook_type prev_shmem_startup_hook_type;
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void walproposer_shmem_request(void);
-#endif
 static void WalproposerShmemInit_SyncSafekeeper(void);
 
 
@@ -193,8 +186,6 @@ pg_init_walproposer(void)
 
 	nwp_register_gucs();
 
-	nwp_prepare_shmem();
-
 	delay_backend_us = &startup_backpressure_wrap;
 	PrevProcessInterruptsCallback = ProcessInterruptsCallback;
 	ProcessInterruptsCallback = backpressure_throttling_impl;
@@ -494,12 +485,11 @@ WalproposerShmemSize(void)
 	return sizeof(WalproposerShmemState);
 }
 
-static bool
+void
 WalproposerShmemInit(void)
 {
 	bool		found;
 
-	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 	walprop_shared = ShmemInitStruct("Walproposer shared state",
 									 sizeof(WalproposerShmemState),
 									 &found);
@@ -517,9 +507,6 @@ WalproposerShmemInit(void)
 		pg_atomic_init_u64(&walprop_shared->wal_rate_limiter.last_recorded_time_us, 0);
 		/* END_HADRON */
 	}
-	LWLockRelease(AddinShmemInitLock);
-
-	return found;
 }
 
 static void
@@ -623,42 +610,15 @@ walprop_register_bgworker(void)
 
 /* shmem handling */
 
-static void
-nwp_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = walproposer_shmem_request;
-#else
-	RequestAddinShmemSpace(WalproposerShmemSize());
-#endif
-	prev_shmem_startup_hook_type = shmem_startup_hook;
-	shmem_startup_hook = nwp_shmem_startup_hook;
-}
-
-#if PG_VERSION_NUM >= 150000
 /*
  * shmem_request hook: request additional shared resources.  We'll allocate or
- * attach to the shared resources in nwp_shmem_startup_hook().
+ * attach to the shared resources in WalproposerShmemInit().
  */
-static void
-walproposer_shmem_request(void)
+void
+WalproposerShmemRequest(void)
 {
-	if (prev_shmem_request_hook)
-		prev_shmem_request_hook();
-
 	RequestAddinShmemSpace(WalproposerShmemSize());
 }
-#endif
-
-static void
-nwp_shmem_startup_hook(void)
-{
-	if (prev_shmem_startup_hook_type)
-		prev_shmem_startup_hook_type();
-
-	WalproposerShmemInit();
-}
 
 WalproposerShmemState *
 GetWalpropShmemState(void)

From 7fef4435c19c053f89af6f27cbb9750d3c7bbadc Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Jul 2025 23:32:34 +0300
Subject: [PATCH 152/163] Store stripe_size in shared memory (#12560)

## Problem

See https://databricks.slack.com/archives/C09254R641L/p1752004515032899

stripe_size GUC update may be delayed at different backends and so cause
inconsistency with connection strings (shard map).

## Summary of changes

Postmaster should store stripe_size in shared memory as well as
connection strings.
It should be also enforced that stripe size is defined prior to
connection strings in postgresql.conf

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Kosntantin Knizhnik <konstantin.knizhnik@databricks.com>
---
 compute_tools/src/config.rs |  7 ++++---
 pgxn/neon/libpagestore.c    | 20 +++++++++++++++++---
 2 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index f6487d33b3..dd46353343 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -56,14 +56,15 @@ pub fn write_postgres_conf(
         writeln!(file, "{conf}")?;
     }
 
+    // Stripe size GUC should be defined prior to connection string
+    if let Some(stripe_size) = spec.shard_stripe_size {
+        writeln!(file, "neon.stripe_size={stripe_size}")?;
+    }
     // Add options for connecting to storage
     writeln!(file, "# Neon storage settings")?;
     if let Some(s) = &spec.pageserver_connstring {
         writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
     }
-    if let Some(stripe_size) = spec.shard_stripe_size {
-        writeln!(file, "neon.stripe_size={stripe_size}")?;
-    }
     if !spec.safekeeper_connstrings.is_empty() {
         let mut neon_safekeepers_value = String::new();
         tracing::info!(
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 596258007a..acb8092990 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -90,6 +90,7 @@ typedef struct
 {
 	char		connstring[MAX_SHARDS][MAX_PAGESERVER_CONNSTRING_SIZE];
 	size_t		num_shards;
+	size_t		stripe_size;
 } ShardMap;
 
 /*
@@ -110,6 +111,11 @@ typedef struct
  * has changed since last access, and to detect and retry copying the value if
  * the postmaster changes the value concurrently. (Postmaster doesn't have a
  * PGPROC entry and therefore cannot use LWLocks.)
+ *
+ * stripe_size is now also part of ShardMap, although it is defined by separate GUC.
+ * Postgres doesn't provide any mechanism to enforce dependencies between GUCs,
+ * that it we we have to rely on order of GUC definition in config file.
+ * "neon.stripe_size" should be defined prior to "neon.pageserver_connstring"
  */
 typedef struct
 {
@@ -230,7 +236,10 @@ ParseShardMap(const char *connstr, ShardMap *result)
 		p = sep + 1;
 	}
 	if (result)
+	{
 		result->num_shards = nshards;
+		result->stripe_size = stripe_size;
+	}
 
 	return true;
 }
@@ -291,12 +300,13 @@ AssignPageserverConnstring(const char *newval, void *extra)
  * last call, terminates all existing connections to all pageservers.
  */
 static void
-load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
+load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p, size_t* stripe_size_p)
 {
 	uint64		begin_update_counter;
 	uint64		end_update_counter;
 	ShardMap   *shard_map = &pagestore_shared->shard_map;
 	shardno_t	num_shards;
+	size_t		stripe_size;
 
 	/*
 	 * Postmaster can update the shared memory values concurrently, in which
@@ -311,6 +321,7 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 		end_update_counter = pg_atomic_read_u64(&pagestore_shared->end_update_counter);
 
 		num_shards = shard_map->num_shards;
+		stripe_size = shard_map->stripe_size;
 		if (connstr_p && shard_no < MAX_SHARDS)
 			strlcpy(connstr_p, shard_map->connstring[shard_no], MAX_PAGESERVER_CONNSTRING_SIZE);
 		pg_memory_barrier();
@@ -345,6 +356,8 @@ load_shard_map(shardno_t shard_no, char *connstr_p, shardno_t *num_shards_p)
 
 	if (num_shards_p)
 		*num_shards_p = num_shards;
+	if (stripe_size_p)
+		*stripe_size_p = stripe_size;
 }
 
 #define MB (1024*1024)
@@ -353,9 +366,10 @@ shardno_t
 get_shard_number(BufferTag *tag)
 {
 	shardno_t	n_shards;
+	size_t		stripe_size;
 	uint32		hash;
 
-	load_shard_map(0, NULL, &n_shards);
+	load_shard_map(0, NULL, &n_shards, &stripe_size);
 
 #if PG_MAJORVERSION_NUM < 16
 	hash = murmurhash32(tag->rnode.relNode);
@@ -408,7 +422,7 @@ pageserver_connect(shardno_t shard_no, int elevel)
 	 * Note that connstr is used both during connection start, and when we
 	 * log the successful connection.
 	 */
-	load_shard_map(shard_no, connstr, NULL);
+	load_shard_map(shard_no, connstr, NULL, NULL);
 
 	switch (shard->state)
 	{

From 62c0152e6bbb00b6fdd1061516317383a2e0ad82 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 17 Jul 2025 22:03:55 +0100
Subject: [PATCH 153/163] pageserver: shut down compute connections at libpq
 level (#12642)

## Problem

Previously, if a get page failure was cause by timeline shutdown, the
pageserver would attempt to tear down the connection gracefully:
`shutdown(SHUT_WR)` followed by `close()`.

This triggers a code path on the compute where it has to tell apart
between an idle connection and a closed one. That code is bug prone, so
we can just side-step the issue by shutting down the connection via a
libpq error message.

This surfaced as instability in test_shard_resolve_during_split_abort.
It's a new test, but the issue existed for ages.

## Summary of Changes

Send a libpq error message instead of doing graceful TCP connection
shutdown.

Closes LKB-648
---
 libs/postgres_backend/src/lib.rs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 851d824291..20afa8bb46 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -749,7 +749,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                 trace!("got query {query_string:?}");
                 if let Err(e) = handler.process_query(self, query_string).await {
                     match e {
-                        QueryError::Shutdown => return Ok(ProcessMsgResult::Break),
+                        err @ QueryError::Shutdown => {
+                            // Notify postgres of the connection shutdown at the libpq
+                            // protocol level. This avoids postgres having to tell apart
+                            // from an idle connection and a stale one, which is bug prone.
+                            let shutdown_error = short_error(&err);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &shutdown_error,
+                                Some(err.pg_error_code()),
+                            ))?;
+
+                            return Ok(ProcessMsgResult::Break);
+                        }
                         QueryError::SimulatedConnectionError => {
                             return Err(QueryError::SimulatedConnectionError);
                         }

From 53a05e8ccbb8b17a5eec07d96c0a1182cf717ffd Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 17 Jul 2025 23:43:43 +0200
Subject: [PATCH 154/163] fix(compute_ctl): Only offload LFC state if no
 prewarming is in progress (#12645)

## Problem

We currently offload LFC state unconditionally, which can cause
problems. Imagine a situation:
1. Endpoint started with `autoprewarm: true`.
2. While prewarming is not completed, we upload the new incomplete
state.
3. Compute gets interrupted and restarts.
4. We start again and try to prewarm with the state from 2. instead of
the previous complete state.

During the orchestrated prewarming, it's probably not a big issue, but
it's still better to do not interfere with the prewarm process.

## Summary of changes

Do not offload LFC state if we are currently prewarming or any issue
occurred. While on it, also introduce `Skipped` LFC prewarm status,
which is used when the corresponding LFC state is not present in the
endpoint storage. It's primarily needed to distinguish the first compute
start for particular endpoint, as it's completely valid to do not have
LFC state yet.
---
 compute_tools/src/compute.rs             | 21 +++++++-
 compute_tools/src/compute_prewarm.rs     | 61 +++++++++++++++++-------
 compute_tools/src/http/openapi_spec.yaml | 10 ++--
 libs/compute_api/src/responses.rs        | 24 ++++++++--
 4 files changed, 88 insertions(+), 28 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 941a21806f..3ae946c10e 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2450,14 +2450,31 @@ LIMIT 100",
     pub fn spawn_lfc_offload_task(self: &Arc<Self>, interval: Duration) {
         self.terminate_lfc_offload_task();
         let secs = interval.as_secs();
-        info!("spawning lfc offload worker with {secs}s interval");
         let this = self.clone();
+
+        info!("spawning LFC offload worker with {secs}s interval");
         let handle = spawn(async move {
             let mut interval = time::interval(interval);
             interval.tick().await; // returns immediately
             loop {
                 interval.tick().await;
-                this.offload_lfc_async().await;
+
+                let prewarm_state = this.state.lock().unwrap().lfc_prewarm_state.clone();
+                // Do not offload LFC state if we are currently prewarming or any issue occurred.
+                // If we'd do that, we might override the LFC state in endpoint storage with some
+                // incomplete state. Imagine a situation:
+                // 1. Endpoint started with `autoprewarm: true`
+                // 2. While prewarming is not completed, we upload the new incomplete state
+                // 3. Compute gets interrupted and restarts
+                // 4. We start again and try to prewarm with the state from 2. instead of the previous complete state
+                if matches!(
+                    prewarm_state,
+                    LfcPrewarmState::Completed
+                        | LfcPrewarmState::NotPrewarmed
+                        | LfcPrewarmState::Skipped
+                ) {
+                    this.offload_lfc_async().await;
+                }
             }
         });
         *self.lfc_offload_task.lock().unwrap() = Some(handle);
diff --git a/compute_tools/src/compute_prewarm.rs b/compute_tools/src/compute_prewarm.rs
index d014a5bb72..07b4a596cc 100644
--- a/compute_tools/src/compute_prewarm.rs
+++ b/compute_tools/src/compute_prewarm.rs
@@ -89,7 +89,7 @@ impl ComputeNode {
         self.state.lock().unwrap().lfc_offload_state.clone()
     }
 
-    /// If there is a prewarm request ongoing, return false, true otherwise
+    /// If there is a prewarm request ongoing, return `false`, `true` otherwise.
     pub fn prewarm_lfc(self: &Arc<Self>, from_endpoint: Option<String>) -> bool {
         {
             let state = &mut self.state.lock().unwrap().lfc_prewarm_state;
@@ -101,15 +101,25 @@ impl ComputeNode {
 
         let cloned = self.clone();
         spawn(async move {
-            let Err(err) = cloned.prewarm_impl(from_endpoint).await else {
-                cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Completed;
-                return;
-            };
-            crate::metrics::LFC_PREWARM_ERRORS.inc();
-            error!(%err, "prewarming lfc");
-            cloned.state.lock().unwrap().lfc_prewarm_state = LfcPrewarmState::Failed {
-                error: err.to_string(),
+            let state = match cloned.prewarm_impl(from_endpoint).await {
+                Ok(true) => LfcPrewarmState::Completed,
+                Ok(false) => {
+                    info!(
+                        "skipping LFC prewarm because LFC state is not found in endpoint storage"
+                    );
+                    LfcPrewarmState::Skipped
+                }
+                Err(err) => {
+                    crate::metrics::LFC_PREWARM_ERRORS.inc();
+                    error!(%err, "could not prewarm LFC");
+
+                    LfcPrewarmState::Failed {
+                        error: err.to_string(),
+                    }
+                }
             };
+
+            cloned.state.lock().unwrap().lfc_prewarm_state = state;
         });
         true
     }
@@ -120,15 +130,21 @@ impl ComputeNode {
         EndpointStoragePair::from_spec_and_endpoint(state.pspec.as_ref().unwrap(), from_endpoint)
     }
 
-    async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<()> {
+    /// Request LFC state from endpoint storage and load corresponding pages into Postgres.
+    /// Returns a result with `false` if the LFC state is not found in endpoint storage.
+    async fn prewarm_impl(&self, from_endpoint: Option<String>) -> Result<bool> {
         let EndpointStoragePair { url, token } = self.endpoint_storage_pair(from_endpoint)?;
-        info!(%url, "requesting LFC state from endpoint storage");
 
+        info!(%url, "requesting LFC state from endpoint storage");
         let request = Client::new().get(&url).bearer_auth(token);
         let res = request.send().await.context("querying endpoint storage")?;
         let status = res.status();
-        if status != StatusCode::OK {
-            bail!("{status} querying endpoint storage")
+        match status {
+            StatusCode::OK => (),
+            StatusCode::NOT_FOUND => {
+                return Ok(false);
+            }
+            _ => bail!("{status} querying endpoint storage"),
         }
 
         let mut uncompressed = Vec::new();
@@ -141,7 +157,8 @@ impl ComputeNode {
             .await
             .context("decoding LFC state")?;
         let uncompressed_len = uncompressed.len();
-        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into postgres");
+
+        info!(%url, "downloaded LFC state, uncompressed size {uncompressed_len}, loading into Postgres");
 
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
             .await
@@ -149,7 +166,9 @@ impl ComputeNode {
             .query_one("select neon.prewarm_local_cache($1)", &[&uncompressed])
             .await
             .context("loading LFC state into postgres")
-            .map(|_| ())
+            .map(|_| ())?;
+
+        Ok(true)
     }
 
     /// If offload request is ongoing, return false, true otherwise
@@ -177,12 +196,14 @@ impl ComputeNode {
 
     async fn offload_lfc_with_state_update(&self) {
         crate::metrics::LFC_OFFLOADS.inc();
+
         let Err(err) = self.offload_lfc_impl().await else {
             self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Completed;
             return;
         };
+
         crate::metrics::LFC_OFFLOAD_ERRORS.inc();
-        error!(%err, "offloading lfc");
+        error!(%err, "could not offload LFC state to endpoint storage");
         self.state.lock().unwrap().lfc_offload_state = LfcOffloadState::Failed {
             error: err.to_string(),
         };
@@ -190,7 +211,7 @@ impl ComputeNode {
 
     async fn offload_lfc_impl(&self) -> Result<()> {
         let EndpointStoragePair { url, token } = self.endpoint_storage_pair(None)?;
-        info!(%url, "requesting LFC state from postgres");
+        info!(%url, "requesting LFC state from Postgres");
 
         let mut compressed = Vec::new();
         ComputeNode::get_maintenance_client(&self.tokio_conn_conf)
@@ -205,13 +226,17 @@ impl ComputeNode {
             .read_to_end(&mut compressed)
             .await
             .context("compressing LFC state")?;
+
         let compressed_len = compressed.len();
         info!(%url, "downloaded LFC state, compressed size {compressed_len}, writing to endpoint storage");
 
         let request = Client::new().put(url).bearer_auth(token).body(compressed);
         match request.send().await {
             Ok(res) if res.status() == StatusCode::OK => Ok(()),
-            Ok(res) => bail!("Error writing to endpoint storage: {}", res.status()),
+            Ok(res) => bail!(
+                "Request to endpoint storage failed with status: {}",
+                res.status()
+            ),
             Err(err) => Err(err).context("writing to endpoint storage"),
         }
     }
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 93a357e160..3cf5ea7c51 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -613,11 +613,11 @@ components:
         - skipped
       properties:
         status:
-          description: Lfc prewarm status
-          enum: [not_prewarmed, prewarming, completed, failed]
+          description: LFC prewarm status
+          enum: [not_prewarmed, prewarming, completed, failed, skipped]
           type: string
         error:
-          description: Lfc prewarm error, if any
+          description: LFC prewarm error, if any
           type: string
         total:
           description: Total pages processed
@@ -635,11 +635,11 @@ components:
         - status
       properties:
         status:
-          description: Lfc offload status
+          description: LFC offload status
           enum: [not_offloaded, offloading, completed, failed]
           type: string
         error:
-          description: Lfc offload error, if any
+          description: LFC offload error, if any
           type: string
 
     PromoteState:
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 2fe233214a..5b8fc49750 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -46,16 +46,33 @@ pub struct ExtensionInstallResponse {
     pub version: ExtVersion,
 }
 
+/// Status of the LFC prewarm process. The same state machine is reused for
+/// both autoprewarm (prewarm after compute/Postgres start using the previously
+/// stored LFC state) and explicit prewarming via API.
 #[derive(Serialize, Default, Debug, Clone, PartialEq)]
 #[serde(tag = "status", rename_all = "snake_case")]
 pub enum LfcPrewarmState {
+    /// Default value when compute boots up.
     #[default]
     NotPrewarmed,
+    /// Prewarming thread is active and loading pages into LFC.
     Prewarming,
+    /// We found requested LFC state in the endpoint storage and
+    /// completed prewarming successfully.
     Completed,
-    Failed {
-        error: String,
-    },
+    /// Unexpected error happened during prewarming. Note, `Not Found 404`
+    /// response from the endpoint storage is explicitly excluded here
+    /// because it can normally happen on the first compute start,
+    /// since LFC state is not available yet.
+    Failed { error: String },
+    /// We tried to fetch the corresponding LFC state from the endpoint storage,
+    /// but received `Not Found 404`. This should normally happen only during the
+    /// first endpoint start after creation with `autoprewarm: true`.
+    ///
+    /// During the orchestrated prewarm via API, when a caller explicitly
+    /// provides the LFC state key to prewarm from, it's the caller responsibility
+    /// to handle this status as an error state in this case.
+    Skipped,
 }
 
 impl Display for LfcPrewarmState {
@@ -64,6 +81,7 @@ impl Display for LfcPrewarmState {
             LfcPrewarmState::NotPrewarmed => f.write_str("NotPrewarmed"),
             LfcPrewarmState::Prewarming => f.write_str("Prewarming"),
             LfcPrewarmState::Completed => f.write_str("Completed"),
+            LfcPrewarmState::Skipped => f.write_str("Skipped"),
             LfcPrewarmState::Failed { error } => write!(f, "Error({error})"),
         }
     }

From 64d0008389849f11c31b6253ea00e86c224caaaf Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Thu, 17 Jul 2025 23:52:20 +0200
Subject: [PATCH 155/163] proxy: Shorten the initial TTL of cancel keys
 (#12647)

## Problem

A high rate of short-lived connections means that there a lot of cancel
keys in Redis with TTL=10min that could be avoided by having a much
shorter initial TTL.

## Summary of changes

* Introduce an initial TTL of 1min used with the SET command.
* Fix: don't delay repushing cancel data when expired.
* Prepare for exponentially increasing TTLs.

## Alternatives

A best-effort UNLINK command on connection termination would clean up
cancel keys right away. This needs a bigger refactor due to how batching
is handled.
---
 proxy/src/cancellation.rs | 82 ++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 31 deletions(-)

diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 77062d3bb4..f25121331f 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -32,8 +32,11 @@ use crate::util::run_until;
 
 type IpSubnetKey = IpNet;
 
-const CANCEL_KEY_TTL: Duration = Duration::from_secs(600);
-const CANCEL_KEY_REFRESH: Duration = Duration::from_secs(570);
+/// Initial period and TTL is shorter to clear keys of short-lived connections faster.
+const CANCEL_KEY_INITIAL_PERIOD: Duration = Duration::from_secs(60);
+const CANCEL_KEY_REFRESH_PERIOD: Duration = Duration::from_secs(10 * 60);
+/// `CANCEL_KEY_TTL_SLACK` is added to the periods to determine the actual TTL.
+const CANCEL_KEY_TTL_SLACK: Duration = Duration::from_secs(30);
 
 // Message types for sending through mpsc channel
 pub enum CancelKeyOp {
@@ -54,6 +57,24 @@ pub enum CancelKeyOp {
     },
 }
 
+impl CancelKeyOp {
+    const fn redis_msg_kind(&self) -> RedisMsgKind {
+        match self {
+            CancelKeyOp::Store { .. } => RedisMsgKind::Set,
+            CancelKeyOp::Refresh { .. } => RedisMsgKind::Expire,
+            CancelKeyOp::Get { .. } => RedisMsgKind::Get,
+            CancelKeyOp::GetOld { .. } => RedisMsgKind::HGet,
+        }
+    }
+
+    fn cancel_channel_metric_guard(&self) -> CancelChannelSizeGuard<'static> {
+        Metrics::get()
+            .proxy
+            .cancel_channel_size
+            .guard(self.redis_msg_kind())
+    }
+}
+
 #[derive(thiserror::Error, Debug, Clone)]
 pub enum PipelineError {
     #[error("could not send cmd to redis: {0}")]
@@ -483,50 +504,49 @@ impl Session {
         let mut cancel = pin!(cancel);
 
         enum State {
-            Set,
+            Init,
             Refresh,
         }
-        let mut state = State::Set;
 
+        let mut state = State::Init;
         loop {
-            let guard_op = match state {
-                State::Set => {
-                    let guard = Metrics::get()
-                        .proxy
-                        .cancel_channel_size
-                        .guard(RedisMsgKind::Set);
-                    let op = CancelKeyOp::Store {
-                        key: self.key,
-                        value: closure_json.clone(),
-                        expire: CANCEL_KEY_TTL,
-                    };
+            let (op, mut wait_interval) = match state {
+                State::Init => {
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
                         "registering cancellation key"
                     );
-                    (guard, op)
+                    (
+                        CancelKeyOp::Store {
+                            key: self.key,
+                            value: closure_json.clone(),
+                            expire: CANCEL_KEY_INITIAL_PERIOD + CANCEL_KEY_TTL_SLACK,
+                        },
+                        CANCEL_KEY_INITIAL_PERIOD,
+                    )
                 }
 
                 State::Refresh => {
-                    let guard = Metrics::get()
-                        .proxy
-                        .cancel_channel_size
-                        .guard(RedisMsgKind::Expire);
-                    let op = CancelKeyOp::Refresh {
-                        key: self.key,
-                        expire: CANCEL_KEY_TTL,
-                    };
                     tracing::debug!(
                         src=%self.key,
                         dest=?cancel_closure.cancel_token,
                         "refreshing cancellation key"
                     );
-                    (guard, op)
+                    (
+                        CancelKeyOp::Refresh {
+                            key: self.key,
+                            expire: CANCEL_KEY_REFRESH_PERIOD + CANCEL_KEY_TTL_SLACK,
+                        },
+                        CANCEL_KEY_REFRESH_PERIOD,
+                    )
                 }
             };
 
-            match tx.call(guard_op, cancel.as_mut()).await {
+            match tx
+                .call((op.cancel_channel_metric_guard(), op), cancel.as_mut())
+                .await
+            {
                 // SET returns OK
                 Ok(Value::Okay) => {
                     tracing::debug!(
@@ -549,23 +569,23 @@ impl Session {
                 Ok(_) => {
                     // Any other response likely means the key expired.
                     tracing::warn!(src=%self.key, "refreshing cancellation key failed");
-                    // Re-enter the SET loop to repush full data.
-                    state = State::Set;
+                    // Re-enter the SET loop quickly to repush full data.
+                    state = State::Init;
+                    wait_interval = Duration::ZERO;
                 }
 
                 // retry immediately.
                 Err(BatchQueueError::Result(error)) => {
                     tracing::warn!(?error, "error refreshing cancellation key");
                     // Small delay to prevent busy loop with high cpu and logging.
-                    tokio::time::sleep(Duration::from_millis(10)).await;
-                    continue;
+                    wait_interval = Duration::from_millis(10);
                 }
 
                 Err(BatchQueueError::Cancelled(Err(_cancelled))) => break,
             }
 
             // wait before continuing. break immediately if cancelled.
-            if run_until(tokio::time::sleep(CANCEL_KEY_REFRESH), cancel.as_mut())
+            if run_until(tokio::time::sleep(wait_interval), cancel.as_mut())
                 .await
                 .is_err()
             {

From 6a353c33e3fe074f2083b315646cc6602a05350a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:13:21 +0200
Subject: [PATCH 156/163] print more timestamps in find_lsn_for_timestamp
 (#12641)

Observability of `find_lsn_for_timestamp` is lacking, as well as how and
when we update gc space and time cutoffs. Log them.
---
 pageserver/src/pgdatadir_mapping.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 08828ec4eb..cda08f2cc4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -813,6 +813,7 @@ impl Timeline {
         let gc_cutoff_lsn_guard = self.get_applied_gc_cutoff_lsn();
         let gc_cutoff_planned = {
             let gc_info = self.gc_info.read().unwrap();
+            info!(cutoffs=?gc_info.cutoffs, applied_cutoff=%*gc_cutoff_lsn_guard, "starting find_lsn_for_timestamp");
             gc_info.min_cutoff()
         };
         // Usually the planned cutoff is newer than the cutoff of the last gc run,

From 8f627ea0abbe8079052061bd25f9cec321a775bd Mon Sep 17 00:00:00 2001
From: HaoyuHuang <haoyu.huang.68@gmail.com>
Date: Thu, 17 Jul 2025 16:17:01 -0700
Subject: [PATCH 157/163] A few more SC changes (#12649)

## Problem

## Summary of changes
---
 pageserver/client/src/mgmt_api.rs             | 16 +++++++
 .../down.sql                                  |  2 +
 .../up.sql                                    | 17 +++++++
 storage_controller/src/hadron_utils.rs        | 44 +++++++++++++++++
 storage_controller/src/lib.rs                 |  1 +
 storage_controller/src/pageserver_client.rs   | 48 +++++++++++++++++++
 storage_controller/src/reconciler.rs          | 13 +++--
 storage_controller/src/schema.rs              | 20 ++++++++
 storage_controller/src/tenant_shard.rs        |  8 +++-
 9 files changed, 163 insertions(+), 6 deletions(-)
 create mode 100644 storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql
 create mode 100644 storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql
 create mode 100644 storage_controller/src/hadron_utils.rs

diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index fe1ddc2e7d..3867e536f4 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -873,6 +873,22 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
+    pub async fn reset_alert_gauges(&self) -> Result<()> {
+        let uri = format!(
+            "{}/hadron-internal/reset_alert_gauges",
+            self.mgmt_api_endpoint
+        );
+        self.start_request(Method::POST, uri)
+            .send()
+            .await
+            .map_err(Error::SendRequest)?
+            .error_from_body()
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
     pub async fn wait_lsn(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql
new file mode 100644
index 0000000000..b45b45e438
--- /dev/null
+++ b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/down.sql
@@ -0,0 +1,2 @@
+DROP TABLE hadron_safekeepers;
+DROP TABLE hadron_timeline_safekeepers;
diff --git a/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql
new file mode 100644
index 0000000000..6cee981efc
--- /dev/null
+++ b/storage_controller/migrations/2025-07-17-000001_hadron_safekeepers/up.sql
@@ -0,0 +1,17 @@
+-- hadron_safekeepers keep track of all Safe Keeper nodes that exist in the system.
+-- Upon startup, each Safe Keeper reaches out to the hadron cluster coordinator to register its node ID and listen addresses.
+
+CREATE TABLE hadron_safekeepers (
+  sk_node_id BIGINT PRIMARY KEY NOT NULL,
+  listen_http_addr VARCHAR NOT NULL,
+  listen_http_port INTEGER NOT NULL,
+  listen_pg_addr VARCHAR NOT NULL,
+  listen_pg_port INTEGER NOT NULL
+);
+
+CREATE TABLE hadron_timeline_safekeepers (
+  timeline_id VARCHAR NOT NULL,
+  sk_node_id BIGINT NOT NULL,
+  legacy_endpoint_id UUID DEFAULT NULL,
+  PRIMARY KEY(timeline_id, sk_node_id)
+);
diff --git a/storage_controller/src/hadron_utils.rs b/storage_controller/src/hadron_utils.rs
new file mode 100644
index 0000000000..871e21c367
--- /dev/null
+++ b/storage_controller/src/hadron_utils.rs
@@ -0,0 +1,44 @@
+use std::collections::BTreeMap;
+
+use rand::Rng;
+use utils::shard::TenantShardId;
+
+static CHARSET: &[u8] = b"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!@#$%^&*()";
+
+/// Generate a random string of `length` that can be used as a password. The generated string
+/// contains alphanumeric characters and special characters (!@#$%^&*())
+pub fn generate_random_password(length: usize) -> String {
+    let mut rng = rand::thread_rng();
+    (0..length)
+        .map(|_| {
+            let idx = rng.gen_range(0..CHARSET.len());
+            CHARSET[idx] as char
+        })
+        .collect()
+}
+
+pub(crate) struct TenantShardSizeMap {
+    #[expect(dead_code)]
+    pub map: BTreeMap<TenantShardId, u64>,
+}
+
+impl TenantShardSizeMap {
+    pub fn new(map: BTreeMap<TenantShardId, u64>) -> Self {
+        Self { map }
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn test_generate_random_password() {
+        let pwd1 = generate_random_password(10);
+        assert_eq!(pwd1.len(), 10);
+        let pwd2 = generate_random_password(10);
+        assert_ne!(pwd1, pwd2);
+        assert!(pwd1.chars().all(|c| CHARSET.contains(&(c as u8))));
+        assert!(pwd2.chars().all(|c| CHARSET.contains(&(c as u8))));
+    }
+}
diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs
index 36e3c5dc6c..24b06da83a 100644
--- a/storage_controller/src/lib.rs
+++ b/storage_controller/src/lib.rs
@@ -6,6 +6,7 @@ extern crate hyper0 as hyper;
 mod auth;
 mod background_node_operations;
 mod compute_hook;
+pub mod hadron_utils;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
diff --git a/storage_controller/src/pageserver_client.rs b/storage_controller/src/pageserver_client.rs
index da0687895a..9e829e252d 100644
--- a/storage_controller/src/pageserver_client.rs
+++ b/storage_controller/src/pageserver_client.rs
@@ -14,6 +14,8 @@ use reqwest::StatusCode;
 use utils::id::{NodeId, TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use crate::hadron_utils::TenantShardSizeMap;
+
 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
 #[derive(Debug, Clone)]
@@ -86,6 +88,31 @@ impl PageserverClient {
         )
     }
 
+    #[expect(dead_code)]
+    pub(crate) async fn tenant_timeline_compact(
+        &self,
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        force_image_layer_creation: bool,
+        wait_until_done: bool,
+    ) -> Result<()> {
+        measured_request!(
+            "tenant_timeline_compact",
+            crate::metrics::Method::Put,
+            &self.node_id_label,
+            self.inner
+                .tenant_timeline_compact(
+                    tenant_shard_id,
+                    timeline_id,
+                    force_image_layer_creation,
+                    true,
+                    false,
+                    wait_until_done,
+                )
+                .await
+        )
+    }
+
     /* BEGIN_HADRON */
     pub(crate) async fn tenant_timeline_describe(
         &self,
@@ -101,6 +128,17 @@ impl PageserverClient {
                 .await
         )
     }
+
+    #[expect(dead_code)]
+    pub(crate) async fn list_tenant_visible_size(&self) -> Result<TenantShardSizeMap> {
+        measured_request!(
+            "list_tenant_visible_size",
+            crate::metrics::Method::Get,
+            &self.node_id_label,
+            self.inner.list_tenant_visible_size().await
+        )
+        .map(TenantShardSizeMap::new)
+    }
     /* END_HADRON */
 
     pub(crate) async fn tenant_scan_remote_storage(
@@ -365,6 +403,16 @@ impl PageserverClient {
         )
     }
 
+    #[expect(dead_code)]
+    pub(crate) async fn reset_alert_gauges(&self) -> Result<()> {
+        measured_request!(
+            "reset_alert_gauges",
+            crate::metrics::Method::Post,
+            &self.node_id_label,
+            self.inner.reset_alert_gauges().await
+        )
+    }
+
     pub(crate) async fn wait_lsn(
         &self,
         tenant_shard_id: TenantShardId,
diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index a2fba0fa56..d1590ec75e 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -862,11 +862,11 @@ impl Reconciler {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     if refreshed {
                         tracing::info!(
-                            node_id=%node.get_id(), "Observed configuration correct after refresh. Notifying compute.");
+                            node_id=%node.get_id(), "[Attached] Observed configuration correct after refresh. Notifying compute.");
                         self.compute_notify().await?;
                     } else {
                         // Nothing to do
-                        tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.");
+                        tracing::info!(node_id=%node.get_id(), "[Attached] Observed configuration already correct.");
                     }
                 }
                 observed => {
@@ -945,17 +945,17 @@ impl Reconciler {
             match self.observed.locations.get(&node.get_id()) {
                 Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                     // Nothing to do
-                    tracing::info!(node_id=%node.get_id(), "Observed configuration already correct.")
+                    tracing::info!(node_id=%node.get_id(), "[Secondary] Observed configuration already correct.")
                 }
                 _ => {
                     // Only try and configure secondary locations on nodes that are available.  This
                     // allows the reconciler to "succeed" while some secondaries are offline (e.g. after
                     // a node failure, where the failed node will have a secondary intent)
                     if node.is_available() {
-                        tracing::info!(node_id=%node.get_id(), "Observed configuration requires update.");
+                        tracing::info!(node_id=%node.get_id(), "[Secondary] Observed configuration requires update.");
                         changes.push((node.clone(), wanted_conf))
                     } else {
-                        tracing::info!(node_id=%node.get_id(), "Skipping configuration as secondary, node is unavailable");
+                        tracing::info!(node_id=%node.get_id(), "[Secondary] Skipping configuration as secondary, node is unavailable");
                         self.observed
                             .locations
                             .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -1066,6 +1066,9 @@ impl Reconciler {
             }
             result
         } else {
+            tracing::info!(
+                "Compute notification is skipped because the tenant shard does not have an attached (primary) location"
+            );
             Ok(())
         }
     }
diff --git a/storage_controller/src/schema.rs b/storage_controller/src/schema.rs
index 312f7e0b0e..f3dcdaf798 100644
--- a/storage_controller/src/schema.rs
+++ b/storage_controller/src/schema.rs
@@ -13,6 +13,24 @@ diesel::table! {
     }
 }
 
+diesel::table! {
+    hadron_safekeepers (sk_node_id) {
+        sk_node_id -> Int8,
+        listen_http_addr -> Varchar,
+        listen_http_port -> Int4,
+        listen_pg_addr -> Varchar,
+        listen_pg_port -> Int4,
+    }
+}
+
+diesel::table! {
+    hadron_timeline_safekeepers (timeline_id, sk_node_id) {
+        timeline_id -> Varchar,
+        sk_node_id -> Int8,
+        legacy_endpoint_id -> Nullable<Uuid>,
+    }
+}
+
 diesel::table! {
     metadata_health (tenant_id, shard_number, shard_count) {
         tenant_id -> Varchar,
@@ -105,6 +123,8 @@ diesel::table! {
 
 diesel::allow_tables_to_appear_in_same_query!(
     controllers,
+    hadron_safekeepers,
+    hadron_timeline_safekeepers,
     metadata_health,
     nodes,
     safekeeper_timeline_pending_ops,
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 05de155963..f60378470e 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1611,7 +1611,13 @@ impl TenantShard {
 
         // Update result counter
         let outcome_label = match &result {
-            Ok(_) => ReconcileOutcome::Success,
+            Ok(_) => {
+                if reconciler.compute_notify_failure {
+                    ReconcileOutcome::SuccessNoNotify
+                } else {
+                    ReconcileOutcome::Success
+                }
+            }
             Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
             Err(_) => ReconcileOutcome::Error,
         };

From f3ef60d236300ed15d72b26215092052ed253895 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 18 Jul 2025 00:40:35 -0400
Subject: [PATCH 158/163] fix(storcon): use unified interface to handle 404 lsn
 lease (#12650)

## Problem

Close LKB-270. This is part of our series of efforts to make sure
lsn_lease API prompts clients to retry. Follow up of
https://github.com/neondatabase/neon/pull/12631.

Slack thread w/ Vlad:
https://databricks.slack.com/archives/C09254R641L/p1752677940697529

## Summary of changes

- Use `tenant_remote_mutation` API for LSN leases. Makes it consistent
with new APIs added to storcon.
- For 404, we now always retry because we know the tenant is
to-be-attached and will eventually reach a point that we can find that
tenant on the intent pageserver.
- Using the `tenant_remote_mutation` API also prevents us from the case
where the intent pageserver changes within the lease request. The
wrapper function will error with 503 if such things happen.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 storage_controller/src/http.rs    |   2 +-
 storage_controller/src/service.rs | 156 +++++++++---------------------
 2 files changed, 48 insertions(+), 110 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 6b6d081dcd..ff73719adb 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -819,7 +819,7 @@ async fn handle_tenant_timeline_passthrough(
                 .map_err(|e| ApiError::InternalServerError(e.into()))?;
             // We only handle "tenant not found" errors; other 404s like timeline not found should
             // be forwarded as-is.
-            if resp_str.contains(&format!("tenant {tenant_or_shard_id}")) {
+            if Service::is_tenant_not_found_error(resp_str, tenant_or_shard_id.tenant_id) {
                 // Rather than retry here, send the client a 503 to prompt a retry: this matches
                 // the pageserver's use of 503, and all clients calling this API should retry on 503.
                 return Err(ApiError::ResourceUnavailable(
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a1ff9b3c61..71186076ec 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -207,27 +207,6 @@ enum ShardGenerationValidity {
     },
 }
 
-/// We collect the state of attachments for some operations to determine if the operation
-/// needs to be retried when it fails.
-struct TenantShardAttachState {
-    /// The targets of the operation.
-    ///
-    /// Tenant shard ID, node ID, node, is intent node observed primary.
-    targets: Vec<(TenantShardId, NodeId, Node, bool)>,
-
-    /// The targets grouped by node ID.
-    by_node_id: HashMap<NodeId, (TenantShardId, Node, bool)>,
-}
-
-impl TenantShardAttachState {
-    fn for_api_call(&self) -> Vec<(TenantShardId, Node)> {
-        self.targets
-            .iter()
-            .map(|(tenant_shard_id, _, node, _)| (*tenant_shard_id, node.clone()))
-            .collect()
-    }
-}
-
 pub const RECONCILER_CONCURRENCY_DEFAULT: usize = 128;
 pub const PRIORITY_RECONCILER_CONCURRENCY_DEFAULT: usize = 256;
 pub const SAFEKEEPER_RECONCILER_CONCURRENCY_DEFAULT: usize = 32;
@@ -4795,78 +4774,24 @@ impl Service {
         Ok(())
     }
 
-    fn is_observed_consistent_with_intent(
-        &self,
-        shard: &TenantShard,
-        intent_node_id: NodeId,
-    ) -> bool {
-        if let Some(location) = shard.observed.locations.get(&intent_node_id)
-            && let Some(ref conf) = location.conf
-            && (conf.mode == LocationConfigMode::AttachedSingle
-                || conf.mode == LocationConfigMode::AttachedMulti)
-        {
-            true
-        } else {
-            false
-        }
-    }
-
-    fn collect_tenant_shards(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantShardAttachState, ApiError> {
-        let locked = self.inner.read().unwrap();
-        let mut targets = Vec::new();
-        let mut by_node_id = HashMap::new();
-
-        // If the request got an unsharded tenant id, then apply
-        // the operation to all shards. Otherwise, apply it to a specific shard.
-        let shards_range = TenantShardId::tenant_range(tenant_id);
-
-        for (tenant_shard_id, shard) in locked.tenants.range(shards_range) {
-            if let Some(node_id) = shard.intent.get_attached() {
-                let node = locked
-                    .nodes
-                    .get(node_id)
-                    .expect("Pageservers may not be deleted while referenced");
-
-                let consistent = self.is_observed_consistent_with_intent(shard, *node_id);
-
-                targets.push((*tenant_shard_id, *node_id, node.clone(), consistent));
-                by_node_id.insert(*node_id, (*tenant_shard_id, node.clone(), consistent));
-            }
-        }
-
-        if targets.is_empty() {
-            return Err(ApiError::NotFound(
-                anyhow::anyhow!("Tenant {tenant_id} not found").into(),
-            ));
-        }
-
-        Ok(TenantShardAttachState {
-            targets,
-            by_node_id,
-        })
+    pub(crate) fn is_tenant_not_found_error(body: &str, tenant_id: TenantId) -> bool {
+        body.contains(&format!("tenant {tenant_id}"))
     }
 
     fn process_result_and_passthrough_errors<T>(
         &self,
+        tenant_id: TenantId,
         results: Vec<(Node, Result<T, mgmt_api::Error>)>,
-        attach_state: TenantShardAttachState,
     ) -> Result<Vec<(Node, T)>, ApiError> {
         let mut processed_results: Vec<(Node, T)> = Vec::with_capacity(results.len());
-        debug_assert_eq!(results.len(), attach_state.targets.len());
         for (node, res) in results {
-            let is_consistent = attach_state
-                .by_node_id
-                .get(&node.get_id())
-                .map(|(_, _, consistent)| *consistent);
             match res {
                 Ok(res) => processed_results.push((node, res)),
-                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _))
-                    if is_consistent == Some(false) =>
+                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, body))
+                    if Self::is_tenant_not_found_error(&body, tenant_id) =>
                 {
-                    // This is expected if the attach is not finished yet. Return 503 so that the client can retry.
+                    // If there's a tenant not found, we are still in the process of attaching the tenant.
+                    // Return 503 so that the client can retry.
                     return Err(ApiError::ResourceUnavailable(
                         format!(
                             "Timeline is not attached to the pageserver {} yet, please retry",
@@ -4894,35 +4819,48 @@ impl Service {
         )
         .await;
 
-        let attach_state = self.collect_tenant_shards(tenant_id)?;
-
-        let results = self
-            .tenant_for_shards_api(
-                attach_state.for_api_call(),
-                |tenant_shard_id, client| async move {
-                    client
-                        .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
-                        .await
-                },
-                1,
-                1,
-                SHORT_RECONCILE_TIMEOUT,
-                &self.cancel,
-            )
-            .await;
-
-        let leases = self.process_result_and_passthrough_errors(results, attach_state)?;
-        let mut valid_until = None;
-        for (_, lease) in leases {
-            if let Some(ref mut valid_until) = valid_until {
-                *valid_until = std::cmp::min(*valid_until, lease.valid_until);
-            } else {
-                valid_until = Some(lease.valid_until);
+        self.tenant_remote_mutation(tenant_id, |locations| async move {
+            if locations.0.is_empty() {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant not found").into(),
+                ));
             }
-        }
-        Ok(LsnLease {
-            valid_until: valid_until.unwrap_or_else(SystemTime::now),
+
+            let results = self
+                .tenant_for_shards_api(
+                    locations
+                        .0
+                        .iter()
+                        .map(|(tenant_shard_id, ShardMutationLocations { latest, .. })| {
+                            (*tenant_shard_id, latest.node.clone())
+                        })
+                        .collect(),
+                    |tenant_shard_id, client| async move {
+                        client
+                            .timeline_lease_lsn(tenant_shard_id, timeline_id, lsn)
+                            .await
+                    },
+                    1,
+                    1,
+                    SHORT_RECONCILE_TIMEOUT,
+                    &self.cancel,
+                )
+                .await;
+
+            let leases = self.process_result_and_passthrough_errors(tenant_id, results)?;
+            let mut valid_until = None;
+            for (_, lease) in leases {
+                if let Some(ref mut valid_until) = valid_until {
+                    *valid_until = std::cmp::min(*valid_until, lease.valid_until);
+                } else {
+                    valid_until = Some(lease.valid_until);
+                }
+            }
+            Ok(LsnLease {
+                valid_until: valid_until.unwrap_or_else(SystemTime::now),
+            })
         })
+        .await?
     }
 
     pub(crate) async fn tenant_timeline_download_heatmap_layers(

From cb50291dcd68ba7b4240cab78671ead96e191e58 Mon Sep 17 00:00:00 2001
From: Victor Polevoy <fx@thefx.co>
Date: Thu, 10 Jul 2025 11:02:32 +0200
Subject: [PATCH 159/163] Fetches the SLRU segment via the new communicator.

The fetch is done not into a buffer as earlier, but directly into the
file.
---
 libs/neon-shmem/src/hash.rs                   | 31 +++++++++
 libs/neon-shmem/src/hash/core.rs              | 30 +++++++++
 libs/neon-shmem/src/shmem.rs                  |  2 +
 pgxn/neon/communicator/src/file_cache.rs      |  2 +
 .../neon/communicator/src/integrated_cache.rs |  1 +
 pgxn/neon/communicator/src/lib.rs             |  2 +-
 pgxn/neon/communicator/src/neon_request.rs    | 31 ++++++++-
 .../src/worker_process/main_loop.rs           | 38 ++++++++++-
 pgxn/neon/communicator_new.c                  | 66 ++++++++++++++++++-
 pgxn/neon/communicator_new.h                  |  8 ++-
 pgxn/neon/file_cache.c                        | 30 ++++-----
 pgxn/neon/libpagestore.c                      |  8 +--
 pgxn/neon/neon.c                              | 13 ++--
 pgxn/neon/neon.h                              |  1 -
 pgxn/neon/pagestore_smgr.c                    | 60 ++++++++---------
 pgxn/neon/relsize_cache.c                     | 10 ++-
 test_runner/fixtures/neon_fixtures.py         |  4 +-
 test_runner/regress/test_normal_work.py       |  4 +-
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  8 +--
 23 files changed, 275 insertions(+), 82 deletions(-)

diff --git a/libs/neon-shmem/src/hash.rs b/libs/neon-shmem/src/hash.rs
index 84c2be3637..6fc7baefcc 100644
--- a/libs/neon-shmem/src/hash.rs
+++ b/libs/neon-shmem/src/hash.rs
@@ -14,6 +14,7 @@
 //! in-place and are at a high level achieved by expanding/reducing the bucket array and rebuilding the
 //! dictionary by rehashing all keys.
 
+use std::fmt::Debug;
 use std::hash::{BuildHasher, Hash};
 use std::mem::MaybeUninit;
 
@@ -41,6 +42,22 @@ pub struct HashMapInit<'a, K, V, S = rustc_hash::FxBuildHasher> {
     num_buckets: u32,
 }
 
+impl<'a, K, V, S> Debug for HashMapInit<'a, K, V, S>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("HashMapInit")
+            .field("shmem_handle", &self.shmem_handle)
+            .field("shared_ptr", &self.shared_ptr)
+            .field("shared_size", &self.shared_size)
+            // .field("hasher", &self.hasher)
+            .field("num_buckets", &self.num_buckets)
+            .finish()
+    }
+}
+
 /// This is a per-process handle to a hash table that (possibly) lives in shared memory.
 /// If a child process is launched with fork(), the child process should
 /// get its own HashMapAccess by calling HashMapInit::attach_writer/reader().
@@ -56,6 +73,20 @@ pub struct HashMapAccess<'a, K, V, S = rustc_hash::FxBuildHasher> {
 unsafe impl<K: Sync, V: Sync, S> Sync for HashMapAccess<'_, K, V, S> {}
 unsafe impl<K: Send, V: Send, S> Send for HashMapAccess<'_, K, V, S> {}
 
+impl<'a, K, V, S> Debug for HashMapAccess<'a, K, V, S>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("HashMapAccess")
+            .field("shmem_handle", &self.shmem_handle)
+            .field("shared_ptr", &self.shared_ptr)
+            // .field("hasher", &self.hasher)
+            .finish()
+    }
+}
+
 impl<'a, K: Clone + Hash + Eq, V, S> HashMapInit<'a, K, V, S> {
     /// Change the 'hasher' used by the hash table.
     ///
diff --git a/libs/neon-shmem/src/hash/core.rs b/libs/neon-shmem/src/hash/core.rs
index 013eb9a09c..67be7672d1 100644
--- a/libs/neon-shmem/src/hash/core.rs
+++ b/libs/neon-shmem/src/hash/core.rs
@@ -1,5 +1,6 @@
 //! Simple hash table with chaining.
 
+use std::fmt::Debug;
 use std::hash::Hash;
 use std::mem::MaybeUninit;
 
@@ -17,6 +18,19 @@ pub(crate) struct Bucket<K, V> {
     pub(crate) inner: Option<(K, V)>,
 }
 
+impl<K, V> Debug for Bucket<K, V>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("Bucket")
+            .field("next", &self.next)
+            .field("inner", &self.inner)
+            .finish()
+    }
+}
+
 /// Core hash table implementation.
 pub(crate) struct CoreHashMap<'a, K, V> {
     /// Dictionary used to map hashes to bucket indices.
@@ -34,6 +48,22 @@ pub(crate) struct CoreHashMap<'a, K, V> {
     pub(crate) _user_list_head: u32,
 }
 
+impl<'a, K, V> Debug for CoreHashMap<'a, K, V>
+where
+    K: Debug,
+    V: Debug,
+{
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("CoreHashMap")
+            .field("dictionary", &self.dictionary)
+            .field("buckets", &self.buckets)
+            .field("free_head", &self.free_head)
+            .field("alloc_limit", &self.alloc_limit)
+            .field("buckets_in_use", &self.buckets_in_use)
+            .finish()
+    }
+}
+
 /// Error for when there are no empty buckets left but one is needed.
 #[derive(Debug, PartialEq)]
 pub struct FullError();
diff --git a/libs/neon-shmem/src/shmem.rs b/libs/neon-shmem/src/shmem.rs
index f19f402859..9c304d6540 100644
--- a/libs/neon-shmem/src/shmem.rs
+++ b/libs/neon-shmem/src/shmem.rs
@@ -21,6 +21,7 @@ use nix::unistd::ftruncate as nix_ftruncate;
 /// the underlying file is resized. Do not access the area beyond the current size. Currently, that
 /// will cause the file to be expanded, but we might use `mprotect()` etc. to enforce that in the
 /// future.
+#[derive(Debug)]
 pub struct ShmemHandle {
     /// memfd file descriptor
     fd: OwnedFd,
@@ -35,6 +36,7 @@ pub struct ShmemHandle {
 }
 
 /// This is stored at the beginning in the shared memory area.
+#[derive(Debug)]
 struct SharedStruct {
     max_size: usize,
 
diff --git a/pgxn/neon/communicator/src/file_cache.rs b/pgxn/neon/communicator/src/file_cache.rs
index 1f60c97f2c..f153174c6b 100644
--- a/pgxn/neon/communicator/src/file_cache.rs
+++ b/pgxn/neon/communicator/src/file_cache.rs
@@ -22,6 +22,7 @@ pub type CacheBlock = u64;
 
 pub const INVALID_CACHE_BLOCK: CacheBlock = u64::MAX;
 
+#[derive(Debug)]
 pub struct FileCache {
     file: Arc<File>,
 
@@ -35,6 +36,7 @@ pub struct FileCache {
 // TODO: We keep track of all free blocks in this vec. That doesn't really scale.
 // Idea: when free_blocks fills up with more than 1024 entries, write them all to
 // one block on disk.
+#[derive(Debug)]
 struct FreeList {
     next_free_block: CacheBlock,
     max_blocks: u64,
diff --git a/pgxn/neon/communicator/src/integrated_cache.rs b/pgxn/neon/communicator/src/integrated_cache.rs
index e00e49bf3d..a69af44492 100644
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -46,6 +46,7 @@ pub struct IntegratedCacheInitStruct<'t> {
 }
 
 /// Represents write-access to the integrated cache. This is used by the communicator process.
+#[derive(Debug)]
 pub struct IntegratedCacheWriteAccess<'t> {
     relsize_cache: neon_shmem::hash::HashMapAccess<'t, RelKey, RelEntry>,
     block_map: neon_shmem::hash::HashMapAccess<'t, BlockKey, BlockEntry>,
diff --git a/pgxn/neon/communicator/src/lib.rs b/pgxn/neon/communicator/src/lib.rs
index 734e89a89a..d0c5b758da 100644
--- a/pgxn/neon/communicator/src/lib.rs
+++ b/pgxn/neon/communicator/src/lib.rs
@@ -21,5 +21,5 @@ mod worker_process;
 
 mod global_allocator;
 
-// FIXME get this from postgres headers somehow
+// FIXME: get this from postgres headers somehow
 pub const BLCKSZ: usize = 8192;
diff --git a/pgxn/neon/communicator/src/neon_request.rs b/pgxn/neon/communicator/src/neon_request.rs
index 732c35d6ce..70ceaf8744 100644
--- a/pgxn/neon/communicator/src/neon_request.rs
+++ b/pgxn/neon/communicator/src/neon_request.rs
@@ -15,7 +15,9 @@ pub type COid = u32;
 // This conveniently matches PG_IOV_MAX
 pub const MAX_GETPAGEV_PAGES: usize = 32;
 
-use pageserver_page_api as page_api;
+use std::ffi::CStr;
+
+use pageserver_page_api::{self as page_api, SlruKind};
 
 /// Request from a Postgres backend to the communicator process
 #[allow(clippy::large_enum_variant)]
@@ -29,6 +31,7 @@ pub enum NeonIORequest {
     RelExists(CRelExistsRequest),
     RelSize(CRelSizeRequest),
     GetPageV(CGetPageVRequest),
+    ReadSlruSegment(CReadSlruSegmentRequest),
     PrefetchV(CPrefetchVRequest),
     DbSize(CDbSizeRequest),
 
@@ -54,6 +57,9 @@ pub enum NeonIOResult {
 
     /// the result pages are written to the shared memory addresses given in the request
     GetPageV,
+    /// The result is written to the file, path to which is provided
+    /// in the request. The [`u64`] value here is the number of blocks.
+    ReadSlruSegment(u64),
 
     /// A prefetch request returns as soon as the request has been received by the communicator.
     /// It is processed in the background.
@@ -83,6 +89,7 @@ impl NeonIORequest {
             RelExists(req) => req.request_id,
             RelSize(req) => req.request_id,
             GetPageV(req) => req.request_id,
+            ReadSlruSegment(req) => req.request_id,
             PrefetchV(req) => req.request_id,
             DbSize(req) => req.request_id,
             WritePage(req) => req.request_id,
@@ -193,6 +200,28 @@ pub struct CGetPageVRequest {
     pub dest: [ShmemBuf; MAX_GETPAGEV_PAGES],
 }
 
+#[repr(C)]
+#[derive(Copy, Clone, Debug)]
+pub struct CReadSlruSegmentRequest {
+    pub request_id: u64,
+    pub slru_kind: SlruKind,
+    pub segment_number: u32,
+    pub request_lsn: CLsn,
+    /// Must be a null-terminated C string containing the file path
+    /// where the communicator will write the SLRU segment.
+    pub destination_file_path: ShmemBuf,
+}
+
+impl CReadSlruSegmentRequest {
+    /// Returns the file path where the communicator will write the
+    /// SLRU segment.
+    pub(crate) fn destination_file_path(&self) -> String {
+        unsafe { CStr::from_ptr(self.destination_file_path.as_mut_ptr() as *const _) }
+            .to_string_lossy()
+            .into_owned()
+    }
+}
+
 #[repr(C)]
 #[derive(Copy, Clone, Debug)]
 pub struct CPrefetchVRequest {
diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs
index c207132753..0b2f9da366 100644
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -24,7 +24,7 @@ use utils::id::{TenantId, TimelineId};
 
 use super::callbacks::{get_request_lsn, notify_proc};
 
-use tracing::{error, info, info_span, trace};
+use tracing::{debug, error, info, info_span, trace};
 
 use utils::lsn::Lsn;
 
@@ -58,6 +58,7 @@ pub struct CommunicatorWorkerProcessStruct<'a> {
     request_rel_exists_counter: IntCounter,
     request_rel_size_counter: IntCounter,
     request_get_pagev_counter: IntCounter,
+    request_read_slru_segment_counter: IntCounter,
     request_prefetchv_counter: IntCounter,
     request_db_size_counter: IntCounter,
     request_write_page_counter: IntCounter,
@@ -106,6 +107,8 @@ pub(super) async fn init(
         .integrated_cache_init_struct
         .worker_process_init(last_lsn, file_cache);
 
+    debug!("Initialised integrated cache: {cache:?}");
+
     let tenant_id = TenantId::from_str(&tenant_id).expect("invalid tenant ID");
     let timeline_id = TimelineId::from_str(&timeline_id).expect("invalid timeline ID");
     let shard_spec = ShardSpec::new(shard_map, stripe_size).expect("invalid shard spec");
@@ -123,6 +126,8 @@ pub(super) async fn init(
     let request_rel_exists_counter = request_counters.with_label_values(&["rel_exists"]);
     let request_rel_size_counter = request_counters.with_label_values(&["rel_size"]);
     let request_get_pagev_counter = request_counters.with_label_values(&["get_pagev"]);
+    let request_read_slru_segment_counter =
+        request_counters.with_label_values(&["read_slru_segment"]);
     let request_prefetchv_counter = request_counters.with_label_values(&["prefetchv"]);
     let request_db_size_counter = request_counters.with_label_values(&["db_size"]);
     let request_write_page_counter = request_counters.with_label_values(&["write_page"]);
@@ -173,6 +178,7 @@ pub(super) async fn init(
         request_rel_exists_counter,
         request_rel_size_counter,
         request_get_pagev_counter,
+        request_read_slru_segment_counter,
         request_prefetchv_counter,
         request_db_size_counter,
         request_write_page_counter,
@@ -418,6 +424,36 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
                     Err(errno) => NeonIOResult::Error(errno),
                 }
             }
+            NeonIORequest::ReadSlruSegment(req) => {
+                self.request_read_slru_segment_counter.inc();
+                let lsn = Lsn(req.request_lsn);
+                let file_path = req.destination_file_path();
+
+                match self
+                    .client
+                    .get_slru_segment(page_api::GetSlruSegmentRequest {
+                        read_lsn: self.request_lsns(lsn),
+                        kind: req.slru_kind,
+                        segno: req.segment_number,
+                    })
+                    .await
+                {
+                    Ok(slru_bytes) => {
+                        if let Err(e) = tokio::fs::write(&file_path, &slru_bytes).await {
+                            info!("could not write slru segment to file {file_path}: {e}");
+                            return NeonIOResult::Error(e.raw_os_error().unwrap_or(libc::EIO));
+                        }
+
+                        let blocks_count = slru_bytes.len() / crate::BLCKSZ;
+
+                        NeonIOResult::ReadSlruSegment(blocks_count as _)
+                    }
+                    Err(err) => {
+                        info!("tonic error: {err:?}");
+                        NeonIOResult::Error(0)
+                    }
+                }
+            }
             NeonIORequest::PrefetchV(req) => {
                 self.request_prefetchv_counter.inc();
                 self.request_prefetchv_nblocks_counter
diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index bdd5a75d62..87c25af8e5 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -997,10 +997,58 @@ communicator_new_dbsize(Oid dbNode)
 }
 
 int
-communicator_new_read_slru_segment(SlruKind kind, int64 segno, void *buffer)
+communicator_new_read_slru_segment(
+	SlruKind kind,
+	uint32_t segno,
+	neon_request_lsns *request_lsns,
+	const char* path)
 {
-	/* TODO */
-	elog(ERROR, "not implemented");
+	NeonIOResult result = {};
+	NeonIORequest request = {
+		.tag = NeonIORequest_ReadSlruSegment,
+		.read_slru_segment = {
+			.request_id = assign_request_id(),
+			.slru_kind = kind,
+			.segment_number = segno,
+			.request_lsn = request_lsns->request_lsn,
+		}
+	};
+	int nblocks = -1;
+	char *temp_path = bounce_buf();
+
+	if (path == NULL) {
+		elog(ERROR, "read_slru_segment called with NULL path");
+		return -1;
+	}
+
+	strlcpy(temp_path, path, BLCKSZ);
+	request.read_slru_segment.destination_file_path.ptr = (uint8_t *) temp_path;
+
+	elog(DEBUG5, "readslrusegment called for kind=%u, segno=%u, file_path=\"%s\"",
+		kind, segno, request.read_slru_segment.destination_file_path.ptr);
+
+	/* FIXME: see `request_lsns` in main_loop.rs for why this is needed */
+	XLogSetAsyncXactLSN(request_lsns->request_lsn);
+
+	perform_request(&request, &result);
+
+	switch (result.tag)
+	{
+		case NeonIOResult_ReadSlruSegment:
+			nblocks = result.read_slru_segment;
+			break;
+		case NeonIOResult_Error:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not read slru segment, kind=%u, segno=%u: %s",
+							kind, segno, pg_strerror(result.error))));
+			break;
+		default:
+			elog(ERROR, "unexpected result for read SLRU operation: %d", result.tag);
+			break;
+	}
+
+	return nblocks;
 }
 
 /* Write requests */
@@ -1305,6 +1353,18 @@ print_neon_io_request(NeonIORequest *request)
 								r->spc_oid, r->db_oid, r->rel_number, r->fork_number, r->block_number, r->block_number + r->nblocks);
 				return buf;
 			}
+		case NeonIORequest_ReadSlruSegment:
+			{
+				CReadSlruSegmentRequest *r = &request->read_slru_segment;
+
+				snprintf(buf, sizeof(buf), "ReadSlruSegment: req " UINT64_FORMAT " slrukind=%u, segno=%u, lsn=%X/%X, file_path=\"%s\"",
+								r->request_id,
+								r->slru_kind,
+								r->segment_number,
+								LSN_FORMAT_ARGS(r->request_lsn),
+								r->destination_file_path.ptr);
+				return buf;
+			}
 		case NeonIORequest_PrefetchV:
 			{
 				CPrefetchVRequest *r = &request->prefetch_v;
diff --git a/pgxn/neon/communicator_new.h b/pgxn/neon/communicator_new.h
index 1323c48e15..a19feaaac6 100644
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -38,8 +38,12 @@ extern void communicator_new_prefetch_register_bufferv(NRelFileInfo rinfo, ForkN
 													   BlockNumber nblocks);
 extern bool communicator_new_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
 											BlockNumber blockno);
-extern int	communicator_new_read_slru_segment(SlruKind kind, int64 segno,
-											   void *buffer);
+extern int communicator_new_read_slru_segment(
+	SlruKind kind,
+	uint32_t segno,
+	neon_request_lsns *request_lsns,
+	const char *path
+);
 
 /* Write requests, to keep the caches up-to-date */
 extern void communicator_new_write_page(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blockno,
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 909fd6fa36..754b1ca033 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -258,7 +258,7 @@ lfc_switch_off(void)
 {
 	int			fd;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (LFC_ENABLED())
 	{
@@ -325,7 +325,7 @@ lfc_maybe_disabled(void)
 static bool
 lfc_ensure_opened(void)
 {
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (lfc_generation != lfc_ctl->generation)
 	{
@@ -352,7 +352,7 @@ lfc_shmem_startup(void)
 	bool		found;
 	static HASHCTL info;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (prev_shmem_startup_hook)
 	{
@@ -652,7 +652,7 @@ lfc_init(void)
 	if (lfc_max_size == 0)
 		return;
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		return;
 
 	prev_shmem_startup_hook = shmem_startup_hook;
@@ -730,7 +730,7 @@ lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
 	dsm_segment *seg;
 	BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (!lfc_ensure_opened())
 		return;
@@ -885,7 +885,7 @@ lfc_prewarm_main(Datum main_arg)
 	PrewarmWorkerState* ws;
 	uint32 worker_id = DatumGetInt32(main_arg);
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	AmPrewarmWorker = true;
 
@@ -987,7 +987,7 @@ lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 	FileCacheEntry *entry;
 	uint32		hash;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
@@ -1034,7 +1034,7 @@ lfc_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	bool		found = false;
 	uint32		hash;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;
@@ -1071,7 +1071,7 @@ lfc_cache_containsv(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	uint32		hash;
 	int			i = 0;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return 0;
@@ -1180,7 +1180,7 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	int			blocks_read = 0;
 	int			buf_offset = 0;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return -1;
@@ -1547,7 +1547,7 @@ lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 
 	int		chunk_offs = BLOCK_TO_CHUNK_OFF(blkno);
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return false;
@@ -1694,7 +1694,7 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 	uint32		entry_offset;
 	int			buf_offset = 0;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (lfc_maybe_disabled())	/* fast exit if file cache is disabled */
 		return;
@@ -2211,7 +2211,7 @@ get_local_cache_state(PG_FUNCTION_ARGS)
 	size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
 	FileCacheState* fcs;
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		elog(ERROR, "TODO: not implemented");
 
 	fcs = lfc_get_state(max_entries);
@@ -2231,7 +2231,7 @@ prewarm_local_cache(PG_FUNCTION_ARGS)
 	uint32 n_workers =  PG_GETARG_INT32(1);
 	FileCacheState* fcs;
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		elog(ERROR, "TODO: not implemented");
 
 	fcs = (FileCacheState*)state;
@@ -2254,7 +2254,7 @@ get_prewarm_info(PG_FUNCTION_ARGS)
 	uint32 total_pages;
 	size_t n_workers;
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		elog(ERROR, "TODO: not implemented");
 
 	if (lfc_size_limit == 0)
diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 7dfc08e54a..1e41527fb5 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -123,7 +123,7 @@ static uint64 pagestore_local_counter = 0;
 typedef enum PSConnectionState {
 	PS_Disconnected,			/* no connection yet */
 	PS_Connecting_Startup,		/* connection starting up */
-	PS_Connecting_PageStream,	/* negotiating pagestream */ 
+	PS_Connecting_PageStream,	/* negotiating pagestream */
 	PS_Connected,				/* connected, pagestream established */
 } PSConnectionState;
 
@@ -253,7 +253,7 @@ AssignPageserverConnstring(const char *newval, void *extra)
 	 * In that case, the shard map is loaded from 'neon.pageserver_grpc_urls'
 	 * instead, and that happens in the communicator process only.
 	 */
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		return;
 
 	/*
@@ -395,7 +395,7 @@ get_shard_number(BufferTag *tag)
 }
 
 static inline void
-CLEANUP_AND_DISCONNECT(PageServer *shard) 
+CLEANUP_AND_DISCONNECT(PageServer *shard)
 {
 	if (shard->wes_read)
 	{
@@ -417,7 +417,7 @@ CLEANUP_AND_DISCONNECT(PageServer *shard)
  * complete the connection (e.g. due to receiving an earlier cancellation
  * during connection start).
  * Returns true if successfully connected; false if the connection failed.
- * 
+ *
  * Throws errors in unrecoverable situations, or when this backend's query
  * is canceled.
  */
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 68f00de761..10785f748f 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -52,7 +52,6 @@ PG_MODULE_MAGIC;
 void		_PG_init(void);
 
 
-bool neon_enable_new_communicator;
 static int  running_xacts_overflow_policy;
 static bool monitor_query_exec_time = false;
 
@@ -468,10 +467,10 @@ _PG_init(void)
 #endif
 
 	DefineCustomBoolVariable(
-							"neon.enable_new_communicator",
-							"Enables new communicator implementation",
+							"neon.use_communicator_worker",
+							"Uses the communicator worker implementation",
 							NULL,
-							&neon_enable_new_communicator,
+							&neon_use_communicator_worker,
 							true,
 							PGC_POSTMASTER,
 							0,
@@ -483,7 +482,7 @@ _PG_init(void)
 	init_lwlsncache();
 
 	pg_init_communicator();
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		pg_init_communicator_new();
 
 	Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
@@ -648,7 +647,7 @@ approximate_working_set_size_seconds(PG_FUNCTION_ARGS)
 
 	duration = PG_ARGISNULL(0) ? (time_t) -1 : PG_GETARG_INT32(0);
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		dc = communicator_new_approximate_working_set_size_seconds(duration, false);
 	else
 		dc = lfc_approximate_working_set_size_seconds(duration, false);
@@ -664,7 +663,7 @@ approximate_working_set_size(PG_FUNCTION_ARGS)
 	bool		reset = PG_GETARG_BOOL(0);
 	int32		dc;
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		dc = communicator_new_approximate_working_set_size_seconds(-1, reset);
 	else
 		dc = lfc_approximate_working_set_size_seconds(-1, reset);
diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h
index f781b08aa0..215396ef7a 100644
--- a/pgxn/neon/neon.h
+++ b/pgxn/neon/neon.h
@@ -13,7 +13,6 @@
 #include "utils/wait_event.h"
 
 /* GUCs */
-extern bool neon_enable_new_communicator;
 extern char *neon_auth_token;
 extern char *neon_timeline;
 extern char *neon_tenant;
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 9340d49f5a..06ce61d2e5 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -822,7 +822,7 @@ neon_exists(SMgrRelation reln, ForkNumber forkNum)
 		return false;
 	}
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		return communicator_new_rel_exists(InfoFromSMgrRel(reln), forkNum);
 	else
 	{
@@ -900,7 +900,7 @@ neon_create(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 	 * that's being replayed, so we should not have the correctness issue
 	 * mentioned in previous paragraph.
 	 */
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		XLogRecPtr	lsn = neon_get_write_lsn();
 
@@ -961,7 +961,7 @@ neon_unlink(NRelFileInfoBackend rinfo, ForkNumber forkNum, bool isRedo)
 
 	if (!NRelFileInfoBackendIsTemp(rinfo))
 	{
-		if (neon_enable_new_communicator)
+		if (neon_use_communicator_worker)
 		{
 			XLogRecPtr	lsn = neon_get_write_lsn();
 
@@ -1055,7 +1055,7 @@ neon_extend(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno,
 		 forkNum, blkno,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		// FIXME: this can pass lsn == invalid. Is that ok?
 		communicator_new_rel_extend(InfoFromSMgrRel(reln), forkNum, blkno, (const void *) buffer, lsn);
@@ -1182,7 +1182,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
 
 		lsn = XLogInsert(RM_XLOG_ID, XLOG_FPI);
 
-		if (!neon_enable_new_communicator)
+		if (!neon_use_communicator_worker)
 		{
 			for (int i = 0; i < count; i++)
 			{
@@ -1198,7 +1198,7 @@ neon_zeroextend(SMgrRelation reln, ForkNumber forkNum, BlockNumber start_block,
 
 	Assert(lsn != 0);
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		communicator_new_rel_zeroextend(InfoFromSMgrRel(reln), forkNum, start_block, nblocks, lsn);
 	}
@@ -1266,7 +1266,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, nblocks);
 		return false;
@@ -1276,7 +1276,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	tag.dbOid = reln->smgr_rlocator.locator.dbOid;
 	tag.relNumber = reln->smgr_rlocator.locator.relNumber;
 	tag.forkNum = forknum;
-	
+
 	while (nblocks > 0)
 	{
 		int		iterblocks = Min(nblocks, PG_IOV_MAX);
@@ -1298,7 +1298,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 		blocknum += iterblocks;
 	}
 
-	if (!neon_enable_new_communicator)
+	if (!neon_use_communicator_worker)
 		communicator_prefetch_pump_state();
 
 	return false;
@@ -1326,7 +1326,7 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		communicator_new_prefetch_register_bufferv(InfoFromSMgrRel(reln), forknum, blocknum, 1);
 	}
@@ -1388,7 +1388,7 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");
 
-	if (!neon_enable_new_communicator)
+	if (!neon_use_communicator_worker)
 		communicator_prefetch_pump_state();
 
 	if (debug_compare_local)
@@ -1406,7 +1406,7 @@ void
 neon_read_at_lsn(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 				 neon_request_lsns request_lsns, void *buffer)
 {
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		// FIXME: request_lsns is ignored. That affects the neon_test_utils callers.
 		// Add the capability to specify the LSNs explicitly, for the sake of neon_test_utils ?
@@ -1539,7 +1539,7 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forkNum, blkno,
 									  (void *) &buffer, 1);
@@ -1650,12 +1650,12 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 				 nblocks, PG_IOV_MAX);
 
 	/* Try to read PS results if they are available */
-	if (!neon_enable_new_communicator)
+	if (!neon_use_communicator_worker)
 		communicator_prefetch_pump_state();
 
 	memset(read_pages, 0, sizeof(read_pages));
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		communicator_new_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum,
 									  buffers, nblocks);
@@ -1664,7 +1664,7 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	{
 		neon_get_request_lsns(InfoFromSMgrRel(reln), forknum, blocknum,
 							  request_lsns, nblocks);
-		
+
 		prefetch_result = communicator_prefetch_lookupv(InfoFromSMgrRel(reln), forknum,
 														blocknum, request_lsns, nblocks,
 														buffers, read_pages);
@@ -1811,7 +1811,7 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 		 forknum, blocknum,
 		 (uint32) (lsn >> 32), (uint32) lsn);
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		communicator_new_write_page(InfoFromSMgrRel(reln), forknum, blocknum, buffer, lsn);
 	}
@@ -1881,7 +1881,7 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 
 	neon_wallog_pagev(reln, forknum, blkno, nblocks, (const char **) buffers, false);
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		for (int i = 0; i < nblocks; i++)
 		{
@@ -1936,7 +1936,7 @@ neon_nblocks(SMgrRelation reln, ForkNumber forknum)
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		n_blocks = communicator_new_rel_nblocks(InfoFromSMgrRel(reln), forknum);
 	}
@@ -1976,7 +1976,7 @@ neon_dbsize(Oid dbNode)
 	neon_request_lsns request_lsns;
 	NRelFileInfo dummy_node = {0};
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		db_size = communicator_new_dbsize(dbNode);
 	}
@@ -2023,7 +2023,7 @@ neon_truncate(SMgrRelation reln, ForkNumber forknum, BlockNumber old_blocks, Blo
 			neon_log(ERROR, "unknown relpersistence '%c'", reln->smgr_relpersistence);
 	}
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		XLogRecPtr	lsn = neon_get_write_lsn();
 
@@ -2104,7 +2104,7 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 
 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
 
-	if (!neon_enable_new_communicator)
+	if (!neon_use_communicator_worker)
 		communicator_prefetch_pump_state();
 
 	if (debug_compare_local)
@@ -2291,7 +2291,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 		nblocks = mdnblocks(reln, MAIN_FORKNUM);
 		recptr = GetXLogInsertRecPtr();
 
-		if (!neon_enable_new_communicator)
+		if (!neon_use_communicator_worker)
 		{
 			neon_set_lwlsn_block_range(recptr,
 									   InfoFromNInfoB(rinfob),
@@ -2308,7 +2308,7 @@ neon_end_unlogged_build(SMgrRelation reln)
 				 RelFileInfoFmt(InfoFromNInfoB(rinfob)),
 				 forknum);
 
-			if (neon_enable_new_communicator)
+			if (neon_use_communicator_worker)
 			{
 				communicator_new_update_cached_rel_size(InfoFromSMgrRel(reln), forknum, nblocks, recptr);
 			}
@@ -2384,8 +2384,8 @@ neon_read_slru_segment(SMgrRelation reln, const char* path, int segno, void* buf
 	request_lsns.not_modified_since = not_modified_since;
 	request_lsns.effective_request_lsn = request_lsn;
 
-	if (neon_enable_new_communicator)
-		n_blocks = communicator_new_read_slru_segment(kind, segno, buffer);
+	if (neon_use_communicator_worker)
+		n_blocks = communicator_new_read_slru_segment(kind, (uint32_t)segno, &request_lsns, path);
 	else
 		n_blocks = communicator_read_slru_segment(kind, segno, &request_lsns, buffer);
 
@@ -2424,7 +2424,7 @@ AtEOXact_neon(XactEvent event, void *arg)
 			}
 			break;
 	}
-	if (!neon_enable_new_communicator)
+	if (!neon_use_communicator_worker)
 		communicator_reconfigure_timeout_if_needed();
 }
 
@@ -2483,7 +2483,7 @@ smgr_init_neon(void)
 
 	smgr_init_standard();
 	neon_init();
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 		communicator_new_init();
 	else
 		communicator_init();
@@ -2498,7 +2498,7 @@ neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 	/* This is only used in WAL replay */
 	Assert(RecoveryInProgress());
 
-	if (neon_enable_new_communicator)
+	if (neon_use_communicator_worker)
 	{
 		relsize = communicator_new_rel_nblocks(rinfo, forknum);
 
@@ -2677,7 +2677,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 		 * We should perform this check after assigning LwLSN to prevent
 		 * prefetching of some older version of the page by some other backend.
 		 */
-		if (neon_enable_new_communicator)
+		if (neon_use_communicator_worker)
 			no_redo_needed = communicator_new_cache_contains(rinfo, forknum, blkno);
 		else
 			no_redo_needed = !lfc_cache_contains(rinfo, forknum, blkno);
diff --git a/pgxn/neon/relsize_cache.c b/pgxn/neon/relsize_cache.c
index 4ea303f996..f3ceec78cc 100644
--- a/pgxn/neon/relsize_cache.c
+++ b/pgxn/neon/relsize_cache.c
@@ -23,9 +23,7 @@
 #include "utils/dynahash.h"
 #include "utils/guc.h"
 
-#if PG_VERSION_NUM >= 150000
 #include "miscadmin.h"
-#endif
 
 typedef struct
 {
@@ -100,7 +98,7 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 {
 	bool		found = false;
 
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (relsize_hash_size > 0)
 	{
@@ -133,7 +131,7 @@ get_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber *size)
 void
 set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (relsize_hash_size > 0)
 	{
@@ -183,7 +181,7 @@ set_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 void
 update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 {
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (relsize_hash_size > 0)
 	{
@@ -219,7 +217,7 @@ update_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber size)
 void
 forget_cached_relsize(NRelFileInfo rinfo, ForkNumber forknum)
 {
-	Assert(!neon_enable_new_communicator);
+	Assert(!neon_use_communicator_worker);
 
 	if (relsize_hash_size > 0)
 	{
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e5c646468f..7c90c0162c 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4369,9 +4369,9 @@ class Endpoint(PgProtocol, LogUtils):
         # XXX: By checking for None, we enable the new communicator for all tests
         # by default
         if grpc or grpc is None:
-            config_lines += [f"neon.enable_new_communicator=on"]
+            config_lines += ["neon.use_communicator_worker=on"]
         else:
-            config_lines += [f"neon.enable_new_communicator=off"]
+            config_lines += ["neon.use_communicator_worker=off"]
 
         # Delete file cache if it exists (and we're recreating the endpoint)
         if USE_LFC:
diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py
index b815fee702..ae545664d2 100644
--- a/test_runner/regress/test_normal_work.py
+++ b/test_runner/regress/test_normal_work.py
@@ -17,7 +17,9 @@ def check_tenant(
     config_lines = [
         f"neon.safekeeper_proto_version = {safekeeper_proto_version}",
     ]
-    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id, config_lines=config_lines, grpc=True)
+    endpoint = env.endpoints.create_start(
+        "main", tenant_id=tenant_id, config_lines=config_lines, grpc=True
+    )
     # we rely upon autocommit after each statement
     res_1 = endpoint.safe_psql_many(
         queries=[
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ac3c460e01..1cb207d1c9 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4
+Subproject commit 1cb207d1c9efb1f6c6f864a47bf45e992a7f0eb0
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 24313bf8f3..9d19780350 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 24313bf8f3de722968a2fdf764de7ef77ed64f06
+Subproject commit 9d19780350c0c7b536312dc3b891ade55628bc7b
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 51194dc5ce..1486f919d4 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 51194dc5ce2e3523068d8607852e6c3125a17e58
+Subproject commit 1486f919d4dc21637407ee7ed203497bb5bd516a
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index eac5279cd1..160d0b52d6 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit eac5279cd147d4086e0eb242198aae2f4b766d7b
+Subproject commit 160d0b52d66f4a5d21251a2912a50561bf600333
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e4b6c8e23a..69e7559c67 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "eac5279cd147d4086e0eb242198aae2f4b766d7b"
+    "160d0b52d66f4a5d21251a2912a50561bf600333"
   ],
   "v16": [
     "16.9",
-    "51194dc5ce2e3523068d8607852e6c3125a17e58"
+    "1486f919d4dc21637407ee7ed203497bb5bd516a"
   ],
   "v15": [
     "15.13",
-    "24313bf8f3de722968a2fdf764de7ef77ed64f06"
+    "9d19780350c0c7b536312dc3b891ade55628bc7b"
   ],
   "v14": [
     "14.18",
-    "ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4"
+    "1cb207d1c9efb1f6c6f864a47bf45e992a7f0eb0"
   ]
 }

From 8e95455aef9e18d8b9df5af2388828832c50ec82 Mon Sep 17 00:00:00 2001
From: Shockingly Good <fx@thefx.co>
Date: Fri, 18 Jul 2025 10:21:22 +0200
Subject: [PATCH 160/163] Update the postgres submodules (#12636)

Synchronises the main branch's postgres submodules with the
`neondatabase/postgres` repository state.
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index ac3c460e01..47304b9215 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4
+Subproject commit 47304b921555b3f33eb3b49daada3078e774cfd7
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 24313bf8f3..cef72d5308 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 24313bf8f3de722968a2fdf764de7ef77ed64f06
+Subproject commit cef72d5308ddce3795a9043fcd94f8849f7f4800
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 51194dc5ce..e9db1ff5a6 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 51194dc5ce2e3523068d8607852e6c3125a17e58
+Subproject commit e9db1ff5a6f3ca18f626ba3d62ab475e6c688a96
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index eac5279cd1..a50d80c750 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit eac5279cd147d4086e0eb242198aae2f4b766d7b
+Subproject commit a50d80c7507e8ae9fc37bf1869051cf2d51370ab
diff --git a/vendor/revisions.json b/vendor/revisions.json
index e4b6c8e23a..24a33dec42 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.5",
-    "eac5279cd147d4086e0eb242198aae2f4b766d7b"
+    "a50d80c7507e8ae9fc37bf1869051cf2d51370ab"
   ],
   "v16": [
     "16.9",
-    "51194dc5ce2e3523068d8607852e6c3125a17e58"
+    "e9db1ff5a6f3ca18f626ba3d62ab475e6c688a96"
   ],
   "v15": [
     "15.13",
-    "24313bf8f3de722968a2fdf764de7ef77ed64f06"
+    "cef72d5308ddce3795a9043fcd94f8849f7f4800"
   ],
   "v14": [
     "14.18",
-    "ac3c460e01a31f11fb52fd8d8e88e60f0e1069b4"
+    "47304b921555b3f33eb3b49daada3078e774cfd7"
   ]
 }

From 96bcfba79e4919a7a5b8fddd2149231b42059883 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Krzysztof=20Szafra=C5=84ski?= <k.p.szafranski@gmail.com>
Date: Fri, 18 Jul 2025 12:17:58 +0200
Subject: [PATCH 161/163] [proxy] Cache GetEndpointAccessControl errors
 (#12571)

Related to https://github.com/neondatabase/cloud/issues/19353
---
 proxy/src/cache/project_info.rs               | 286 +++++++++++++++---
 .../control_plane/client/cplane_proxy_v1.rs   | 185 ++++++-----
 proxy/src/control_plane/errors.rs             |   2 +-
 proxy/src/control_plane/messages.rs           |  16 +-
 proxy/src/control_plane/mod.rs                |   6 +-
 5 files changed, 376 insertions(+), 119 deletions(-)

diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index c812779e30..0ef09a8a9a 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -10,6 +10,7 @@ use tokio::time::Instant;
 use tracing::{debug, info};
 
 use crate::config::ProjectInfoCacheOptions;
+use crate::control_plane::messages::{ControlPlaneErrorMessage, Reason};
 use crate::control_plane::{EndpointAccessControl, RoleAccessControl};
 use crate::intern::{AccountIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::types::{EndpointId, RoleName};
@@ -36,22 +37,37 @@ impl<T> Entry<T> {
     }
 
     pub(crate) fn get(&self) -> Option<&T> {
-        (self.expires_at > Instant::now()).then_some(&self.value)
+        (!self.is_expired()).then_some(&self.value)
+    }
+
+    fn is_expired(&self) -> bool {
+        self.expires_at <= Instant::now()
     }
 }
 
 struct EndpointInfo {
-    role_controls: HashMap<RoleNameInt, Entry<RoleAccessControl>>,
-    controls: Option<Entry<EndpointAccessControl>>,
+    role_controls: HashMap<RoleNameInt, Entry<ControlPlaneResult<RoleAccessControl>>>,
+    controls: Option<Entry<ControlPlaneResult<EndpointAccessControl>>>,
 }
 
+type ControlPlaneResult<T> = Result<T, Box<ControlPlaneErrorMessage>>;
+
 impl EndpointInfo {
-    pub(crate) fn get_role_secret(&self, role_name: RoleNameInt) -> Option<RoleAccessControl> {
-        self.role_controls.get(&role_name)?.get().cloned()
+    pub(crate) fn get_role_secret_with_ttl(
+        &self,
+        role_name: RoleNameInt,
+    ) -> Option<(ControlPlaneResult<RoleAccessControl>, Duration)> {
+        let entry = self.role_controls.get(&role_name)?;
+        let ttl = entry.expires_at - Instant::now();
+        Some((entry.get()?.clone(), ttl))
     }
 
-    pub(crate) fn get_controls(&self) -> Option<EndpointAccessControl> {
-        self.controls.as_ref()?.get().cloned()
+    pub(crate) fn get_controls_with_ttl(
+        &self,
+    ) -> Option<(ControlPlaneResult<EndpointAccessControl>, Duration)> {
+        let entry = self.controls.as_ref()?;
+        let ttl = entry.expires_at - Instant::now();
+        Some((entry.get()?.clone(), ttl))
     }
 
     pub(crate) fn invalidate_endpoint(&mut self) {
@@ -153,28 +169,28 @@ impl ProjectInfoCacheImpl {
         self.cache.get(&endpoint_id)
     }
 
-    pub(crate) fn get_role_secret(
+    pub(crate) fn get_role_secret_with_ttl(
         &self,
         endpoint_id: &EndpointId,
         role_name: &RoleName,
-    ) -> Option<RoleAccessControl> {
+    ) -> Option<(ControlPlaneResult<RoleAccessControl>, Duration)> {
         let role_name = RoleNameInt::get(role_name)?;
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_role_secret(role_name)
+        endpoint_info.get_role_secret_with_ttl(role_name)
     }
 
-    pub(crate) fn get_endpoint_access(
+    pub(crate) fn get_endpoint_access_with_ttl(
         &self,
         endpoint_id: &EndpointId,
-    ) -> Option<EndpointAccessControl> {
+    ) -> Option<(ControlPlaneResult<EndpointAccessControl>, Duration)> {
         let endpoint_info = self.get_endpoint_cache(endpoint_id)?;
-        endpoint_info.get_controls()
+        endpoint_info.get_controls_with_ttl()
     }
 
     pub(crate) fn insert_endpoint_access(
         &self,
         account_id: Option<AccountIdInt>,
-        project_id: ProjectIdInt,
+        project_id: Option<ProjectIdInt>,
         endpoint_id: EndpointIdInt,
         role_name: RoleNameInt,
         controls: EndpointAccessControl,
@@ -183,26 +199,89 @@ impl ProjectInfoCacheImpl {
         if let Some(account_id) = account_id {
             self.insert_account2endpoint(account_id, endpoint_id);
         }
-        self.insert_project2endpoint(project_id, endpoint_id);
+        if let Some(project_id) = project_id {
+            self.insert_project2endpoint(project_id, endpoint_id);
+        }
 
         if self.cache.len() >= self.config.size {
             // If there are too many entries, wait until the next gc cycle.
             return;
         }
 
-        let controls = Entry::new(controls, self.config.ttl);
-        let role_controls = Entry::new(role_controls, self.config.ttl);
+        debug!(
+            key = &*endpoint_id,
+            "created a cache entry for endpoint access"
+        );
+
+        let controls = Some(Entry::new(Ok(controls), self.config.ttl));
+        let role_controls = Entry::new(Ok(role_controls), self.config.ttl);
 
         match self.cache.entry(endpoint_id) {
             clashmap::Entry::Vacant(e) => {
                 e.insert(EndpointInfo {
                     role_controls: HashMap::from_iter([(role_name, role_controls)]),
-                    controls: Some(controls),
+                    controls,
                 });
             }
             clashmap::Entry::Occupied(mut e) => {
                 let ep = e.get_mut();
-                ep.controls = Some(controls);
+                ep.controls = controls;
+                if ep.role_controls.len() < self.config.max_roles {
+                    ep.role_controls.insert(role_name, role_controls);
+                }
+            }
+        }
+    }
+
+    pub(crate) fn insert_endpoint_access_err(
+        &self,
+        endpoint_id: EndpointIdInt,
+        role_name: RoleNameInt,
+        msg: Box<ControlPlaneErrorMessage>,
+        ttl: Option<Duration>,
+    ) {
+        if self.cache.len() >= self.config.size {
+            // If there are too many entries, wait until the next gc cycle.
+            return;
+        }
+
+        debug!(
+            key = &*endpoint_id,
+            "created a cache entry for an endpoint access error"
+        );
+
+        let ttl = ttl.unwrap_or(self.config.ttl);
+
+        let controls = if msg.get_reason() == Reason::RoleProtected {
+            // RoleProtected is the only role-specific error that control plane can give us.
+            // If a given role name does not exist, it still returns a successful response,
+            // just with an empty secret.
+            None
+        } else {
+            // We can cache all the other errors in EndpointInfo.controls,
+            // because they don't depend on what role name we pass to control plane.
+            Some(Entry::new(Err(msg.clone()), ttl))
+        };
+
+        let role_controls = Entry::new(Err(msg), ttl);
+
+        match self.cache.entry(endpoint_id) {
+            clashmap::Entry::Vacant(e) => {
+                e.insert(EndpointInfo {
+                    role_controls: HashMap::from_iter([(role_name, role_controls)]),
+                    controls,
+                });
+            }
+            clashmap::Entry::Occupied(mut e) => {
+                let ep = e.get_mut();
+                if let Some(entry) = &ep.controls
+                    && !entry.is_expired()
+                    && entry.value.is_ok()
+                {
+                    // If we have cached non-expired, non-error controls, keep them.
+                } else {
+                    ep.controls = controls;
+                }
                 if ep.role_controls.len() < self.config.max_roles {
                     ep.role_controls.insert(role_name, role_controls);
                 }
@@ -245,7 +324,7 @@ impl ProjectInfoCacheImpl {
             return;
         };
 
-        if role_controls.get().expires_at <= Instant::now() {
+        if role_controls.get().is_expired() {
             role_controls.remove();
         }
     }
@@ -284,13 +363,11 @@ impl ProjectInfoCacheImpl {
 
 #[cfg(test)]
 mod tests {
-    use std::sync::Arc;
-
     use super::*;
-    use crate::control_plane::messages::EndpointRateLimitConfig;
+    use crate::control_plane::messages::{Details, EndpointRateLimitConfig, ErrorInfo, Status};
     use crate::control_plane::{AccessBlockerFlags, AuthSecret};
     use crate::scram::ServerSecret;
-    use crate::types::ProjectId;
+    use std::sync::Arc;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
@@ -301,9 +378,9 @@ mod tests {
             ttl: Duration::from_secs(1),
             gc_interval: Duration::from_secs(600),
         });
-        let project_id: ProjectId = "project".into();
+        let project_id: Option<ProjectIdInt> = Some(ProjectIdInt::from(&"project".into()));
         let endpoint_id: EndpointId = "endpoint".into();
-        let account_id: Option<AccountIdInt> = None;
+        let account_id = None;
 
         let user1: RoleName = "user1".into();
         let user2: RoleName = "user2".into();
@@ -316,7 +393,7 @@ mod tests {
 
         cache.insert_endpoint_access(
             account_id,
-            (&project_id).into(),
+            project_id,
             (&endpoint_id).into(),
             (&user1).into(),
             EndpointAccessControl {
@@ -332,7 +409,7 @@ mod tests {
 
         cache.insert_endpoint_access(
             account_id,
-            (&project_id).into(),
+            project_id,
             (&endpoint_id).into(),
             (&user2).into(),
             EndpointAccessControl {
@@ -346,11 +423,17 @@ mod tests {
             },
         );
 
-        let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
-        assert_eq!(cached.secret, secret1);
+        let (cached, ttl) = cache
+            .get_role_secret_with_ttl(&endpoint_id, &user1)
+            .unwrap();
+        assert_eq!(cached.unwrap().secret, secret1);
+        assert_eq!(ttl, cache.config.ttl);
 
-        let cached = cache.get_role_secret(&endpoint_id, &user2).unwrap();
-        assert_eq!(cached.secret, secret2);
+        let (cached, ttl) = cache
+            .get_role_secret_with_ttl(&endpoint_id, &user2)
+            .unwrap();
+        assert_eq!(cached.unwrap().secret, secret2);
+        assert_eq!(ttl, cache.config.ttl);
 
         // Shouldn't add more than 2 roles.
         let user3: RoleName = "user3".into();
@@ -358,7 +441,7 @@ mod tests {
 
         cache.insert_endpoint_access(
             account_id,
-            (&project_id).into(),
+            project_id,
             (&endpoint_id).into(),
             (&user3).into(),
             EndpointAccessControl {
@@ -372,17 +455,144 @@ mod tests {
             },
         );
 
-        assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
+        assert!(
+            cache
+                .get_role_secret_with_ttl(&endpoint_id, &user3)
+                .is_none()
+        );
 
-        let cached = cache.get_endpoint_access(&endpoint_id).unwrap();
+        let cached = cache
+            .get_endpoint_access_with_ttl(&endpoint_id)
+            .unwrap()
+            .0
+            .unwrap();
         assert_eq!(cached.allowed_ips, allowed_ips);
 
         tokio::time::advance(Duration::from_secs(2)).await;
-        let cached = cache.get_role_secret(&endpoint_id, &user1);
+        let cached = cache.get_role_secret_with_ttl(&endpoint_id, &user1);
         assert!(cached.is_none());
-        let cached = cache.get_role_secret(&endpoint_id, &user2);
+        let cached = cache.get_role_secret_with_ttl(&endpoint_id, &user2);
         assert!(cached.is_none());
-        let cached = cache.get_endpoint_access(&endpoint_id);
+        let cached = cache.get_endpoint_access_with_ttl(&endpoint_id);
         assert!(cached.is_none());
     }
+
+    #[tokio::test]
+    async fn test_caching_project_info_errors() {
+        let cache = ProjectInfoCacheImpl::new(ProjectInfoCacheOptions {
+            size: 10,
+            max_roles: 10,
+            ttl: Duration::from_secs(1),
+            gc_interval: Duration::from_secs(600),
+        });
+        let project_id = Some(ProjectIdInt::from(&"project".into()));
+        let endpoint_id: EndpointId = "endpoint".into();
+        let account_id = None;
+
+        let user1: RoleName = "user1".into();
+        let user2: RoleName = "user2".into();
+        let secret = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
+
+        let role_msg = Box::new(ControlPlaneErrorMessage {
+            error: "role is protected and cannot be used for password-based authentication"
+                .to_owned()
+                .into_boxed_str(),
+            http_status_code: http::StatusCode::NOT_FOUND,
+            status: Some(Status {
+                code: "PERMISSION_DENIED".to_owned().into_boxed_str(),
+                message: "role is protected and cannot be used for password-based authentication"
+                    .to_owned()
+                    .into_boxed_str(),
+                details: Details {
+                    error_info: Some(ErrorInfo {
+                        reason: Reason::RoleProtected,
+                    }),
+                    retry_info: None,
+                    user_facing_message: None,
+                },
+            }),
+        });
+
+        let generic_msg = Box::new(ControlPlaneErrorMessage {
+            error: "oh noes".to_owned().into_boxed_str(),
+            http_status_code: http::StatusCode::NOT_FOUND,
+            status: None,
+        });
+
+        let get_role_secret = |endpoint_id, role_name| {
+            cache
+                .get_role_secret_with_ttl(endpoint_id, role_name)
+                .unwrap()
+                .0
+        };
+        let get_endpoint_access =
+            |endpoint_id| cache.get_endpoint_access_with_ttl(endpoint_id).unwrap().0;
+
+        // stores role-specific errors only for get_role_secret
+        cache.insert_endpoint_access_err(
+            (&endpoint_id).into(),
+            (&user1).into(),
+            role_msg.clone(),
+            None,
+        );
+        assert_eq!(
+            get_role_secret(&endpoint_id, &user1).unwrap_err().error,
+            role_msg.error
+        );
+        assert!(cache.get_endpoint_access_with_ttl(&endpoint_id).is_none());
+
+        // stores non-role specific errors for both get_role_secret and get_endpoint_access
+        cache.insert_endpoint_access_err(
+            (&endpoint_id).into(),
+            (&user1).into(),
+            generic_msg.clone(),
+            None,
+        );
+        assert_eq!(
+            get_role_secret(&endpoint_id, &user1).unwrap_err().error,
+            generic_msg.error
+        );
+        assert_eq!(
+            get_endpoint_access(&endpoint_id).unwrap_err().error,
+            generic_msg.error
+        );
+
+        // error isn't returned for other roles in the same endpoint
+        assert!(
+            cache
+                .get_role_secret_with_ttl(&endpoint_id, &user2)
+                .is_none()
+        );
+
+        // success for a role does not overwrite errors for other roles
+        cache.insert_endpoint_access(
+            account_id,
+            project_id,
+            (&endpoint_id).into(),
+            (&user2).into(),
+            EndpointAccessControl {
+                allowed_ips: Arc::new(vec![]),
+                allowed_vpce: Arc::new(vec![]),
+                flags: AccessBlockerFlags::default(),
+                rate_limits: EndpointRateLimitConfig::default(),
+            },
+            RoleAccessControl {
+                secret: secret.clone(),
+            },
+        );
+        assert!(get_role_secret(&endpoint_id, &user1).is_err());
+        assert!(get_role_secret(&endpoint_id, &user2).is_ok());
+        // ...but does clear the access control error
+        assert!(get_endpoint_access(&endpoint_id).is_ok());
+
+        // storing an error does not overwrite successful access control response
+        cache.insert_endpoint_access_err(
+            (&endpoint_id).into(),
+            (&user2).into(),
+            generic_msg.clone(),
+            None,
+        );
+        assert!(get_role_secret(&endpoint_id, &user2).is_err());
+        assert!(get_endpoint_access(&endpoint_id).is_ok());
+    }
 }
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
index bb785b8b0c..8a0403c0b0 100644
--- a/proxy/src/control_plane/client/cplane_proxy_v1.rs
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -68,6 +68,66 @@ impl NeonControlPlaneClient {
         self.endpoint.url().as_str()
     }
 
+    async fn get_and_cache_auth_info<T>(
+        &self,
+        ctx: &RequestContext,
+        endpoint: &EndpointId,
+        role: &RoleName,
+        cache_key: &EndpointId,
+        extract: impl FnOnce(&EndpointAccessControl, &RoleAccessControl) -> T,
+    ) -> Result<T, GetAuthInfoError> {
+        match self.do_get_auth_req(ctx, endpoint, role).await {
+            Ok(auth_info) => {
+                let control = EndpointAccessControl {
+                    allowed_ips: Arc::new(auth_info.allowed_ips),
+                    allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
+                    flags: auth_info.access_blocker_flags,
+                    rate_limits: auth_info.rate_limits,
+                };
+                let role_control = RoleAccessControl {
+                    secret: auth_info.secret,
+                };
+                let res = extract(&control, &role_control);
+
+                self.caches.project_info.insert_endpoint_access(
+                    auth_info.account_id,
+                    auth_info.project_id,
+                    cache_key.into(),
+                    role.into(),
+                    control,
+                    role_control,
+                );
+
+                if let Some(project_id) = auth_info.project_id {
+                    ctx.set_project_id(project_id);
+                }
+
+                Ok(res)
+            }
+            Err(err) => match err {
+                GetAuthInfoError::ApiError(ControlPlaneError::Message(ref msg)) => {
+                    let retry_info = msg.status.as_ref().and_then(|s| s.details.retry_info);
+
+                    // If we can retry this error, do not cache it,
+                    // unless we were given a retry delay.
+                    if msg.could_retry() && retry_info.is_none() {
+                        return Err(err);
+                    }
+
+                    self.caches.project_info.insert_endpoint_access_err(
+                        cache_key.into(),
+                        role.into(),
+                        msg.clone(),
+                        retry_info.map(|r| Duration::from_millis(r.retry_delay_ms)),
+                    );
+
+                    Err(err)
+                }
+                err => Err(err),
+            },
+        }
+    }
+
     async fn do_get_auth_req(
         &self,
         ctx: &RequestContext,
@@ -284,43 +344,34 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
         ctx: &RequestContext,
         endpoint: &EndpointId,
         role: &RoleName,
-    ) -> Result<RoleAccessControl, crate::control_plane::errors::GetAuthInfoError> {
-        let normalized_ep = &endpoint.normalize();
-        if let Some(secret) = self
+    ) -> Result<RoleAccessControl, GetAuthInfoError> {
+        let key = endpoint.normalize();
+
+        if let Some((role_control, ttl)) = self
             .caches
             .project_info
-            .get_role_secret(normalized_ep, role)
+            .get_role_secret_with_ttl(&key, role)
         {
-            return Ok(secret);
+            return match role_control {
+                Err(mut msg) => {
+                    info!(key = &*key, "found cached get_role_access_control error");
+
+                    // if retry_delay_ms is set change it to the remaining TTL
+                    replace_retry_delay_ms(&mut msg, |_| ttl.as_millis() as u64);
+
+                    Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg)))
+                }
+                Ok(role_control) => {
+                    debug!(key = &*key, "found cached role access control");
+                    Ok(role_control)
+                }
+            };
         }
 
-        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
-
-        let control = EndpointAccessControl {
-            allowed_ips: Arc::new(auth_info.allowed_ips),
-            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
-            flags: auth_info.access_blocker_flags,
-            rate_limits: auth_info.rate_limits,
-        };
-        let role_control = RoleAccessControl {
-            secret: auth_info.secret,
-        };
-
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-
-            self.caches.project_info.insert_endpoint_access(
-                auth_info.account_id,
-                project_id,
-                normalized_ep_int,
-                role.into(),
-                control,
-                role_control.clone(),
-            );
-            ctx.set_project_id(project_id);
-        }
-
-        Ok(role_control)
+        self.get_and_cache_auth_info(ctx, endpoint, role, &key, |_, role_control| {
+            role_control.clone()
+        })
+        .await
     }
 
     #[tracing::instrument(skip_all)]
@@ -330,38 +381,30 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
         endpoint: &EndpointId,
         role: &RoleName,
     ) -> Result<EndpointAccessControl, GetAuthInfoError> {
-        let normalized_ep = &endpoint.normalize();
-        if let Some(control) = self.caches.project_info.get_endpoint_access(normalized_ep) {
-            return Ok(control);
+        let key = endpoint.normalize();
+
+        if let Some((control, ttl)) = self.caches.project_info.get_endpoint_access_with_ttl(&key) {
+            return match control {
+                Err(mut msg) => {
+                    info!(
+                        key = &*key,
+                        "found cached get_endpoint_access_control error"
+                    );
+
+                    // if retry_delay_ms is set change it to the remaining TTL
+                    replace_retry_delay_ms(&mut msg, |_| ttl.as_millis() as u64);
+
+                    Err(GetAuthInfoError::ApiError(ControlPlaneError::Message(msg)))
+                }
+                Ok(control) => {
+                    debug!(key = &*key, "found cached endpoint access control");
+                    Ok(control)
+                }
+            };
         }
 
-        let auth_info = self.do_get_auth_req(ctx, endpoint, role).await?;
-
-        let control = EndpointAccessControl {
-            allowed_ips: Arc::new(auth_info.allowed_ips),
-            allowed_vpce: Arc::new(auth_info.allowed_vpc_endpoint_ids),
-            flags: auth_info.access_blocker_flags,
-            rate_limits: auth_info.rate_limits,
-        };
-        let role_control = RoleAccessControl {
-            secret: auth_info.secret,
-        };
-
-        if let Some(project_id) = auth_info.project_id {
-            let normalized_ep_int = normalized_ep.into();
-
-            self.caches.project_info.insert_endpoint_access(
-                auth_info.account_id,
-                project_id,
-                normalized_ep_int,
-                role.into(),
-                control.clone(),
-                role_control,
-            );
-            ctx.set_project_id(project_id);
-        }
-
-        Ok(control)
+        self.get_and_cache_auth_info(ctx, endpoint, role, &key, |control, _| control.clone())
+            .await
     }
 
     #[tracing::instrument(skip_all)]
@@ -390,13 +433,9 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
                             info!(key = &*key, "found cached wake_compute error");
 
                             // if retry_delay_ms is set, reduce it by the amount of time it spent in cache
-                            if let Some(status) = &mut msg.status {
-                                if let Some(retry_info) = &mut status.details.retry_info {
-                                    retry_info.retry_delay_ms = retry_info
-                                        .retry_delay_ms
-                                        .saturating_sub(created_at.elapsed().as_millis() as u64)
-                                }
-                            }
+                            replace_retry_delay_ms(&mut msg, |delay| {
+                                delay.saturating_sub(created_at.elapsed().as_millis() as u64)
+                            });
 
                             Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
                                 msg,
@@ -478,6 +517,14 @@ impl super::ControlPlaneApi for NeonControlPlaneClient {
     }
 }
 
+fn replace_retry_delay_ms(msg: &mut ControlPlaneErrorMessage, f: impl FnOnce(u64) -> u64) {
+    if let Some(status) = &mut msg.status
+        && let Some(retry_info) = &mut status.details.retry_info
+    {
+        retry_info.retry_delay_ms = f(retry_info.retry_delay_ms);
+    }
+}
+
 /// Parse http response body, taking status code into account.
 fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     status: StatusCode,
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
index 12843e48c7..1e43010957 100644
--- a/proxy/src/control_plane/errors.rs
+++ b/proxy/src/control_plane/errors.rs
@@ -52,7 +52,7 @@ impl ReportableError for ControlPlaneError {
                 | Reason::EndpointNotFound
                 | Reason::EndpointDisabled
                 | Reason::BranchNotFound
-                | Reason::InvalidEphemeralEndpointOptions => ErrorKind::User,
+                | Reason::WrongLsnOrTimestamp => ErrorKind::User,
 
                 Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
 
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index cf193ed268..d44d7efcc3 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -107,7 +107,7 @@ pub(crate) struct ErrorInfo {
     // Schema could also have `metadata` field, but it's not structured. Skip it for now.
 }
 
-#[derive(Clone, Copy, Debug, Deserialize, Default)]
+#[derive(Clone, Copy, Debug, Deserialize, Default, PartialEq, Eq)]
 pub(crate) enum Reason {
     /// RoleProtected indicates that the role is protected and the attempted operation is not permitted on protected roles.
     #[serde(rename = "ROLE_PROTECTED")]
@@ -133,9 +133,9 @@ pub(crate) enum Reason {
     /// or that the subject doesn't have enough permissions to access the requested branch.
     #[serde(rename = "BRANCH_NOT_FOUND")]
     BranchNotFound,
-    /// InvalidEphemeralEndpointOptions indicates that the specified LSN or timestamp are wrong.
-    #[serde(rename = "INVALID_EPHEMERAL_OPTIONS")]
-    InvalidEphemeralEndpointOptions,
+    /// WrongLsnOrTimestamp indicates that the specified LSN or timestamp are wrong.
+    #[serde(rename = "WRONG_LSN_OR_TIMESTAMP")]
+    WrongLsnOrTimestamp,
     /// RateLimitExceeded indicates that the rate limit for the operation has been exceeded.
     #[serde(rename = "RATE_LIMIT_EXCEEDED")]
     RateLimitExceeded,
@@ -205,7 +205,7 @@ impl Reason {
             | Reason::EndpointNotFound
             | Reason::EndpointDisabled
             | Reason::BranchNotFound
-            | Reason::InvalidEphemeralEndpointOptions => false,
+            | Reason::WrongLsnOrTimestamp => false,
             // we were asked to go away
             Reason::RateLimitExceeded
             | Reason::NonDefaultBranchComputeTimeExceeded
@@ -257,19 +257,19 @@ pub(crate) struct GetEndpointAccessControl {
     pub(crate) rate_limits: EndpointRateLimitConfig,
 }
 
-#[derive(Copy, Clone, Deserialize, Default)]
+#[derive(Copy, Clone, Deserialize, Default, Debug)]
 pub struct EndpointRateLimitConfig {
     pub connection_attempts: ConnectionAttemptsLimit,
 }
 
-#[derive(Copy, Clone, Deserialize, Default)]
+#[derive(Copy, Clone, Deserialize, Default, Debug)]
 pub struct ConnectionAttemptsLimit {
     pub tcp: Option<LeakyBucketSetting>,
     pub ws: Option<LeakyBucketSetting>,
     pub http: Option<LeakyBucketSetting>,
 }
 
-#[derive(Copy, Clone, Deserialize)]
+#[derive(Copy, Clone, Deserialize, Debug)]
 pub struct LeakyBucketSetting {
     pub rps: f64,
     pub burst: f64,
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index a8c59dad0c..9bbd3f4fb7 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -82,7 +82,7 @@ impl NodeInfo {
     }
 }
 
-#[derive(Copy, Clone, Default)]
+#[derive(Copy, Clone, Default, Debug)]
 pub(crate) struct AccessBlockerFlags {
     pub public_access_blocked: bool,
     pub vpc_access_blocked: bool,
@@ -92,12 +92,12 @@ pub(crate) type NodeInfoCache =
     TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
 pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct RoleAccessControl {
     pub secret: Option<AuthSecret>,
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub struct EndpointAccessControl {
     pub allowed_ips: Arc<Vec<IpPattern>>,
     pub allowed_vpce: Arc<Vec<String>>,

From 791b5d736b921d54aed868a944f522d551ad0a8e Mon Sep 17 00:00:00 2001
From: Paul Banks <banks@banksco.de>
Date: Fri, 18 Jul 2025 18:09:20 +0100
Subject: [PATCH 162/163] Fixes #10441: control_plane README incorrect neon
 init args (#12646)

## Problem

As reported in #10441 the `control_plane/README/md` incorrectly
specified that `--pg-version` should be specified in the `cargo neon
init` command. This is not the case and causes an invalid argument
error.

## Summary of changes

Fix the README

## Test Plan

I verified that the steps in the README now work locally. I connected to
the started postgres endpoint and executed some basic metadata queries.
---
 control_plane/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/control_plane/README.md b/control_plane/README.md
index aa6f935e27..60c6120d82 100644
--- a/control_plane/README.md
+++ b/control_plane/README.md
@@ -8,10 +8,10 @@ code changes locally, but not suitable for running production systems.
 
 ## Example: Start with Postgres 16
 
-To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 3 of the start-up commands.
+To create and start a local development environment with Postgres 16, you will need to provide `--pg-version` flag to 2 of the start-up commands.
 
 ```shell
-cargo neon init --pg-version 16
+cargo neon init
 cargo neon start
 cargo neon tenant create --set-default --pg-version 16
 cargo neon endpoint create main --pg-version 16

From dc35bda074db534ecd728835070d693c3967fe2e Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 18 Jul 2025 01:11:29 +0300
Subject: [PATCH 163/163] WIP: Implement LFC prewarming

This doesn't pass the tests yet, immediate issue is that we'r emissing
some stats that the tests depend on. And there's a lot more cleanup,
commenting etc. to do. But this is roughly how it should look like.
---
 pgxn/neon/Makefile                            |   1 +
 .../communicator/src/backend_interface.rs     |  48 ++
 .../neon/communicator/src/integrated_cache.rs |  21 +
 pgxn/neon/communicator_new.c                  |  40 ++
 pgxn/neon/communicator_new.h                  |   3 +
 pgxn/neon/file_cache.c                        | 424 +-----------
 pgxn/neon/file_cache.h                        |  17 +-
 pgxn/neon/lfc_prewarm.c                       | 654 ++++++++++++++++++
 pgxn/neon/lfc_prewarm.h                       |  39 ++
 pgxn/neon/neon.c                              |   3 +
 pgxn/neon/neon_pgversioncompat.h              |  24 +-
 11 files changed, 835 insertions(+), 439 deletions(-)
 create mode 100644 pgxn/neon/lfc_prewarm.c
 create mode 100644 pgxn/neon/lfc_prewarm.h

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 322ab039f5..3ea7a946cf 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -9,6 +9,7 @@ OBJS = \
 	extension_server.o \
 	file_cache.o \
 	hll.o \
+	lfc_prewarm.o \
 	libpagestore.o \
 	logical_replication_monitor.o \
 	neon.o \
diff --git a/pgxn/neon/communicator/src/backend_interface.rs b/pgxn/neon/communicator/src/backend_interface.rs
index 45715abee5..abc982193e 100644
--- a/pgxn/neon/communicator/src/backend_interface.rs
+++ b/pgxn/neon/communicator/src/backend_interface.rs
@@ -201,6 +201,54 @@ pub extern "C" fn bcomm_cache_contains(
     )
 }
 
+#[repr(C)]
+#[derive(Clone, Debug)]
+pub struct FileCacheIterator {
+    next_bucket: u64,
+
+    pub spc_oid: COid,
+    pub db_oid: COid,
+    pub rel_number: u32,
+    pub fork_number: u8,
+    pub block_number: u32,
+}
+
+/// Iterate over LFC contents
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_cache_iterate_begin(_bs: &mut CommunicatorBackendStruct, iter: *mut FileCacheIterator) {
+    unsafe { (*iter).next_bucket = 0 };
+}
+#[unsafe(no_mangle)]
+pub extern "C" fn bcomm_cache_iterate_next(bs: &mut CommunicatorBackendStruct, iter: *mut FileCacheIterator) -> bool {
+    use crate::integrated_cache::GetBucketResult;
+    loop {
+        let next_bucket = unsafe { (*iter).next_bucket } as usize;
+        match bs.integrated_cache.get_bucket(next_bucket) {
+            GetBucketResult::Occupied(rel, blk) => {
+                unsafe {
+                    (*iter).spc_oid = rel.spcnode;
+                    (*iter).db_oid = rel.dbnode;
+                    (*iter).rel_number = rel.relnode;
+                    (*iter).fork_number = rel.forknum;
+                    (*iter).block_number = blk;
+
+                    (*iter).next_bucket += 1;
+                }
+                break true;
+            },
+            GetBucketResult::Vacant => {
+                unsafe {
+                    (*iter).next_bucket += 1;
+                }
+                continue;
+            }
+            GetBucketResult::OutOfBounds => {
+                break false;
+            }
+        }
+    }
+}
+
 impl<'t> CommunicatorBackendStruct<'t> {
     /// The slot must be free, or this panics.
     pub(crate) fn start_neon_io_request(&mut self, request_slot_idx: i32, request: &NeonIORequest) {
diff --git a/pgxn/neon/communicator/src/integrated_cache.rs b/pgxn/neon/communicator/src/integrated_cache.rs
index a69af44492..e43e76b1b5 100644
--- a/pgxn/neon/communicator/src/integrated_cache.rs
+++ b/pgxn/neon/communicator/src/integrated_cache.rs
@@ -717,6 +717,12 @@ fn get_rel_size(
     }
 }
 
+pub enum GetBucketResult {
+    Occupied(RelTag, u32),
+    Vacant,
+    OutOfBounds,
+}
+
 /// Accessor for other backends
 ///
 /// This allows backends to read pages from the cache directly, on their own, without making a
@@ -739,6 +745,21 @@ impl<'t> IntegratedCacheReadAccess<'t> {
             .get(&BlockKey::from((rel, block_number)))
             .is_some()
     }
+
+    pub fn get_bucket(&self, bucket_no: usize) -> GetBucketResult {
+        match self.block_map.get_at_bucket(bucket_no).as_deref() {
+            None => {
+                // free bucket, or out of bounds
+                if bucket_no >= self.block_map.get_num_buckets() {
+                    GetBucketResult::OutOfBounds
+                } else {
+                    GetBucketResult::Vacant
+                }
+            }
+            Some((key, _)) => GetBucketResult::Occupied(key.rel, key.block_number),
+        }
+    }
+
 }
 
 pub struct BackendCacheReadOp<'t> {
diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c
index 68501f4ca2..cb0bbc5ee0 100644
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -40,6 +40,7 @@
 #include "storage/spin.h"
 #include "tcop/tcopprot.h"
 
+#include "bitmap.h"
 #include "communicator_new.h"
 #include "hll.h"
 #include "neon.h"
@@ -670,6 +671,45 @@ communicator_new_cache_contains(NRelFileInfo rinfo, ForkNumber forkNum,
 								blockno);
 }
 
+/* Dump a list of blocks in the LFC, for use in prewarming later */
+FileCacheState *
+communicator_new_get_lfc_state(size_t max_entries)
+{
+	struct FileCacheIterator iter;
+	FileCacheState* fcs;
+	uint8	   *bitmap;
+	/* TODO: Max(max_entries, <current # of entries in cache>) */
+	size_t		n_entries = max_entries;
+	size_t		state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries, 1);
+	size_t		n_pages = 0;
+
+	fcs = (FileCacheState *) palloc0(state_size);
+	SET_VARSIZE(fcs, state_size);
+	fcs->magic = FILE_CACHE_STATE_MAGIC;
+	fcs->chunk_size_log = 0;
+	fcs->n_chunks = n_entries;
+	bitmap = FILE_CACHE_STATE_BITMAP(fcs);
+
+	bcomm_cache_iterate_begin(my_bs, &iter);
+	while (n_pages < max_entries && bcomm_cache_iterate_next(my_bs, &iter))
+	{
+		BufferTag tag;
+
+		BufTagInit(tag, iter.rel_number, iter.fork_number, iter.block_number, iter.spc_oid, iter.db_oid);
+		fcs->chunks[n_pages] = tag;
+		n_pages++;
+	}
+
+	/* fill bitmap. TODO: memset would be more efficient, but this is a silly format anyway */
+	for (size_t i = 0; i < n_pages; i++)
+	{
+		BITMAP_SET(bitmap, i);
+	}
+	fcs->n_pages = n_pages;
+
+	return fcs;
+}
+
 /*
  * Drain all in-flight requests from the queue.
  *
diff --git a/pgxn/neon/communicator_new.h b/pgxn/neon/communicator_new.h
index 7fbc167f0f..8de2fab57a 100644
--- a/pgxn/neon/communicator_new.h
+++ b/pgxn/neon/communicator_new.h
@@ -12,6 +12,7 @@
 #ifndef COMMUNICATOR_NEW_H
 #define COMMUNICATOR_NEW_H
 
+#include "lfc_prewarm.h"
 #include "neon_pgversioncompat.h"
 
 #include "storage/buf_internals.h"
@@ -61,4 +62,6 @@ extern void communicator_new_update_cached_rel_size(NRelFileInfo rinfo, ForkNumb
 /* other functions */
 extern int32 communicator_new_approximate_working_set_size_seconds(time_t duration, bool reset);
 
+extern FileCacheState *communicator_new_get_lfc_state(size_t max_entries);
+
 #endif							/* COMMUNICATOR_NEW_H */
diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index 91d8dac274..7c408c82da 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -134,15 +134,6 @@ typedef struct FileCacheEntry
 #define N_COND_VARS 	64
 #define CV_WAIT_TIMEOUT	10
 
-#define MAX_PREWARM_WORKERS 8
-
-typedef struct PrewarmWorkerState
-{
-	uint32		prewarmed_pages;
-	uint32		skipped_pages;
-	TimestampTz completed;
-} PrewarmWorkerState;
-
 typedef struct FileCacheControl
 {
 	uint64		generation;		/* generation is needed to handle correct hash
@@ -188,47 +179,27 @@ typedef struct FileCacheControl
 	 *   again.
 	 */
 	HyperLogLogState wss_estimation;
-
-	/* Prewarmer state */
-	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
-	size_t n_prewarm_workers;
-	size_t n_prewarm_entries;
-	size_t total_prewarm_pages;
-	size_t prewarm_batch;
-	bool   prewarm_active;
-	bool   prewarm_canceled;
-	dsm_handle prewarm_lfc_state_handle;
 } FileCacheControl;
 
-#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
-
-#define FILE_CACHE_STATE_BITMAP(fcs)	((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
-#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * lfc_blocks_per_chunk)+7)/8)
-#define FILE_CACHE_STATE_SIZE(fcs)		(sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
-
 static HTAB *lfc_hash;
 static int	lfc_desc = -1;
 static LWLockId lfc_lock;
 int	lfc_max_size;
 int	lfc_size_limit;
-static int	lfc_prewarm_limit;
-static int	lfc_prewarm_batch;
 static int	lfc_chunk_size_log = MAX_BLOCKS_PER_CHUNK_LOG;
 static int	lfc_blocks_per_chunk = MAX_BLOCKS_PER_CHUNK;
 char *lfc_path;
 static uint64 lfc_generation;
 static FileCacheControl *lfc_ctl;
-static bool lfc_do_prewarm;
 
 bool lfc_store_prefetch_result;
 bool lfc_prewarm_update_ws_estimation;
 
-bool AmPrewarmWorker;
+bool lfc_do_prewarm;
+bool lfc_prewarm_cancel;
 
 #define LFC_ENABLED() (lfc_ctl->limit != 0)
 
-PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
-
 /*
  * Close LFC file if opened.
  * All backends should close their LFC files once LFC is disabled.
@@ -611,34 +582,13 @@ lfc_init(void)
 							lfc_check_chunk_size,
 							lfc_change_chunk_size,
 							NULL);
-
-	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
-							"Maximal number of prewarmed chunks",
-							NULL,
-							&lfc_prewarm_limit,
-							INT_MAX,	/* no limit by default */
-							0,
-							INT_MAX,
-							PGC_SIGHUP,
-							0,
-							NULL,
-							NULL,
-							NULL);
-
-	DefineCustomIntVariable("neon.file_cache_prewarm_batch",
-							"Number of pages retrivied by prewarm from page server",
-							NULL,
-							&lfc_prewarm_batch,
-							64,
-							1,
-							INT_MAX,
-							PGC_SIGHUP,
-							0,
-							NULL,
-							NULL,
-							NULL);
 }
 
+/*
+ * Dump a list of pages that are currently in the LFC
+ *
+ * This is used to get a snapshot that can be used to prewarm the LFC later.
+ */
 FileCacheState*
 lfc_get_state(size_t max_entries)
 {
@@ -656,7 +606,7 @@ lfc_get_state(size_t max_entries)
 		uint8* bitmap;
 		size_t n_pages = 0;
 		size_t n_entries = Min(max_entries, lfc_ctl->used - lfc_ctl->pinned);
-		size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries);
+		size_t state_size = FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_entries, lfc_blocks_per_chunk);
 		fcs = (FileCacheState*)palloc0(state_size);
 		SET_VARSIZE(fcs, state_size);
 		fcs->magic = FILE_CACHE_STATE_MAGIC;
@@ -690,270 +640,6 @@ lfc_get_state(size_t max_entries)
 	return fcs;
 }
 
-/*
- * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to load prewarmed page without hoilding shared buffer lock
- * and avoid race conditions with other backends.
- */
-void
-lfc_prewarm(FileCacheState* fcs, uint32 n_workers)
-{
-	size_t fcs_chunk_size_log;
-	size_t n_entries;
-	size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
-	size_t fcs_size;
-	dsm_segment *seg;
-	BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
-
-	Assert(!neon_use_communicator_worker);
-
-	if (!lfc_ensure_opened())
-		return;
-
-	if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
-	{
-		elog(LOG, "LFC: prewarm is disabled");
-		return;
-	}
-
-	if (n_workers > MAX_PREWARM_WORKERS)
-	{
-		elog(ERROR, "LFC: Too much prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
-	}
-
-	if (fcs == NULL || fcs->n_chunks == 0)
-	{
-		elog(LOG, "LFC: nothing to prewarm");
-		return;
-	}
-
-	if (fcs->magic != FILE_CACHE_STATE_MAGIC)
-	{
-		elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
-	}
-
-	fcs_size = VARSIZE(fcs);
-	if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
-	{
-		elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
-	}
-
-	fcs_chunk_size_log = fcs->chunk_size_log;
-	if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
-	{
-		elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
-	}
-
-	n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
-	Assert(n_entries != 0);
-
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-
-	/* Do not prewarm more entries than LFC limit */
-	if (lfc_ctl->limit <= lfc_ctl->size)
-	{
-		elog(LOG, "LFC: skip prewarm because LFC is already filled");
-		LWLockRelease(lfc_lock);
-		return;
-	}
-
-	if (lfc_ctl->prewarm_active)
-	{
-		LWLockRelease(lfc_lock);
-		elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
-	}
-	lfc_ctl->n_prewarm_entries = n_entries;
-	lfc_ctl->n_prewarm_workers = n_workers;
-	lfc_ctl->prewarm_active = true;
-	lfc_ctl->prewarm_canceled = false;
-	lfc_ctl->prewarm_batch = prewarm_batch;
-	memset(lfc_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
-
-	LWLockRelease(lfc_lock);
-
-	/* Calculate total number of pages to be prewarmed */
-	lfc_ctl->total_prewarm_pages = fcs->n_pages;
-
-	seg = dsm_create(fcs_size, 0);
-	memcpy(dsm_segment_address(seg), fcs, fcs_size);
-	lfc_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
-
-	/* Spawn background workers */
-	for (uint32 i = 0; i < n_workers; i++)
-	{
-		BackgroundWorker worker = {0};
-
-		worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
-		worker.bgw_start_time = BgWorkerStart_ConsistentState;
-		worker.bgw_restart_time = BGW_NEVER_RESTART;
-		strcpy(worker.bgw_library_name, "neon");
-		strcpy(worker.bgw_function_name, "lfc_prewarm_main");
-		snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
-		strcpy(worker.bgw_type, "LFC prewarm worker");
-		worker.bgw_main_arg = Int32GetDatum(i);
-		/* must set notify PID to wait for shutdown */
-		worker.bgw_notify_pid = MyProcPid;
-
-		if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
-		{
-			ereport(LOG,
-					(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
-					 errmsg("LFC: registering dynamic bgworker prewarm failed"),
-					 errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
-			n_workers = i;
-			lfc_ctl->prewarm_canceled = true;
-			break;
-		}
-	}
-
-	for (uint32 i = 0; i < n_workers; i++)
-	{
-		bool interrupted;
-		do
-		{
-			interrupted = false;
-			PG_TRY();
-			{
-				BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
-				if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
-				{
-					elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
-				}
-			}
-			PG_CATCH();
-			{
-				elog(LOG, "LFC: cancel prewarm");
-				lfc_ctl->prewarm_canceled = true;
-				interrupted = true;
-			}
-			PG_END_TRY();
-		} while (interrupted);
-
-		if (!lfc_ctl->prewarm_workers[i].completed)
-		{
-			/* Background worker doesn't set completion time: it means that it was abnormally terminated */
-			elog(LOG, "LFC: prewarm worker %d failed", i+1);
-			/* Set completion time to prevent get_prewarm_info from considering this worker as active */
-			lfc_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
-		}
-	}
-	dsm_detach(seg);
-
-	LWLockAcquire(lfc_lock, LW_EXCLUSIVE);
-	lfc_ctl->prewarm_active = false;
-	LWLockRelease(lfc_lock);
-}
-
-void
-lfc_prewarm_main(Datum main_arg)
-{
-	size_t snd_idx = 0, rcv_idx = 0;
-	size_t n_sent = 0, n_received = 0;
-	size_t fcs_chunk_size_log;
-	size_t max_prefetch_pages;
-	size_t prewarm_batch;
-	size_t n_workers;
-	dsm_segment *seg;
-	FileCacheState* fcs;
-	uint8* bitmap;
-	BufferTag tag;
-	PrewarmWorkerState* ws;
-	uint32 worker_id = DatumGetInt32(main_arg);
-
-	Assert(!neon_use_communicator_worker);
-
-	AmPrewarmWorker = true;
-
-	pqsignal(SIGTERM, die);
-	BackgroundWorkerUnblockSignals();
-
-	seg = dsm_attach(lfc_ctl->prewarm_lfc_state_handle);
-	if (seg == NULL)
-		ereport(ERROR,
-				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
-				 errmsg("could not map dynamic shared memory segment")));
-
-	fcs = (FileCacheState*) dsm_segment_address(seg);
-	prewarm_batch = lfc_ctl->prewarm_batch;
-	fcs_chunk_size_log = fcs->chunk_size_log;
-	n_workers = lfc_ctl->n_prewarm_workers;
-	max_prefetch_pages = lfc_ctl->n_prewarm_entries << fcs_chunk_size_log;
-	ws = &lfc_ctl->prewarm_workers[worker_id];
-	bitmap = FILE_CACHE_STATE_BITMAP(fcs);
-
-	/* enable prefetch in LFC */
-	lfc_store_prefetch_result = true;
-	lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
-
-	elog(LOG, "LFC: worker %d start prewarming", worker_id);
-	while (!lfc_ctl->prewarm_canceled)
-	{
-		if (snd_idx < max_prefetch_pages)
-		{
-			if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
-			{
-				/* If there are multiple workers, split chunks between them */
-				snd_idx += 1 << fcs_chunk_size_log;
-			}
-			else
-			{
-				if (BITMAP_ISSET(bitmap, snd_idx))
-				{
-					tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
-					tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
-					if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
-					{
-						(void)communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
-						n_sent += 1;
-					}
-					else
-					{
-						ws->skipped_pages += 1;
-						BITMAP_CLR(bitmap, snd_idx);
-					}
-				}
-				snd_idx += 1;
-			}
-		}
-		if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
-		{
-			if (n_received == n_sent && snd_idx == max_prefetch_pages)
-			{
-				break;
-			}
-			if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
-			{
-				/* Skip chunks processed by other workers */
-				rcv_idx += 1 << fcs_chunk_size_log;
-				continue;
-			}
-
-			/* Locate next block to prefetch */
-			while (!BITMAP_ISSET(bitmap, rcv_idx))
-			{
-				rcv_idx += 1;
-			}
-			tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
-			tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
-			if (communicator_prefetch_receive(tag))
-			{
-				ws->prewarmed_pages += 1;
-			}
-			else
-			{
-				ws->skipped_pages += 1;
-			}
-			rcv_idx += 1;
-			n_received += 1;
-		}
-	}
-	/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
-	 * connection to PS dropped just after return from this function.
-	 */
-	Assert(n_sent == n_received || lfc_ctl->prewarm_canceled);
-	elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
-	lfc_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
-}
-
 void
 lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks)
 {
@@ -1466,7 +1152,7 @@ lfc_init_new_entry(FileCacheEntry* entry, uint32 hash)
 		/* Can't add this chunk - we don't have the space for it */
 		hash_search_with_hash_value(lfc_hash, &entry->key, hash,
 									HASH_REMOVE, NULL);
-		lfc_ctl->prewarm_canceled = true; /* cancel prewarm if LFC limit is reached */
+		lfc_prewarm_cancel = true; /* cancel prewarm if LFC limit is reached */
 		return false;
 	}
 
@@ -2176,95 +1862,3 @@ lfc_approximate_working_set_size_seconds(time_t duration, bool reset)
 		memset(lfc_ctl->wss_estimation.regs, 0, sizeof lfc_ctl->wss_estimation.regs);
 	return dc;
 }
-
-PG_FUNCTION_INFO_V1(get_local_cache_state);
-
-Datum
-get_local_cache_state(PG_FUNCTION_ARGS)
-{
-	size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
-	FileCacheState* fcs;
-
-	if (neon_use_communicator_worker)
-		elog(ERROR, "TODO: not implemented");
-
-	fcs = lfc_get_state(max_entries);
-
-	if (fcs != NULL)
-		PG_RETURN_BYTEA_P((bytea*)fcs);
-	else
-		PG_RETURN_NULL();
-}
-
-PG_FUNCTION_INFO_V1(prewarm_local_cache);
-
-Datum
-prewarm_local_cache(PG_FUNCTION_ARGS)
-{
-	bytea* state = PG_GETARG_BYTEA_PP(0);
-	uint32 n_workers =  PG_GETARG_INT32(1);
-	FileCacheState* fcs;
-
-	if (neon_use_communicator_worker)
-		elog(ERROR, "TODO: not implemented");
-
-	fcs = (FileCacheState*)state;
-	lfc_prewarm(fcs, n_workers);
-
-	PG_RETURN_NULL();
-}
-
-PG_FUNCTION_INFO_V1(get_prewarm_info);
-
-Datum
-get_prewarm_info(PG_FUNCTION_ARGS)
-{
-	Datum		values[4];
-	bool		nulls[4];
-	TupleDesc	tupdesc;
-	uint32 prewarmed_pages = 0;
-	uint32 skipped_pages = 0;
-	uint32 active_workers = 0;
-	uint32 total_pages;
-	size_t n_workers;
-
-	if (neon_use_communicator_worker)
-		elog(ERROR, "TODO: not implemented");
-
-	if (lfc_size_limit == 0)
-		PG_RETURN_NULL();
-
-	LWLockAcquire(lfc_lock, LW_SHARED);
-	if (!lfc_ctl || lfc_ctl->n_prewarm_workers == 0)
-	{
-		LWLockRelease(lfc_lock);
-		PG_RETURN_NULL();
-	}
-	n_workers = lfc_ctl->n_prewarm_workers;
-	total_pages = lfc_ctl->total_prewarm_pages;
-	for (size_t i = 0; i < n_workers; i++)
-	{
-		PrewarmWorkerState* ws = &lfc_ctl->prewarm_workers[i];
-		prewarmed_pages += ws->prewarmed_pages;
-		skipped_pages += ws->skipped_pages;
-		active_workers += ws->completed != 0;
-	}
-	LWLockRelease(lfc_lock);
-
-	tupdesc = CreateTemplateTupleDesc(4);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
-	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
-	tupdesc = BlessTupleDesc(tupdesc);
-
-	MemSet(nulls, 0, sizeof(nulls));
-
-	values[0] = Int32GetDatum(total_pages);
-	values[1] = Int32GetDatum(prewarmed_pages);
-	values[2] = Int32GetDatum(skipped_pages);
-	values[3] = Int32GetDatum(active_workers);
-
-	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
-}
-
diff --git a/pgxn/neon/file_cache.h b/pgxn/neon/file_cache.h
index f8056e22ff..fd79eee532 100644
--- a/pgxn/neon/file_cache.h
+++ b/pgxn/neon/file_cache.h
@@ -11,18 +11,9 @@
 #ifndef FILE_CACHE_h
 #define FILE_CACHE_h
 
-#include "neon_pgversioncompat.h"
+#include "lfc_prewarm.h"
 
-typedef struct FileCacheState
-{
-	int32		vl_len_;		/* varlena header (do not touch directly!) */
-	uint32		magic;
-	uint32		n_chunks;
-	uint32		n_pages;
-	uint16		chunk_size_log;
-	BufferTag	chunks[FLEXIBLE_ARRAY_MEMBER];
-	/* followed by bitmap */
-} FileCacheState;
+#include "neon_pgversioncompat.h"
 
 /* GUCs */
 extern bool lfc_store_prefetch_result;
@@ -30,6 +21,9 @@ extern int	lfc_max_size;
 extern int	lfc_size_limit;
 extern char *lfc_path;
 
+extern bool lfc_do_prewarm;
+extern bool lfc_prewarm_cancel;
+
 /* functions for local file cache */
 extern void lfc_invalidate(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber nblocks);
 extern void lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum,
@@ -48,7 +42,6 @@ extern void lfc_init(void);
 extern bool lfc_prefetch(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno,
 						 const void* buffer, XLogRecPtr lsn);
 extern FileCacheState* lfc_get_state(size_t max_entries);
-extern void lfc_prewarm(FileCacheState* fcs, uint32 n_workers);
 
 extern int32 lfc_approximate_working_set_size_seconds(time_t duration, bool reset);
 
diff --git a/pgxn/neon/lfc_prewarm.c b/pgxn/neon/lfc_prewarm.c
new file mode 100644
index 0000000000..2acb805f9d
--- /dev/null
+++ b/pgxn/neon/lfc_prewarm.c
@@ -0,0 +1,654 @@
+/*-------------------------------------------------------------------------
+ *
+ * lfc_prewarm.c
+ *		Functions related to LFC prewarming
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "bitmap.h"
+#include "communicator.h"
+#include "communicator_new.h"
+#include "file_cache.h"
+#include "lfc_prewarm.h"
+#include "neon.h"
+#include "pagestore_client.h"
+
+#include "funcapi.h"
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "storage/dsm.h"
+#include "tcop/tcopprot.h"
+#include "utils/timestamp.h"
+
+#define MAX_PREWARM_WORKERS 8
+
+typedef struct PrewarmWorkerState
+{
+	uint32		prewarmed_pages;
+	uint32		skipped_pages;
+	TimestampTz completed;
+} PrewarmWorkerState;
+
+typedef struct PrewarmControl
+{
+	/* -1 when not using workers, 0 when no prewarm has been performed */
+	size_t		n_prewarm_workers;
+	size_t		total_prewarm_pages;
+	bool		prewarm_active;
+	bool		prewarm_canceled;
+
+	/* These are used in the non-worker mode */
+	uint32		prewarmed_pages;
+	uint32		skipped_pages;
+	TimestampTz completed;
+
+	/* These are used with workers */
+	PrewarmWorkerState prewarm_workers[MAX_PREWARM_WORKERS];
+	dsm_handle	prewarm_lfc_state_handle;
+	size_t		prewarm_batch;
+	size_t		n_prewarm_entries;
+} PrewarmControl;
+
+static PrewarmControl *prewarm_ctl;
+
+static int	lfc_prewarm_limit;
+static int	lfc_prewarm_batch;
+
+static LWLockId prewarm_lock;
+
+bool AmPrewarmWorker;
+
+static void lfc_prewarm_with_workers(FileCacheState *fcs, uint32 n_workers);
+static void lfc_prewarm_with_async_requests(FileCacheState *fcs);
+PGDLLEXPORT void lfc_prewarm_main(Datum main_arg);
+
+void
+pg_init_prewarm(void)
+{
+	DefineCustomIntVariable("neon.file_cache_prewarm_limit",
+							"Maximal number of prewarmed chunks",
+							NULL,
+							&lfc_prewarm_limit,
+							INT_MAX,	/* no limit by default */
+							0,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+
+	DefineCustomIntVariable("neon.file_cache_prewarm_batch",
+							"Number of pages retrivied by prewarm from page server",
+							NULL,
+							&lfc_prewarm_batch,
+							64,
+							1,
+							INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL,
+							NULL,
+							NULL);
+}
+
+static size_t
+PrewarmShmemSize(void)
+{
+	return sizeof(PrewarmControl);
+}
+
+void
+PrewarmShmemRequest(void)
+{
+	RequestAddinShmemSpace(PrewarmShmemSize());
+	RequestNamedLWLockTranche("prewarm_lock", 1);
+}
+
+void
+PrewarmShmemInit(void)
+{
+	bool		found;
+
+	prewarm_ctl = (PrewarmControl *) ShmemInitStruct("Prewarmer shmem state",
+								PrewarmShmemSize(),
+								&found);
+	if (!found)
+	{
+		/* it's zeroed already */
+
+		prewarm_lock = (LWLockId) GetNamedLWLockTranche("prewarm_lock");
+	}
+}
+
+static void
+validate_fcs(FileCacheState *fcs)
+{
+	size_t fcs_size;
+#if 0
+	size_t fcs_chunk_size_log;
+#endif
+
+	if (fcs->magic != FILE_CACHE_STATE_MAGIC)
+	{
+		elog(ERROR, "LFC: Invalid file cache state magic: %X", fcs->magic);
+	}
+
+	fcs_size = VARSIZE(fcs);
+	if (FILE_CACHE_STATE_SIZE(fcs) != fcs_size)
+	{
+		elog(ERROR, "LFC: Invalid file cache state size: %u vs. %u", (unsigned)FILE_CACHE_STATE_SIZE(fcs), VARSIZE(fcs));
+	}
+
+	/* FIXME */
+#if 0
+	fcs_chunk_size_log = fcs->chunk_size_log;
+	if (fcs_chunk_size_log > MAX_BLOCKS_PER_CHUNK_LOG)
+	{
+		elog(ERROR, "LFC: Invalid chunk size log: %u", fcs->chunk_size_log);
+	}
+#endif
+}
+
+/*
+ * Prewarm LFC cache to the specified state. It uses lfc_prefetch function to
+ * load prewarmed page without hoilding shared buffer lock and avoid race
+ * conditions with other backends.
+ */
+void
+lfc_prewarm_with_workers(FileCacheState *fcs, uint32 n_workers)
+{
+	size_t n_entries;
+	size_t prewarm_batch = Min(lfc_prewarm_batch, readahead_buffer_size);
+	size_t fcs_size = VARSIZE(fcs);
+	dsm_segment *seg;
+	BackgroundWorkerHandle* bgw_handle[MAX_PREWARM_WORKERS];
+
+	Assert(!neon_use_communicator_worker);
+
+	if (prewarm_batch == 0 || lfc_prewarm_limit == 0 || n_workers == 0)
+	{
+		elog(LOG, "LFC: prewarm is disabled");
+		return;
+	}
+
+	if (n_workers > MAX_PREWARM_WORKERS)
+	{
+		elog(ERROR, "LFC: too many prewarm workers, maximum is %d", MAX_PREWARM_WORKERS);
+	}
+
+	if (fcs == NULL || fcs->n_chunks == 0)
+	{
+		elog(LOG, "LFC: nothing to prewarm");
+		return;
+	}
+
+	n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
+	Assert(n_entries != 0);
+
+	LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
+
+	/* Do not prewarm more entries than LFC limit */
+	/* FIXME */
+#if 0
+	if (prewarm_ctl->limit <= prewarm_ctl->size)
+	{
+		elog(LOG, "LFC: skip prewarm because LFC is already filled");
+		LWLockRelease(prewarm_lock);
+		return;
+	}
+#endif
+	
+	if (prewarm_ctl->prewarm_active)
+	{
+		LWLockRelease(prewarm_lock);
+		elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
+	}
+	prewarm_ctl->n_prewarm_entries = n_entries;
+	prewarm_ctl->n_prewarm_workers = n_workers;
+	prewarm_ctl->prewarm_active = true;
+	prewarm_ctl->prewarm_canceled = false;
+	prewarm_ctl->prewarm_batch = prewarm_batch;
+	memset(prewarm_ctl->prewarm_workers, 0, n_workers*sizeof(PrewarmWorkerState));
+
+	/* Calculate total number of pages to be prewarmed */
+	prewarm_ctl->total_prewarm_pages = fcs->n_pages;
+
+	LWLockRelease(prewarm_lock);
+
+	seg = dsm_create(fcs_size, 0);
+	memcpy(dsm_segment_address(seg), fcs, fcs_size);
+	prewarm_ctl->prewarm_lfc_state_handle = dsm_segment_handle(seg);
+
+	/* Spawn background workers */
+	for (uint32 i = 0; i < n_workers; i++)
+	{
+		BackgroundWorker worker = {0};
+
+		worker.bgw_flags = BGWORKER_SHMEM_ACCESS;
+		worker.bgw_start_time = BgWorkerStart_ConsistentState;
+		worker.bgw_restart_time = BGW_NEVER_RESTART;
+		strcpy(worker.bgw_library_name, "neon");
+		strcpy(worker.bgw_function_name, "lfc_prewarm_main");
+		snprintf(worker.bgw_name, BGW_MAXLEN, "LFC prewarm worker %d", i+1);
+		strcpy(worker.bgw_type, "LFC prewarm worker");
+		worker.bgw_main_arg = Int32GetDatum(i);
+		/* must set notify PID to wait for shutdown */
+		worker.bgw_notify_pid = MyProcPid;
+
+		if (!RegisterDynamicBackgroundWorker(&worker, &bgw_handle[i]))
+		{
+			ereport(LOG,
+					(errcode(ERRCODE_INSUFFICIENT_RESOURCES),
+					 errmsg("LFC: registering dynamic bgworker prewarm failed"),
+					 errhint("Consider increasing the configuration parameter \"%s\".", "max_worker_processes")));
+			n_workers = i;
+			prewarm_ctl->prewarm_canceled = true;
+			break;
+		}
+	}
+
+	for (uint32 i = 0; i < n_workers; i++)
+	{
+		bool interrupted;
+		do
+		{
+			interrupted = false;
+			PG_TRY();
+			{
+				BgwHandleStatus status = WaitForBackgroundWorkerShutdown(bgw_handle[i]);
+				if (status != BGWH_STOPPED && status != BGWH_POSTMASTER_DIED)
+				{
+					elog(LOG, "LFC: Unexpected status of prewarm worker termination: %d", status);
+				}
+			}
+			PG_CATCH();
+			{
+				elog(LOG, "LFC: cancel prewarm");
+				prewarm_ctl->prewarm_canceled = true;
+				interrupted = true;
+			}
+			PG_END_TRY();
+		} while (interrupted);
+
+		if (!prewarm_ctl->prewarm_workers[i].completed)
+		{
+			/* Background worker doesn't set completion time: it means that it was abnormally terminated */
+			elog(LOG, "LFC: prewarm worker %d failed", i+1);
+			/* Set completion time to prevent get_prewarm_info from considering this worker as active */
+			prewarm_ctl->prewarm_workers[i].completed = GetCurrentTimestamp();
+		}
+	}
+	dsm_detach(seg);
+
+	LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
+	prewarm_ctl->prewarm_active = false;
+	LWLockRelease(prewarm_lock);
+}
+
+
+void
+lfc_prewarm_main(Datum main_arg)
+{
+	size_t snd_idx = 0, rcv_idx = 0;
+	size_t n_sent = 0, n_received = 0;
+	size_t fcs_chunk_size_log;
+	size_t max_prefetch_pages;
+	size_t prewarm_batch;
+	size_t n_workers;
+	dsm_segment *seg;
+	FileCacheState* fcs;
+	uint8* bitmap;
+	BufferTag tag;
+	PrewarmWorkerState* ws;
+	uint32 worker_id = DatumGetInt32(main_arg);
+
+	Assert(!neon_use_communicator_worker);
+
+	AmPrewarmWorker = true;
+
+	pqsignal(SIGTERM, die);
+	BackgroundWorkerUnblockSignals();
+
+	seg = dsm_attach(prewarm_ctl->prewarm_lfc_state_handle);
+	if (seg == NULL)
+		ereport(ERROR,
+				(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+				 errmsg("could not map dynamic shared memory segment")));
+
+	fcs = (FileCacheState*) dsm_segment_address(seg);
+	prewarm_batch = prewarm_ctl->prewarm_batch;
+	fcs_chunk_size_log = fcs->chunk_size_log;
+	n_workers = prewarm_ctl->n_prewarm_workers;
+	max_prefetch_pages = prewarm_ctl->n_prewarm_entries << fcs_chunk_size_log;
+	ws = &prewarm_ctl->prewarm_workers[worker_id];
+	bitmap = FILE_CACHE_STATE_BITMAP(fcs);
+
+	/* enable prefetch in LFC */
+	lfc_store_prefetch_result = true;
+	lfc_do_prewarm = true; /* Flag for lfc_prefetch preventing replacement of existed entries if LFC cache is full */
+
+	elog(LOG, "LFC: worker %d start prewarming", worker_id);
+	while (!prewarm_ctl->prewarm_canceled)
+	{
+		if (snd_idx < max_prefetch_pages)
+		{
+			if ((snd_idx >> fcs_chunk_size_log) % n_workers != worker_id)
+			{
+				/* If there are multiple workers, split chunks between them */
+				snd_idx += 1 << fcs_chunk_size_log;
+			}
+			else
+			{
+				if (BITMAP_ISSET(bitmap, snd_idx))
+				{
+					tag = fcs->chunks[snd_idx >> fcs_chunk_size_log];
+					tag.blockNum += snd_idx & ((1 << fcs_chunk_size_log) - 1);
+					if (!lfc_cache_contains(BufTagGetNRelFileInfo(tag), tag.forkNum, tag.blockNum))
+					{
+						(void) communicator_prefetch_register_bufferv(tag, NULL, 1, NULL);
+						n_sent += 1;
+					}
+					else
+					{
+						ws->skipped_pages += 1;
+						BITMAP_CLR(bitmap, snd_idx);
+					}
+				}
+				snd_idx += 1;
+			}
+		}
+		if (n_sent >= n_received + prewarm_batch || snd_idx == max_prefetch_pages)
+		{
+			if (n_received == n_sent && snd_idx == max_prefetch_pages)
+			{
+				break;
+			}
+			if ((rcv_idx >> fcs_chunk_size_log) % n_workers != worker_id)
+			{
+				/* Skip chunks processed by other workers */
+				rcv_idx += 1 << fcs_chunk_size_log;
+				continue;
+			}
+
+			/* Locate next block to prefetch */
+			while (!BITMAP_ISSET(bitmap, rcv_idx))
+			{
+				rcv_idx += 1;
+			}
+			tag = fcs->chunks[rcv_idx >> fcs_chunk_size_log];
+			tag.blockNum += rcv_idx & ((1 << fcs_chunk_size_log) - 1);
+			if (communicator_prefetch_receive(tag))
+			{
+				ws->prewarmed_pages += 1;
+			}
+			else
+			{
+				ws->skipped_pages += 1;
+			}
+			rcv_idx += 1;
+			n_received += 1;
+		}
+	}
+	/* No need to perform prefetch cleanup here because prewarm worker will be terminated and
+	 * connection to PS dropped just after return from this function.
+	 */
+	Assert(n_sent == n_received || prewarm_ctl->prewarm_canceled);
+	elog(LOG, "LFC: worker %d complete prewarming: loaded %ld pages", worker_id, (long)n_received);
+	prewarm_ctl->prewarm_workers[worker_id].completed = GetCurrentTimestamp();
+}
+
+/*
+ * Prewarm LFC cache to the specified state. Uses the new communicator
+ *
+ * FIXME: Is there a race condition because we're not holding Postgres
+ * buffer manager locks?
+ */
+static void
+lfc_prewarm_with_async_requests(FileCacheState *fcs)
+{
+	size_t n_entries;
+	uint8	   *bitmap;
+	uint64		bitno;
+	int			blocks_per_chunk;
+
+	Assert(neon_use_communicator_worker);
+
+	if (lfc_prewarm_limit == 0)
+	{
+		elog(LOG, "LFC: prewarm is disabled");
+		return;
+	}
+
+	if (fcs == NULL || fcs->n_chunks == 0)
+	{
+		elog(LOG, "LFC: nothing to prewarm");
+		return;
+	}
+
+	n_entries = Min(fcs->n_chunks, lfc_prewarm_limit);
+	Assert(n_entries != 0);
+
+	LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
+
+	/* Do not prewarm more entries than LFC limit */
+	/* FIXME */
+#if 0
+	if (prewarm_ctl->limit <= prewarm_ctl->size)
+	{
+		elog(LOG, "LFC: skip prewarm because LFC is already filled");
+		LWLockRelease(prewarm_lock);
+		return;
+	}
+#endif
+
+	if (prewarm_ctl->prewarm_active)
+	{
+		LWLockRelease(prewarm_lock);
+		elog(ERROR, "LFC: skip prewarm because another prewarm is still active");
+	}
+	prewarm_ctl->n_prewarm_entries = n_entries;
+	prewarm_ctl->n_prewarm_workers = -1;
+	prewarm_ctl->prewarm_active = true;
+	prewarm_ctl->prewarm_canceled = false;
+
+	/* Calculate total number of pages to be prewarmed */
+	prewarm_ctl->total_prewarm_pages = fcs->n_pages;
+
+	LWLockRelease(prewarm_lock);
+
+	elog(LOG, "LFC: start prewarming");
+	lfc_do_prewarm = true;
+	lfc_prewarm_cancel = false;
+
+	bitmap = FILE_CACHE_STATE_BITMAP(fcs);
+
+	blocks_per_chunk = 1 << fcs->chunk_size_log;
+
+	bitno = 0;
+	for (uint32 chunkno = 0; chunkno < fcs->n_chunks; chunkno++)
+	{
+		BufferTag *chunk_tag = &fcs->chunks[chunkno];
+		BlockNumber request_startblkno = InvalidBlockNumber;
+		BlockNumber request_endblkno;
+
+		if (lfc_prewarm_cancel)
+		{
+			prewarm_ctl->prewarm_canceled = true;
+			break;
+		}
+
+		/* take next chunk */
+		for (int j = 0; j < blocks_per_chunk; j++)
+		{
+			BlockNumber blkno = chunk_tag->blockNum + j;
+
+			if (BITMAP_ISSET(bitmap, bitno))
+			{
+				if (request_startblkno != InvalidBlockNumber)
+				{
+					if (request_endblkno == blkno)
+					{
+						/* append this block to the request */
+						request_endblkno++;
+					}
+					else
+					{
+						/* flush this request, and start new one */
+						communicator_new_prefetch_register_bufferv(
+							BufTagGetNRelFileInfo(*chunk_tag),
+							chunk_tag->forkNum,
+							request_startblkno,
+							request_endblkno - request_startblkno
+							);
+						request_startblkno = blkno;
+						request_endblkno = blkno + 1;
+					}
+				}
+				else
+				{
+					/* flush this request, if any, and start new one */
+					if (request_startblkno != InvalidBlockNumber)
+					{
+						communicator_new_prefetch_register_bufferv(
+							BufTagGetNRelFileInfo(*chunk_tag),
+							chunk_tag->forkNum,
+							request_startblkno,
+							request_endblkno - request_startblkno
+							);
+					}
+					request_startblkno = blkno;
+					request_endblkno = blkno + 1;
+				}
+				prewarm_ctl->prewarmed_pages += 1;
+			}
+			bitno++;
+		}
+
+		/* flush this request */
+		communicator_new_prefetch_register_bufferv(
+			BufTagGetNRelFileInfo(*chunk_tag),
+			chunk_tag->forkNum,
+			request_startblkno,
+			request_endblkno - request_startblkno
+			);
+		request_startblkno = request_endblkno = InvalidBlockNumber;
+	}
+
+	Assert(n_sent == n_received || prewarm_ctl->prewarm_canceled);
+	elog(LOG, "LFC: complete prewarming: loaded %lu pages", (unsigned long) prewarm_ctl->prewarmed_pages);
+	prewarm_ctl->completed = GetCurrentTimestamp();
+
+	LWLockAcquire(prewarm_lock, LW_EXCLUSIVE);
+	prewarm_ctl->prewarm_active = false;
+	LWLockRelease(prewarm_lock);
+}
+
+PG_FUNCTION_INFO_V1(get_local_cache_state);
+
+Datum
+get_local_cache_state(PG_FUNCTION_ARGS)
+{
+	size_t max_entries = PG_ARGISNULL(0) ? lfc_prewarm_limit : PG_GETARG_INT32(0);
+	FileCacheState* fcs;
+
+	if (neon_use_communicator_worker)
+		fcs = communicator_new_get_lfc_state(max_entries);
+	else
+		fcs = lfc_get_state(max_entries);
+
+	if (fcs != NULL)
+		PG_RETURN_BYTEA_P((bytea*)fcs);
+	else
+		PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(prewarm_local_cache);
+
+Datum
+prewarm_local_cache(PG_FUNCTION_ARGS)
+{
+	bytea* state = PG_GETARG_BYTEA_PP(0);
+	uint32 n_workers =  PG_GETARG_INT32(1);
+	FileCacheState* fcs;
+
+	fcs = (FileCacheState *)state;
+	validate_fcs(fcs);
+
+	if (neon_use_communicator_worker)
+		lfc_prewarm_with_async_requests(fcs);
+	else
+		lfc_prewarm_with_workers(fcs, n_workers);
+
+	PG_RETURN_NULL();
+}
+
+PG_FUNCTION_INFO_V1(get_prewarm_info);
+
+Datum
+get_prewarm_info(PG_FUNCTION_ARGS)
+{
+	Datum		values[4];
+	bool		nulls[4];
+	TupleDesc	tupdesc;
+	uint32		prewarmed_pages = 0;
+	uint32		skipped_pages = 0;
+	uint32		active_workers = 0;
+	uint32		total_pages;
+
+	if (lfc_size_limit == 0)
+		PG_RETURN_NULL();
+
+	LWLockAcquire(prewarm_lock, LW_SHARED);
+	if (!prewarm_ctl || prewarm_ctl->n_prewarm_workers == 0)
+	{
+		LWLockRelease(prewarm_lock);
+		PG_RETURN_NULL();
+	}
+
+	if (prewarm_ctl->n_prewarm_workers == -1)
+	{
+		total_pages = prewarm_ctl->total_prewarm_pages;
+		prewarmed_pages = prewarm_ctl->prewarmed_pages;
+		skipped_pages = prewarm_ctl->prewarmed_pages;
+		active_workers = 1;
+	}
+	else
+	{
+		size_t		n_workers;
+
+		n_workers = prewarm_ctl->n_prewarm_workers;
+		total_pages = prewarm_ctl->total_prewarm_pages;
+		for (size_t i = 0; i < n_workers; i++)
+		{
+			PrewarmWorkerState *ws = &prewarm_ctl->prewarm_workers[i];
+
+			prewarmed_pages += ws->prewarmed_pages;
+			skipped_pages += ws->skipped_pages;
+			active_workers += ws->completed != 0;
+		}
+	}
+	LWLockRelease(prewarm_lock);
+
+	tupdesc = CreateTemplateTupleDesc(4);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 1, "total_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 2, "prewarmed_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 3, "skipped_pages", INT4OID, -1, 0);
+	TupleDescInitEntry(tupdesc, (AttrNumber) 4, "active_workers", INT4OID, -1, 0);
+	tupdesc = BlessTupleDesc(tupdesc);
+
+	MemSet(nulls, 0, sizeof(nulls));
+
+	values[0] = Int32GetDatum(total_pages);
+	values[1] = Int32GetDatum(prewarmed_pages);
+	values[2] = Int32GetDatum(skipped_pages);
+	values[3] = Int32GetDatum(active_workers);
+
+	PG_RETURN_DATUM(HeapTupleGetDatum(heap_form_tuple(tupdesc, values, nulls)));
+}
diff --git a/pgxn/neon/lfc_prewarm.h b/pgxn/neon/lfc_prewarm.h
new file mode 100644
index 0000000000..09d224b1fc
--- /dev/null
+++ b/pgxn/neon/lfc_prewarm.h
@@ -0,0 +1,39 @@
+/*-------------------------------------------------------------------------
+ *
+ * lfc_prewarm.h
+ *	  Local File Cache prewarmer
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef LFC_PREWARM_H
+#define LFC_PREWARM_H
+
+#include "storage/buf_internals.h"
+
+typedef struct FileCacheState
+{
+	int32		vl_len_;		/* varlena header (do not touch directly!) */
+	uint32		magic;
+	uint32		n_chunks;
+	uint32		n_pages;
+	uint16		chunk_size_log;
+	BufferTag	chunks[FLEXIBLE_ARRAY_MEMBER];
+	/* followed by bitmap */
+} FileCacheState;
+
+#define FILE_CACHE_STATE_MAGIC 0xfcfcfcfc
+
+#define FILE_CACHE_STATE_BITMAP(fcs)	((uint8*)&(fcs)->chunks[(fcs)->n_chunks])
+#define FILE_CACHE_STATE_SIZE_FOR_CHUNKS(n_chunks, blocks_per_chunk)	(sizeof(FileCacheState) + (n_chunks)*sizeof(BufferTag) + (((n_chunks) * blocks_per_chunk)+7)/8)
+#define FILE_CACHE_STATE_SIZE(fcs)		(sizeof(FileCacheState) + (fcs->n_chunks)*sizeof(BufferTag) + (((fcs->n_chunks) << fcs->chunk_size_log)+7)/8)
+
+extern void pg_init_prewarm(void);
+extern void PrewarmShmemRequest(void);
+extern void PrewarmShmemInit(void);
+
+#endif							/* LFC_PREWARM_H */
+
+
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index 8efea63e72..59ecd9ab1c 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -503,6 +503,7 @@ _PG_init(void)
 	pg_init_libpagestore();
 	relsize_hash_init();
 	lfc_init();
+	pg_init_prewarm();
 	pg_init_walproposer();
 	init_lwlsncache();
 
@@ -728,6 +729,7 @@ neon_shmem_request_hook(void)
 #endif
 
 	LfcShmemRequest();
+	PrewarmShmemRequest();
 	NeonPerfCountersShmemRequest();
 	PagestoreShmemRequest();
 	RelsizeCacheShmemRequest();
@@ -752,6 +754,7 @@ neon_shmem_startup_hook(void)
 	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
 
 	LfcShmemInit();
+	PrewarmShmemInit();
 	NeonPerfCountersShmemInit();
 	PagestoreShmemInit();
 	RelsizeCacheShmemInit();
diff --git a/pgxn/neon/neon_pgversioncompat.h b/pgxn/neon/neon_pgversioncompat.h
index 288b6dd42f..85646a6dc5 100644
--- a/pgxn/neon/neon_pgversioncompat.h
+++ b/pgxn/neon/neon_pgversioncompat.h
@@ -76,16 +76,16 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 		(tag).rnode = (rinfo); \
 	} while (false)
 
-#define BufTagGetNRelFileInfo(tag) tag.rnode
+#define BufTagGetNRelFileInfo(tag) (tag).rnode
 
 #define BufTagGetRelNumber(tagp) ((tagp)->rnode.relNode)
 
-#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \
+#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \
 	do { \
-		RelFileNode rnode = { .spcNode = spcOid, .dbNode = dbOid, .relNode = relNumber}; \
-		(tag).forkNum = forknum; \
-		(tag).blockNum = blkno; \
-		(tag).rnode = rnode; \
+		RelFileNode rnode = { .spcNode = (spc_oid), .dbNode = (db_oid), .relNode = (rel_number)}; \
+		(tag).forkNum = (fork_number);								\
+		(tag).blockNum = (block_number);							\
+		(tag).rnode = rnode;										\
 	} while (false)
 
 #define InvalidRelFileNumber InvalidOid
@@ -137,13 +137,13 @@ InitBufferTag(BufferTag *tag, const RelFileNode *rnode,
 		.relNumber = (tag).relNumber, \
 	})
 
-#define BufTagInit(tag, relNumber, forknum, blkno, spcOid, dbOid) \
+#define BufTagInit(tag, rel_number, fork_number, block_number, spc_oid, db_oid) \
 	do { \
-		(tag).forkNum = forknum; \
-		(tag).blockNum = blkno; \
-		(tag).spcOid = spcOid; \
-		(tag).dbOid = dbOid; \
-		(tag).relNumber = relNumber; \
+		(tag).forkNum = (fork_number);					\
+		(tag).blockNum = (block_number);				\
+		(tag).spcOid = (spc_oid);						\
+		(tag).dbOid = (db_oid);							\
+		(tag).relNumber = (rel_number);					\
 	} while (false)
 
 #define SMgrRelGetRelInfo(reln) \