From 47c1640accbbf45071de91b28aadc7356775fc43 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 14 Jan 2025 21:37:32 +0000
Subject: [PATCH 01/40] storage controller: pagination for tenant listing API
 (#10365)

## Problem

For large deployments, the `control/v1/tenant` listing API can time out
transmitting a monolithic serialized response.

## Summary of changes

- Add `limit` and `start_after` parameters to listing API
- Update storcon_cli to use these parameters and limit requests to 1000
items at a time
---
 control_plane/storcon_cli/src/main.rs         | 74 +++++++++++++------
 storage_controller/src/http.rs                |  6 +-
 storage_controller/src/service.rs             | 33 ++++++++-
 test_runner/fixtures/neon_fixtures.py         | 19 ++++-
 .../regress/test_storage_controller.py        | 19 ++++-
 5 files changed, 117 insertions(+), 34 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 9d133e4af1..2ba8f63678 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -477,16 +477,7 @@ async fn main() -> anyhow::Result<()> {
             println!("{table}");
         }
         Command::Tenants { node_id: None } => {
-            let mut resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-
-            resp.sort_by(|a, b| a.tenant_id.cmp(&b.tenant_id));
-
+            // Set up output formatting
             let mut table = comfy_table::Table::new();
             table.set_header([
                 "TenantId",
@@ -496,20 +487,55 @@ async fn main() -> anyhow::Result<()> {
                 "Placement",
                 "Scheduling",
             ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    shard_zero
-                        .preferred_az_id
-                        .as_ref()
-                        .cloned()
-                        .unwrap_or("".to_string()),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
+
+            // Pagination loop over listing API
+            let mut start_after = None;
+            const LIMIT: usize = 1000;
+            loop {
+                let path = match start_after {
+                    None => format!("control/v1/tenant?limit={LIMIT}"),
+                    Some(start_after) => {
+                        format!("control/v1/tenant?limit={LIMIT}&start_after={start_after}")
+                    }
+                };
+
+                let resp = storcon_client
+                    .dispatch::<(), Vec<TenantDescribeResponse>>(Method::GET, path, None)
+                    .await?;
+
+                if resp.is_empty() {
+                    // End of data reached
+                    break;
+                }
+
+                // Give some visual feedback while we're building up the table (comfy_table doesn't have
+                // streaming output)
+                if resp.len() >= LIMIT {
+                    eprint!(".");
+                }
+
+                start_after = Some(resp.last().unwrap().tenant_id);
+
+                for tenant in resp {
+                    let shard_zero = tenant.shards.into_iter().next().unwrap();
+                    table.add_row([
+                        format!("{}", tenant.tenant_id),
+                        shard_zero
+                            .preferred_az_id
+                            .as_ref()
+                            .cloned()
+                            .unwrap_or("".to_string()),
+                        format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
+                        format!("{:?}", tenant.stripe_size),
+                        format!("{:?}", tenant.policy),
+                        format!("{:?}", shard_zero.scheduling_policy),
+                    ]);
+                }
+            }
+
+            // Terminate progress dots
+            if table.row_count() > LIMIT {
+                eprint!("");
             }
 
             println!("{table}");
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index c8df4ffe28..03d8f11992 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -653,6 +653,10 @@ async fn handle_tenant_list(
 ) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
+    let limit: Option<usize> = parse_query_param(&req, "limit")?;
+    let start_after: Option<TenantId> = parse_query_param(&req, "start_after")?;
+    tracing::info!("start_after: {:?}", start_after);
+
     match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
             return res;
@@ -660,7 +664,7 @@ async fn handle_tenant_list(
         ForwardOutcome::NotForwarded(_req) => {}
     };
 
-    json_response(StatusCode::OK, service.tenant_list())
+    json_response(StatusCode::OK, service.tenant_list(limit, start_after))
 }
 
 async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiError> {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cbb9103880..57f4cc8463 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4158,17 +4158,42 @@ impl Service {
         .ok_or_else(|| ApiError::NotFound(anyhow::anyhow!("Tenant {tenant_id} not found").into()))
     }
 
-    pub(crate) fn tenant_list(&self) -> Vec<TenantDescribeResponse> {
+    /// limit & offset are pagination parameters. Since we are walking an in-memory HashMap, `offset` does not
+    /// avoid traversing data, it just avoid returning it. This is suitable for our purposes, since our in memory
+    /// maps are small enough to traverse fast, our pagination is just to avoid serializing huge JSON responses
+    /// in our external API.
+    pub(crate) fn tenant_list(
+        &self,
+        limit: Option<usize>,
+        start_after: Option<TenantId>,
+    ) -> Vec<TenantDescribeResponse> {
         let locked = self.inner.read().unwrap();
 
+        // Apply start_from parameter
+        let shard_range = match start_after {
+            None => locked.tenants.range(..),
+            Some(tenant_id) => locked.tenants.range(
+                TenantShardId {
+                    tenant_id,
+                    shard_number: ShardNumber(u8::MAX),
+                    shard_count: ShardCount(u8::MAX),
+                }..,
+            ),
+        };
+
         let mut result = Vec::new();
-        for (_tenant_id, tenant_shards) in
-            &locked.tenants.iter().group_by(|(id, _shard)| id.tenant_id)
-        {
+        for (_tenant_id, tenant_shards) in &shard_range.group_by(|(id, _shard)| id.tenant_id) {
             result.push(
                 self.tenant_describe_impl(tenant_shards.map(|(_k, v)| v))
                     .expect("Groups are always non-empty"),
             );
+
+            // Enforce `limit` parameter
+            if let Some(limit) = limit {
+                if result.len() >= limit {
+                    break;
+                }
+            }
         }
 
         result
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e22e452a52..c47739cd81 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1884,7 +1884,10 @@ class NeonStorageController(MetricsGetter, LogUtils):
         )
         return response.json()
 
-    def tenant_list(self):
+    def tenant_shard_dump(self):
+        """
+        Debug listing API: dumps the internal map of tenant shards
+        """
         response = self.request(
             "GET",
             f"{self.api}/debug/v1/tenant",
@@ -1892,6 +1895,18 @@ class NeonStorageController(MetricsGetter, LogUtils):
         )
         return response.json()
 
+    def tenant_list(self, **kwargs):
+        """
+        Control API tenant listing: a vector of the same content returned by tenant_describe
+        """
+        response = self.request(
+            "GET",
+            f"{self.api}/control/v1/tenant",
+            headers=self.headers(TokenScope.ADMIN),
+            params=kwargs,
+        )
+        return response.json()
+
     def node_configure(self, node_id, body: dict[str, Any]):
         log.info(f"node_configure({node_id}, {body})")
         body["node_id"] = node_id
@@ -2238,7 +2253,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         """
         Get the intent and observed placements of all tenants known to the storage controller.
         """
-        tenants = self.tenant_list()
+        tenants = self.tenant_shard_dump()
 
         tenant_placement: defaultdict[str, dict[str, Any]] = defaultdict(
             lambda: {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 8ffb6ba6b2..b5d109559f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -113,6 +113,19 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
     for tid in tenant_ids:
         env.create_tenant(tid, shard_count=shards_per_tenant)
 
+    # Tenant listing API should work
+    listed_tenants = env.storage_controller.tenant_list()
+    log.info(f"listed_tenants: {listed_tenants}")
+    assert set(t["tenant_id"] for t in listed_tenants) == set(str(t) for t in tenant_ids)
+    paged = env.storage_controller.tenant_list(limit=2, start_after=listed_tenants[0]["tenant_id"])
+    assert len(paged) == 2
+    assert paged[0] == listed_tenants[1]
+    assert paged[1] == listed_tenants[2]
+    paged = env.storage_controller.tenant_list(
+        limit=1000, start_after="ffffffffffffffffffffffffffffffff"
+    )
+    assert paged == []
+
     # Validate high level metrics
     assert (
         env.storage_controller.get_metric_value("storage_controller_tenant_shards")
@@ -1506,7 +1519,7 @@ class PageserverFailpoint(Failure):
 
 
 def build_node_to_tenants_map(env: NeonEnv) -> dict[int, list[TenantId]]:
-    tenants = env.storage_controller.tenant_list()
+    tenants = env.storage_controller.tenant_shard_dump()
 
     node_to_tenants: dict[int, list[TenantId]] = {}
     for t in tenants:
@@ -2631,7 +2644,7 @@ def test_storage_controller_step_down(neon_env_builder: NeonEnvBuilder):
     # Validate that the storcon attempts to forward the request, but stops.
     # when it realises it is still the current leader.
     with pytest.raises(StorageControllerApiException, match="Leader is stepped down instance"):
-        env.storage_controller.tenant_list()
+        env.storage_controller.tenant_shard_dump()
 
     # Validate that we can step down multiple times and the observed state
     # doesn't change.
@@ -2781,7 +2794,7 @@ def test_storage_controller_leadership_transfer(
         # Check that the stepped down instance forwards requests
         # to the new leader while it's still running.
         storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_1_port}")
-        env.storage_controller.tenant_list()
+        env.storage_controller.tenant_shard_dump()
         env.storage_controller.node_configure(env.pageservers[0].id, {"scheduling": "Pause"})
         status = env.storage_controller.node_status(env.pageservers[0].id)
         assert status["scheduling"] == "Pause"

From c98cbbeac143837eb99dfea1b5246bad21647b22 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 15 Jan 2025 12:41:49 +0300
Subject: [PATCH 02/40] Add migration details to safekeeper membership RFC.
 (#10272)

## Problem

https://github.com/neondatabase/neon/pull/8455 wasn't specific enough on
migration from current situation to enabling generations.

## Summary of changes

Describe the missing parts, including control plane pushing generation
to compute, which also defines whether generations are enabled -- non
zero value does it.
---
 ...35-safekeeper-dynamic-membership-change.md | 188 ++++++++++++------
 1 file changed, 122 insertions(+), 66 deletions(-)

diff --git a/docs/rfcs/035-safekeeper-dynamic-membership-change.md b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
index 239ec58186..cea9af34ab 100644
--- a/docs/rfcs/035-safekeeper-dynamic-membership-change.md
+++ b/docs/rfcs/035-safekeeper-dynamic-membership-change.md
@@ -81,7 +81,7 @@ configuration generation in them is less than its current one. Namely, it
 refuses to vote, to truncate WAL in `handle_elected` and to accept WAL. In
 response it sends its current configuration generation to let walproposer know.
 
-Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration` 
+Safekeeper gets `PUT /v1/tenants/{tenant_id}/timelines/{timeline_id}/configuration`
 accepting `Configuration`. Safekeeper switches to the given conf it is higher than its
 current one and ignores it otherwise. In any case it replies with
 ```
@@ -103,7 +103,7 @@ currently and tries to communicate with all of them. However, the list does not
 define consensus members. Instead, on start walproposer tracks highest
 configuration it receives from `AcceptorGreeting`s. Once it assembles greetings
 from majority of `sk_set` and majority of `new_sk_set` (if it is present), it
-establishes this configuration as its own and moves to voting. 
+establishes this configuration as its own and moves to voting.
 
 It should stop talking to safekeepers not listed in the configuration at this
 point, though it is not unsafe to continue doing so.
@@ -119,7 +119,7 @@ refusal to accept due to configuration change) it simply restarts.
 The following algorithm can be executed anywhere having access to configuration
 storage and safekeepers. It is safe to interrupt / restart it and run multiple
 instances of it concurrently, though likely one of them won't make
-progress then. It accepts `desired_set: Vec<NodeId>` as input. 
+progress then. It accepts `desired_set: Vec<NodeId>` as input.
 
 Algorithm will refuse to make the change if it encounters previous interrupted
 change attempt, but in this case it will try to finish it.
@@ -140,7 +140,7 @@ storage are reachable.
    safe. Failed CAS aborts the procedure.
 4) Call `PUT` `configuration` on safekeepers from the current set,
    delivering them `joint_conf`. Collecting responses from majority is required
-   to proceed. If any response returned generation higher than 
+   to proceed. If any response returned generation higher than
    `joint_conf.generation`, abort (another switch raced us). Otherwise, choose
    max `<last_log_term, flush_lsn>` among responses and establish it as
    (in memory) `sync_position`. Also choose max `term` and establish it as (in
@@ -149,49 +149,49 @@ storage are reachable.
    without ack from the new set. Similarly, we'll bump term on new majority
    to `sync_term` so that two computes with the same term are never elected.
 4) Initialize timeline on safekeeper(s) from `new_sk_set` where it
-   doesn't exist yet by doing `pull_timeline` from the majority of the 
+   doesn't exist yet by doing `pull_timeline` from the majority of the
    current set. Doing that on majority of `new_sk_set` is enough to
    proceed, but it is reasonable to ensure that all `new_sk_set` members
    are initialized -- if some of them are down why are we migrating there?
-5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set. 
+5) Call `POST` `bump_term(sync_term)` on safekeepers from the new set.
    Success on majority is enough.
 6) Repeatedly call `PUT` `configuration` on safekeepers from the new set,
    delivering them `joint_conf` and collecting their positions. This will
-   switch them to the `joint_conf` which generally won't be needed 
+   switch them to the `joint_conf` which generally won't be needed
    because `pull_timeline` already includes it and plus additionally would be
    broadcast by compute. More importantly, we may proceed to the next step
-   only when `<last_log_term, flush_lsn>` on the majority of the new set reached 
-   `sync_position`. Similarly, on the happy path no waiting is not needed because 
+   only when `<last_log_term, flush_lsn>` on the majority of the new set reached
+   `sync_position`. Similarly, on the happy path no waiting is not needed because
    `pull_timeline` already includes it. However, we should double
     check to be safe. For example, timeline could have been created earlier e.g.
-    manually or after try-to-migrate, abort, try-to-migrate-again sequence. 
-7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new 
-   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration 
+    manually or after try-to-migrate, abort, try-to-migrate-again sequence.
+7) Create `new_conf: Configuration` incrementing `join_conf` generation and having new
+   safekeeper set as `sk_set` and None `new_sk_set`. Write it to configuration
    storage under one more CAS.
 8) Call `PUT` `configuration` on safekeepers from the new set,
-   delivering them `new_conf`. It is enough to deliver it to the majority 
+   delivering them `new_conf`. It is enough to deliver it to the majority
    of the new set; the rest can be updated by compute.
 
 I haven't put huge effort to make the description above very precise, because it
 is natural language prone to interpretations anyway. Instead I'd like to make TLA+
 spec of it.
 
-Description above focuses on safety. To make the flow practical and live, here a few more 
+Description above focuses on safety. To make the flow practical and live, here a few more
 considerations.
-1) It makes sense to ping new set to ensure it we are migrating to live node(s) before 
+1) It makes sense to ping new set to ensure it we are migrating to live node(s) before
   step 3.
-2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed 
+2) If e.g. accidentally wrong new sk set has been specified, before CAS in step `6` is completed
    it is safe to rollback to the old conf with one more CAS.
-3) On step 4 timeline might be already created on members of the new set for various reasons; 
+3) On step 4 timeline might be already created on members of the new set for various reasons;
    the simplest is the procedure restart. There are more complicated scenarious like mentioned
-   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving 
-   generations, so seems simpler to treat existing timeline as success. However, this also 
+   in step 5. Deleting and re-doing `pull_timeline` is generally unsafe without involving
+   generations, so seems simpler to treat existing timeline as success. However, this also
    has a disadvantage: you might imagine an surpassingly unlikely schedule where condition in
    the step 5 is never reached until compute is (re)awaken up to synchronize new member(s).
    I don't think we'll observe this in practice, but can add waking up compute if needed.
 4) In the end timeline should be locally deleted on the safekeeper(s) which are
    in the old set but not in the new one, unless they are unreachable. To be
-   safe this also should be done under generation number (deletion proceeds only if 
+   safe this also should be done under generation number (deletion proceeds only if
    current configuration is <= than one in request and safekeeper is not memeber of it).
 5) If current conf fetched on step 1 is already not joint and members equal to `desired_set`,
    jump to step 7, using it as `new_conf`.
@@ -202,47 +202,87 @@ The procedure ought to be driven from somewhere. Obvious candidates are control
 plane and storage_controller; and as each of them already has db we don't want
 yet another storage. I propose to manage safekeepers in storage_controller
 because 1) since it is in rust it simplifies simulation testing (more on this
-below) 2) it already manages pageservers. 
+below) 2) it already manages pageservers.
 
 This assumes that migration will be fully usable only after we migrate all
 tenants/timelines to storage_controller. It is discussible whether we want also
 to manage pageserver attachments for all of these, but likely we do.
 
-This requires us to define storcon <-> cplane interface.
+This requires us to define storcon <-> cplane interface and changes.
 
-### storage_controller <-> control plane interface
+### storage_controller <-> control plane interface and changes
 
 First of all, control plane should
 [change](https://neondb.slack.com/archives/C03438W3FLZ/p1719226543199829)
 storing safekeepers per timeline instead of per tenant because we can't migrate
-tenants atomically. 
+tenants atomically.
 
 The important question is how updated configuration is delivered from
 storage_controller to control plane to provide it to computes. As always, there
 are two options, pull and push. Let's do it the same push as with pageserver
 `/notify-attach` because 1) it keeps storage_controller out of critical compute
-start path 2) provides easier upgrade: there won't be such a thing as 'timeline
-managed by control plane / storcon', cplane just takes the value out of its db
-when needed 3) uniformity. It makes storage_controller responsible for retrying notifying
-control plane until it succeeds.
+start path 2) uniformity. It makes storage_controller responsible for retrying
+notifying control plane until it succeeds.
 
-So, cplane `/notify-safekeepers` for the timeline accepts `Configuration` and
-updates it in the db if the provided conf generation is higher (the cplane db
-should also store generations for this). Similarly to [`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365), it
-should update db which makes the call successful, and then try to schedule
-`apply_config` if possible, it is ok if not. storage_controller 
-should rate limit calling the endpoint, but likely this won't be needed, as migration
+It is not needed for the control plane to fully know the `Configuration`. It is
+enough for it to only to be aware of the list of safekeepers in the latest
+configuration to supply it to compute, plus associated generation number to
+protect from stale update requests and to also pass it to compute.
+
+So, cplane `/notify-safekeepers` for the timeline can accept JSON like
+```
+{
+   tenant_id: String,
+   timeline_id: String,
+   generation: u32,
+   safekeepers: Vec<SafekeeperId>,
+}
+```
+where `SafekeeperId` is
+```
+{
+   node_id: u64,
+   host: String
+}
+```
+In principle `host` is redundant, but may be useful for observability.
+
+The request updates list of safekeepers in the db if the provided conf
+generation is higher (the cplane db should also store generations for this).
+Similarly to
+[`/notify-attach`](https://www.notion.so/neondatabase/Storage-Controller-Control-Plane-interface-6de56dd310a043bfa5c2f5564fa98365),
+it should update db which makes the call successful, and then try to schedule
+`apply_config` if possible, it is ok if not. storage_controller should rate
+limit calling the endpoint, but likely this won't be needed, as migration
 throughput is limited by `pull_timeline`.
 
 Timeline (branch) creation in cplane should call storage_controller POST
 `tenant/:tenant_id/timeline` like it currently does for sharded tenants.
-Response should be augmented with `safekeeper_conf: Configuration`. The call
-should be retried until succeeds.
+Response should be augmented with `safekeepers_generation` and `safekeepers`
+fields like described in `/notify-safekeepers` above. Initially (currently)
+these fields may be absent; in this case cplane chooses safekeepers on its own
+like it currently does. The call should be retried until succeeds.
 
 Timeline deletion and tenant deletion in cplane should call appropriate
 storage_controller endpoints like it currently does for sharded tenants. The
 calls should be retried until they succeed.
 
+When compute receives safekeepers list from control plane it needs to know the
+generation to checked whether it should be updated (note that compute may get
+safekeeper list from either cplane or safekeepers). Currently `neon.safekeepers`
+GUC is just a comma separates list of `host:port`. Let's prefix it with
+`g#<generation>:` to this end, so it will look like
+```
+g#42:safekeeper-0.eu-central-1.aws.neon.tech:6401,safekeeper-2.eu-central-1.aws.neon.tech:6401,safekeeper-1.eu-central-1.aws.neon.tech:6401
+```
+
+To summarize, list of cplane changes:
+- per tenant -> per timeline safekeepers management and addition of int `safekeeper_generation` field.
+- `/notify-safekeepers` endpoint.
+- Branch creation call may return list of safekeepers and when it is
+  present cplane should adopt it instead of choosing on its own like it does currently.
+- `neon.safekeepers` GUC should be prefixed with `g#<generation>:`.
+
 ### storage_controller implementation
 
 Current 'load everything on startup and keep in memory' easy design is fine.
@@ -360,10 +400,10 @@ source safekeeper might fail, which is not a problem if we are going to
 decomission the node but leaves garbage otherwise. I'd propose in the first version
 1) Don't attempt deletion at all if node status is `offline`.
 2) If it failed, just issue warning.
-And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and 
-remove garbage timelines for manual use. It will 1) list all timelines on the 
-safekeeper 2) compare each one against configuration storage: if timeline 
-doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can 
+And add PUT `/control/v1/safekeepers/:node_id/scrub` endpoint which would find and
+remove garbage timelines for manual use. It will 1) list all timelines on the
+safekeeper 2) compare each one against configuration storage: if timeline
+doesn't exist at all (had been deleted), it can be deleted. Otherwise, it can
 be deleted under generation number if node is not member of current generation.
 
 Automating this is untrivial; we'd need to register all potential missing
@@ -412,8 +452,8 @@ There should be following layers of tests:
 3) Since simulation testing injects at relatively high level points (not
    syscalls), it omits some code, in particular `pull_timeline`. Thus it is
    better to have basic tests covering whole system as well. Extended version of
-   `test_restarts_under_load` would do: start background load and do migration 
-   under it, then restart endpoint and check that no reported commits 
+   `test_restarts_under_load` would do: start background load and do migration
+   under it, then restart endpoint and check that no reported commits
    had been lost. I'd also add one more creating classic network split scenario, with
    one compute talking to AC and another to BD while migration from nodes ABC to ABD
    happens.
@@ -422,35 +462,51 @@ There should be following layers of tests:
 
 ## Order of implementation and rollout
 
-Note that 
+Note that
 - Control plane parts and integration with it is fully independent from everything else
   (tests would use simulation and neon_local).
+- It is reasonable to make compute <-> safekeepers protocol change
+  independent of enabling generations.
 - There is a lot of infra work making storage_controller aware of timelines and safekeepers
   and its impl/rollout should be separate from migration itself.
-- Initially walproposer can just stop working while it observers joint configuration.
+- Initially walproposer can just stop working while it observes joint configuration.
   Such window would be typically very short anyway.
+- Obviously we want to test the whole thing thoroughly on staging and only then
+  gradually enable in prod.
 
-To rollout smoothly, both walproposer and safekeeper should have flag
-`configurations_enabled`; when set to false, they would work as currently, i.e.
-walproposer is able to commit on whatever safekeeper set it is provided. Until
-all timelines are managed by storcon we'd need to use current script to migrate
-and update/drop entries in the storage_controller database if it has any.
+Let's have the following implementation bits for gradual rollout:
+- compute gets `neon.safekeepers_proto_version` flag.
+  Initially both compute and safekeepers will be able to talk both
+  versions so that we can delay force restart of them and for
+  simplicity of rollback in case it is needed.
+- storcon gets `-set-safekeepers` config option disabled by
+  default. Timeline creation request chooses safekeepers
+  (and returns them in response to cplane) only when it is set to
+  true.
+- control_plane [see above](storage_controller-<->-control-plane interface-and-changes)
+  prefixes `neon.safekeepers` GUC with generation number. When it is 0
+  (or prefix not present at all), walproposer behaves as currently, committing on
+  the provided safekeeper list -- generations are disabled.
+  If it is non 0 it follows this RFC rules.
+- We provide a script for manual migration to storage controller.
+  It selects timeline(s) from control plane (specified or all of them) db
+  and calls special import endpoint on storage controller which is very
+  similar to timeline creation: it inserts into the db, sets
+  configuration to initial on the safekeepers, calls cplane
+  `notify-safekeepers`.
 
-Safekeepers would need to be able to talk both current and new protocol version
-with compute to reduce number of computes restarted in prod once v2 protocol is
-deployed (though before completely switching we'd need to force this).
-
-Let's have the following rollout order:
-- storage_controller becomes aware of safekeepers;
-- storage_controller gets timeline creation for new timelines and deletion requests, but
-  doesn't manage all timelines yet. Migration can be tested on these new timelines.
-  To keep control plane and storage_controller databases in sync while control 
-  plane still chooses the safekeepers initially (until all timelines are imported
-  it can choose better), `TimelineCreateRequest` can get optional safekeepers
-  field with safekeepers chosen by cplane.
-- Then we can import all existing timelines from control plane to
-  storage_controller and gradually enable configurations region by region.
+Then the rollout for a region would be:
+- Current situation: safekeepers are choosen by control_plane.
+- We manually migrate some timelines, test moving them around.
+- Then we enable `--set-safekeepers` so that all new timelines
+  are on storage controller.
+- Finally migrate all existing timelines using the script (no
+  compute should be speaking old proto version at this point).
 
+Until all timelines are managed by storcon we'd need to use current ad hoc
+script to migrate if needed. To keep state clean, all storage controller managed
+timelines must be migrated before that, or controller db and configurations
+state of safekeepers dropped manually.
 
 Very rough implementation order:
 - Add concept of configurations to safekeepers (including control file),
@@ -458,10 +514,10 @@ Very rough implementation order:
 - Implement walproposer changes, including protocol.
 - Implement storconn part. Use it in neon_local (and pytest).
 - Make cplane store safekeepers per timeline instead of per tenant.
-- Implement cplane/storcon integration. Route branch creation/deletion 
+- Implement cplane/storcon integration. Route branch creation/deletion
   through storcon. Then we can test migration of new branches.
-- Finally import existing branches. Then we can drop cplane 
-  safekeeper selection code. Gradually enable configurations at 
+- Finally import existing branches. Then we can drop cplane
+  safekeeper selection code. Gradually enable configurations at
   computes and safekeepers. Before that, all computes must talk only
   v3 protocol version.
 

From 2d0ea085244208a8b0238b69c7dc5a4f93890d90 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 15 Jan 2025 12:45:58 +0300
Subject: [PATCH 03/40] Add safekeeper membership conf to control file.
 (#10196)

## Problem

https://github.com/neondatabase/neon/issues/9965

## Summary of changes

Add safekeeper membership configuration struct itself and storing it in
the control file. In passing also add creation timestamp to the control
file (there were cases where I wanted it in the past).

Remove obsolete unused PersistedPeerInfo struct from control file (still
keep it control_file_upgrade.rs to have it in old upgrade code).

Remove the binary representation of cfile in the roundtrip test.
Updating it is annoying, and we still test the actual roundtrip.

Also add configuration to timeline creation http request, currently used
only in one python test. In passing, slightly change LSNs meaning in the
request: normally start_lsn is passed (the same as ancestor_start_lsn in
similar pageserver call), but we allow specifying higher commit_lsn for
manual intervention if needed. Also when given LSN initialize
term_history with it.
---
 Cargo.lock                                    |   2 +
 libs/safekeeper_api/Cargo.toml                |   2 +
 libs/safekeeper_api/src/lib.rs                |   5 +-
 libs/safekeeper_api/src/membership.rs         | 164 ++++++++++++++++++
 libs/safekeeper_api/src/models.rs             |  13 +-
 safekeeper/src/control_file.rs                |  20 ++-
 safekeeper/src/control_file_upgrade.rs        | 145 ++++++++++++++--
 safekeeper/src/copy_timeline.rs               |   5 +-
 safekeeper/src/http/routes.rs                 |  13 +-
 safekeeper/src/json_ctrl.rs                   |   2 +
 safekeeper/src/receive_wal.rs                 |   9 +-
 safekeeper/src/safekeeper.rs                  | 117 +++----------
 safekeeper/src/state.rs                       |  69 ++++----
 safekeeper/src/timelines_global_map.rs        |   7 +-
 .../tests/walproposer_sim/safekeeper.rs       |  11 +-
 test_runner/fixtures/pageserver/http.py       |  11 +-
 test_runner/fixtures/safekeeper/http.py       |  46 +++--
 test_runner/fixtures/utils.py                 |  18 ++
 test_runner/regress/test_wal_acceptor.py      |  10 +-
 19 files changed, 477 insertions(+), 192 deletions(-)
 create mode 100644 libs/safekeeper_api/src/membership.rs

diff --git a/Cargo.lock b/Cargo.lock
index 1e29f4fc08..f543057933 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5707,10 +5707,12 @@ dependencies = [
 name = "safekeeper_api"
 version = "0.1.0"
 dependencies = [
+ "anyhow",
  "const_format",
  "postgres_ffi",
  "pq_proto",
  "serde",
+ "serde_json",
  "tokio",
  "utils",
 ]
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 4234ec6779..7652c3d413 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -5,8 +5,10 @@ edition.workspace = true
 license.workspace = true
 
 [dependencies]
+anyhow.workspace = true
 const_format.workspace = true
 serde.workspace = true
+serde_json.workspace = true
 postgres_ffi.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
diff --git a/libs/safekeeper_api/src/lib.rs b/libs/safekeeper_api/src/lib.rs
index be6923aca9..fa86523ad7 100644
--- a/libs/safekeeper_api/src/lib.rs
+++ b/libs/safekeeper_api/src/lib.rs
@@ -4,12 +4,15 @@ use const_format::formatcp;
 use pq_proto::SystemId;
 use serde::{Deserialize, Serialize};
 
+pub mod membership;
 /// Public API types
 pub mod models;
 
 /// Consensus logical timestamp. Note: it is a part of sk control file.
 pub type Term = u64;
-pub const INVALID_TERM: Term = 0;
+/// With this term timeline is created initially. It
+/// is a normal term except wp is never elected with it.
+pub const INITIAL_TERM: Term = 0;
 
 /// Information about Postgres. Safekeeper gets it once and then verifies all
 /// further connections from computes match. Note: it is a part of sk control
diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
new file mode 100644
index 0000000000..fe30204545
--- /dev/null
+++ b/libs/safekeeper_api/src/membership.rs
@@ -0,0 +1,164 @@
+//! Types defining safekeeper membership, see
+//! rfcs/035-safekeeper-dynamic-membership-change.md
+//! for details.
+
+use std::{collections::HashSet, fmt::Display};
+
+use anyhow;
+use anyhow::bail;
+use serde::{Deserialize, Serialize};
+use utils::id::NodeId;
+
+/// Number uniquely identifying safekeeper configuration.
+/// Note: it is a part of sk control file.
+pub type Generation = u32;
+/// 1 is the first valid generation, 0 is used as
+/// a placeholder before we fully migrate to generations.
+pub const INVALID_GENERATION: Generation = 0;
+pub const INITIAL_GENERATION: Generation = 1;
+
+/// Membership is defined by ids so e.g. walproposer uses them to figure out
+/// quorums, but we also carry host and port to give wp idea where to connect.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct SafekeeperId {
+    pub id: NodeId,
+    pub host: String,
+    pub pg_port: u16,
+}
+
+impl Display for SafekeeperId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[id={}, ep={}:{}]", self.id, self.host, self.pg_port)
+    }
+}
+
+/// Set of safekeepers.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+#[serde(transparent)]
+pub struct MemberSet {
+    pub members: Vec<SafekeeperId>,
+}
+
+impl MemberSet {
+    pub fn empty() -> Self {
+        MemberSet {
+            members: Vec::new(),
+        }
+    }
+
+    pub fn new(members: Vec<SafekeeperId>) -> anyhow::Result<Self> {
+        let hs: HashSet<NodeId> = HashSet::from_iter(members.iter().map(|sk| sk.id));
+        if hs.len() != members.len() {
+            bail!("duplicate safekeeper id in the set {:?}", members);
+        }
+        Ok(MemberSet { members })
+    }
+
+    pub fn contains(&self, sk: &SafekeeperId) -> bool {
+        self.members.iter().any(|m| m.id == sk.id)
+    }
+
+    pub fn add(&mut self, sk: SafekeeperId) -> anyhow::Result<()> {
+        if self.contains(&sk) {
+            bail!(format!(
+                "sk {} is already member of the set {}",
+                sk.id, self
+            ));
+        }
+        self.members.push(sk);
+        Ok(())
+    }
+}
+
+impl Display for MemberSet {
+    /// Display as a comma separated list of members.
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        let sks_str = self
+            .members
+            .iter()
+            .map(|m| m.to_string())
+            .collect::<Vec<_>>();
+        write!(f, "({})", sks_str.join(", "))
+    }
+}
+
+/// Safekeeper membership configuration.
+/// Note: it is a part of both control file and http API.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct Configuration {
+    /// Unique id.
+    pub generation: Generation,
+    /// Current members of the configuration.
+    pub members: MemberSet,
+    /// Some means it is a joint conf.
+    pub new_members: Option<MemberSet>,
+}
+
+impl Configuration {
+    /// Used for pre-generations timelines, will be removed eventually.
+    pub fn empty() -> Self {
+        Configuration {
+            generation: INVALID_GENERATION,
+            members: MemberSet::empty(),
+            new_members: None,
+        }
+    }
+}
+
+impl Display for Configuration {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(
+            f,
+            "gen={}, members={}, new_members={}",
+            self.generation,
+            self.members,
+            self.new_members
+                .as_ref()
+                .map(ToString::to_string)
+                .unwrap_or(String::from("none"))
+        )
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::{MemberSet, SafekeeperId};
+    use utils::id::NodeId;
+
+    #[test]
+    fn test_member_set() {
+        let mut members = MemberSet::empty();
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(42),
+                host: String::from("lala.org"),
+                pg_port: 5432,
+            })
+            .expect_err("duplicate must not be allowed");
+
+        members
+            .add(SafekeeperId {
+                id: NodeId(43),
+                host: String::from("bubu.org"),
+                pg_port: 5432,
+            })
+            .unwrap();
+
+        println!("members: {}", members);
+
+        let j = serde_json::to_string(&members).expect("failed to serialize");
+        println!("members json: {}", j);
+        assert_eq!(
+            j,
+            r#"[{"id":42,"host":"lala.org","pg_port":5432},{"id":43,"host":"bubu.org","pg_port":5432}]"#
+        );
+    }
+}
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index 3e424a792c..ad38986357 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -11,7 +11,7 @@ use utils::{
     pageserver_feedback::PageserverFeedback,
 };
 
-use crate::{ServerInfo, Term};
+use crate::{membership::Configuration, ServerInfo, Term};
 
 #[derive(Debug, Serialize)]
 pub struct SafekeeperStatus {
@@ -22,13 +22,16 @@ pub struct SafekeeperStatus {
 pub struct TimelineCreateRequest {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
-    pub peer_ids: Option<Vec<NodeId>>,
+    pub mconf: Configuration,
     pub pg_version: u32,
     pub system_id: Option<u64>,
+    // By default WAL_SEGMENT_SIZE
     pub wal_seg_size: Option<u32>,
-    pub commit_lsn: Lsn,
-    // If not passed, it is assigned to the beginning of commit_lsn segment.
-    pub local_start_lsn: Option<Lsn>,
+    pub start_lsn: Lsn,
+    // Normal creation should omit this field (start_lsn initializes all LSNs).
+    // However, we allow specifying custom value higher than start_lsn for
+    // manual recovery case, see test_s3_wal_replay.
+    pub commit_lsn: Option<Lsn>,
 }
 
 /// Same as TermLsn, but serializes LSN using display serializer
diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 06e5afbf74..e92ca881e1 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -3,6 +3,7 @@
 use anyhow::{bail, ensure, Context, Result};
 use byteorder::{LittleEndian, ReadBytesExt, WriteBytesExt};
 use camino::{Utf8Path, Utf8PathBuf};
+use safekeeper_api::membership::INVALID_GENERATION;
 use tokio::fs::File;
 use tokio::io::AsyncWriteExt;
 use utils::crashsafe::durable_rename;
@@ -13,14 +14,14 @@ use std::ops::Deref;
 use std::path::Path;
 use std::time::Instant;
 
-use crate::control_file_upgrade::downgrade_v9_to_v8;
+use crate::control_file_upgrade::downgrade_v10_to_v9;
 use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::{EvictionState, TimelinePersistentState};
 use utils::bin_ser::LeSer;
 
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
-pub const SK_FORMAT_VERSION: u32 = 9;
+pub const SK_FORMAT_VERSION: u32 = 10;
 
 // contains persistent metadata for safekeeper
 pub const CONTROL_FILE_NAME: &str = "safekeeper.control";
@@ -169,10 +170,11 @@ impl TimelinePersistentState {
         let mut buf: Vec<u8> = Vec::new();
         WriteBytesExt::write_u32::<LittleEndian>(&mut buf, SK_MAGIC)?;
 
-        if self.eviction_state == EvictionState::Present {
-            // temp hack for forward compatibility
-            const PREV_FORMAT_VERSION: u32 = 8;
-            let prev = downgrade_v9_to_v8(self);
+        if self.mconf.generation == INVALID_GENERATION {
+            // Temp hack for forward compatibility test: in case of none
+            // configuration save cfile in previous v9 format.
+            const PREV_FORMAT_VERSION: u32 = 9;
+            let prev = downgrade_v10_to_v9(self);
             WriteBytesExt::write_u32::<LittleEndian>(&mut buf, PREV_FORMAT_VERSION)?;
             prev.ser_into(&mut buf)?;
         } else {
@@ -233,6 +235,7 @@ impl Storage for FileStorage {
 #[cfg(test)]
 mod test {
     use super::*;
+    use safekeeper_api::membership::{Configuration, MemberSet};
     use tokio::fs;
     use utils::lsn::Lsn;
 
@@ -242,6 +245,11 @@ mod test {
     async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
         let tempdir = camino_tempfile::tempdir()?;
         let mut state = TimelinePersistentState::empty();
+        state.mconf = Configuration {
+            generation: 42,
+            members: MemberSet::empty(),
+            new_members: None,
+        };
         let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
 
         // Make a change.
diff --git a/safekeeper/src/control_file_upgrade.rs b/safekeeper/src/control_file_upgrade.rs
index dd152fd4cc..904e79f976 100644
--- a/safekeeper/src/control_file_upgrade.rs
+++ b/safekeeper/src/control_file_upgrade.rs
@@ -1,17 +1,22 @@
 //! Code to deal with safekeeper control file upgrades
+use std::vec;
+
 use crate::{
     safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn},
-    state::{EvictionState, PersistedPeers, TimelinePersistentState},
+    state::{EvictionState, TimelinePersistentState},
     wal_backup_partial,
 };
 use anyhow::{bail, Result};
 use pq_proto::SystemId;
-use safekeeper_api::{ServerInfo, Term};
+use safekeeper_api::{
+    membership::{Configuration, INVALID_GENERATION},
+    ServerInfo, Term,
+};
 use serde::{Deserialize, Serialize};
 use tracing::*;
 use utils::{
     bin_ser::LeSer,
-    id::{TenantId, TimelineId},
+    id::{NodeId, TenantId, TimelineId},
     lsn::Lsn,
 };
 
@@ -233,6 +238,90 @@ pub struct SafeKeeperStateV8 {
     pub partial_backup: wal_backup_partial::State,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
+
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct PersistedPeerInfo {
+    /// LSN up to which safekeeper offloaded WAL to s3.
+    pub backup_lsn: Lsn,
+    /// Term of the last entry.
+    pub term: Term,
+    /// LSN of the last record.
+    pub flush_lsn: Lsn,
+    /// Up to which LSN safekeeper regards its WAL as committed.
+    pub commit_lsn: Lsn,
+}
+
+impl PersistedPeerInfo {
+    pub fn new() -> Self {
+        Self {
+            backup_lsn: Lsn::INVALID,
+            term: safekeeper_api::INITIAL_TERM,
+            flush_lsn: Lsn(0),
+            commit_lsn: Lsn(0),
+        }
+    }
+}
+
+// make clippy happy
+impl Default for PersistedPeerInfo {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+/// Note: SafekeeperStateVn is old name for TimelinePersistentStateVn.
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
+pub struct TimelinePersistentStateV9 {
+    #[serde(with = "hex")]
+    pub tenant_id: TenantId,
+    #[serde(with = "hex")]
+    pub timeline_id: TimelineId,
+    /// persistent acceptor state
+    pub acceptor_state: AcceptorState,
+    /// information about server
+    pub server: ServerInfo,
+    /// Unique id of the last *elected* proposer we dealt with. Not needed
+    /// for correctness, exists for monitoring purposes.
+    #[serde(with = "hex")]
+    pub proposer_uuid: PgUuid,
+    /// Since which LSN this timeline generally starts. Safekeeper might have
+    /// joined later.
+    pub timeline_start_lsn: Lsn,
+    /// Since which LSN safekeeper has (had) WAL for this timeline.
+    /// All WAL segments next to one containing local_start_lsn are
+    /// filled with data from the beginning.
+    pub local_start_lsn: Lsn,
+    /// Part of WAL acknowledged by quorum *and available locally*. Always points
+    /// to record boundary.
+    pub commit_lsn: Lsn,
+    /// LSN that points to the end of the last backed up segment. Useful to
+    /// persist to avoid finding out offloading progress on boot.
+    pub backup_lsn: Lsn,
+    /// Minimal LSN which may be needed for recovery of some safekeeper (end_lsn
+    /// of last record streamed to everyone). Persisting it helps skipping
+    /// recovery in walproposer, generally we compute it from peers. In
+    /// walproposer proto called 'truncate_lsn'. Updates are currently drived
+    /// only by walproposer.
+    pub peer_horizon_lsn: Lsn,
+    /// LSN of the oldest known checkpoint made by pageserver and successfully
+    /// pushed to s3. We don't remove WAL beyond it. Persisted only for
+    /// informational purposes, we receive it from pageserver (or broker).
+    pub remote_consistent_lsn: Lsn,
+    /// Peers and their state as we remember it. Knowing peers themselves is
+    /// fundamental; but state is saved here only for informational purposes and
+    /// obviously can be stale. (Currently not saved at all, but let's provision
+    /// place to have less file version upgrades).
+    pub peers: PersistedPeers,
+    /// Holds names of partial segments uploaded to remote storage. Used to
+    /// clean up old objects without leaving garbage in remote storage.
+    pub partial_backup: wal_backup_partial::State,
+    /// Eviction state of the timeline. If it's Offloaded, we should download
+    /// WAL files from remote storage to serve the timeline.
+    pub eviction_state: EvictionState,
+}
+
 pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersistentState> {
     // migrate to storing full term history
     if version == 1 {
@@ -248,6 +337,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.server.tenant_id,
             timeline_id: oldstate.server.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: ac,
             server: ServerInfo {
                 pg_version: oldstate.server.pg_version,
@@ -261,9 +351,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn(0),
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     // migrate to hexing some ids
     } else if version == 2 {
@@ -277,6 +367,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.server.tenant_id,
             timeline_id: oldstate.server.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -286,9 +377,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn(0),
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     // migrate to moving tenant_id/timeline_id to the top and adding some lsns
     } else if version == 3 {
@@ -302,6 +393,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.server.tenant_id,
             timeline_id: oldstate.server.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -311,9 +403,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn(0),
             peer_horizon_lsn: oldstate.truncate_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     // migrate to having timeline_start_lsn
     } else if version == 4 {
@@ -327,6 +419,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.tenant_id,
             timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -336,9 +429,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: Lsn::INVALID,
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(vec![]),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     } else if version == 5 {
         info!("reading safekeeper control file version {}", version);
@@ -372,6 +465,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.tenant_id,
             timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server: oldstate.server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -381,9 +475,9 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: oldstate.backup_lsn,
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: oldstate.remote_consistent_lsn,
-            peers: oldstate.peers,
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     } else if version == 8 {
         let oldstate = SafeKeeperStateV8::des(&buf[..buf.len()])?;
@@ -391,6 +485,7 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
         return Ok(TimelinePersistentState {
             tenant_id: oldstate.tenant_id,
             timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
             acceptor_state: oldstate.acceptor_state,
             server: oldstate.server,
             proposer_uuid: oldstate.proposer_uuid,
@@ -400,9 +495,28 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
             backup_lsn: oldstate.backup_lsn,
             peer_horizon_lsn: oldstate.peer_horizon_lsn,
             remote_consistent_lsn: oldstate.remote_consistent_lsn,
-            peers: oldstate.peers,
             partial_backup: oldstate.partial_backup,
             eviction_state: EvictionState::Present,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
+        });
+    } else if version == 9 {
+        let oldstate = TimelinePersistentStateV9::des(&buf[..buf.len()])?;
+        return Ok(TimelinePersistentState {
+            tenant_id: oldstate.tenant_id,
+            timeline_id: oldstate.timeline_id,
+            mconf: Configuration::empty(),
+            acceptor_state: oldstate.acceptor_state,
+            server: oldstate.server,
+            proposer_uuid: oldstate.proposer_uuid,
+            timeline_start_lsn: oldstate.timeline_start_lsn,
+            local_start_lsn: oldstate.local_start_lsn,
+            commit_lsn: oldstate.commit_lsn,
+            backup_lsn: oldstate.backup_lsn,
+            peer_horizon_lsn: oldstate.peer_horizon_lsn,
+            remote_consistent_lsn: oldstate.remote_consistent_lsn,
+            partial_backup: oldstate.partial_backup,
+            eviction_state: oldstate.eviction_state,
+            creation_ts: std::time::SystemTime::UNIX_EPOCH,
         });
     }
 
@@ -412,9 +526,11 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<TimelinePersiste
     bail!("unsupported safekeeper control file version {}", version)
 }
 
-pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8 {
-    assert!(state.eviction_state == EvictionState::Present);
-    SafeKeeperStateV8 {
+// Used as a temp hack to make forward compatibility test work. Should be
+// removed after PR adding v10 is merged.
+pub fn downgrade_v10_to_v9(state: &TimelinePersistentState) -> TimelinePersistentStateV9 {
+    assert!(state.mconf.generation == INVALID_GENERATION);
+    TimelinePersistentStateV9 {
         tenant_id: state.tenant_id,
         timeline_id: state.timeline_id,
         acceptor_state: state.acceptor_state.clone(),
@@ -426,8 +542,9 @@ pub fn downgrade_v9_to_v8(state: &TimelinePersistentState) -> SafeKeeperStateV8
         backup_lsn: state.backup_lsn,
         peer_horizon_lsn: state.peer_horizon_lsn,
         remote_consistent_lsn: state.remote_consistent_lsn,
-        peers: state.peers.clone(),
+        peers: PersistedPeers(vec![]),
         partial_backup: state.partial_backup.clone(),
+        eviction_state: state.eviction_state,
     }
 }
 
@@ -437,7 +554,7 @@ mod tests {
 
     use utils::{id::NodeId, Hex};
 
-    use crate::safekeeper::PersistedPeerInfo;
+    use crate::control_file_upgrade::PersistedPeerInfo;
 
     use super::*;
 
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 28ef2b1d23..10a761e1f5 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -1,6 +1,7 @@
 use anyhow::{bail, Result};
 use camino::Utf8PathBuf;
 use postgres_ffi::{MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use safekeeper_api::membership::Configuration;
 use std::sync::Arc;
 use tokio::{
     fs::OpenOptions,
@@ -147,10 +148,10 @@ pub async fn handle_request(
 
     let mut new_state = TimelinePersistentState::new(
         &request.destination_ttid,
+        Configuration::empty(),
         state.server.clone(),
-        vec![],
-        request.until_lsn,
         start_lsn,
+        request.until_lsn,
     )?;
     new_state.timeline_start_lsn = start_lsn;
     new_state.peer_horizon_lsn = request.until_lsn;
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 6186f4c3ba..3835d39698 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -111,14 +111,15 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
         system_id: request_data.system_id.unwrap_or(0),
         wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
     };
-    let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
-        request_data
-            .commit_lsn
-            .segment_lsn(server_info.wal_seg_size as usize)
-    });
     let global_timelines = get_global_timelines(&request);
     global_timelines
-        .create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
+        .create(
+            ttid,
+            request_data.mconf,
+            server_info,
+            request_data.start_lsn,
+            request_data.commit_lsn.unwrap_or(request_data.start_lsn),
+        )
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 256e350ceb..19e17c4a75 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -8,6 +8,7 @@
 
 use anyhow::Context;
 use postgres_backend::QueryError;
+use safekeeper_api::membership::Configuration;
 use safekeeper_api::{ServerInfo, Term};
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -105,6 +106,7 @@ async fn prepare_safekeeper(
         .global_timelines
         .create(
             spg.ttid,
+            Configuration::empty(),
             ServerInfo {
                 pg_version,
                 wal_seg_size: WAL_SEGMENT_SIZE as u32,
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 3e9ce1da8e..daaa8a253d 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -21,6 +21,7 @@ use postgres_backend::PostgresBackend;
 use postgres_backend::PostgresBackendReader;
 use postgres_backend::QueryError;
 use pq_proto::BeMessage;
+use safekeeper_api::membership::Configuration;
 use safekeeper_api::models::{ConnectionId, WalReceiverState, WalReceiverStatus};
 use safekeeper_api::ServerInfo;
 use std::future;
@@ -337,7 +338,13 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
                 };
                 let tli = self
                     .global_timelines
-                    .create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
+                    .create(
+                        self.ttid,
+                        Configuration::empty(),
+                        server_info,
+                        Lsn::INVALID,
+                        Lsn::INVALID,
+                    )
                     .await
                     .context("create timeline")?;
                 tli.wal_residence_guard().await?
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 6ceaf325b0..06403228e9 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -7,7 +7,6 @@ use bytes::{Buf, BufMut, Bytes, BytesMut};
 use postgres_ffi::{TimeLineID, MAX_SEND_SIZE};
 use safekeeper_api::models::HotStandbyFeedback;
 use safekeeper_api::Term;
-use safekeeper_api::INVALID_TERM;
 use serde::{Deserialize, Serialize};
 use std::cmp::max;
 use std::cmp::min;
@@ -193,36 +192,6 @@ impl AcceptorState {
     }
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct PersistedPeerInfo {
-    /// LSN up to which safekeeper offloaded WAL to s3.
-    pub backup_lsn: Lsn,
-    /// Term of the last entry.
-    pub term: Term,
-    /// LSN of the last record.
-    pub flush_lsn: Lsn,
-    /// Up to which LSN safekeeper regards its WAL as committed.
-    pub commit_lsn: Lsn,
-}
-
-impl PersistedPeerInfo {
-    pub fn new() -> Self {
-        Self {
-            backup_lsn: Lsn::INVALID,
-            term: INVALID_TERM,
-            flush_lsn: Lsn(0),
-            commit_lsn: Lsn(0),
-        }
-    }
-}
-
-// make clippy happy
-impl Default for PersistedPeerInfo {
-    fn default() -> Self {
-        Self::new()
-    }
-}
-
 // protocol messages
 
 /// Initial Proposer -> Acceptor message
@@ -1010,7 +979,7 @@ where
 
     /// Update commit_lsn from peer safekeeper data.
     pub async fn record_safekeeper_info(&mut self, sk_info: &SafekeeperTimelineInfo) -> Result<()> {
-        if (Lsn(sk_info.commit_lsn) != Lsn::INVALID) && (sk_info.last_log_term != INVALID_TERM) {
+        if Lsn(sk_info.commit_lsn) != Lsn::INVALID {
             // Note: the check is too restrictive, generally we can update local
             // commit_lsn if our history matches (is part of) history of advanced
             // commit_lsn provider.
@@ -1025,12 +994,20 @@ where
 #[cfg(test)]
 mod tests {
     use futures::future::BoxFuture;
+
     use postgres_ffi::{XLogSegNo, WAL_SEGMENT_SIZE};
-    use safekeeper_api::ServerInfo;
+    use safekeeper_api::{
+        membership::{Configuration, MemberSet, SafekeeperId},
+        ServerInfo,
+    };
 
     use super::*;
-    use crate::state::{EvictionState, PersistedPeers, TimelinePersistentState};
-    use std::{ops::Deref, str::FromStr, time::Instant};
+    use crate::state::{EvictionState, TimelinePersistentState};
+    use std::{
+        ops::Deref,
+        str::FromStr,
+        time::{Instant, UNIX_EPOCH},
+    };
 
     // fake storage for tests
     struct InMemoryState {
@@ -1313,12 +1290,21 @@ mod tests {
 
     #[test]
     fn test_sk_state_bincode_serde_roundtrip() {
-        use utils::Hex;
         let tenant_id = TenantId::from_str("cf0480929707ee75372337efaa5ecf96").unwrap();
         let timeline_id = TimelineId::from_str("112ded66422aa5e953e5440fa5427ac4").unwrap();
         let state = TimelinePersistentState {
             tenant_id,
             timeline_id,
+            mconf: Configuration {
+                generation: 42,
+                members: MemberSet::new(vec![SafekeeperId {
+                    id: NodeId(1),
+                    host: "hehe.org".to_owned(),
+                    pg_port: 5432,
+                }])
+                .expect("duplicate member"),
+                new_members: None,
+            },
             acceptor_state: AcceptorState {
                 term: 42,
                 term_history: TermHistory(vec![TermLsn {
@@ -1342,70 +1328,13 @@ mod tests {
             backup_lsn: Lsn(1234567300),
             peer_horizon_lsn: Lsn(9999999),
             remote_consistent_lsn: Lsn(1234560000),
-            peers: PersistedPeers(vec![(
-                NodeId(1),
-                PersistedPeerInfo {
-                    backup_lsn: Lsn(1234567000),
-                    term: 42,
-                    flush_lsn: Lsn(1234567800 - 8),
-                    commit_lsn: Lsn(1234567600),
-                },
-            )]),
             partial_backup: crate::wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: UNIX_EPOCH,
         };
 
         let ser = state.ser().unwrap();
 
-        #[rustfmt::skip]
-        let expected = [
-            // tenant_id as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x63, 0x66, 0x30, 0x34, 0x38, 0x30, 0x39, 0x32, 0x39, 0x37, 0x30, 0x37, 0x65, 0x65, 0x37, 0x35, 0x33, 0x37, 0x32, 0x33, 0x33, 0x37, 0x65, 0x66, 0x61, 0x61, 0x35, 0x65, 0x63, 0x66, 0x39, 0x36,
-            // timeline_id as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x31, 0x31, 0x32, 0x64, 0x65, 0x64, 0x36, 0x36, 0x34, 0x32, 0x32, 0x61, 0x61, 0x35, 0x65, 0x39, 0x35, 0x33, 0x65, 0x35, 0x34, 0x34, 0x30, 0x66, 0x61, 0x35, 0x34, 0x32, 0x37, 0x61, 0x63, 0x34,
-            // term
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // length prefix
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // unsure why this order is swapped
-            0x29, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // pg_version
-            0x0e, 0x00, 0x00, 0x00,
-            // systemid
-            0x21, 0x43, 0x65, 0x87, 0x78, 0x56, 0x34, 0x12,
-            // wal_seg_size
-            0x78, 0x56, 0x34, 0x12,
-            // pguuid as length prefixed hex
-            0x20, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x63, 0x34, 0x37, 0x61, 0x34, 0x32, 0x61, 0x35, 0x30, 0x66, 0x34, 0x34, 0x65, 0x35, 0x35, 0x33, 0x65, 0x39, 0x61, 0x35, 0x32, 0x61, 0x34, 0x32, 0x36, 0x36, 0x65, 0x64, 0x32, 0x64, 0x31, 0x31,
-
-            // timeline_start_lsn
-            0x00, 0x56, 0x34, 0x12, 0x00, 0x00, 0x00, 0x00,
-            0x12, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x78, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x84, 0x00, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x7f, 0x96, 0x98, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x00, 0xe4, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
-            // length prefix for persistentpeers
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // nodeid
-            0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // backuplsn
-            0x58, 0xff, 0x95, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0x2a, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            0x70, 0x02, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            0xb0, 0x01, 0x96, 0x49, 0x00, 0x00, 0x00, 0x00,
-            // partial_backup
-            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
-            // eviction_state
-            0x00, 0x00, 0x00, 0x00,
-        ];
-
-        assert_eq!(Hex(&ser), Hex(&expected));
-
         let deser = TimelinePersistentState::des(&ser).unwrap();
 
         assert_eq!(deser, state);
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index c6ae6c1d2b..1c3bb1b4dc 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -1,20 +1,22 @@
 //! Defines per timeline data stored persistently (SafeKeeperPersistentState)
 //! and its wrapper with in memory layer (SafekeeperState).
 
-use std::{cmp::max, ops::Deref};
+use std::{cmp::max, ops::Deref, time::SystemTime};
 
 use anyhow::{bail, Result};
 use postgres_ffi::WAL_SEGMENT_SIZE;
-use safekeeper_api::{models::TimelineTermBumpResponse, ServerInfo, Term};
+use safekeeper_api::{
+    membership::Configuration, models::TimelineTermBumpResponse, ServerInfo, Term, INITIAL_TERM,
+};
 use serde::{Deserialize, Serialize};
 use utils::{
-    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
+    id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
 };
 
 use crate::{
     control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, TermHistory, UNKNOWN_SERVER_VERSION},
+    safekeeper::{AcceptorState, PgUuid, TermHistory, TermLsn, UNKNOWN_SERVER_VERSION},
     timeline::TimelineError,
     wal_backup_partial::{self},
 };
@@ -27,6 +29,8 @@ pub struct TimelinePersistentState {
     pub tenant_id: TenantId,
     #[serde(with = "hex")]
     pub timeline_id: TimelineId,
+    /// Membership configuration.
+    pub mconf: Configuration,
     /// persistent acceptor state
     pub acceptor_state: AcceptorState,
     /// information about server
@@ -58,22 +62,15 @@ pub struct TimelinePersistentState {
     /// pushed to s3. We don't remove WAL beyond it. Persisted only for
     /// informational purposes, we receive it from pageserver (or broker).
     pub remote_consistent_lsn: Lsn,
-    /// Peers and their state as we remember it. Knowing peers themselves is
-    /// fundamental; but state is saved here only for informational purposes and
-    /// obviously can be stale. (Currently not saved at all, but let's provision
-    /// place to have less file version upgrades).
-    pub peers: PersistedPeers,
     /// Holds names of partial segments uploaded to remote storage. Used to
     /// clean up old objects without leaving garbage in remote storage.
     pub partial_backup: wal_backup_partial::State,
     /// Eviction state of the timeline. If it's Offloaded, we should download
     /// WAL files from remote storage to serve the timeline.
     pub eviction_state: EvictionState,
+    pub creation_ts: SystemTime,
 }
 
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
-pub struct PersistedPeers(pub Vec<(NodeId, PersistedPeerInfo)>);
-
 /// State of the local WAL files. Used to track current timeline state,
 /// that can be either WAL files are present on disk or last partial segment
 /// is offloaded to remote storage.
@@ -87,12 +84,14 @@ pub enum EvictionState {
 }
 
 impl TimelinePersistentState {
+    /// commit_lsn is the same as start_lsn in the normal creaiton; see
+    /// `TimelineCreateRequest` comments.`
     pub fn new(
         ttid: &TenantTimelineId,
+        mconf: Configuration,
         server_info: ServerInfo,
-        peers: Vec<NodeId>,
+        start_lsn: Lsn,
         commit_lsn: Lsn,
-        local_start_lsn: Lsn,
     ) -> anyhow::Result<TimelinePersistentState> {
         if server_info.wal_seg_size == 0 {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
@@ -102,49 +101,59 @@ impl TimelinePersistentState {
             bail!(TimelineError::UninitialinzedPgVersion(*ttid));
         }
 
-        if commit_lsn < local_start_lsn {
+        if commit_lsn < start_lsn {
             bail!(
-                "commit_lsn {} is smaller than local_start_lsn {}",
+                "commit_lsn {} is smaller than start_lsn {}",
                 commit_lsn,
-                local_start_lsn
+                start_lsn
             );
         }
 
+        // If we are given with init LSN, initialize term history with it. It
+        // ensures that walproposer always must be able to find a common point
+        // in histories; if it can't something is corrupted. Not having LSN here
+        // is so far left for legacy case where timeline is created by compute
+        // and LSN during creation is not known yet.
+        let term_history = if commit_lsn != Lsn::INVALID {
+            TermHistory(vec![TermLsn {
+                term: INITIAL_TERM,
+                lsn: start_lsn,
+            }])
+        } else {
+            TermHistory::empty()
+        };
+
         Ok(TimelinePersistentState {
             tenant_id: ttid.tenant_id,
             timeline_id: ttid.timeline_id,
+            mconf,
             acceptor_state: AcceptorState {
-                term: 0,
-                term_history: TermHistory::empty(),
+                term: INITIAL_TERM,
+                term_history,
             },
             server: server_info,
             proposer_uuid: [0; 16],
-            timeline_start_lsn: Lsn(0),
-            local_start_lsn,
+            timeline_start_lsn: start_lsn,
+            local_start_lsn: start_lsn,
             commit_lsn,
-            backup_lsn: local_start_lsn,
-            peer_horizon_lsn: local_start_lsn,
+            backup_lsn: start_lsn,
+            peer_horizon_lsn: start_lsn,
             remote_consistent_lsn: Lsn(0),
-            peers: PersistedPeers(
-                peers
-                    .iter()
-                    .map(|p| (*p, PersistedPeerInfo::new()))
-                    .collect(),
-            ),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
+            creation_ts: SystemTime::now(),
         })
     }
 
     pub fn empty() -> Self {
         TimelinePersistentState::new(
             &TenantTimelineId::empty(),
+            Configuration::empty(),
             ServerInfo {
                 pg_version: 170000, /* Postgres server version (major * 10000) */
                 system_id: 0,       /* Postgres system identifier */
                 wal_seg_size: WAL_SEGMENT_SIZE as u32,
             },
-            vec![],
             Lsn::INVALID,
             Lsn::INVALID,
         )
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index ad29c9f66c..a701534f65 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -12,6 +12,7 @@ use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
+use safekeeper_api::membership::Configuration;
 use safekeeper_api::ServerInfo;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -214,9 +215,10 @@ impl GlobalTimelines {
     pub(crate) async fn create(
         &self,
         ttid: TenantTimelineId,
+        mconf: Configuration,
         server_info: ServerInfo,
+        start_lsn: Lsn,
         commit_lsn: Lsn,
-        local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
         let (conf, _, _) = {
             let state = self.state.lock().unwrap();
@@ -239,8 +241,7 @@ impl GlobalTimelines {
 
         // TODO: currently we create only cfile. It would be reasonable to
         // immediately initialize first WAL segment as well.
-        let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
+        let state = TimelinePersistentState::new(&ttid, mconf, server_info, start_lsn, commit_lsn)?;
         control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
         let timeline = self.load_temp_timeline(ttid, &tmp_dir_path, true).await?;
         Ok(timeline)
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index efcdd89e7d..a99de71a04 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -21,7 +21,7 @@ use safekeeper::{
     wal_storage::Storage,
     SafeKeeperConf,
 };
-use safekeeper_api::ServerInfo;
+use safekeeper_api::{membership::Configuration, ServerInfo};
 use tracing::{debug, info_span, warn};
 use utils::{
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -96,8 +96,13 @@ impl GlobalMap {
         let commit_lsn = Lsn::INVALID;
         let local_start_lsn = Lsn::INVALID;
 
-        let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
+        let state = TimelinePersistentState::new(
+            &ttid,
+            Configuration::empty(),
+            server_info,
+            commit_lsn,
+            local_start_lsn,
+        )?;
 
         let disk_timeline = self.disk.put_state(&ttid, state);
         let control_store = DiskStateStorage::new(disk_timeline.clone());
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 378e568622..364aff325d 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -15,7 +15,6 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry
 
 from fixtures.common_types import (
-    Id,
     Lsn,
     TenantId,
     TenantShardId,
@@ -25,7 +24,7 @@ from fixtures.common_types import (
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
-from fixtures.utils import Fn
+from fixtures.utils import EnhancedJSONEncoder, Fn
 
 
 class PageserverApiException(Exception):
@@ -83,14 +82,6 @@ class TimelineCreateRequest:
     mode: TimelineCreateRequestMode
 
     def to_json(self) -> str:
-        class EnhancedJSONEncoder(json.JSONEncoder):
-            def default(self, o):
-                if dataclasses.is_dataclass(o) and not isinstance(o, type):
-                    return dataclasses.asdict(o)
-                elif isinstance(o, Id):
-                    return o.id.hex()
-                return super().default(o)
-
         # mode is flattened
         this = dataclasses.asdict(self)
         mode = this.pop("mode")
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 286f80ba69..4826cae3ee 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -10,7 +10,7 @@ import requests
 from fixtures.common_types import Lsn, TenantId, TenantTimelineId, TimelineId
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
-from fixtures.utils import wait_until
+from fixtures.utils import EnhancedJSONEncoder, wait_until
 
 if TYPE_CHECKING:
     from typing import Any
@@ -69,6 +69,34 @@ class TermBumpResponse:
         )
 
 
+@dataclass
+class SafekeeperId:
+    id: int
+    host: str
+    pg_port: str
+
+
+@dataclass
+class Configuration:
+    generation: int
+    members: list[SafekeeperId]
+    new_members: list[SafekeeperId] | None
+
+
+@dataclass
+class TimelineCreateRequest:
+    tenant_id: TenantId
+    timeline_id: TimelineId
+    mconf: Configuration
+    # not exactly PgVersion, for example 150002 for 15.2
+    pg_version: int
+    start_lsn: Lsn
+    commit_lsn: Lsn | None
+
+    def to_json(self) -> str:
+        return json.dumps(self, cls=EnhancedJSONEncoder)
+
+
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
@@ -131,20 +159,8 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         resj = res.json()
         return [TenantTimelineId.from_json(ttidj) for ttidj in resj]
 
-    def timeline_create(
-        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
-        pg_version: int,  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
-        commit_lsn: Lsn,
-    ):
-        body = {
-            "tenant_id": str(tenant_id),
-            "timeline_id": str(timeline_id),
-            "pg_version": pg_version,
-            "commit_lsn": str(commit_lsn),
-        }
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", json=body)
+    def timeline_create(self, r: TimelineCreateRequest):
+        res = self.post(f"http://localhost:{self.port}/v1/tenant/timeline", data=r.to_json())
         res.raise_for_status()
 
     def timeline_status(
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index c34ac298d1..e160c617cd 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import contextlib
+import dataclasses
 import json
 import os
 import re
@@ -21,6 +22,7 @@ import zstandard
 from psycopg2.extensions import cursor
 from typing_extensions import override
 
+from fixtures.common_types import Id, Lsn
 from fixtures.log_helper import log
 from fixtures.pageserver.common_types import (
     parse_delta_layer,
@@ -605,6 +607,22 @@ class PropagatingThread(threading.Thread):
         return self.ret
 
 
+class EnhancedJSONEncoder(json.JSONEncoder):
+    """
+    Default json.JSONEncoder works only on primitive builtins. Extend it to any
+    dataclass plus our custom types.
+    """
+
+    def default(self, o):
+        if dataclasses.is_dataclass(o) and not isinstance(o, type):
+            return dataclasses.asdict(o)
+        elif isinstance(o, Id):
+            return o.id.hex()
+        elif isinstance(o, Lsn):
+            return str(o)  # standard hex notation
+        return super().default(o)
+
+
 def human_bytes(amt: float) -> str:
     """
     Render a bytes amount into nice IEC bytes string.
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 0a8900b351..d39c6a6b5b 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -48,7 +48,7 @@ from fixtures.remote_storage import (
     default_remote_storage,
     s3_storage,
 )
-from fixtures.safekeeper.http import SafekeeperHttpClient
+from fixtures.safekeeper.http import Configuration, SafekeeperHttpClient, TimelineCreateRequest
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     PropagatingThread,
@@ -658,7 +658,13 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder):
     for sk in env.safekeepers:
         sk.start()
         cli = sk.http_client()
-        cli.timeline_create(tenant_id, timeline_id, pg_version, last_lsn)
+        mconf = Configuration(generation=0, members=[], new_members=None)
+        # set start_lsn to the beginning of the first segment to allow reading
+        # WAL from there (could you intidb LSN as well).
+        r = TimelineCreateRequest(
+            tenant_id, timeline_id, mconf, pg_version, Lsn("0/1000000"), commit_lsn=last_lsn
+        )
+        cli.timeline_create(r)
         f_partial_path = (
             Path(sk.data_dir) / str(tenant_id) / str(timeline_id) / f_partial_saved.name
         )

From 05d17a10aee949b56317bc1ecaf24b76b097b9d2 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 15 Jan 2025 11:35:38 +0100
Subject: [PATCH 04/40] rfc: add CPU and heap profiling RFC (#10085)

This document proposes a standard cross-team pattern for CPU and memory
profiling across applications and languages, using the
[pprof](https://github.com/google/pprof) profile format.

It enables both ad hoc profiles via HTTP endpoints, and continuous
profiling across the fleet via [Grafana Cloud
Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
Continuous profiling incurs an overhead of about 0.1% CPU usage and 3%
slower heap allocations.


[Rendered](https://github.com/neondatabase/neon/blob/erik/profiling-rfc/docs/rfcs/040-profiling.md)

Touches #9534.
Touches https://github.com/neondatabase/cloud/issues/14888.
---
 docs/rfcs/040-profiling.md | 247 +++++++++++++++++++++++++++++++++++++
 1 file changed, 247 insertions(+)
 create mode 100644 docs/rfcs/040-profiling.md

diff --git a/docs/rfcs/040-profiling.md b/docs/rfcs/040-profiling.md
new file mode 100644
index 0000000000..8da9e50774
--- /dev/null
+++ b/docs/rfcs/040-profiling.md
@@ -0,0 +1,247 @@
+# CPU and Memory Profiling
+
+Created 2025-01-12 by Erik Grinaker.
+
+See also [internal user guide](https://www.notion.so/neondatabase/Storage-CPU-Memory-Profiling-14bf189e004780228ec7d04442742324?pvs=4).
+
+## Summary
+
+This document proposes a standard cross-team pattern for CPU and memory profiling across
+applications and languages, using the [pprof](https://github.com/google/pprof) profile format.
+
+It enables both ad hoc profiles via HTTP endpoints, and continuous profiling across the fleet via
+[Grafana Cloud Profiles](https://grafana.com/docs/grafana-cloud/monitor-applications/profiles/).
+Continuous profiling incurs an overhead of about 0.1% CPU usage and 3% slower heap allocations.
+
+## Motivation
+
+CPU and memory profiles are crucial observability tools for understanding performance issues,
+resource exhaustion, and resource costs. They allow answering questions like:
+
+* Why is this process using 100% CPU?
+* How do I make this go faster?
+* Why did this process run out of memory?
+* Why are we paying for all these CPU cores and memory chips?
+
+Go has [first-class support](https://pkg.go.dev/net/http/pprof) for profiling included in its
+standard library, using the [pprof](https://github.com/google/pprof) profile format and associated
+tooling.
+
+This is not the case for Rust and C, where obtaining profiles can be rather cumbersome. It requires
+installing and running additional tools like `perf` as root on production nodes, with analysis tools
+that can be hard to use and often don't give good results. This is not only annoying, but can also
+significantly affect the resolution time of production incidents.
+
+This proposal will:
+
+* Provide CPU and heap profiles in pprof format via HTTP API.
+* Record continuous profiles in Grafana for aggregate historical analysis.
+* Make it easy for anyone to see a flamegraph in less than one minute.
+* Be reasonably consistent across teams and services (Rust, Go, C).
+
+## Non Goals (For Now)
+
+* [Additional profile types](https://grafana.com/docs/pyroscope/next/configure-client/profile-types/)
+  like mutexes, locks, goroutines, etc.
+* [Runtime trace integration](https://grafana.com/docs/pyroscope/next/configure-client/trace-span-profiles/).
+* [Profile-guided optimization](https://en.wikipedia.org/wiki/Profile-guided_optimization).
+
+## Using Profiles
+
+Ready-to-use profiles can be obtained using e.g. `curl`. For Rust services:
+
+```
+$ curl localhost:9898/profile/cpu >profile.pb.gz
+```
+
+pprof profiles can be explored using the [`pprof`](https://github.com/google/pprof) web UI, which
+provides flamegraphs, call graphs, plain text listings, and more:
+
+```
+$ pprof -http :6060 <profile>
+```
+
+Some endpoints (e.g. Rust-based ones) can also generate flamegraph SVGs directly:
+
+```
+$ curl localhost:9898/profile/cpu?format=svg >profile.svg
+$ open profile.svg
+```
+
+Continuous profiles are available in Grafana under Explore → Profiles → Explore Profiles
+(currently only in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)).
+
+## API Requirements
+
+* HTTP endpoints that return a profile in pprof format (with symbols).
+  * CPU: records a profile over the request time interval (`seconds` query parameter).
+  * Memory: returns the current in-use heap allocations.
+* Unauthenticated, as it should not expose user data or pose a denial-of-service risk.
+* Default sample frequency should not impact service (maximum 5% CPU overhead).
+* Linux-compatibility.
+
+Nice to have:
+
+* Return flamegraph SVG directly from the HTTP endpoint if requested.
+* Configurable sample frequency for CPU profiles.
+* Historical heap allocations, by count and bytes.
+* macOS-compatiblity.
+
+## Rust Profiling
+
+[`libs/utils/src/http/endpoint.rs`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs)
+contains ready-to-use HTTP endpoints for CPU and memory profiling:
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338) and [`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+
+### CPU
+
+CPU profiles are provided by [pprof-rs](https://github.com/tikv/pprof-rs) via
+[`profile_cpu_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L338).
+Expose it unauthenticated at `/profile/cpu`.
+
+Parameters:
+
+* `format`: profile output format (`pprof` or `svg`; default `pprof`).
+* `seconds`: duration to collect profile over, in seconds (default `5`).
+* `frequency`: how often to sample thread stacks, in Hz (default `99`).
+* `force`: if `true`, cancel a running profile and start a new one (default `false`).
+
+Works on Linux and macOS.
+
+### Memory
+
+Use the jemalloc allocator via [`tikv-jemallocator`](https://github.com/tikv/jemallocator),
+and enable profiling with samples every 2 MB allocated:
+
+```rust
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:21\0";
+```
+
+pprof profiles are generated by
+[`jemalloc-pprof`](https://github.com/polarsignals/rust-jemalloc-pprof) via
+[`profile_heap_handler`](https://github.com/neondatabase/neon/blob/8327f68043e692c77f70d6a6dafa463636c01578/libs/utils/src/http/endpoint.rs#L416).
+Expose it unauthenticated at `/profile/heap`.
+
+Parameters:
+
+* `format`: profile output format (`pprof`, `svg`, or `jemalloc`; default `pprof`).
+
+Works on Linux only, due to [jemalloc limitations](https://github.com/jemalloc/jemalloc/issues/26).
+
+## Go Profiling
+
+The Go standard library includes pprof profiling via HTTP API in
+[`net/http/pprof`](https://pkg.go.dev/net/http/pprof). Expose it unauthenticated at
+`/debug/pprof`.
+
+Works on Linux and macOS.
+
+### CPU 
+
+Via `/debug/pprof/profile`. Parameters:
+
+* `debug`: profile output format (`0` is pprof, `1` or above is plaintext; default `0`).
+* `seconds`: duration to collect profile over, in seconds (default `30`).
+
+Does not support a frequency parameter (see [#57488](https://github.com/golang/go/issues/57488)),
+and defaults to 100 Hz. A lower frequency can be hardcoded via `SetCPUProfileRate`, but the default
+is likely ok (estimated 1% overhead).
+
+### Memory
+
+Via `/debug/pprof/heap`. Parameters:
+
+* `seconds`: take a delta profile over the given duration, in seconds (default `0`).
+* `gc`: if `1`, garbage collect before taking profile.
+
+## C Profiling
+
+[gperftools](https://github.com/gperftools/gperftools) provides in-process CPU and heap profiling
+with pprof output.
+
+However, continuous profiling of PostgreSQL is expensive (many computes), and has limited value
+since we don't own the internals anyway.
+
+Ad hoc profiling might still be useful, but the compute team considers existing tooling sufficient,
+so this is not a priority at the moment.
+
+## Grafana Continuous Profiling
+
+[Grafana Alloy](https://grafana.com/docs/alloy/latest/) continually scrapes CPU and memory profiles
+across the fleet, and archives them as time series. This can be used to analyze resource usage over
+time, either in aggregate or zoomed in to specific events and nodes.
+
+Profiles are retained for 30 days. Profile ingestion volume for CPU+heap at 60-second intervals
+is about 0.5 GB/node/day, or about $0.25/node/day = $7.5/node/month ($0.50/GB).
+
+It is currently enabled in [staging](https://neonstaging.grafana.net/a/grafana-pyroscope-app/profiles-explorer)
+for Pageserver and Safekeeper.
+
+### Scraping
+
+* CPU profiling: 59 seconds at 19 Hz every 60 seconds.
+* Heap profiling: heap snapshot with 2 MB frequency every 60 seconds.
+
+There are two main approaches that can be taken for CPU profiles:
+
+* Continuous low-frequency profiles (e.g. 19 Hz for 60 seconds every 60 seconds).
+* Occasional high-frequency profiles (e.g. 99 Hz for 5 seconds every 60 seconds).
+
+We choose continuous low-frequency profiles where possible. This has a fixed low overhead, instead
+of a spiky high overhead. It likely also gives a more representative view of resource usage.
+However, a 19 Hz rate gives a minimum resolution of 52.6 ms per sample, which may be larger than the
+actual runtime of small functions. Note that Go does not support a frequency parameter, so we must
+use a fixed frequency for all profiles via `SetCPUProfileRate()` (default 100 Hz).
+
+Only one CPU profile can be taken at a time. With continuous profiling, one will always be running.
+To allow also taking an ad hoc CPU profile, the Rust endpoint supports a `force` query parameter to
+cancel a running profile and start a new one.
+
+### Overhead
+
+With Rust:
+
+* CPU profiles at 19 Hz frequency: 0.1% overhead.
+* Heap profiles at 2 MB frequency: 3% allocation overhead.
+* Profile call/encoding/symbolization: 20 ms every 60 seconds, or 0.03% of 1 CPU (for Pageserver).
+* Profile symbolization caches: 125 MB memory, or 0.4% of 32 GB (for Pageserver).
+
+Benchmarks with pprof-rs showed that the CPU time for taking a stack trace of a 40-frame stack was
+11 µs using the `frame-pointer` feature, and 1.4 µs using `libunwind` with DWARF. `libunwind` saw
+frequent seg faults, so we use `frame-pointer` and build binaries with frame pointers (negligible
+overhead).
+
+CPU profiles work by installing an `ITIMER_PROF` for the process, which triggers a `SIGPROF` signal
+after a given amount of cumulative CPU time across all CPUs. The signal handler will run for one
+of the currently executing threads and take a stack trace. Thus, a 19 Hz profile will take 1 stack
+trace every 52.6 ms CPU time -- assuming 11 µs for a stack trace, this is 0.02% overhead, but
+likely 0.1% in practice (given e.g. context switches).
+
+Heap profiles work by probabilistically taking a stack trace on allocations, adjusted for the
+allocation size. A 1 MB allocation takes about 15 µs in benchmarks, and a stack trace about 1 µs,
+so we can estimate that a 2 MB sampling frequency has about 3% allocation overhead -- this is 
+consistent with benchmarks. This is significantly larger than CPU profiles, but mitigated by the
+fact that performance-sensitive code will avoid allocations as far as possible.
+
+Profile symbolization uses in-memory caches for symbol lookups. These take about 125 MB for
+Pageserver.
+
+## Alternatives Considered
+
+* eBPF profiles.
+  * Don't require instrumenting the binary.
+  * Use less resources.
+  * Can profile in kernel space too.
+  * Supported by Grafana.
+  * Less information about stack frames and spans.
+  * Limited tooling for local analysis.
+  * Does not support heap profiles.
+  * Does not work on macOS.
+
+* [Polar Signals](https://www.polarsignals.com) instead of Grafana.
+  * We already use Grafana for everything else. Appears good enough.

From 157743040896e59214260a78542899c07638f94d Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 15 Jan 2025 11:10:24 +0000
Subject: [PATCH 05/40] safekeeper: decode and interpret for multiple shards in
 one go (#10201)

## Problem

Currently, we call `InterpretedWalRecord::from_bytes_filtered`
from each shard. To serve multiple shards at the same time,
the API needs to allow for enquiring about multiple shards.

## Summary of changes

This commit tweaks it a pretty brute force way. Naively, we could
just generate the shard for a key, but pre and post split shards
may be subscribed at the same time, so doing it efficiently is more
complex.
---
 Cargo.lock                                    |   9 +
 libs/pageserver_api/src/shard.rs              |  21 +-
 libs/postgres_ffi/src/walrecord.rs            |  14 +-
 libs/wal_decoder/Cargo.toml                   |  15 ++
 libs/wal_decoder/benches/README.md            |  34 +++
 .../benches/bench_interpret_wal.rs            | 250 ++++++++++++++++++
 libs/wal_decoder/src/decoder.rs               | 140 ++++++----
 libs/wal_decoder/src/models.rs                |  54 ++--
 libs/wal_decoder/src/serialized_batch.rs      | 191 ++++++-------
 pageserver/src/import_datadir.rs              |  15 +-
 .../walreceiver/walreceiver_connection.rs     |   8 +-
 pageserver/src/walingest.rs                   |   4 +-
 safekeeper/src/send_interpreted_wal.rs        |   8 +-
 13 files changed, 574 insertions(+), 189 deletions(-)
 create mode 100644 libs/wal_decoder/benches/README.md
 create mode 100644 libs/wal_decoder/benches/bench_interpret_wal.rs

diff --git a/Cargo.lock b/Cargo.lock
index f543057933..0669899617 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -7560,12 +7560,21 @@ dependencies = [
  "anyhow",
  "async-compression",
  "bytes",
+ "camino",
+ "camino-tempfile",
+ "criterion",
+ "futures",
  "pageserver_api",
  "postgres_ffi",
+ "pprof",
  "prost",
+ "remote_storage",
  "serde",
+ "serde_json",
  "thiserror",
+ "tikv-jemallocator",
  "tokio",
+ "tokio-util",
  "tonic",
  "tonic-build",
  "tracing",
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index 4cc0a739e8..e03df02afb 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -31,6 +31,8 @@
 //! - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 //!   and their slugs are 0004, 0104, 0204, and 0304.
 
+use std::hash::{Hash, Hasher};
+
 use crate::{key::Key, models::ShardParameters};
 use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
@@ -48,6 +50,23 @@ pub struct ShardIdentity {
     layout: ShardLayout,
 }
 
+/// Hash implementation
+///
+/// The stripe size cannot change dynamically, so it can be ignored for efficiency reasons.
+impl Hash for ShardIdentity {
+    fn hash<H: Hasher>(&self, state: &mut H) {
+        let ShardIdentity {
+            number,
+            count,
+            stripe_size: _,
+            layout: _,
+        } = self;
+
+        number.0.hash(state);
+        count.0.hash(state);
+    }
+}
+
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
@@ -59,7 +78,7 @@ impl Default for ShardStripeSize {
 }
 
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Hash, Debug)]
 pub struct ShardLayout(u8);
 
 const LAYOUT_V1: ShardLayout = ShardLayout(1);
diff --git a/libs/postgres_ffi/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs
index b32106632a..fce37e2fdd 100644
--- a/libs/postgres_ffi/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -16,7 +16,7 @@ use utils::bin_ser::DeserializeError;
 use utils::lsn::Lsn;
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactCreate {
     pub mid: MultiXactId,
     /* new MultiXact's ID */
@@ -46,7 +46,7 @@ impl XlMultiXactCreate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlMultiXactTruncate {
     pub oldest_multi_db: Oid,
     /* to-be-truncated range of multixact offsets */
@@ -72,7 +72,7 @@ impl XlMultiXactTruncate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlRelmapUpdate {
     pub dbid: Oid,   /* database ID, or 0 for shared map */
     pub tsid: Oid,   /* database's tablespace, or pg_global */
@@ -90,7 +90,7 @@ impl XlRelmapUpdate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginDrop {
     pub node_id: RepOriginId,
 }
@@ -104,7 +104,7 @@ impl XlReploriginDrop {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlReploriginSet {
     pub remote_lsn: Lsn,
     pub node_id: RepOriginId,
@@ -911,7 +911,7 @@ impl XlSmgrCreate {
 }
 
 #[repr(C)]
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlSmgrTruncate {
     pub blkno: BlockNumber,
     pub rnode: RelFileNode,
@@ -984,7 +984,7 @@ impl XlDropDatabase {
 /// xl_xact_parsed_abort structs in PostgreSQL, but we use the same
 /// struct for commits and aborts.
 ///
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Clone, Debug, Serialize, Deserialize)]
 pub struct XlXactParsedRecord {
     pub xid: TransactionId,
     pub info: u8,
diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
index 8fac4e38ca..09c4afb18a 100644
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -24,3 +24,18 @@ workspace_hack = { version = "0.1", path = "../../workspace_hack" }
 
 [build-dependencies]
 tonic-build.workspace = true
+
+[dev-dependencies]
+criterion.workspace = true
+camino.workspace = true
+camino-tempfile.workspace = true
+remote_storage.workspace = true
+tokio-util.workspace = true
+serde_json.workspace = true
+futures.workspace = true
+tikv-jemallocator.workspace = true
+pprof.workspace = true
+
+[[bench]]
+name = "bench_interpret_wal"
+harness = false
diff --git a/libs/wal_decoder/benches/README.md b/libs/wal_decoder/benches/README.md
new file mode 100644
index 0000000000..14885afecf
--- /dev/null
+++ b/libs/wal_decoder/benches/README.md
@@ -0,0 +1,34 @@
+## WAL Decoding and Interpretation Benchmarks
+
+Note that these benchmarks pull WAL from a public bucket in S3
+as a preparation step. Hence, you need a way to auth with AWS.
+You can achieve this by copying the `~/.aws/config` file from
+the AWS SSO notion page and exporting `AWS_PROFILE=dev` when invoking
+the benchmarks.
+
+To run benchmarks:
+
+```sh
+aws sso login --profile dev
+
+# All benchmarks.
+AWS_PROFILE=dev cargo bench --package wal_decoder
+
+# Specific file.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal
+
+# Specific benchmark.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded
+
+# List available benchmarks.
+cargo bench --package wal_decoder --benches -- --list
+
+# Generate flamegraph profiles using pprof-rs, profiling for 10 seconds.
+# Output in target/criterion/*/profile/flamegraph.svg.
+AWS_PROFILE=dev cargo bench --package wal_decoder --bench bench_interpret_wal unsharded -- --profile-time 10
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
diff --git a/libs/wal_decoder/benches/bench_interpret_wal.rs b/libs/wal_decoder/benches/bench_interpret_wal.rs
new file mode 100644
index 0000000000..846904cf87
--- /dev/null
+++ b/libs/wal_decoder/benches/bench_interpret_wal.rs
@@ -0,0 +1,250 @@
+use anyhow::Context;
+use criterion::{criterion_group, criterion_main, Criterion};
+use futures::{stream::FuturesUnordered, StreamExt};
+use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+use postgres_ffi::{waldecoder::WalStreamDecoder, MAX_SEND_SIZE, WAL_SEGMENT_SIZE};
+use pprof::criterion::{Output, PProfProfiler};
+use serde::Deserialize;
+use std::{env, num::NonZeroUsize, sync::Arc};
+
+use camino::{Utf8Path, Utf8PathBuf};
+use camino_tempfile::Utf8TempDir;
+use remote_storage::{
+    DownloadOpts, GenericRemoteStorage, ListingMode, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
+};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    lsn::Lsn,
+    shard::{ShardCount, ShardNumber},
+};
+use wal_decoder::models::InterpretedWalRecord;
+
+const S3_BUCKET: &str = "neon-github-public-dev";
+const S3_REGION: &str = "eu-central-1";
+const BUCKET_PREFIX: &str = "wal-snapshots/bulk-insert/";
+const METADATA_FILENAME: &str = "metadata.json";
+
+/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
+/// This mirrors the configuration in bin/safekeeper.rs.
+#[global_allocator]
+static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
+
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
+async fn create_s3_client() -> anyhow::Result<Arc<GenericRemoteStorage>> {
+    let remote_storage_config = RemoteStorageConfig {
+        storage: RemoteStorageKind::AwsS3(S3Config {
+            bucket_name: S3_BUCKET.to_string(),
+            bucket_region: S3_REGION.to_string(),
+            prefix_in_bucket: Some(BUCKET_PREFIX.to_string()),
+            endpoint: None,
+            concurrency_limit: NonZeroUsize::new(100).unwrap(),
+            max_keys_per_list_response: None,
+            upload_storage_class: None,
+        }),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
+        small_timeout: RemoteStorageConfig::DEFAULT_SMALL_TIMEOUT,
+    };
+    Ok(Arc::new(
+        GenericRemoteStorage::from_config(&remote_storage_config)
+            .await
+            .context("remote storage init")?,
+    ))
+}
+
+async fn download_bench_data(
+    client: Arc<GenericRemoteStorage>,
+    cancel: &CancellationToken,
+) -> anyhow::Result<Utf8TempDir> {
+    let temp_dir_parent: Utf8PathBuf = env::current_dir().unwrap().try_into()?;
+    let temp_dir = camino_tempfile::tempdir_in(temp_dir_parent)?;
+
+    eprintln!("Downloading benchmark data to {:?}", temp_dir);
+
+    let listing = client
+        .list(None, ListingMode::NoDelimiter, None, cancel)
+        .await?;
+
+    let mut downloads = listing
+        .keys
+        .into_iter()
+        .map(|obj| {
+            let client = client.clone();
+            let temp_dir_path = temp_dir.path().to_owned();
+
+            async move {
+                let remote_path = obj.key;
+                let download = client
+                    .download(&remote_path, &DownloadOpts::default(), cancel)
+                    .await?;
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+
+                let file_name = remote_path.object_name().unwrap();
+                let file_path = temp_dir_path.join(file_name);
+                let file = tokio::fs::OpenOptions::new()
+                    .create(true)
+                    .truncate(true)
+                    .write(true)
+                    .open(&file_path)
+                    .await?;
+
+                let mut writer = tokio::io::BufWriter::new(file);
+                tokio::io::copy_buf(&mut body, &mut writer).await?;
+
+                Ok::<(), anyhow::Error>(())
+            }
+        })
+        .collect::<FuturesUnordered<_>>();
+
+    while let Some(download) = downloads.next().await {
+        download?;
+    }
+
+    Ok(temp_dir)
+}
+
+struct BenchmarkData {
+    wal: Vec<u8>,
+    meta: BenchmarkMetadata,
+}
+
+#[derive(Deserialize)]
+struct BenchmarkMetadata {
+    pg_version: u32,
+    start_lsn: Lsn,
+}
+
+async fn load_bench_data(path: &Utf8Path, input_size: usize) -> anyhow::Result<BenchmarkData> {
+    eprintln!("Loading benchmark data from {:?}", path);
+
+    let mut entries = tokio::fs::read_dir(path).await?;
+    let mut ordered_segment_paths = Vec::new();
+    let mut metadata = None;
+
+    while let Some(entry) = entries.next_entry().await? {
+        if entry.file_name() == METADATA_FILENAME {
+            let bytes = tokio::fs::read(entry.path()).await?;
+            metadata = Some(
+                serde_json::from_slice::<BenchmarkMetadata>(&bytes)
+                    .context("failed to deserialize metadata.json")?,
+            );
+        } else {
+            ordered_segment_paths.push(entry.path());
+        }
+    }
+
+    ordered_segment_paths.sort();
+
+    let mut buffer = Vec::new();
+    for path in ordered_segment_paths {
+        if buffer.len() >= input_size {
+            break;
+        }
+
+        use async_compression::tokio::bufread::ZstdDecoder;
+        let file = tokio::fs::File::open(path).await?;
+        let reader = tokio::io::BufReader::new(file);
+        let decoder = ZstdDecoder::new(reader);
+        let mut reader = tokio::io::BufReader::new(decoder);
+        tokio::io::copy_buf(&mut reader, &mut buffer).await?;
+    }
+
+    buffer.truncate(input_size);
+
+    Ok(BenchmarkData {
+        wal: buffer,
+        meta: metadata.unwrap(),
+    })
+}
+
+fn criterion_benchmark(c: &mut Criterion) {
+    const INPUT_SIZE: usize = 128 * 1024 * 1024;
+
+    let setup_runtime = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .unwrap();
+
+    let (_temp_dir, bench_data) = setup_runtime.block_on(async move {
+        let cancel = CancellationToken::new();
+        let client = create_s3_client().await.unwrap();
+        let temp_dir = download_bench_data(client, &cancel).await.unwrap();
+        let bench_data = load_bench_data(temp_dir.path(), INPUT_SIZE).await.unwrap();
+
+        (temp_dir, bench_data)
+    });
+
+    eprintln!(
+        "Benchmarking against {} MiB of WAL",
+        INPUT_SIZE / 1024 / 1024
+    );
+
+    let mut group = c.benchmark_group("decode-interpret-wal");
+    group.throughput(criterion::Throughput::Bytes(bench_data.wal.len() as u64));
+    group.sample_size(10);
+
+    group.bench_function("unsharded", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &[ShardIdentity::unsharded()]))
+    });
+
+    let eight_shards = (0..8)
+        .map(|i| ShardIdentity::new(ShardNumber(i), ShardCount(8), ShardStripeSize(8)).unwrap())
+        .collect::<Vec<_>>();
+
+    group.bench_function("8/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &eight_shards))
+    });
+
+    let four_shards = eight_shards
+        .into_iter()
+        .filter(|s| s.number.0 % 2 == 0)
+        .collect::<Vec<_>>();
+    group.bench_function("4/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &four_shards))
+    });
+
+    let two_shards = four_shards
+        .into_iter()
+        .filter(|s| s.number.0 % 4 == 0)
+        .collect::<Vec<_>>();
+    group.bench_function("2/8-shards", |b| {
+        b.iter(|| decode_interpret_main(&bench_data, &two_shards))
+    });
+}
+
+fn decode_interpret_main(bench: &BenchmarkData, shards: &[ShardIdentity]) {
+    let r = decode_interpret(bench, shards);
+    if let Err(e) = r {
+        panic!("{e:?}");
+    }
+}
+
+fn decode_interpret(bench: &BenchmarkData, shard: &[ShardIdentity]) -> anyhow::Result<()> {
+    let mut decoder = WalStreamDecoder::new(bench.meta.start_lsn, bench.meta.pg_version);
+    let xlogoff: usize = bench.meta.start_lsn.segment_offset(WAL_SEGMENT_SIZE);
+
+    for chunk in bench.wal[xlogoff..].chunks(MAX_SEND_SIZE) {
+        decoder.feed_bytes(chunk);
+        while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+            assert!(lsn.is_aligned());
+            let _ = InterpretedWalRecord::from_bytes_filtered(
+                recdata,
+                shard,
+                lsn,
+                bench.meta.pg_version,
+            )
+            .unwrap();
+        }
+    }
+
+    Ok(())
+}
+criterion_group!(
+    name=benches;
+    config=Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
+    targets=criterion_benchmark
+);
+criterion_main!(benches);
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index aa50c62911..ebb38ceb52 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -1,6 +1,8 @@
 //! This module contains logic for decoding and interpreting
 //! raw bytes which represent a raw Postgres WAL record.
 
+use std::collections::HashMap;
+
 use crate::models::*;
 use crate::serialized_batch::SerializedValueBatch;
 use bytes::{Buf, Bytes};
@@ -14,15 +16,15 @@ use utils::lsn::Lsn;
 
 impl InterpretedWalRecord {
     /// Decode and interpreted raw bytes which represent one Postgres WAL record.
-    /// Data blocks which do not match the provided shard identity are filtered out.
+    /// Data blocks which do not match any of the provided shard identities are filtered out.
     /// Shard 0 is a special case since it tracks all relation sizes. We only give it
     /// the keys that are being written as that is enough for updating relation sizes.
     pub fn from_bytes_filtered(
         buf: Bytes,
-        shard: &ShardIdentity,
+        shards: &[ShardIdentity],
         next_record_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<InterpretedWalRecord> {
+    ) -> anyhow::Result<HashMap<ShardIdentity, InterpretedWalRecord>> {
         let mut decoded = DecodedWALRecord::default();
         decode_wal_record(buf, &mut decoded, pg_version)?;
         let xid = decoded.xl_xid;
@@ -33,43 +35,57 @@ impl InterpretedWalRecord {
             FlushUncommittedRecords::No
         };
 
-        let metadata_record =
-            MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
-        let batch = SerializedValueBatch::from_decoded_filtered(
+        let mut shard_records: HashMap<ShardIdentity, InterpretedWalRecord> =
+            HashMap::with_capacity(shards.len());
+        for shard in shards {
+            shard_records.insert(
+                *shard,
+                InterpretedWalRecord {
+                    metadata_record: None,
+                    batch: SerializedValueBatch::default(),
+                    next_record_lsn,
+                    flush_uncommitted,
+                    xid,
+                },
+            );
+        }
+
+        MetadataRecord::from_decoded_filtered(
+            &decoded,
+            &mut shard_records,
+            next_record_lsn,
+            pg_version,
+        )?;
+        SerializedValueBatch::from_decoded_filtered(
             decoded,
-            shard,
+            &mut shard_records,
             next_record_lsn,
             pg_version,
         )?;
 
-        Ok(InterpretedWalRecord {
-            metadata_record,
-            batch,
-            next_record_lsn,
-            flush_uncommitted,
-            xid,
-        })
+        Ok(shard_records)
     }
 }
 
 impl MetadataRecord {
-    /// Builds a metadata record for this WAL record, if any.
+    /// Populates the given `shard_records` with metadata records from this WAL record, if any,
+    /// discarding those belonging to other shards.
     ///
-    /// Only metadata records relevant for the given shard are emitted. Currently, most metadata
+    /// Only metadata records relevant for the given shards is emitted. Currently, most metadata
     /// records are broadcast to all shards for simplicity, but this should be improved.
     fn from_decoded_filtered(
         decoded: &DecodedWALRecord,
-        shard: &ShardIdentity,
+        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
         next_record_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<Option<MetadataRecord>> {
+    ) -> anyhow::Result<()> {
         // Note: this doesn't actually copy the bytes since
         // the [`Bytes`] type implements it via a level of indirection.
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
 
         // First, generate metadata records from the decoded WAL record.
-        let mut metadata_record = match decoded.xl_rmid {
+        let metadata_record = match decoded.xl_rmid {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 Self::decode_heapam_record(&mut buf, decoded, pg_version)?
             }
@@ -112,41 +128,65 @@ impl MetadataRecord {
         };
 
         // Next, filter the metadata record by shard.
-        match metadata_record {
-            Some(
-                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
-                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
-            ) => {
-                // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
-                // of the main relation. These are sharded and managed just like regular relation pages.
-                // See: https://github.com/neondatabase/neon/issues/9855
-                let is_local_vm_page = |heap_blk| {
-                    let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
-                    shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
-                };
-                // Send the old and new VM page updates to their respective shards.
-                clear_vm_bits.old_heap_blkno = clear_vm_bits
-                    .old_heap_blkno
-                    .filter(|&blkno| is_local_vm_page(blkno));
-                clear_vm_bits.new_heap_blkno = clear_vm_bits
-                    .new_heap_blkno
-                    .filter(|&blkno| is_local_vm_page(blkno));
-                // If neither VM page belongs to this shard, discard the record.
-                if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
-                {
-                    metadata_record = None
+        for (shard, record) in shard_records.iter_mut() {
+            match metadata_record {
+                Some(
+                    MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref clear_vm_bits))
+                    | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref clear_vm_bits)),
+                ) => {
+                    // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+                    // of the main relation. These are sharded and managed just like regular relation pages.
+                    // See: https://github.com/neondatabase/neon/issues/9855
+                    let is_local_vm_page = |heap_blk| {
+                        let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                        shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+                    };
+                    // Send the old and new VM page updates to their respective shards.
+                    let updated_old_heap_blkno = clear_vm_bits
+                        .old_heap_blkno
+                        .filter(|&blkno| is_local_vm_page(blkno));
+                    let updated_new_heap_blkno = clear_vm_bits
+                        .new_heap_blkno
+                        .filter(|&blkno| is_local_vm_page(blkno));
+                    // If neither VM page belongs to this shard, discard the record.
+                    if updated_old_heap_blkno.is_some() || updated_new_heap_blkno.is_some() {
+                        // Clone the record and update it for the current shard.
+                        let mut for_shard = metadata_record.clone();
+                        match for_shard {
+                            Some(
+                                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
+                                    ref mut clear_vm_bits,
+                                ))
+                                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
+                                    ref mut clear_vm_bits,
+                                )),
+                            ) => {
+                                clear_vm_bits.old_heap_blkno = updated_old_heap_blkno;
+                                clear_vm_bits.new_heap_blkno = updated_new_heap_blkno;
+                                record.metadata_record = for_shard;
+                            }
+                            _ => {
+                                unreachable!("for_shard is a clone of what we checked above")
+                            }
+                        }
+                    }
+                }
+                Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
+                    // Filter LogicalMessage records (AUX files) to only be stored on shard zero
+                    if shard.is_shard_zero() {
+                        record.metadata_record = metadata_record;
+                        // No other shards should receive this record, so we stop traversing shards early.
+                        break;
+                    }
+                }
+                _ => {
+                    // All other metadata records are sent to all shards.
+                    record.metadata_record = metadata_record.clone();
                 }
             }
-            Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
-                // Filter LogicalMessage records (AUX files) to only be stored on shard zero
-                if !shard.is_shard_zero() {
-                    metadata_record = None;
-                }
-            }
-            _ => {}
         }
 
-        Ok(metadata_record)
+        Ok(())
     }
 
     fn decode_heapam_record(
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 6576dd0eba..8bfa48faac 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -48,7 +48,7 @@ pub mod proto {
     tonic::include_proto!("interpreted_wal");
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Copy, Clone, Serialize, Deserialize)]
 pub enum FlushUncommittedRecords {
     Yes,
     No,
@@ -107,7 +107,7 @@ impl InterpretedWalRecord {
 
 /// The interpreted part of the Postgres WAL record which requires metadata
 /// writes to the underlying storage engine.
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum MetadataRecord {
     Heapam(HeapamRecord),
     Neonrmgr(NeonrmgrRecord),
@@ -123,12 +123,12 @@ pub enum MetadataRecord {
     Replorigin(ReploriginRecord),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum HeapamRecord {
     ClearVmBits(ClearVmBits),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClearVmBits {
     pub new_heap_blkno: Option<u32>,
     pub old_heap_blkno: Option<u32>,
@@ -136,29 +136,29 @@ pub struct ClearVmBits {
     pub flags: u8,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum NeonrmgrRecord {
     ClearVmBits(ClearVmBits),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum SmgrRecord {
     Create(SmgrCreate),
     Truncate(XlSmgrTruncate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct SmgrCreate {
     pub rel: RelTag,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum DbaseRecord {
     Create(DbaseCreate),
     Drop(DbaseDrop),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct DbaseCreate {
     pub db_id: Oid,
     pub tablespace_id: Oid,
@@ -166,32 +166,32 @@ pub struct DbaseCreate {
     pub src_tablespace_id: Oid,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct DbaseDrop {
     pub db_id: Oid,
     pub tablespace_ids: Vec<Oid>,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum ClogRecord {
     ZeroPage(ClogZeroPage),
     Truncate(ClogTruncate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClogZeroPage {
     pub segno: u32,
     pub rpageno: u32,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct ClogTruncate {
     pub pageno: u32,
     pub oldest_xid: TransactionId,
     pub oldest_xid_db: Oid,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum XactRecord {
     Commit(XactCommon),
     Abort(XactCommon),
@@ -200,7 +200,7 @@ pub enum XactRecord {
     Prepare(XactPrepare),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct XactCommon {
     pub parsed: XlXactParsedRecord,
     pub origin_id: u16,
@@ -209,73 +209,73 @@ pub struct XactCommon {
     pub lsn: Lsn,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct XactPrepare {
     pub xl_xid: TransactionId,
     pub data: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum MultiXactRecord {
     ZeroPage(MultiXactZeroPage),
     Create(XlMultiXactCreate),
     Truncate(XlMultiXactTruncate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct MultiXactZeroPage {
     pub slru_kind: SlruKind,
     pub segno: u32,
     pub rpageno: u32,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum RelmapRecord {
     Update(RelmapUpdate),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct RelmapUpdate {
     pub update: XlRelmapUpdate,
     pub buf: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum XlogRecord {
     Raw(RawXlogRecord),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct RawXlogRecord {
     pub info: u8,
     pub lsn: Lsn,
     pub buf: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum LogicalMessageRecord {
     Put(PutLogicalMessage),
     #[cfg(feature = "testing")]
     Failpoint,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct PutLogicalMessage {
     pub path: String,
     pub buf: Bytes,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum StandbyRecord {
     RunningXacts(StandbyRunningXacts),
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub struct StandbyRunningXacts {
     pub oldest_running_xid: TransactionId,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize)]
 pub enum ReploriginRecord {
     Set(XlReploriginSet),
     Drop(XlReploriginDrop),
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
index af2b179e05..c70ff05b8e 100644
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -5,7 +5,7 @@
 //! Such batches are created from decoded PG wal records and ingested
 //! by the pageserver by writing directly to the ephemeral file.
 
-use std::collections::BTreeSet;
+use std::collections::{BTreeSet, HashMap};
 
 use bytes::{Bytes, BytesMut};
 use pageserver_api::key::rel_block_to_key;
@@ -22,6 +22,8 @@ use utils::lsn::Lsn;
 
 use pageserver_api::key::Key;
 
+use crate::models::InterpretedWalRecord;
+
 static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 
 /// Accompanying metadata for the batch
@@ -128,7 +130,8 @@ impl Default for SerializedValueBatch {
 }
 
 impl SerializedValueBatch {
-    /// Build a batch of serialized values from a decoded PG WAL record
+    /// Populates the given `shard_records` with value batches from this WAL record, if any,
+    /// discarding those belonging to other shards.
     ///
     /// The batch will only contain values for keys targeting the specifiec
     /// shard. Shard 0 is a special case, where any keys that don't belong to
@@ -136,21 +139,20 @@ impl SerializedValueBatch {
     /// but absent from the raw buffer [`SerializedValueBatch::raw`]).
     pub(crate) fn from_decoded_filtered(
         decoded: DecodedWALRecord,
-        shard: &ShardIdentity,
+        shard_records: &mut HashMap<ShardIdentity, InterpretedWalRecord>,
         next_record_lsn: Lsn,
         pg_version: u32,
-    ) -> anyhow::Result<SerializedValueBatch> {
-        // First determine how big the buffer needs to be and allocate it up-front.
+    ) -> anyhow::Result<()> {
+        // First determine how big the buffers need to be and allocate it up-front.
         // This duplicates some of the work below, but it's empirically much faster.
-        let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
-        let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
+        for (shard, record) in shard_records.iter_mut() {
+            assert!(record.batch.is_empty());
+
+            let estimate = Self::estimate_buffer_size(&decoded, shard, pg_version);
+            record.batch.raw = Vec::with_capacity(estimate);
+        }
 
-        let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        let mut len: usize = 0;
         for blk in decoded.blocks.iter() {
-            let relative_off = buf.len() as u64;
-
             let rel = RelTag {
                 spcnode: blk.rnode_spcnode,
                 dbnode: blk.rnode_dbnode,
@@ -168,99 +170,98 @@ impl SerializedValueBatch {
                 );
             }
 
-            let key_is_local = shard.is_key_local(&key);
+            for (shard, record) in shard_records.iter_mut() {
+                let key_is_local = shard.is_key_local(&key);
 
-            tracing::debug!(
-                lsn=%next_record_lsn,
-                key=%key,
-                "ingest: shard decision {}",
-                if !key_is_local { "drop" } else { "keep" },
-            );
+                tracing::debug!(
+                    lsn=%next_record_lsn,
+                    key=%key,
+                    "ingest: shard decision {}",
+                    if !key_is_local { "drop" } else { "keep" },
+                );
 
-            if !key_is_local {
-                if shard.is_shard_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
-                    // its blkno in case it implicitly extends a relation.
-                    metadata.push(ValueMeta::Observed(ObservedValueMeta {
+                if !key_is_local {
+                    if shard.is_shard_zero() {
+                        // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                        // its blkno in case it implicitly extends a relation.
+                        record
+                            .batch
+                            .metadata
+                            .push(ValueMeta::Observed(ObservedValueMeta {
+                                key: key.to_compact(),
+                                lsn: next_record_lsn,
+                            }))
+                    }
+
+                    continue;
+                }
+
+                // Instead of storing full-page-image WAL record,
+                // it is better to store extracted image: we can skip wal-redo
+                // in this case. Also some FPI records may contain multiple (up to 32) pages,
+                // so them have to be copied multiple times.
+                //
+                let val = if Self::block_is_image(&decoded, blk, pg_version) {
+                    // Extract page image from FPI record
+                    let img_len = blk.bimg_len as usize;
+                    let img_offs = blk.bimg_offset as usize;
+                    let mut image = BytesMut::with_capacity(BLCKSZ as usize);
+                    // TODO(vlad): skip the copy
+                    image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
+
+                    if blk.hole_length != 0 {
+                        let tail = image.split_off(blk.hole_offset as usize);
+                        image.resize(image.len() + blk.hole_length as usize, 0u8);
+                        image.unsplit(tail);
+                    }
+                    //
+                    // Match the logic of XLogReadBufferForRedoExtended:
+                    // The page may be uninitialized. If so, we can't set the LSN because
+                    // that would corrupt the page.
+                    //
+                    if !page_is_new(&image) {
+                        page_set_lsn(&mut image, next_record_lsn)
+                    }
+                    assert_eq!(image.len(), BLCKSZ as usize);
+
+                    Value::Image(image.freeze())
+                } else {
+                    Value::WalRecord(NeonWalRecord::Postgres {
+                        will_init: blk.will_init || blk.apply_image,
+                        rec: decoded.record.clone(),
+                    })
+                };
+
+                let relative_off = record.batch.raw.len() as u64;
+
+                val.ser_into(&mut record.batch.raw)
+                    .expect("Writing into in-memory buffer is infallible");
+
+                let val_ser_size = record.batch.raw.len() - relative_off as usize;
+
+                record
+                    .batch
+                    .metadata
+                    .push(ValueMeta::Serialized(SerializedValueMeta {
                         key: key.to_compact(),
                         lsn: next_record_lsn,
-                    }))
-                }
-
-                continue;
+                        batch_offset: relative_off,
+                        len: val_ser_size,
+                        will_init: val.will_init(),
+                    }));
+                record.batch.max_lsn = std::cmp::max(record.batch.max_lsn, next_record_lsn);
+                record.batch.len += 1;
             }
-
-            // Instead of storing full-page-image WAL record,
-            // it is better to store extracted image: we can skip wal-redo
-            // in this case. Also some FPI records may contain multiple (up to 32) pages,
-            // so them have to be copied multiple times.
-            //
-            let val = if Self::block_is_image(&decoded, blk, pg_version) {
-                // Extract page image from FPI record
-                let img_len = blk.bimg_len as usize;
-                let img_offs = blk.bimg_offset as usize;
-                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
-                // TODO(vlad): skip the copy
-                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
-
-                if blk.hole_length != 0 {
-                    let tail = image.split_off(blk.hole_offset as usize);
-                    image.resize(image.len() + blk.hole_length as usize, 0u8);
-                    image.unsplit(tail);
-                }
-                //
-                // Match the logic of XLogReadBufferForRedoExtended:
-                // The page may be uninitialized. If so, we can't set the LSN because
-                // that would corrupt the page.
-                //
-                if !page_is_new(&image) {
-                    page_set_lsn(&mut image, next_record_lsn)
-                }
-                assert_eq!(image.len(), BLCKSZ as usize);
-
-                Value::Image(image.freeze())
-            } else {
-                Value::WalRecord(NeonWalRecord::Postgres {
-                    will_init: blk.will_init || blk.apply_image,
-                    rec: decoded.record.clone(),
-                })
-            };
-
-            val.ser_into(&mut buf)
-                .expect("Writing into in-memory buffer is infallible");
-
-            let val_ser_size = buf.len() - relative_off as usize;
-
-            metadata.push(ValueMeta::Serialized(SerializedValueMeta {
-                key: key.to_compact(),
-                lsn: next_record_lsn,
-                batch_offset: relative_off,
-                len: val_ser_size,
-                will_init: val.will_init(),
-            }));
-            max_lsn = std::cmp::max(max_lsn, next_record_lsn);
-            len += 1;
         }
 
         if cfg!(any(debug_assertions, test)) {
-            let batch = Self {
-                raw: buf,
-                metadata,
-                max_lsn,
-                len,
-            };
-
-            batch.validate_lsn_order();
-
-            return Ok(batch);
+            // Validate that the batches are correct
+            for record in shard_records.values() {
+                record.batch.validate_lsn_order();
+            }
         }
 
-        Ok(Self {
-            raw: buf,
-            metadata,
-            max_lsn,
-            len,
-        })
+        Ok(())
     }
 
     /// Look into the decoded PG WAL record and determine
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index c061714010..a73fa5cec8 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -278,6 +278,8 @@ async fn import_wal(
 
     let mut walingest = WalIngest::new(tline, startpoint, ctx).await?;
 
+    let shard = vec![*tline.get_shard_identity()];
+
     while last_lsn <= endpoint {
         // FIXME: assume postgresql tli 1 for now
         let filename = XLogFileName(1, segno, WAL_SEGMENT_SIZE);
@@ -314,10 +316,12 @@ async fn import_wal(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 let interpreted = InterpretedWalRecord::from_bytes_filtered(
                     recdata,
-                    tline.get_shard_identity(),
+                    &shard,
                     lsn,
                     tline.pg_version,
-                )?;
+                )?
+                .remove(tline.get_shard_identity())
+                .unwrap();
 
                 walingest
                     .ingest_record(interpreted, &mut modification, ctx)
@@ -411,6 +415,7 @@ pub async fn import_wal_from_tar(
     let mut offset = start_lsn.segment_offset(WAL_SEGMENT_SIZE);
     let mut last_lsn = start_lsn;
     let mut walingest = WalIngest::new(tline, start_lsn, ctx).await?;
+    let shard = vec![*tline.get_shard_identity()];
 
     // Ingest wal until end_lsn
     info!("importing wal until {}", end_lsn);
@@ -459,10 +464,12 @@ pub async fn import_wal_from_tar(
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                 let interpreted = InterpretedWalRecord::from_bytes_filtered(
                     recdata,
-                    tline.get_shard_identity(),
+                    &shard,
                     lsn,
                     tline.pg_version,
-                )?;
+                )?
+                .remove(tline.get_shard_identity())
+                .unwrap();
 
                 walingest
                     .ingest_record(interpreted, &mut modification, ctx)
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 3a8796add8..129b987e57 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -264,6 +264,8 @@ pub(super) async fn handle_walreceiver_connection(
 
     let mut walingest = WalIngest::new(timeline.as_ref(), startpoint, &ctx).await?;
 
+    let shard = vec![*timeline.get_shard_identity()];
+
     let interpreted_proto_config = match protocol {
         PostgresClientProtocol::Vanilla => None,
         PostgresClientProtocol::Interpreted {
@@ -476,10 +478,12 @@ pub(super) async fn handle_walreceiver_connection(
                         // Deserialize and interpret WAL record
                         let interpreted = InterpretedWalRecord::from_bytes_filtered(
                             recdata,
-                            modification.tline.get_shard_identity(),
+                            &shard,
                             next_record_lsn,
                             modification.tline.pg_version,
-                        )?;
+                        )?
+                        .remove(timeline.get_shard_identity())
+                        .unwrap();
 
                         if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                             && uncommitted_records > 0
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 7253af8507..ad7bcc0714 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -2163,10 +2163,12 @@ mod tests {
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
                 let interpreted = InterpretedWalRecord::from_bytes_filtered(
                     recdata,
-                    modification.tline.get_shard_identity(),
+                    &[*modification.tline.get_shard_identity()],
                     lsn,
                     modification.tline.pg_version,
                 )
+                .unwrap()
+                .remove(modification.tline.get_shard_identity())
                 .unwrap();
 
                 walingest
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index 7d215176dd..a718c16a6a 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -57,6 +57,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
         keepalive_ticker.reset();
 
         let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
+        let shard = vec![self.shard];
 
         loop {
             tokio::select! {
@@ -80,14 +81,17 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         assert!(next_record_lsn.is_aligned());
                         max_next_record_lsn = Some(next_record_lsn);
 
+
                         // Deserialize and interpret WAL record
                         let interpreted = InterpretedWalRecord::from_bytes_filtered(
                             recdata,
-                            &self.shard,
+                            &shard,
                             next_record_lsn,
                             self.pg_version,
                         )
-                        .with_context(|| "Failed to interpret WAL")?;
+                        .with_context(|| "Failed to interpret WAL")?
+                        .remove(&self.shard)
+                        .unwrap();
 
                         if !interpreted.is_empty() {
                             records.push(interpreted);

From b9464865b619d35257500bfade7651e3adfc07e4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 15 Jan 2025 13:05:05 +0000
Subject: [PATCH 06/40] benchmarks: report successful runs to slack as well
 (#10393)

## Problem

Successful `benchmarks` runs doesn't have enough visibility

Ref https://neondb.slack.com/archives/C069Z2199DL/p1736868055094539

## Summary of changes
- Report both successful and failed `benchmarks` to Slack
- Update `slackapi/slack-github-action` action
---
 .github/actionlint.yml               |  1 +
 .github/workflows/build_and_test.yml | 25 +++++++++++--------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 7a97e2ae55..aec5b4ee75 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -25,3 +25,4 @@ config-variables:
   - PGREGRESS_PG17_PROJECT_ID
   - SLACK_ON_CALL_QA_STAGING_STREAM
   - DEV_AWS_OIDC_ROLE_MANAGE_BENCHMARK_EC2_VMS_ARN
+  - SLACK_ON_CALL_STORAGE_STAGING_STREAM
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index cd95a5b16d..489a93f46d 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -346,25 +346,22 @@ jobs:
       # XXX: no coverage data handling here, since benchmarks are run on release builds,
       # while coverage is currently collected for the debug ones
 
-  report-benchmarks-failures:
+  report-benchmarks-results-to-slack:
     needs: [ benchmarks, create-test-report ]
-    if: github.ref_name == 'main' && failure() && needs.benchmarks.result == 'failure'
-    permissions:
-      id-token: write # aws-actions/configure-aws-credentials
-      statuses: write
-      contents: write
-      pull-requests: write
+    if: github.ref_name == 'main' && !cancelled() && contains(fromJSON('["success", "failure"]'), needs.benchmarks.result)
     runs-on: ubuntu-22.04
 
     steps:
-    - uses: slackapi/slack-github-action@v1
+    - uses: slackapi/slack-github-action@v2
       with:
-        channel-id: C060CNA47S9 # on-call-staging-storage-stream
-        slack-message: |
-          Benchmarks failed on main <${{ github.event.head_commit.url }}|${{ github.sha }}>
-          <${{ needs.create-test-report.outputs.report-url }}|Allure report>
-      env:
-        SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
+        method: chat.postMessage
+        token: ${{ secrets.SLACK_BOT_TOKEN }}
+        payload: |
+          channel: "${{ vars.SLACK_ON_CALL_STORAGE_STAGING_STREAM }}"
+          text: |
+            Benchmarks on main: *${{ needs.benchmarks.result }}*
+            - <${{ needs.create-test-report.outputs.report-url }}|Allure report>
+            - <${{ github.event.head_commit.url }}|${{ github.sha }}>
 
   create-test-report:
     needs: [ check-permissions, build-and-test-locally, coverage-report, build-build-tools-image, benchmarks ]

From 05a71c7d6a14d471dcbce9d9d27d5eed124c9947 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 15 Jan 2025 17:16:04 +0300
Subject: [PATCH 07/40] safekeeper: add membership configuration switch
 endpoint (#10241)

## Problem

https://github.com/neondatabase/neon/issues/9965

## Summary of changes

Add to safekeeper http endpoint to switch membership configuration. Also
add it to python client for tests, and add simple test itself.
---
 libs/safekeeper_api/src/membership.rs    |  2 +
 libs/safekeeper_api/src/models.rs        | 15 ++++++
 safekeeper/src/http/routes.rs            | 28 +++++++++++
 safekeeper/src/state.rs                  | 30 ++++++++++-
 safekeeper/src/timeline.rs               | 20 +++++++-
 test_runner/fixtures/safekeeper/http.py  | 43 +++++++++++++++-
 test_runner/regress/test_wal_acceptor.py | 64 +++++++++++++++++++++++-
 7 files changed, 198 insertions(+), 4 deletions(-)

diff --git a/libs/safekeeper_api/src/membership.rs b/libs/safekeeper_api/src/membership.rs
index fe30204545..a39fda526f 100644
--- a/libs/safekeeper_api/src/membership.rs
+++ b/libs/safekeeper_api/src/membership.rs
@@ -23,6 +23,8 @@ pub const INITIAL_GENERATION: Generation = 1;
 pub struct SafekeeperId {
     pub id: NodeId,
     pub host: String,
+    /// We include here only port for computes -- that is, pg protocol tenant
+    /// only port, or wide pg protocol port if the former is not configured.
     pub pg_port: u16,
 }
 
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index ad38986357..a6f90154f4 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -175,6 +175,7 @@ pub enum WalReceiverStatus {
 pub struct TimelineStatus {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
+    pub mconf: Configuration,
     pub acceptor_state: AcceptorStateStatus,
     pub pg_info: ServerInfo,
     pub flush_lsn: Lsn,
@@ -189,6 +190,20 @@ pub struct TimelineStatus {
     pub walreceivers: Vec<WalReceiverState>,
 }
 
+/// Request to switch membership configuration.
+#[derive(Serialize, Deserialize)]
+#[serde(transparent)]
+pub struct TimelineMembershipSwitchRequest {
+    pub mconf: Configuration,
+}
+
+/// In response both previous and current configuration are sent.
+#[derive(Serialize, Deserialize)]
+pub struct TimelineMembershipSwitchResponse {
+    pub previous_conf: Configuration,
+    pub current_conf: Configuration,
+}
+
 fn lsn_invalid() -> Lsn {
     Lsn::INVALID
 }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 3835d39698..5ecde4b125 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -1,4 +1,5 @@
 use hyper::{Body, Request, Response, StatusCode};
+use safekeeper_api::models;
 use safekeeper_api::models::AcceptorStateStatus;
 use safekeeper_api::models::SafekeeperStatus;
 use safekeeper_api::models::TermSwitchApiEntry;
@@ -183,6 +184,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
     let status = TimelineStatus {
         tenant_id: ttid.tenant_id,
         timeline_id: ttid.timeline_id,
+        mconf: state.mconf,
         acceptor_state: acc_state,
         pg_info: state.server,
         flush_lsn,
@@ -268,6 +270,28 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     Ok(response)
 }
 
+/// Consider switching timeline membership configuration to the provided one.
+async fn timeline_membership_handler(
+    mut request: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    let ttid = TenantTimelineId::new(
+        parse_request_param(&request, "tenant_id")?,
+        parse_request_param(&request, "timeline_id")?,
+    );
+    check_permission(&request, Some(ttid.tenant_id))?;
+
+    let global_timelines = get_global_timelines(&request);
+    let tli = global_timelines.get(ttid).map_err(ApiError::from)?;
+
+    let data: models::TimelineMembershipSwitchRequest = json_request(&mut request).await?;
+    let response = tli
+        .membership_switch(data.mconf)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, response)
+}
+
 async fn timeline_copy_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permission(&request, None)?;
 
@@ -619,6 +643,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/snapshot/:destination_id",
             |r| request_span(r, timeline_snapshot_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_id/timeline/:timeline_id/membership",
+            |r| request_span(r, timeline_membership_handler),
+        )
         .post(
             "/v1/tenant/:tenant_id/timeline/:source_timeline_id/copy",
             |r| request_span(r, timeline_copy_handler),
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 1c3bb1b4dc..4d566b12a0 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -6,9 +6,12 @@ use std::{cmp::max, ops::Deref, time::SystemTime};
 use anyhow::{bail, Result};
 use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::{
-    membership::Configuration, models::TimelineTermBumpResponse, ServerInfo, Term, INITIAL_TERM,
+    membership::Configuration,
+    models::{TimelineMembershipSwitchResponse, TimelineTermBumpResponse},
+    ServerInfo, Term, INITIAL_TERM,
 };
 use serde::{Deserialize, Serialize};
+use tracing::info;
 use utils::{
     id::{TenantId, TenantTimelineId, TimelineId},
     lsn::Lsn,
@@ -258,6 +261,31 @@ where
             current_term: after,
         })
     }
+
+    /// Switch into membership configuration `to` if it is higher than the
+    /// current one.
+    pub async fn membership_switch(
+        &mut self,
+        to: Configuration,
+    ) -> Result<TimelineMembershipSwitchResponse> {
+        let before = self.mconf.clone();
+        // Is switch allowed?
+        if to.generation <= self.mconf.generation {
+            info!(
+                "ignoring request to switch membership conf to lower {}, current conf {}",
+                to, self.mconf
+            );
+        } else {
+            let mut state = self.start_change();
+            state.mconf = to.clone();
+            self.finish_change(&state).await?;
+            info!("switched membership conf to {} from {}", to, before);
+        }
+        Ok(TimelineMembershipSwitchResponse {
+            previous_conf: before,
+            current_conf: self.mconf.clone(),
+        })
+    }
 }
 
 impl<CTRL> Deref for TimelineState<CTRL>
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 36860a0da2..2882391074 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -4,7 +4,10 @@
 use anyhow::{anyhow, bail, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use remote_storage::RemotePath;
-use safekeeper_api::models::{PeerInfo, TimelineTermBumpResponse};
+use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::{
+    PeerInfo, TimelineMembershipSwitchResponse, TimelineTermBumpResponse,
+};
 use safekeeper_api::Term;
 use tokio::fs::{self};
 use tokio_util::sync::CancellationToken;
@@ -188,6 +191,13 @@ impl StateSK {
         self.state_mut().term_bump(to).await
     }
 
+    pub async fn membership_switch(
+        &mut self,
+        to: Configuration,
+    ) -> Result<TimelineMembershipSwitchResponse> {
+        self.state_mut().membership_switch(to).await
+    }
+
     /// Close open WAL files to release FDs.
     fn close_wal_store(&mut self) {
         if let StateSK::Loaded(sk) = self {
@@ -768,6 +778,14 @@ impl Timeline {
         state.sk.term_bump(to).await
     }
 
+    pub async fn membership_switch(
+        self: &Arc<Self>,
+        to: Configuration,
+    ) -> Result<TimelineMembershipSwitchResponse> {
+        let mut state = self.write_shared_state().await;
+        state.sk.membership_switch(to).await
+    }
+
     /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
     async fn do_wal_residence_guard(
         self: &Arc<Self>,
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 4826cae3ee..493ce7334e 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -25,6 +25,7 @@ class Walreceiver:
 
 @dataclass
 class SafekeeperTimelineStatus:
+    mconf: Configuration | None
     term: int
     last_log_term: int
     pg_version: int  # Not exactly a PgVersion, safekeeper returns version as int, for example 150002 for 15.2
@@ -73,7 +74,7 @@ class TermBumpResponse:
 class SafekeeperId:
     id: int
     host: str
-    pg_port: str
+    pg_port: int
 
 
 @dataclass
@@ -82,6 +83,16 @@ class Configuration:
     members: list[SafekeeperId]
     new_members: list[SafekeeperId] | None
 
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> Configuration:
+        generation = d["generation"]
+        members = d["members"]
+        new_members = d.get("new_members")
+        return Configuration(generation, members, new_members)
+
+    def to_json(self) -> str:
+        return json.dumps(self, cls=EnhancedJSONEncoder)
+
 
 @dataclass
 class TimelineCreateRequest:
@@ -97,6 +108,18 @@ class TimelineCreateRequest:
         return json.dumps(self, cls=EnhancedJSONEncoder)
 
 
+@dataclass
+class TimelineMembershipSwitchResponse:
+    previous_conf: Configuration
+    current_conf: Configuration
+
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> TimelineMembershipSwitchResponse:
+        previous_conf = Configuration.from_json(d["previous_conf"])
+        current_conf = Configuration.from_json(d["current_conf"])
+        return TimelineMembershipSwitchResponse(previous_conf, current_conf)
+
+
 class SafekeeperHttpClient(requests.Session, MetricsGetter):
     HTTPError = requests.HTTPError
 
@@ -170,7 +193,10 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         res.raise_for_status()
         resj = res.json()
         walreceivers = [Walreceiver(wr["conn_id"], wr["status"]) for wr in resj["walreceivers"]]
+        # It is always normally not None, it is allowed only to make forward compat tests happy.
+        mconf = Configuration.from_json(resj["mconf"]) if "mconf" in resj else None
         return SafekeeperTimelineStatus(
+            mconf=mconf,
             term=resj["acceptor_state"]["term"],
             last_log_term=resj["acceptor_state"]["epoch"],
             pg_version=resj["pg_info"]["pg_version"],
@@ -196,6 +222,11 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
 
+    # Get timeline membership configuration.
+    def get_membership(self, tenant_id: TenantId, timeline_id: TimelineId) -> Configuration:
+        # make mypy happy
+        return self.timeline_status(tenant_id, timeline_id).mconf  # type: ignore
+
     # only_local doesn't remove segments in the remote storage.
     def timeline_delete(
         self, tenant_id: TenantId, timeline_id: TimelineId, only_local: bool = False
@@ -242,6 +273,16 @@ class SafekeeperHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, dict)
         return res_json
 
+    def membership_switch(
+        self, tenant_id: TenantId, timeline_id: TimelineId, to: Configuration
+    ) -> TimelineMembershipSwitchResponse:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/membership",
+            data=to.to_json(),
+        )
+        res.raise_for_status()
+        return TimelineMembershipSwitchResponse.from_json(res.json())
+
     def copy_timeline(self, tenant_id: TenantId, timeline_id: TimelineId, body: dict[str, Any]):
         res = self.post(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/copy",
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index d39c6a6b5b..2b6a267bdf 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -48,7 +48,12 @@ from fixtures.remote_storage import (
     default_remote_storage,
     s3_storage,
 )
-from fixtures.safekeeper.http import Configuration, SafekeeperHttpClient, TimelineCreateRequest
+from fixtures.safekeeper.http import (
+    Configuration,
+    SafekeeperHttpClient,
+    SafekeeperId,
+    TimelineCreateRequest,
+)
 from fixtures.safekeeper.utils import wait_walreceivers_absent
 from fixtures.utils import (
     PropagatingThread,
@@ -2243,6 +2248,63 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
     wait_until(unevicted_on_dest, interval=0.1, timeout=1.0)
 
 
+# Basic test for http API membership related calls: create timeline and switch
+# configuration. Normally these are called by storage controller, but this
+# allows to test them separately.
+@run_only_on_default_postgres("tests only safekeeper API")
+def test_membership_api(neon_env_builder: NeonEnvBuilder):
+    neon_env_builder.num_safekeepers = 1
+    env = neon_env_builder.init_start()
+
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    sk = env.safekeepers[0]
+    http_cli = sk.http_client()
+
+    sk_id_1 = SafekeeperId(env.safekeepers[0].id, "localhost", sk.port.pg_tenant_only)
+    sk_id_2 = SafekeeperId(11, "localhost", 5434)  # just a mock
+
+    # Request to switch before timeline creation should fail.
+    init_conf = Configuration(generation=1, members=[sk_id_1], new_members=None)
+    with pytest.raises(requests.exceptions.HTTPError):
+        http_cli.membership_switch(tenant_id, timeline_id, init_conf)
+
+    # Create timeline.
+    create_r = TimelineCreateRequest(
+        tenant_id, timeline_id, init_conf, 150002, Lsn("0/1000000"), commit_lsn=None
+    )
+    log.info(f"sending {create_r.to_json()}")
+    http_cli.timeline_create(create_r)
+
+    # Switch into some conf.
+    joint_conf = Configuration(generation=4, members=[sk_id_1], new_members=[sk_id_2])
+    resp = http_cli.membership_switch(tenant_id, timeline_id, joint_conf)
+    log.info(f"joint switch resp: {resp}")
+    assert resp.previous_conf.generation == 1
+    assert resp.current_conf.generation == 4
+
+    # Restart sk, conf should be preserved.
+    sk.stop().start()
+    after_restart = http_cli.get_membership(tenant_id, timeline_id)
+    log.info(f"conf after restart: {after_restart}")
+    assert after_restart.generation == 4
+
+    # Switch into disjoint conf.
+    non_joint = Configuration(generation=5, members=[sk_id_2], new_members=None)
+    resp = http_cli.membership_switch(tenant_id, timeline_id, non_joint)
+    log.info(f"non joint switch resp: {resp}")
+    assert resp.previous_conf.generation == 4
+    assert resp.current_conf.generation == 5
+
+    # Switch request to lower conf should be ignored.
+    lower_conf = Configuration(generation=3, members=[], new_members=None)
+    resp = http_cli.membership_switch(tenant_id, timeline_id, lower_conf)
+    log.info(f"lower switch resp: {resp}")
+    assert resp.previous_conf.generation == 5
+    assert resp.current_conf.generation == 5
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From 3e529f124f45f20f0e10948b4b4a8ba881b59b06 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 15 Jan 2025 09:29:52 -0600
Subject: [PATCH 08/40] Remove leading slashes when downloading remote files
 (#10396)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/postgres-v16   | 2 +-
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 210a0ba3af..46082f2088 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 210a0ba3afd8134ea910b203f274b165bd4f05d7
+Subproject commit 46082f20884f087a2d974b33ac65d63af26142bd
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index d3141e17a7..dd0b28d6fb 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit d3141e17a7155e3d07c8deba4a10c748a29ba1e6
+Subproject commit dd0b28d6fbad39e227f3b77296fcca879af8b3a9
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index f63b141cfb..d674efd776 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit f63b141cfb0c813725a6b2574049565bff643018
+Subproject commit d674efd776f59d78e8fa1535bd2f95c3e6984fca
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 0f8da73ed0..a8dd6e779d 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 0f8da73ed08d4fc4ee58cccea008c75bfb20baa8
+Subproject commit a8dd6e779dde907778006adb436b557ad652fb97
diff --git a/vendor/revisions.json b/vendor/revisions.json
index b4d57ab709..c899dbaa5a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.2",
-    "0f8da73ed08d4fc4ee58cccea008c75bfb20baa8"
+    "a8dd6e779dde907778006adb436b557ad652fb97"
   ],
   "v16": [
     "16.6",
-    "f63b141cfb0c813725a6b2574049565bff643018"
+    "d674efd776f59d78e8fa1535bd2f95c3e6984fca"
   ],
   "v15": [
     "15.10",
-    "d3141e17a7155e3d07c8deba4a10c748a29ba1e6"
+    "dd0b28d6fbad39e227f3b77296fcca879af8b3a9"
   ],
   "v14": [
     "14.15",
-    "210a0ba3afd8134ea910b203f274b165bd4f05d7"
+    "46082f20884f087a2d974b33ac65d63af26142bd"
   ]
 }

From dbebede7bf5ff0cefd303f266781457b7472d070 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 15 Jan 2025 15:33:54 +0000
Subject: [PATCH 09/40] safekeeper: fan out from single wal reader to multiple
 shards (#10190)

## Problem

Safekeepers currently decode and interpret WAL for each shard
separately.
This is wasteful in terms of CPU memory usage - we've seen this in
profiles.

## Summary of changes

Fan-out interpreted WAL to multiple shards.
The basic is that wal decoding and interpretation happens in a separate
tokio task and senders
attach to it. Senders only receive batches concerning their shard and
only past the Lsn they've last seen.

Fan-out is gated behind the `wal_reader_fanout` safekeeper flag
(disabled by default for now).

When fan-out is enabled, it might be desirable to control the absolute
delta between the
current position and a new shard's desired position (i.e. how far behind
or ahead a shard may be).
`max_delta_for_fanout` is a new optional safekeeper flag which dictates
whether to create a new
WAL reader or attach to the existing one. By default, this behaviour is
disabled. Let's consider enabling
it if we spot the need for it in the field.

## Testing

Tests passed [here](https://github.com/neondatabase/neon/pull/10301)
with wal reader fanout enabled
as of
https://github.com/neondatabase/neon/pull/10190/commits/34f6a717182c431847bbd5b7828fd0f89027b2be.

Related: https://github.com/neondatabase/neon/issues/9337
Epic: https://github.com/neondatabase/neon/issues/9329
---
 Cargo.lock                                    |   3 +
 libs/safekeeper_api/Cargo.toml                |   1 +
 libs/safekeeper_api/src/models.rs             |  20 +-
 libs/wal_decoder/src/models.rs                |   2 +-
 libs/wal_decoder/src/serialized_batch.rs      |   8 +-
 safekeeper/Cargo.toml                         |   3 +
 safekeeper/src/bin/safekeeper.rs              |   9 +
 safekeeper/src/http/routes.rs                 |   2 +-
 safekeeper/src/lib.rs                         |   4 +
 safekeeper/src/metrics.rs                     |  32 +-
 safekeeper/src/send_interpreted_wal.rs        | 765 ++++++++++++++++--
 safekeeper/src/send_wal.rs                    | 354 ++++++--
 safekeeper/src/test_utils.rs                  |  65 +-
 safekeeper/src/timeline.rs                    |  14 +-
 safekeeper/src/wal_reader_stream.rs           | 396 ++++++---
 .../tests/walproposer_sim/safekeeper.rs       |   2 +
 16 files changed, 1410 insertions(+), 270 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 0669899617..afe16ff848 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5655,6 +5655,7 @@ dependencies = [
  "crc32c",
  "criterion",
  "desim",
+ "env_logger 0.10.2",
  "fail",
  "futures",
  "hex",
@@ -5683,6 +5684,7 @@ dependencies = [
  "serde",
  "serde_json",
  "sha2",
+ "smallvec",
  "storage_broker",
  "strum",
  "strum_macros",
@@ -5709,6 +5711,7 @@ version = "0.1.0"
 dependencies = [
  "anyhow",
  "const_format",
+ "pageserver_api",
  "postgres_ffi",
  "pq_proto",
  "serde",
diff --git a/libs/safekeeper_api/Cargo.toml b/libs/safekeeper_api/Cargo.toml
index 7652c3d413..6b72ace019 100644
--- a/libs/safekeeper_api/Cargo.toml
+++ b/libs/safekeeper_api/Cargo.toml
@@ -13,3 +13,4 @@ postgres_ffi.workspace = true
 pq_proto.workspace = true
 tokio.workspace = true
 utils.workspace = true
+pageserver_api.workspace = true
diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index a6f90154f4..b5fa903820 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -1,5 +1,6 @@
 //! Types used in safekeeper http API. Many of them are also reused internally.
 
+use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::TimestampTz;
 use serde::{Deserialize, Serialize};
 use std::net::SocketAddr;
@@ -146,7 +147,13 @@ pub type ConnectionId = u32;
 
 /// Serialize is used only for json'ing in API response. Also used internally.
 #[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalSenderState {
+pub enum WalSenderState {
+    Vanilla(VanillaWalSenderState),
+    Interpreted(InterpretedWalSenderState),
+}
+
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct VanillaWalSenderState {
     pub ttid: TenantTimelineId,
     pub addr: SocketAddr,
     pub conn_id: ConnectionId,
@@ -155,6 +162,17 @@ pub struct WalSenderState {
     pub feedback: ReplicationFeedback,
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct InterpretedWalSenderState {
+    pub ttid: TenantTimelineId,
+    pub shard: ShardIdentity,
+    pub addr: SocketAddr,
+    pub conn_id: ConnectionId,
+    // postgres application_name
+    pub appname: Option<String>,
+    pub feedback: ReplicationFeedback,
+}
+
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalReceiverState {
     /// None means it is recovery initiated by us (this safekeeper).
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 8bfa48faac..c2f9125b21 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -64,7 +64,7 @@ pub struct InterpretedWalRecords {
 }
 
 /// An interpreted Postgres WAL record, ready to be handled by the pageserver
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct InterpretedWalRecord {
     /// Optional metadata record - may cause writes to metadata keys
     /// in the storage engine
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
index c70ff05b8e..d76f75f51f 100644
--- a/libs/wal_decoder/src/serialized_batch.rs
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -32,7 +32,7 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 /// relation sizes. In the case of "observed" values, we only need to know
 /// the key and LSN, so two types of metadata are supported to save on network
 /// bandwidth.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub enum ValueMeta {
     Serialized(SerializedValueMeta),
     Observed(ObservedValueMeta),
@@ -79,7 +79,7 @@ impl PartialEq for OrderedValueMeta {
 impl Eq for OrderedValueMeta {}
 
 /// Metadata for a [`Value`] serialized into the batch.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct SerializedValueMeta {
     pub key: CompactKey,
     pub lsn: Lsn,
@@ -91,14 +91,14 @@ pub struct SerializedValueMeta {
 }
 
 /// Metadata for a [`Value`] observed by the batch
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct ObservedValueMeta {
     pub key: CompactKey,
     pub lsn: Lsn,
 }
 
 /// Batch of serialized [`Value`]s.
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Clone)]
 pub struct SerializedValueBatch {
     /// [`Value`]s serialized in EphemeralFile's native format,
     /// ready for disk write by the pageserver
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index 3ebb7097f2..0eb511f1cc 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -26,6 +26,7 @@ hex.workspace = true
 humantime.workspace = true
 http.workspace = true
 hyper0.workspace = true
+itertools.workspace = true
 futures.workspace = true
 once_cell.workspace = true
 parking_lot.workspace = true
@@ -39,6 +40,7 @@ scopeguard.workspace = true
 reqwest = { workspace = true, features = ["json"] }
 serde.workspace = true
 serde_json.workspace = true
+smallvec.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 thiserror.workspace = true
@@ -63,6 +65,7 @@ storage_broker.workspace = true
 tokio-stream.workspace = true
 utils.workspace = true
 wal_decoder.workspace = true
+env_logger.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index bc7af02185..6cc53e0d23 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -207,6 +207,13 @@ struct Args {
     /// Also defines interval for eviction retries.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
     eviction_min_resident: Duration,
+    /// Enable fanning out WAL to different shards from the same reader
+    #[arg(long)]
+    wal_reader_fanout: bool,
+    /// Only fan out the WAL reader if the absoulte delta between the new requested position
+    /// and the current position of the reader is smaller than this value.
+    #[arg(long)]
+    max_delta_for_fanout: Option<u64>,
 }
 
 // Like PathBufValueParser, but allows empty string.
@@ -370,6 +377,8 @@ async fn main() -> anyhow::Result<()> {
         control_file_save_interval: args.control_file_save_interval,
         partial_backup_concurrency: args.partial_backup_concurrency,
         eviction_min_resident: args.eviction_min_resident,
+        wal_reader_fanout: args.wal_reader_fanout,
+        max_delta_for_fanout: args.max_delta_for_fanout,
     });
 
     // initialize sentry if SENTRY_DSN is provided
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 5ecde4b125..4b9fb9eb67 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -195,7 +195,7 @@ async fn timeline_status_handler(request: Request<Body>) -> Result<Response<Body
         peer_horizon_lsn: inmem.peer_horizon_lsn,
         remote_consistent_lsn: inmem.remote_consistent_lsn,
         peers: tli.get_peers(conf).await,
-        walsenders: tli.get_walsenders().get_all(),
+        walsenders: tli.get_walsenders().get_all_public(),
         walreceivers: tli.get_walreceivers().get_all(),
     };
     json_response(StatusCode::OK, status)
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 7acf355e6a..e0090c638a 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -108,6 +108,8 @@ pub struct SafeKeeperConf {
     pub control_file_save_interval: Duration,
     pub partial_backup_concurrency: usize,
     pub eviction_min_resident: Duration,
+    pub wal_reader_fanout: bool,
+    pub max_delta_for_fanout: Option<u64>,
 }
 
 impl SafeKeeperConf {
@@ -150,6 +152,8 @@ impl SafeKeeperConf {
             control_file_save_interval: Duration::from_secs(1),
             partial_backup_concurrency: 1,
             eviction_min_resident: Duration::ZERO,
+            wal_reader_fanout: false,
+            max_delta_for_fanout: None,
         }
     }
 }
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index 5883f402c7..3ea9e3d674 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -12,9 +12,9 @@ use metrics::{
     pow2_buckets,
     proto::MetricFamily,
     register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
-    Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
-    IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge,
+    register_int_gauge_vec, Gauge, GaugeVec, Histogram, HistogramVec, IntCounter, IntCounterPair,
+    IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
 };
 use once_cell::sync::Lazy;
 use postgres_ffi::XLogSegNo;
@@ -211,6 +211,14 @@ pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_wal_receivers")
 });
+pub static WAL_READERS: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "safekeeper_wal_readers",
+        "Number of active WAL readers (may serve pageservers or other safekeepers)",
+        &["kind", "target"]
+    )
+    .expect("Failed to register safekeeper_wal_receivers")
+});
 pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
     // Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
     // full queues respectively.
@@ -443,6 +451,7 @@ pub struct FullTimelineInfo {
     pub timeline_is_active: bool,
     pub num_computes: u32,
     pub last_removed_segno: XLogSegNo,
+    pub interpreted_wal_reader_tasks: usize,
 
     pub epoch_start_lsn: Lsn,
     pub mem_state: TimelineMemState,
@@ -472,6 +481,7 @@ pub struct TimelineCollector {
     disk_usage: GenericGaugeVec<AtomicU64>,
     acceptor_term: GenericGaugeVec<AtomicU64>,
     written_wal_bytes: GenericGaugeVec<AtomicU64>,
+    interpreted_wal_reader_tasks: GenericGaugeVec<AtomicU64>,
     written_wal_seconds: GaugeVec,
     flushed_wal_seconds: GaugeVec,
     collect_timeline_metrics: Gauge,
@@ -670,6 +680,16 @@ impl TimelineCollector {
         .unwrap();
         descs.extend(active_timelines_count.desc().into_iter().cloned());
 
+        let interpreted_wal_reader_tasks = GenericGaugeVec::new(
+            Opts::new(
+                "safekeeper_interpreted_wal_reader_tasks",
+                "Number of active interpreted wal reader tasks, grouped by timeline",
+            ),
+            &["tenant_id", "timeline_id"],
+        )
+        .unwrap();
+        descs.extend(interpreted_wal_reader_tasks.desc().into_iter().cloned());
+
         TimelineCollector {
             global_timelines,
             descs,
@@ -693,6 +713,7 @@ impl TimelineCollector {
             collect_timeline_metrics,
             timelines_count,
             active_timelines_count,
+            interpreted_wal_reader_tasks,
         }
     }
 }
@@ -721,6 +742,7 @@ impl Collector for TimelineCollector {
         self.disk_usage.reset();
         self.acceptor_term.reset();
         self.written_wal_bytes.reset();
+        self.interpreted_wal_reader_tasks.reset();
         self.written_wal_seconds.reset();
         self.flushed_wal_seconds.reset();
 
@@ -782,6 +804,9 @@ impl Collector for TimelineCollector {
             self.written_wal_bytes
                 .with_label_values(labels)
                 .set(tli.wal_storage.write_wal_bytes);
+            self.interpreted_wal_reader_tasks
+                .with_label_values(labels)
+                .set(tli.interpreted_wal_reader_tasks as u64);
             self.written_wal_seconds
                 .with_label_values(labels)
                 .set(tli.wal_storage.write_wal_seconds);
@@ -834,6 +859,7 @@ impl Collector for TimelineCollector {
         mfs.extend(self.disk_usage.collect());
         mfs.extend(self.acceptor_term.collect());
         mfs.extend(self.written_wal_bytes.collect());
+        mfs.extend(self.interpreted_wal_reader_tasks.collect());
         mfs.extend(self.written_wal_seconds.collect());
         mfs.extend(self.flushed_wal_seconds.collect());
 
diff --git a/safekeeper/src/send_interpreted_wal.rs b/safekeeper/src/send_interpreted_wal.rs
index a718c16a6a..ea09ce364d 100644
--- a/safekeeper/src/send_interpreted_wal.rs
+++ b/safekeeper/src/send_interpreted_wal.rs
@@ -1,100 +1,330 @@
+use std::collections::HashMap;
+use std::fmt::Display;
+use std::sync::Arc;
 use std::time::Duration;
 
-use anyhow::Context;
+use anyhow::{anyhow, Context};
+use futures::future::Either;
 use futures::StreamExt;
 use pageserver_api::shard::ShardIdentity;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackend};
-use postgres_ffi::MAX_SEND_SIZE;
+use postgres_ffi::waldecoder::WalDecodeError;
 use postgres_ffi::{get_current_timestamp, waldecoder::WalStreamDecoder};
 use pq_proto::{BeMessage, InterpretedWalRecordsBody, WalSndKeepAlive};
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::sync::mpsc::error::SendError;
+use tokio::task::JoinHandle;
 use tokio::time::MissedTickBehavior;
+use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 use utils::postgres_client::Compression;
 use utils::postgres_client::InterpretedFormat;
 use wal_decoder::models::{InterpretedWalRecord, InterpretedWalRecords};
 use wal_decoder::wire_format::ToWireFormat;
 
-use crate::send_wal::EndWatchView;
-use crate::wal_reader_stream::{WalBytes, WalReaderStreamBuilder};
+use crate::metrics::WAL_READERS;
+use crate::send_wal::{EndWatchView, WalSenderGuard};
+use crate::timeline::WalResidentTimeline;
+use crate::wal_reader_stream::{StreamingWalReader, WalBytes};
 
-/// Shard-aware interpreted record sender.
-/// This is used for sending WAL to the pageserver. Said WAL
-/// is pre-interpreted and filtered for the shard.
-pub(crate) struct InterpretedWalSender<'a, IO> {
-    pub(crate) format: InterpretedFormat,
-    pub(crate) compression: Option<Compression>,
-    pub(crate) pgb: &'a mut PostgresBackend<IO>,
-    pub(crate) wal_stream_builder: WalReaderStreamBuilder,
-    pub(crate) end_watch_view: EndWatchView,
-    pub(crate) shard: ShardIdentity,
-    pub(crate) pg_version: u32,
-    pub(crate) appname: Option<String>,
+/// Identifier used to differentiate between senders of the same
+/// shard.
+///
+/// In the steady state there's only one, but two pageservers may
+/// temporarily have the same shard attached and attempt to ingest
+/// WAL for it. See also [`ShardSenderId`].
+#[derive(Hash, Eq, PartialEq, Copy, Clone)]
+struct SenderId(u8);
+
+impl SenderId {
+    fn first() -> Self {
+        SenderId(0)
+    }
+
+    fn next(&self) -> Self {
+        SenderId(self.0.checked_add(1).expect("few senders"))
+    }
 }
 
-struct Batch {
+#[derive(Hash, Eq, PartialEq)]
+struct ShardSenderId {
+    shard: ShardIdentity,
+    sender_id: SenderId,
+}
+
+impl Display for ShardSenderId {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}{}", self.sender_id.0, self.shard.shard_slug())
+    }
+}
+
+impl ShardSenderId {
+    fn new(shard: ShardIdentity, sender_id: SenderId) -> Self {
+        ShardSenderId { shard, sender_id }
+    }
+
+    fn shard(&self) -> ShardIdentity {
+        self.shard
+    }
+}
+
+/// Shard-aware fan-out interpreted record reader.
+/// Reads WAL from disk, decodes it, intepretets it, and sends
+/// it to any [`InterpretedWalSender`] connected to it.
+/// Each [`InterpretedWalSender`] corresponds to one shard
+/// and gets interpreted records concerning that shard only.
+pub(crate) struct InterpretedWalReader {
+    wal_stream: StreamingWalReader,
+    shard_senders: HashMap<ShardIdentity, smallvec::SmallVec<[ShardSenderState; 1]>>,
+    shard_notification_rx: Option<tokio::sync::mpsc::UnboundedReceiver<AttachShardNotification>>,
+    state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
+    pg_version: u32,
+}
+
+/// A handle for [`InterpretedWalReader`] which allows for interacting with it
+/// when it runs as a separate tokio task.
+#[derive(Debug)]
+pub(crate) struct InterpretedWalReaderHandle {
+    join_handle: JoinHandle<Result<(), InterpretedWalReaderError>>,
+    state: Arc<std::sync::RwLock<InterpretedWalReaderState>>,
+    shard_notification_tx: tokio::sync::mpsc::UnboundedSender<AttachShardNotification>,
+}
+
+struct ShardSenderState {
+    sender_id: SenderId,
+    tx: tokio::sync::mpsc::Sender<Batch>,
+    next_record_lsn: Lsn,
+}
+
+/// State of [`InterpretedWalReader`] visible outside of the task running it.
+#[derive(Debug)]
+pub(crate) enum InterpretedWalReaderState {
+    Running { current_position: Lsn },
+    Done,
+}
+
+pub(crate) struct Batch {
     wal_end_lsn: Lsn,
     available_wal_end_lsn: Lsn,
     records: InterpretedWalRecords,
 }
 
-impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
-    /// Send interpreted WAL to a receiver.
-    /// Stops when an error occurs or the receiver is caught up and there's no active compute.
-    ///
-    /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
-    /// convenience.
-    pub(crate) async fn run(self) -> Result<(), CopyStreamHandlerEnd> {
-        let mut wal_position = self.wal_stream_builder.start_pos();
-        let mut wal_decoder =
-            WalStreamDecoder::new(self.wal_stream_builder.start_pos(), self.pg_version);
+#[derive(thiserror::Error, Debug)]
+pub enum InterpretedWalReaderError {
+    /// Handler initiates the end of streaming.
+    #[error("decode error: {0}")]
+    Decode(#[from] WalDecodeError),
+    #[error("read or interpret error: {0}")]
+    ReadOrInterpret(#[from] anyhow::Error),
+    #[error("wal stream closed")]
+    WalStreamClosed,
+}
 
-        let stream = self.wal_stream_builder.build(MAX_SEND_SIZE).await?;
-        let mut stream = std::pin::pin!(stream);
+impl InterpretedWalReaderState {
+    fn current_position(&self) -> Option<Lsn> {
+        match self {
+            InterpretedWalReaderState::Running {
+                current_position, ..
+            } => Some(*current_position),
+            InterpretedWalReaderState::Done => None,
+        }
+    }
+}
 
-        let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
-        keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
-        keepalive_ticker.reset();
+pub(crate) struct AttachShardNotification {
+    shard_id: ShardIdentity,
+    sender: tokio::sync::mpsc::Sender<Batch>,
+    start_pos: Lsn,
+}
 
-        let (tx, mut rx) = tokio::sync::mpsc::channel::<Batch>(2);
-        let shard = vec![self.shard];
+impl InterpretedWalReader {
+    /// Spawn the reader in a separate tokio task and return a handle
+    pub(crate) fn spawn(
+        wal_stream: StreamingWalReader,
+        start_pos: Lsn,
+        tx: tokio::sync::mpsc::Sender<Batch>,
+        shard: ShardIdentity,
+        pg_version: u32,
+        appname: &Option<String>,
+    ) -> InterpretedWalReaderHandle {
+        let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
+            current_position: start_pos,
+        }));
+
+        let (shard_notification_tx, shard_notification_rx) = tokio::sync::mpsc::unbounded_channel();
+
+        let reader = InterpretedWalReader {
+            wal_stream,
+            shard_senders: HashMap::from([(
+                shard,
+                smallvec::smallvec![ShardSenderState {
+                    sender_id: SenderId::first(),
+                    tx,
+                    next_record_lsn: start_pos,
+                }],
+            )]),
+            shard_notification_rx: Some(shard_notification_rx),
+            state: state.clone(),
+            pg_version,
+        };
+
+        let metric = WAL_READERS
+            .get_metric_with_label_values(&["task", appname.as_deref().unwrap_or("safekeeper")])
+            .unwrap();
+
+        let join_handle = tokio::task::spawn(
+            async move {
+                metric.inc();
+                scopeguard::defer! {
+                    metric.dec();
+                }
+
+                let res = reader.run_impl(start_pos).await;
+                if let Err(ref err) = res {
+                    tracing::error!("Task finished with error: {err}");
+                }
+                res
+            }
+            .instrument(info_span!("interpreted wal reader")),
+        );
+
+        InterpretedWalReaderHandle {
+            join_handle,
+            state,
+            shard_notification_tx,
+        }
+    }
+
+    /// Construct the reader without spawning anything
+    /// Callers should drive the future returned by [`Self::run`].
+    pub(crate) fn new(
+        wal_stream: StreamingWalReader,
+        start_pos: Lsn,
+        tx: tokio::sync::mpsc::Sender<Batch>,
+        shard: ShardIdentity,
+        pg_version: u32,
+    ) -> InterpretedWalReader {
+        let state = Arc::new(std::sync::RwLock::new(InterpretedWalReaderState::Running {
+            current_position: start_pos,
+        }));
+
+        InterpretedWalReader {
+            wal_stream,
+            shard_senders: HashMap::from([(
+                shard,
+                smallvec::smallvec![ShardSenderState {
+                    sender_id: SenderId::first(),
+                    tx,
+                    next_record_lsn: start_pos,
+                }],
+            )]),
+            shard_notification_rx: None,
+            state: state.clone(),
+            pg_version,
+        }
+    }
+
+    /// Entry point for future (polling) based wal reader.
+    pub(crate) async fn run(
+        self,
+        start_pos: Lsn,
+        appname: &Option<String>,
+    ) -> Result<(), CopyStreamHandlerEnd> {
+        let metric = WAL_READERS
+            .get_metric_with_label_values(&["future", appname.as_deref().unwrap_or("safekeeper")])
+            .unwrap();
+
+        metric.inc();
+        scopeguard::defer! {
+            metric.dec();
+        }
+
+        let res = self.run_impl(start_pos).await;
+        if let Err(err) = res {
+            tracing::error!("Interpreted wal reader encountered error: {err}");
+        } else {
+            tracing::info!("Interpreted wal reader exiting");
+        }
+
+        Err(CopyStreamHandlerEnd::Other(anyhow!(
+            "interpreted wal reader finished"
+        )))
+    }
+
+    /// Send interpreted WAL to one or more [`InterpretedWalSender`]s
+    /// Stops when an error is encountered or when the [`InterpretedWalReaderHandle`]
+    /// goes out of scope.
+    async fn run_impl(mut self, start_pos: Lsn) -> Result<(), InterpretedWalReaderError> {
+        let defer_state = self.state.clone();
+        scopeguard::defer! {
+            *defer_state.write().unwrap() = InterpretedWalReaderState::Done;
+        }
+
+        let mut wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
 
         loop {
             tokio::select! {
-                // Get some WAL from the stream and then: decode, interpret and push it down the
-                // pipeline.
-                wal = stream.next(), if tx.capacity() > 0 => {
-                    let WalBytes { wal, wal_start_lsn: _, wal_end_lsn, available_wal_end_lsn } = match wal {
-                        Some(some) => some?,
-                        None => { break; }
+                // Main branch for reading WAL and forwarding it
+                wal_or_reset = self.wal_stream.next() => {
+                    let wal = wal_or_reset.map(|wor| wor.get_wal().expect("reset handled in select branch below"));
+                    let WalBytes {
+                        wal,
+                        wal_start_lsn: _,
+                        wal_end_lsn,
+                        available_wal_end_lsn,
+                    } = match wal {
+                        Some(some) => some.map_err(InterpretedWalReaderError::ReadOrInterpret)?,
+                        None => {
+                            // [`StreamingWalReader::next`] is an endless stream of WAL.
+                            // It shouldn't ever finish unless it panicked or became internally
+                            // inconsistent.
+                            return Result::Err(InterpretedWalReaderError::WalStreamClosed);
+                        }
                     };
 
-                    wal_position = wal_end_lsn;
                     wal_decoder.feed_bytes(&wal);
 
-                    let mut records = Vec::new();
+                    // Deserialize and interpret WAL records from this batch of WAL.
+                    // Interpreted records for each shard are collected separately.
+                    let shard_ids = self.shard_senders.keys().copied().collect::<Vec<_>>();
+                    let mut records_by_sender: HashMap<ShardSenderId, Vec<InterpretedWalRecord>> = HashMap::new();
                     let mut max_next_record_lsn = None;
-                    while let Some((next_record_lsn, recdata)) = wal_decoder
-                        .poll_decode()
-                        .with_context(|| "Failed to decode WAL")?
+                    while let Some((next_record_lsn, recdata)) = wal_decoder.poll_decode()?
                     {
                         assert!(next_record_lsn.is_aligned());
                         max_next_record_lsn = Some(next_record_lsn);
 
-
-                        // Deserialize and interpret WAL record
                         let interpreted = InterpretedWalRecord::from_bytes_filtered(
                             recdata,
-                            &shard,
+                            &shard_ids,
                             next_record_lsn,
                             self.pg_version,
                         )
-                        .with_context(|| "Failed to interpret WAL")?
-                        .remove(&self.shard)
-                        .unwrap();
+                        .with_context(|| "Failed to interpret WAL")?;
 
-                        if !interpreted.is_empty() {
-                            records.push(interpreted);
+                        for (shard, record) in interpreted {
+                            if record.is_empty() {
+                                continue;
+                            }
+
+                            let mut states_iter = self.shard_senders
+                                .get(&shard)
+                                .expect("keys collected above")
+                                .iter()
+                                .filter(|state| record.next_record_lsn > state.next_record_lsn)
+                                .peekable();
+                            while let Some(state) = states_iter.next() {
+                                let shard_sender_id = ShardSenderId::new(shard, state.sender_id);
+
+                                // The most commont case is one sender per shard. Peek and break to avoid the
+                                // clone in that situation.
+                                if states_iter.peek().is_none() {
+                                    records_by_sender.entry(shard_sender_id).or_default().push(record);
+                                    break;
+                                } else {
+                                    records_by_sender.entry(shard_sender_id).or_default().push(record.clone());
+                                }
+                            }
                         }
                     }
 
@@ -103,20 +333,170 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         None => { continue; }
                     };
 
-                    let batch = InterpretedWalRecords {
-                        records,
-                        next_record_lsn: Some(max_next_record_lsn),
-                    };
+                    // Update the current position such that new receivers can decide
+                    // whether to attach to us or spawn a new WAL reader.
+                    match &mut *self.state.write().unwrap() {
+                        InterpretedWalReaderState::Running { current_position, .. } => {
+                            *current_position = max_next_record_lsn;
+                        },
+                        InterpretedWalReaderState::Done => {
+                            unreachable!()
+                        }
+                    }
 
-                    tx.send(Batch {wal_end_lsn, available_wal_end_lsn, records: batch}).await.unwrap();
+                    // Send interpreted records downstream. Anything that has already been seen
+                    // by a shard is filtered out.
+                    let mut shard_senders_to_remove = Vec::new();
+                    for (shard, states) in &mut self.shard_senders {
+                        for state in states {
+                            if max_next_record_lsn <= state.next_record_lsn {
+                                continue;
+                            }
+
+                            let shard_sender_id = ShardSenderId::new(*shard, state.sender_id);
+                            let records = records_by_sender.remove(&shard_sender_id).unwrap_or_default();
+
+                            let batch = InterpretedWalRecords {
+                                records,
+                                next_record_lsn: Some(max_next_record_lsn),
+                            };
+
+                            let res = state.tx.send(Batch {
+                                wal_end_lsn,
+                                available_wal_end_lsn,
+                                records: batch,
+                            }).await;
+
+                            if res.is_err() {
+                                shard_senders_to_remove.push(shard_sender_id);
+                            } else {
+                                state.next_record_lsn = max_next_record_lsn;
+                            }
+                        }
+                    }
+
+                    // Clean up any shard senders that have dropped out.
+                    // This is inefficient, but such events are rare (connection to PS termination)
+                    // and the number of subscriptions on the same shards very small (only one
+                    // for the steady state).
+                    for to_remove in shard_senders_to_remove {
+                        let shard_senders = self.shard_senders.get_mut(&to_remove.shard()).expect("saw it above");
+                        if let Some(idx) = shard_senders.iter().position(|s| s.sender_id == to_remove.sender_id) {
+                            shard_senders.remove(idx);
+                            tracing::info!("Removed shard sender {}", to_remove);
+                        }
+
+                        if shard_senders.is_empty() {
+                            self.shard_senders.remove(&to_remove.shard());
+                        }
+                    }
                 },
-                // For a previously interpreted batch, serialize it and push it down the wire.
-                batch = rx.recv() => {
+                // Listen for new shards that want to attach to this reader.
+                // If the reader is not running as a task, then this is not supported
+                // (see the pending branch below).
+                notification = match self.shard_notification_rx.as_mut() {
+                        Some(rx) => Either::Left(rx.recv()),
+                        None => Either::Right(std::future::pending())
+                    } => {
+                    if let Some(n) = notification {
+                        let AttachShardNotification { shard_id, sender, start_pos } = n;
+
+                        // Update internal and external state, then reset the WAL stream
+                        // if required.
+                        let senders = self.shard_senders.entry(shard_id).or_default();
+                        let new_sender_id = match senders.last() {
+                            Some(sender) => sender.sender_id.next(),
+                            None => SenderId::first()
+                        };
+
+                        senders.push(ShardSenderState { sender_id: new_sender_id, tx: sender, next_record_lsn: start_pos});
+                        let current_pos = self.state.read().unwrap().current_position().unwrap();
+                        if start_pos < current_pos {
+                            self.wal_stream.reset(start_pos).await;
+                            wal_decoder = WalStreamDecoder::new(start_pos, self.pg_version);
+                        }
+
+                        tracing::info!(
+                            "Added shard sender {} with start_pos={} current_pos={}",
+                            ShardSenderId::new(shard_id, new_sender_id), start_pos, current_pos
+                        );
+                    }
+                }
+            }
+        }
+    }
+}
+
+impl InterpretedWalReaderHandle {
+    /// Fan-out the reader by attaching a new shard to it
+    pub(crate) fn fanout(
+        &self,
+        shard_id: ShardIdentity,
+        sender: tokio::sync::mpsc::Sender<Batch>,
+        start_pos: Lsn,
+    ) -> Result<(), SendError<AttachShardNotification>> {
+        self.shard_notification_tx.send(AttachShardNotification {
+            shard_id,
+            sender,
+            start_pos,
+        })
+    }
+
+    /// Get the current WAL position of the reader
+    pub(crate) fn current_position(&self) -> Option<Lsn> {
+        self.state.read().unwrap().current_position()
+    }
+
+    pub(crate) fn abort(&self) {
+        self.join_handle.abort()
+    }
+}
+
+impl Drop for InterpretedWalReaderHandle {
+    fn drop(&mut self) {
+        tracing::info!("Aborting interpreted wal reader");
+        self.abort()
+    }
+}
+
+pub(crate) struct InterpretedWalSender<'a, IO> {
+    pub(crate) format: InterpretedFormat,
+    pub(crate) compression: Option<Compression>,
+    pub(crate) appname: Option<String>,
+
+    pub(crate) tli: WalResidentTimeline,
+    pub(crate) start_lsn: Lsn,
+
+    pub(crate) pgb: &'a mut PostgresBackend<IO>,
+    pub(crate) end_watch_view: EndWatchView,
+    pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
+    pub(crate) rx: tokio::sync::mpsc::Receiver<Batch>,
+}
+
+impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
+    /// Send interpreted WAL records over the network.
+    /// Also manages keep-alives if nothing was sent for a while.
+    pub(crate) async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
+        let mut keepalive_ticker = tokio::time::interval(Duration::from_secs(1));
+        keepalive_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+        keepalive_ticker.reset();
+
+        let mut wal_position = self.start_lsn;
+
+        loop {
+            tokio::select! {
+                batch = self.rx.recv() => {
                     let batch = match batch {
                         Some(b) => b,
-                        None => { break; }
+                        None => {
+                            return Result::Err(
+                                CopyStreamHandlerEnd::Other(anyhow!("Interpreted WAL reader exited early"))
+                            );
+                        }
                     };
 
+                    wal_position = batch.wal_end_lsn;
+
                     let buf = batch
                         .records
                         .to_wire(self.format, self.compression)
@@ -136,7 +516,21 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                         })).await?;
                 }
                 // Send a periodic keep alive when the connection has been idle for a while.
+                // Since we've been idle, also check if we can stop streaming.
                 _ = keepalive_ticker.tick() => {
+                    if let Some(remote_consistent_lsn) = self.wal_sender_guard
+                        .walsenders()
+                        .get_ws_remote_consistent_lsn(self.wal_sender_guard.id())
+                    {
+                        if self.tli.should_walsender_stop(remote_consistent_lsn).await {
+                            // Stop streaming if the receivers are caught up and
+                            // there's no active compute. This causes the loop in
+                            // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
+                            // to exit and terminate the WAL stream.
+                            break;
+                        }
+                    }
+
                     self.pgb
                         .write_message(&BeMessage::KeepAlive(WalSndKeepAlive {
                             wal_end: self.end_watch_view.get().0,
@@ -144,14 +538,259 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> InterpretedWalSender<'_, IO> {
                             request_reply: true,
                         }))
                         .await?;
-                }
+                },
             }
         }
 
-        // The loop above ends when the receiver is caught up and there's no more WAL to send.
         Err(CopyStreamHandlerEnd::ServerInitiated(format!(
             "ending streaming to {:?} at {}, receiver is caughtup and there is no computes",
             self.appname, wal_position,
         )))
     }
 }
+#[cfg(test)]
+mod tests {
+    use std::{collections::HashMap, str::FromStr, time::Duration};
+
+    use pageserver_api::shard::{ShardIdentity, ShardStripeSize};
+    use postgres_ffi::MAX_SEND_SIZE;
+    use tokio::sync::mpsc::error::TryRecvError;
+    use utils::{
+        id::{NodeId, TenantTimelineId},
+        lsn::Lsn,
+        shard::{ShardCount, ShardNumber},
+    };
+
+    use crate::{
+        send_interpreted_wal::{Batch, InterpretedWalReader},
+        test_utils::Env,
+        wal_reader_stream::StreamingWalReader,
+    };
+
+    #[tokio::test]
+    async fn test_interpreted_wal_reader_fanout() {
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 8 * 1024;
+        const MSG_COUNT: usize = 200;
+        const PG_VERSION: u32 = 17;
+        const SHARD_COUNT: u8 = 2;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+            .await
+            .unwrap();
+        let end_pos = end_watch.get();
+
+        tracing::info!("Doing first round of reads ...");
+
+        let streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            start_lsn,
+            end_pos,
+            end_watch,
+            MAX_SEND_SIZE,
+        );
+
+        let shard_0 = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let shard_1 = ShardIdentity::new(
+            ShardNumber(1),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let mut shards = HashMap::new();
+
+        for shard_number in 0..SHARD_COUNT {
+            let shard_id = ShardIdentity::new(
+                ShardNumber(shard_number),
+                ShardCount(SHARD_COUNT),
+                ShardStripeSize::default(),
+            )
+            .unwrap();
+            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+            shards.insert(shard_id, (Some(tx), Some(rx)));
+        }
+
+        let shard_0_tx = shards.get_mut(&shard_0).unwrap().0.take().unwrap();
+        let mut shard_0_rx = shards.get_mut(&shard_0).unwrap().1.take().unwrap();
+
+        let handle = InterpretedWalReader::spawn(
+            streaming_wal_reader,
+            start_lsn,
+            shard_0_tx,
+            shard_0,
+            PG_VERSION,
+            &Some("pageserver".to_string()),
+        );
+
+        tracing::info!("Reading all WAL with only shard 0 attached ...");
+
+        let mut shard_0_interpreted_records = Vec::new();
+        while let Some(batch) = shard_0_rx.recv().await {
+            shard_0_interpreted_records.push(batch.records);
+            if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                break;
+            }
+        }
+
+        let shard_1_tx = shards.get_mut(&shard_1).unwrap().0.take().unwrap();
+        let mut shard_1_rx = shards.get_mut(&shard_1).unwrap().1.take().unwrap();
+
+        tracing::info!("Attaching shard 1 to the reader at start of WAL");
+        handle.fanout(shard_1, shard_1_tx, start_lsn).unwrap();
+
+        tracing::info!("Reading all WAL with shard 0 and shard 1 attached ...");
+
+        let mut shard_1_interpreted_records = Vec::new();
+        while let Some(batch) = shard_1_rx.recv().await {
+            shard_1_interpreted_records.push(batch.records);
+            if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                break;
+            }
+        }
+
+        // This test uses logical messages. Those only go to shard 0. Check that the
+        // filtering worked and shard 1 did not get any.
+        assert!(shard_1_interpreted_records
+            .iter()
+            .all(|recs| recs.records.is_empty()));
+
+        // Shard 0 should not receive anything more since the reader is
+        // going through wal that it has already processed.
+        let res = shard_0_rx.try_recv();
+        if let Ok(ref ok) = res {
+            tracing::error!(
+                "Shard 0 received batch: wal_end_lsn={} available_wal_end_lsn={}",
+                ok.wal_end_lsn,
+                ok.available_wal_end_lsn
+            );
+        }
+        assert!(matches!(res, Err(TryRecvError::Empty)));
+
+        // Check that the next records lsns received by the two shards match up.
+        let shard_0_next_lsns = shard_0_interpreted_records
+            .iter()
+            .map(|recs| recs.next_record_lsn)
+            .collect::<Vec<_>>();
+        let shard_1_next_lsns = shard_1_interpreted_records
+            .iter()
+            .map(|recs| recs.next_record_lsn)
+            .collect::<Vec<_>>();
+        assert_eq!(shard_0_next_lsns, shard_1_next_lsns);
+
+        handle.abort();
+        let mut done = false;
+        for _ in 0..5 {
+            if handle.current_position().is_none() {
+                done = true;
+                break;
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+
+        assert!(done);
+    }
+
+    #[tokio::test]
+    async fn test_interpreted_wal_reader_same_shard_fanout() {
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 8 * 1024;
+        const MSG_COUNT: usize = 200;
+        const PG_VERSION: u32 = 17;
+        const SHARD_COUNT: u8 = 2;
+        const ATTACHED_SHARDS: u8 = 4;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+            .await
+            .unwrap();
+        let end_pos = end_watch.get();
+
+        let streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            start_lsn,
+            end_pos,
+            end_watch,
+            MAX_SEND_SIZE,
+        );
+
+        let shard_0 = ShardIdentity::new(
+            ShardNumber(0),
+            ShardCount(SHARD_COUNT),
+            ShardStripeSize::default(),
+        )
+        .unwrap();
+
+        let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+        let mut batch_receivers = vec![rx];
+
+        let handle = InterpretedWalReader::spawn(
+            streaming_wal_reader,
+            start_lsn,
+            tx,
+            shard_0,
+            PG_VERSION,
+            &Some("pageserver".to_string()),
+        );
+
+        for _ in 0..(ATTACHED_SHARDS - 1) {
+            let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(MSG_COUNT * 2);
+            handle.fanout(shard_0, tx, start_lsn).unwrap();
+            batch_receivers.push(rx);
+        }
+
+        loop {
+            let batch = batch_receivers.first_mut().unwrap().recv().await.unwrap();
+            for rx in batch_receivers.iter_mut().skip(1) {
+                let other_batch = rx.recv().await.unwrap();
+
+                assert_eq!(batch.wal_end_lsn, other_batch.wal_end_lsn);
+                assert_eq!(
+                    batch.available_wal_end_lsn,
+                    other_batch.available_wal_end_lsn
+                );
+            }
+
+            if batch.wal_end_lsn == batch.available_wal_end_lsn {
+                break;
+            }
+        }
+
+        handle.abort();
+        let mut done = false;
+        for _ in 0..5 {
+            if handle.current_position().is_none() {
+                done = true;
+                break;
+            }
+            tokio::time::sleep(Duration::from_millis(1)).await;
+        }
+
+        assert!(done);
+    }
+}
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 8463221998..4a4a74a0fd 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,16 +2,18 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.
 
 use crate::handler::SafekeeperPostgresHandler;
-use crate::metrics::RECEIVED_PS_FEEDBACKS;
+use crate::metrics::{RECEIVED_PS_FEEDBACKS, WAL_READERS};
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::TermLsn;
-use crate::send_interpreted_wal::InterpretedWalSender;
+use crate::send_interpreted_wal::{
+    Batch, InterpretedWalReader, InterpretedWalReaderHandle, InterpretedWalSender,
+};
 use crate::timeline::WalResidentTimeline;
-use crate::wal_reader_stream::WalReaderStreamBuilder;
+use crate::wal_reader_stream::StreamingWalReader;
 use crate::wal_storage::WalReader;
 use anyhow::{bail, Context as AnyhowContext};
 use bytes::Bytes;
-use futures::future::Either;
+use futures::FutureExt;
 use parking_lot::Mutex;
 use postgres_backend::PostgresBackend;
 use postgres_backend::{CopyStreamHandlerEnd, PostgresBackendReader, QueryError};
@@ -19,16 +21,16 @@ use postgres_ffi::get_current_timestamp;
 use postgres_ffi::{TimestampTz, MAX_SEND_SIZE};
 use pq_proto::{BeMessage, WalSndKeepAlive, XLogDataBody};
 use safekeeper_api::models::{
-    ConnectionId, HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
-    WalSenderState, INVALID_FULL_TRANSACTION_ID,
+    HotStandbyFeedback, ReplicationFeedback, StandbyFeedback, StandbyReply,
+    INVALID_FULL_TRANSACTION_ID,
 };
 use safekeeper_api::Term;
 use tokio::io::{AsyncRead, AsyncWrite};
 use utils::failpoint_support;
-use utils::id::TenantTimelineId;
 use utils::pageserver_feedback::PageserverFeedback;
 use utils::postgres_client::PostgresClientProtocol;
 
+use itertools::Itertools;
 use std::cmp::{max, min};
 use std::net::SocketAddr;
 use std::sync::Arc;
@@ -50,6 +52,12 @@ pub struct WalSenders {
     walreceivers: Arc<WalReceivers>,
 }
 
+pub struct WalSendersTimelineMetricValues {
+    pub ps_feedback_counter: u64,
+    pub last_ps_feedback: PageserverFeedback,
+    pub interpreted_wal_reader_tasks: usize,
+}
+
 impl WalSenders {
     pub fn new(walreceivers: Arc<WalReceivers>) -> Arc<WalSenders> {
         Arc::new(WalSenders {
@@ -60,21 +68,8 @@ impl WalSenders {
 
     /// Register new walsender. Returned guard provides access to the slot and
     /// automatically deregisters in Drop.
-    fn register(
-        self: &Arc<WalSenders>,
-        ttid: TenantTimelineId,
-        addr: SocketAddr,
-        conn_id: ConnectionId,
-        appname: Option<String>,
-    ) -> WalSenderGuard {
+    fn register(self: &Arc<WalSenders>, walsender_state: WalSenderState) -> WalSenderGuard {
         let slots = &mut self.mutex.lock().slots;
-        let walsender_state = WalSenderState {
-            ttid,
-            addr,
-            conn_id,
-            appname,
-            feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
-        };
         // find empty slot or create new one
         let pos = if let Some(pos) = slots.iter().position(|s| s.is_none()) {
             slots[pos] = Some(walsender_state);
@@ -90,9 +85,79 @@ impl WalSenders {
         }
     }
 
+    fn create_or_update_interpreted_reader<
+        FUp: FnOnce(&Arc<InterpretedWalReaderHandle>) -> anyhow::Result<()>,
+        FNew: FnOnce() -> InterpretedWalReaderHandle,
+    >(
+        self: &Arc<WalSenders>,
+        id: WalSenderId,
+        start_pos: Lsn,
+        max_delta_for_fanout: Option<u64>,
+        update: FUp,
+        create: FNew,
+    ) -> anyhow::Result<()> {
+        let state = &mut self.mutex.lock();
+
+        let mut selected_interpreted_reader = None;
+        for slot in state.slots.iter().flatten() {
+            if let WalSenderState::Interpreted(slot_state) = slot {
+                if let Some(ref interpreted_reader) = slot_state.interpreted_wal_reader {
+                    let select = match (interpreted_reader.current_position(), max_delta_for_fanout)
+                    {
+                        (Some(pos), Some(max_delta)) => {
+                            let delta = pos.0.abs_diff(start_pos.0);
+                            delta <= max_delta
+                        }
+                        // Reader is not active
+                        (None, _) => false,
+                        // Gating fanout by max delta is disabled.
+                        // Attach to any active reader.
+                        (_, None) => true,
+                    };
+
+                    if select {
+                        selected_interpreted_reader = Some(interpreted_reader.clone());
+                        break;
+                    }
+                }
+            }
+        }
+
+        let slot = state.get_slot_mut(id);
+        let slot_state = match slot {
+            WalSenderState::Interpreted(s) => s,
+            WalSenderState::Vanilla(_) => unreachable!(),
+        };
+
+        let selected_or_new = match selected_interpreted_reader {
+            Some(selected) => {
+                update(&selected)?;
+                selected
+            }
+            None => Arc::new(create()),
+        };
+
+        slot_state.interpreted_wal_reader = Some(selected_or_new);
+
+        Ok(())
+    }
+
     /// Get state of all walsenders.
-    pub fn get_all(self: &Arc<WalSenders>) -> Vec<WalSenderState> {
-        self.mutex.lock().slots.iter().flatten().cloned().collect()
+    pub fn get_all_public(self: &Arc<WalSenders>) -> Vec<safekeeper_api::models::WalSenderState> {
+        self.mutex
+            .lock()
+            .slots
+            .iter()
+            .flatten()
+            .map(|state| match state {
+                WalSenderState::Vanilla(s) => {
+                    safekeeper_api::models::WalSenderState::Vanilla(s.clone())
+                }
+                WalSenderState::Interpreted(s) => {
+                    safekeeper_api::models::WalSenderState::Interpreted(s.public_state.clone())
+                }
+            })
+            .collect()
     }
 
     /// Get LSN of the most lagging pageserver receiver. Return None if there are no
@@ -103,7 +168,7 @@ impl WalSenders {
             .slots
             .iter()
             .flatten()
-            .filter_map(|s| match s.feedback {
+            .filter_map(|s| match s.get_feedback() {
                 ReplicationFeedback::Pageserver(feedback) => Some(feedback.last_received_lsn),
                 ReplicationFeedback::Standby(_) => None,
             })
@@ -111,9 +176,25 @@ impl WalSenders {
     }
 
     /// Returns total counter of pageserver feedbacks received and last feedback.
-    pub fn get_ps_feedback_stats(self: &Arc<WalSenders>) -> (u64, PageserverFeedback) {
+    pub fn info_for_metrics(self: &Arc<WalSenders>) -> WalSendersTimelineMetricValues {
         let shared = self.mutex.lock();
-        (shared.ps_feedback_counter, shared.last_ps_feedback)
+
+        let interpreted_wal_reader_tasks = shared
+            .slots
+            .iter()
+            .filter_map(|ss| match ss {
+                Some(WalSenderState::Interpreted(int)) => int.interpreted_wal_reader.as_ref(),
+                Some(WalSenderState::Vanilla(_)) => None,
+                None => None,
+            })
+            .unique_by(|reader| Arc::as_ptr(reader))
+            .count();
+
+        WalSendersTimelineMetricValues {
+            ps_feedback_counter: shared.ps_feedback_counter,
+            last_ps_feedback: shared.last_ps_feedback,
+            interpreted_wal_reader_tasks,
+        }
     }
 
     /// Get aggregated hot standby feedback (we send it to compute).
@@ -124,7 +205,7 @@ impl WalSenders {
     /// Record new pageserver feedback, update aggregated values.
     fn record_ps_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &PageserverFeedback) {
         let mut shared = self.mutex.lock();
-        shared.get_slot_mut(id).feedback = ReplicationFeedback::Pageserver(*feedback);
+        *shared.get_slot_mut(id).get_mut_feedback() = ReplicationFeedback::Pageserver(*feedback);
         shared.last_ps_feedback = *feedback;
         shared.ps_feedback_counter += 1;
         drop(shared);
@@ -143,10 +224,10 @@ impl WalSenders {
             "Record standby reply: ts={} apply_lsn={}",
             reply.reply_ts, reply.apply_lsn
         );
-        match &mut slot.feedback {
+        match &mut slot.get_mut_feedback() {
             ReplicationFeedback::Standby(sf) => sf.reply = *reply,
             ReplicationFeedback::Pageserver(_) => {
-                slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
+                *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
                     reply: *reply,
                     hs_feedback: HotStandbyFeedback::empty(),
                 })
@@ -158,10 +239,10 @@ impl WalSenders {
     fn record_hs_feedback(self: &Arc<WalSenders>, id: WalSenderId, feedback: &HotStandbyFeedback) {
         let mut shared = self.mutex.lock();
         let slot = shared.get_slot_mut(id);
-        match &mut slot.feedback {
+        match &mut slot.get_mut_feedback() {
             ReplicationFeedback::Standby(sf) => sf.hs_feedback = *feedback,
             ReplicationFeedback::Pageserver(_) => {
-                slot.feedback = ReplicationFeedback::Standby(StandbyFeedback {
+                *slot.get_mut_feedback() = ReplicationFeedback::Standby(StandbyFeedback {
                     reply: StandbyReply::empty(),
                     hs_feedback: *feedback,
                 })
@@ -175,7 +256,7 @@ impl WalSenders {
     pub fn get_ws_remote_consistent_lsn(self: &Arc<WalSenders>, id: WalSenderId) -> Option<Lsn> {
         let shared = self.mutex.lock();
         let slot = shared.get_slot(id);
-        match slot.feedback {
+        match slot.get_feedback() {
             ReplicationFeedback::Pageserver(feedback) => Some(feedback.remote_consistent_lsn),
             _ => None,
         }
@@ -199,6 +280,47 @@ struct WalSendersShared {
     slots: Vec<Option<WalSenderState>>,
 }
 
+/// Safekeeper internal definitions of wal sender state
+///
+/// As opposed to [`safekeeper_api::models::WalSenderState`] these struct may
+/// include state that we don not wish to expose to the public api.
+#[derive(Debug, Clone)]
+pub(crate) enum WalSenderState {
+    Vanilla(VanillaWalSenderInternalState),
+    Interpreted(InterpretedWalSenderInternalState),
+}
+
+type VanillaWalSenderInternalState = safekeeper_api::models::VanillaWalSenderState;
+
+#[derive(Debug, Clone)]
+pub(crate) struct InterpretedWalSenderInternalState {
+    public_state: safekeeper_api::models::InterpretedWalSenderState,
+    interpreted_wal_reader: Option<Arc<InterpretedWalReaderHandle>>,
+}
+
+impl WalSenderState {
+    fn get_addr(&self) -> &SocketAddr {
+        match self {
+            WalSenderState::Vanilla(state) => &state.addr,
+            WalSenderState::Interpreted(state) => &state.public_state.addr,
+        }
+    }
+
+    fn get_feedback(&self) -> &ReplicationFeedback {
+        match self {
+            WalSenderState::Vanilla(state) => &state.feedback,
+            WalSenderState::Interpreted(state) => &state.public_state.feedback,
+        }
+    }
+
+    fn get_mut_feedback(&mut self) -> &mut ReplicationFeedback {
+        match self {
+            WalSenderState::Vanilla(state) => &mut state.feedback,
+            WalSenderState::Interpreted(state) => &mut state.public_state.feedback,
+        }
+    }
+}
+
 impl WalSendersShared {
     fn new() -> Self {
         WalSendersShared {
@@ -225,7 +347,7 @@ impl WalSendersShared {
         let mut agg = HotStandbyFeedback::empty();
         let mut reply_agg = StandbyReply::empty();
         for ws_state in self.slots.iter().flatten() {
-            if let ReplicationFeedback::Standby(standby_feedback) = ws_state.feedback {
+            if let ReplicationFeedback::Standby(standby_feedback) = ws_state.get_feedback() {
                 let hs_feedback = standby_feedback.hs_feedback;
                 // doing Option math like op1.iter().chain(op2.iter()).min()
                 // would be nicer, but we serialize/deserialize this struct
@@ -317,7 +439,7 @@ impl SafekeeperPostgresHandler {
     /// Wrapper around handle_start_replication_guts handling result. Error is
     /// handled here while we're still in walsender ttid span; with API
     /// extension, this can probably be moved into postgres_backend.
-    pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin>(
+    pub async fn handle_start_replication<IO: AsyncRead + AsyncWrite + Unpin + Send>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
@@ -342,7 +464,7 @@ impl SafekeeperPostgresHandler {
         Ok(())
     }
 
-    pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin>(
+    pub async fn handle_start_replication_guts<IO: AsyncRead + AsyncWrite + Unpin + Send>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         start_pos: Lsn,
@@ -352,12 +474,30 @@ impl SafekeeperPostgresHandler {
         let appname = self.appname.clone();
 
         // Use a guard object to remove our entry from the timeline when we are done.
-        let ws_guard = Arc::new(tli.get_walsenders().register(
-            self.ttid,
-            *pgb.get_peer_addr(),
-            self.conn_id,
-            self.appname.clone(),
-        ));
+        let ws_guard = match self.protocol() {
+            PostgresClientProtocol::Vanilla => Arc::new(tli.get_walsenders().register(
+                WalSenderState::Vanilla(VanillaWalSenderInternalState {
+                    ttid: self.ttid,
+                    addr: *pgb.get_peer_addr(),
+                    conn_id: self.conn_id,
+                    appname: self.appname.clone(),
+                    feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
+                }),
+            )),
+            PostgresClientProtocol::Interpreted { .. } => Arc::new(tli.get_walsenders().register(
+                WalSenderState::Interpreted(InterpretedWalSenderInternalState {
+                    public_state: safekeeper_api::models::InterpretedWalSenderState {
+                        ttid: self.ttid,
+                        shard: self.shard.unwrap(),
+                        addr: *pgb.get_peer_addr(),
+                        conn_id: self.conn_id,
+                        appname: self.appname.clone(),
+                        feedback: ReplicationFeedback::Pageserver(PageserverFeedback::empty()),
+                    },
+                    interpreted_wal_reader: None,
+                }),
+            )),
+        };
 
         // Walsender can operate in one of two modes which we select by
         // application_name: give only committed WAL (used by pageserver) or all
@@ -403,7 +543,7 @@ impl SafekeeperPostgresHandler {
                     pgb,
                     // should succeed since we're already holding another guard
                     tli: tli.wal_residence_guard().await?,
-                    appname,
+                    appname: appname.clone(),
                     start_pos,
                     end_pos,
                     term,
@@ -413,7 +553,7 @@ impl SafekeeperPostgresHandler {
                     send_buf: vec![0u8; MAX_SEND_SIZE],
                 };
 
-                Either::Left(sender.run())
+                FutureExt::boxed(sender.run())
             }
             PostgresClientProtocol::Interpreted {
                 format,
@@ -421,27 +561,96 @@ impl SafekeeperPostgresHandler {
             } => {
                 let pg_version = tli.tli.get_state().await.1.server.pg_version / 10000;
                 let end_watch_view = end_watch.view();
-                let wal_stream_builder = WalReaderStreamBuilder {
-                    tli: tli.wal_residence_guard().await?,
-                    start_pos,
-                    end_pos,
-                    term,
-                    end_watch,
-                    wal_sender_guard: ws_guard.clone(),
-                };
+                let wal_residence_guard = tli.wal_residence_guard().await?;
+                let (tx, rx) = tokio::sync::mpsc::channel::<Batch>(2);
+                let shard = self.shard.unwrap();
 
-                let sender = InterpretedWalSender {
-                    format,
-                    compression,
-                    pgb,
-                    wal_stream_builder,
-                    end_watch_view,
-                    shard: self.shard.unwrap(),
-                    pg_version,
-                    appname,
-                };
+                if self.conf.wal_reader_fanout && !shard.is_unsharded() {
+                    let ws_id = ws_guard.id();
+                    ws_guard.walsenders().create_or_update_interpreted_reader(
+                        ws_id,
+                        start_pos,
+                        self.conf.max_delta_for_fanout,
+                        {
+                            let tx = tx.clone();
+                            |reader| {
+                                tracing::info!(
+                                    "Fanning out interpreted wal reader at {}",
+                                    start_pos
+                                );
+                                reader
+                                    .fanout(shard, tx, start_pos)
+                                    .with_context(|| "Failed to fan out reader")
+                            }
+                        },
+                        || {
+                            tracing::info!("Spawning interpreted wal reader at {}", start_pos);
 
-                Either::Right(sender.run())
+                            let wal_stream = StreamingWalReader::new(
+                                wal_residence_guard,
+                                term,
+                                start_pos,
+                                end_pos,
+                                end_watch,
+                                MAX_SEND_SIZE,
+                            );
+
+                            InterpretedWalReader::spawn(
+                                wal_stream, start_pos, tx, shard, pg_version, &appname,
+                            )
+                        },
+                    )?;
+
+                    let sender = InterpretedWalSender {
+                        format,
+                        compression,
+                        appname,
+                        tli: tli.wal_residence_guard().await?,
+                        start_lsn: start_pos,
+                        pgb,
+                        end_watch_view,
+                        wal_sender_guard: ws_guard.clone(),
+                        rx,
+                    };
+
+                    FutureExt::boxed(sender.run())
+                } else {
+                    let wal_reader = StreamingWalReader::new(
+                        wal_residence_guard,
+                        term,
+                        start_pos,
+                        end_pos,
+                        end_watch,
+                        MAX_SEND_SIZE,
+                    );
+
+                    let reader =
+                        InterpretedWalReader::new(wal_reader, start_pos, tx, shard, pg_version);
+
+                    let sender = InterpretedWalSender {
+                        format,
+                        compression,
+                        appname: appname.clone(),
+                        tli: tli.wal_residence_guard().await?,
+                        start_lsn: start_pos,
+                        pgb,
+                        end_watch_view,
+                        wal_sender_guard: ws_guard.clone(),
+                        rx,
+                    };
+
+                    FutureExt::boxed(async move {
+                        // Sender returns an Err on all code paths.
+                        // If the sender finishes first, we will drop the reader future.
+                        // If the reader finishes first, the sender will finish too since
+                        // the wal sender has dropped.
+                        let res = tokio::try_join!(sender.run(), reader.run(start_pos, &appname));
+                        match res.map(|_| ()) {
+                            Ok(_) => unreachable!("sender finishes with Err by convention"),
+                            err_res => err_res,
+                        }
+                    })
+                }
             }
         };
 
@@ -470,7 +679,8 @@ impl SafekeeperPostgresHandler {
             .clone();
         info!(
             "finished streaming to {}, feedback={:?}",
-            ws_state.addr, ws_state.feedback,
+            ws_state.get_addr(),
+            ws_state.get_feedback(),
         );
 
         // Join pg backend back.
@@ -578,6 +788,18 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
     /// Err(CopyStreamHandlerEnd) is always returned; Result is used only for ?
     /// convenience.
     async fn run(mut self) -> Result<(), CopyStreamHandlerEnd> {
+        let metric = WAL_READERS
+            .get_metric_with_label_values(&[
+                "future",
+                self.appname.as_deref().unwrap_or("safekeeper"),
+            ])
+            .unwrap();
+
+        metric.inc();
+        scopeguard::defer! {
+            metric.dec();
+        }
+
         loop {
             // Wait for the next portion if it is not there yet, or just
             // update our end of WAL available for sending value, we
@@ -813,7 +1035,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> ReplyReader<IO> {
 #[cfg(test)]
 mod tests {
     use safekeeper_api::models::FullTransactionId;
-    use utils::id::{TenantId, TimelineId};
+    use utils::id::{TenantId, TenantTimelineId, TimelineId};
 
     use super::*;
 
@@ -830,13 +1052,13 @@ mod tests {
 
     // add to wss specified feedback setting other fields to dummy values
     fn push_feedback(wss: &mut WalSendersShared, feedback: ReplicationFeedback) {
-        let walsender_state = WalSenderState {
+        let walsender_state = WalSenderState::Vanilla(VanillaWalSenderInternalState {
             ttid: mock_ttid(),
             addr: mock_addr(),
             conn_id: 1,
             appname: None,
             feedback,
-        };
+        });
         wss.slots.push(Some(walsender_state))
     }
 
diff --git a/safekeeper/src/test_utils.rs b/safekeeper/src/test_utils.rs
index c40a8bae5a..4e851c5b3d 100644
--- a/safekeeper/src/test_utils.rs
+++ b/safekeeper/src/test_utils.rs
@@ -1,13 +1,19 @@
 use std::sync::Arc;
 
 use crate::rate_limit::RateLimiter;
-use crate::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
+use crate::receive_wal::WalAcceptor;
+use crate::safekeeper::{
+    AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
+    ProposerElected, SafeKeeper, TermHistory,
+};
+use crate::send_wal::EndWatch;
 use crate::state::{TimelinePersistentState, TimelineState};
 use crate::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::remote_timeline_path;
-use crate::{control_file, wal_storage, SafeKeeperConf};
+use crate::{control_file, receive_wal, wal_storage, SafeKeeperConf};
 use camino_tempfile::Utf8TempDir;
+use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
 use tokio::fs::create_dir_all;
 use utils::id::{NodeId, TenantTimelineId};
 use utils::lsn::Lsn;
@@ -107,4 +113,59 @@ impl Env {
         );
         Ok(timeline)
     }
+
+    // This will be dead code when building a non-benchmark target with the
+    // benchmarking feature enabled.
+    #[allow(dead_code)]
+    pub(crate) async fn write_wal(
+        tli: Arc<Timeline>,
+        start_lsn: Lsn,
+        msg_size: usize,
+        msg_count: usize,
+    ) -> anyhow::Result<EndWatch> {
+        let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
+        let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
+
+        let end_watch = EndWatch::Commit(tli.get_commit_lsn_watch_rx());
+
+        WalAcceptor::spawn(tli.wal_residence_guard().await?, msg_rx, reply_tx, Some(0));
+
+        let prefix = c"p";
+        let prefixlen = prefix.to_bytes_with_nul().len();
+        assert!(msg_size >= prefixlen);
+        let message = vec![0; msg_size - prefixlen];
+
+        let walgen =
+            &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message), start_lsn);
+        for _ in 0..msg_count {
+            let (lsn, record) = walgen.next().unwrap();
+
+            let req = AppendRequest {
+                h: AppendRequestHeader {
+                    term: 1,
+                    term_start_lsn: start_lsn,
+                    begin_lsn: lsn,
+                    end_lsn: lsn + record.len() as u64,
+                    commit_lsn: lsn,
+                    truncate_lsn: Lsn(0),
+                    proposer_uuid: [0; 16],
+                },
+                wal_data: record,
+            };
+
+            let end_lsn = req.h.end_lsn;
+
+            let msg = ProposerAcceptorMessage::AppendRequest(req);
+            msg_tx.send(msg).await?;
+            while let Some(reply) = reply_rx.recv().await {
+                if let AcceptorProposerMessage::AppendResponse(resp) = reply {
+                    if resp.flush_lsn >= end_lsn {
+                        break;
+                    }
+                }
+            }
+        }
+
+        Ok(end_watch)
+    }
 }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 2882391074..5eb0bd7146 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -35,7 +35,7 @@ use crate::control_file;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, TermLsn};
-use crate::send_wal::WalSenders;
+use crate::send_wal::{WalSenders, WalSendersTimelineMetricValues};
 use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
 use crate::timeline_guard::ResidenceGuard;
 use crate::timeline_manager::{AtomicStatus, ManagerCtl};
@@ -712,16 +712,22 @@ impl Timeline {
             return None;
         }
 
-        let (ps_feedback_count, last_ps_feedback) = self.walsenders.get_ps_feedback_stats();
+        let WalSendersTimelineMetricValues {
+            ps_feedback_counter,
+            last_ps_feedback,
+            interpreted_wal_reader_tasks,
+        } = self.walsenders.info_for_metrics();
+
         let state = self.read_shared_state().await;
         Some(FullTimelineInfo {
             ttid: self.ttid,
-            ps_feedback_count,
+            ps_feedback_count: ps_feedback_counter,
             last_ps_feedback,
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             timeline_is_active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
             last_removed_segno: self.last_removed_segno.load(Ordering::Relaxed),
+            interpreted_wal_reader_tasks,
             epoch_start_lsn: state.sk.term_start_lsn(),
             mem_state: state.sk.state().inmem.clone(),
             persisted_state: TimelinePersistentState::clone(state.sk.state()),
@@ -740,7 +746,7 @@ impl Timeline {
         debug_dump::Memory {
             is_cancelled: self.is_cancelled(),
             peers_info_len: state.peers_info.0.len(),
-            walsenders: self.walsenders.get_all(),
+            walsenders: self.walsenders.get_all_public(),
             wal_backup_active: self.wal_backup_active.load(Ordering::Relaxed),
             active: self.broker_active.load(Ordering::Relaxed),
             num_computes: self.walreceivers.get_num() as u32,
diff --git a/safekeeper/src/wal_reader_stream.rs b/safekeeper/src/wal_reader_stream.rs
index aea628c208..adac6067da 100644
--- a/safekeeper/src/wal_reader_stream.rs
+++ b/safekeeper/src/wal_reader_stream.rs
@@ -1,34 +1,16 @@
-use std::sync::Arc;
-
-use async_stream::try_stream;
-use bytes::Bytes;
-use futures::Stream;
-use postgres_backend::CopyStreamHandlerEnd;
-use safekeeper_api::Term;
-use std::time::Duration;
-use tokio::time::timeout;
-use utils::lsn::Lsn;
-
-use crate::{
-    send_wal::{EndWatch, WalSenderGuard},
-    timeline::WalResidentTimeline,
+use std::{
+    pin::Pin,
+    task::{Context, Poll},
 };
 
-pub(crate) struct WalReaderStreamBuilder {
-    pub(crate) tli: WalResidentTimeline,
-    pub(crate) start_pos: Lsn,
-    pub(crate) end_pos: Lsn,
-    pub(crate) term: Option<Term>,
-    pub(crate) end_watch: EndWatch,
-    pub(crate) wal_sender_guard: Arc<WalSenderGuard>,
-}
+use bytes::Bytes;
+use futures::{stream::BoxStream, Stream, StreamExt};
+use utils::lsn::Lsn;
 
-impl WalReaderStreamBuilder {
-    pub(crate) fn start_pos(&self) -> Lsn {
-        self.start_pos
-    }
-}
+use crate::{send_wal::EndWatch, timeline::WalResidentTimeline, wal_storage::WalReader};
+use safekeeper_api::Term;
 
+#[derive(PartialEq, Eq, Debug)]
 pub(crate) struct WalBytes {
     /// Raw PG WAL
     pub(crate) wal: Bytes,
@@ -44,106 +26,270 @@ pub(crate) struct WalBytes {
     pub(crate) available_wal_end_lsn: Lsn,
 }
 
-impl WalReaderStreamBuilder {
-    /// Builds a stream of Postgres WAL starting from [`Self::start_pos`].
-    /// The stream terminates when the receiver (pageserver) is fully caught up
-    /// and there's no active computes.
-    pub(crate) async fn build(
-        self,
-        buffer_size: usize,
-    ) -> anyhow::Result<impl Stream<Item = Result<WalBytes, CopyStreamHandlerEnd>>> {
-        // TODO(vlad): The code below duplicates functionality from [`crate::send_wal`].
-        // We can make the raw WAL sender use this stream too and remove the duplication.
-        let Self {
-            tli,
-            mut start_pos,
-            mut end_pos,
-            term,
-            mut end_watch,
-            wal_sender_guard,
-        } = self;
-        let mut wal_reader = tli.get_walreader(start_pos).await?;
-        let mut buffer = vec![0; buffer_size];
+struct PositionedWalReader {
+    start: Lsn,
+    end: Lsn,
+    reader: Option<WalReader>,
+}
 
-        const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);
+/// A streaming WAL reader wrapper which can be reset while running
+pub(crate) struct StreamingWalReader {
+    stream: BoxStream<'static, WalOrReset>,
+    start_changed_tx: tokio::sync::watch::Sender<Lsn>,
+}
 
-        Ok(try_stream! {
-            loop {
-                let have_something_to_send = end_pos > start_pos;
+pub(crate) enum WalOrReset {
+    Wal(anyhow::Result<WalBytes>),
+    Reset(Lsn),
+}
 
-                if !have_something_to_send {
-                    // wait for lsn
-                    let res = timeout(POLL_STATE_TIMEOUT, end_watch.wait_for_lsn(start_pos, term)).await;
-                    match res {
-                        Ok(ok) => {
-                            end_pos = ok?;
-                        },
-                        Err(_) => {
-                            if let EndWatch::Commit(_) = end_watch {
-                                if let Some(remote_consistent_lsn) = wal_sender_guard
-                                    .walsenders()
-                                    .get_ws_remote_consistent_lsn(wal_sender_guard.id())
-                                {
-                                    if tli.should_walsender_stop(remote_consistent_lsn).await {
-                                        // Stop streaming if the receivers are caught up and
-                                        // there's no active compute. This causes the loop in
-                                        // [`crate::send_interpreted_wal::InterpretedWalSender::run`]
-                                        // to exit and terminate the WAL stream.
-                                        return;
-                                    }
-                                }
-                            }
-
-                            continue;
-                        }
-                    }
-                }
-
-
-                assert!(
-                    end_pos > start_pos,
-                    "nothing to send after waiting for WAL"
-                );
-
-                // try to send as much as available, capped by the buffer size
-                let mut chunk_end_pos = start_pos + buffer_size as u64;
-                // if we went behind available WAL, back off
-                if chunk_end_pos >= end_pos {
-                    chunk_end_pos = end_pos;
-                } else {
-                    // If sending not up to end pos, round down to page boundary to
-                    // avoid breaking WAL record not at page boundary, as protocol
-                    // demands. See walsender.c (XLogSendPhysical).
-                    chunk_end_pos = chunk_end_pos
-                        .checked_sub(chunk_end_pos.block_offset())
-                        .unwrap();
-                }
-                let send_size = (chunk_end_pos.0 - start_pos.0) as usize;
-                let buffer = &mut buffer[..send_size];
-                let send_size: usize;
-                {
-                    // If uncommitted part is being pulled, check that the term is
-                    // still the expected one.
-                    let _term_guard = if let Some(t) = term {
-                        Some(tli.acquire_term(t).await?)
-                    } else {
-                        None
-                    };
-                    // Read WAL into buffer. send_size can be additionally capped to
-                    // segment boundary here.
-                    send_size = wal_reader.read(buffer).await?
-                };
-                let wal = Bytes::copy_from_slice(&buffer[..send_size]);
-
-                yield WalBytes {
-                    wal,
-                    wal_start_lsn: start_pos,
-                    wal_end_lsn: start_pos + send_size as u64,
-                    available_wal_end_lsn: end_pos
-                };
-
-                start_pos += send_size as u64;
-            }
-        })
+impl WalOrReset {
+    pub(crate) fn get_wal(self) -> Option<anyhow::Result<WalBytes>> {
+        match self {
+            WalOrReset::Wal(wal) => Some(wal),
+            WalOrReset::Reset(_) => None,
+        }
+    }
+}
+
+impl StreamingWalReader {
+    pub(crate) fn new(
+        tli: WalResidentTimeline,
+        term: Option<Term>,
+        start: Lsn,
+        end: Lsn,
+        end_watch: EndWatch,
+        buffer_size: usize,
+    ) -> Self {
+        let (start_changed_tx, start_changed_rx) = tokio::sync::watch::channel(start);
+
+        let state = WalReaderStreamState {
+            tli,
+            wal_reader: PositionedWalReader {
+                start,
+                end,
+                reader: None,
+            },
+            term,
+            end_watch,
+            buffer: vec![0; buffer_size],
+            buffer_size,
+        };
+
+        // When a change notification is received while polling the internal
+        // reader, stop polling the read future and service the change.
+        let stream = futures::stream::unfold(
+            (state, start_changed_rx),
+            |(mut state, mut rx)| async move {
+                let wal_or_reset = tokio::select! {
+                    read_res = state.read() => { WalOrReset::Wal(read_res) },
+                    changed_res = rx.changed() => {
+                        if changed_res.is_err() {
+                            return None;
+                        }
+
+                        let new_start_pos = rx.borrow_and_update();
+                        WalOrReset::Reset(*new_start_pos)
+                    }
+                };
+
+                if let WalOrReset::Reset(lsn) = wal_or_reset {
+                    state.wal_reader.start = lsn;
+                    state.wal_reader.reader = None;
+                }
+
+                Some((wal_or_reset, (state, rx)))
+            },
+        )
+        .boxed();
+
+        Self {
+            stream,
+            start_changed_tx,
+        }
+    }
+
+    /// Reset the stream to a given position.
+    pub(crate) async fn reset(&mut self, start: Lsn) {
+        self.start_changed_tx.send(start).unwrap();
+        while let Some(wal_or_reset) = self.stream.next().await {
+            match wal_or_reset {
+                WalOrReset::Reset(at) => {
+                    // Stream confirmed the reset.
+                    // There may only one ongoing reset at any given time,
+                    // hence the assertion.
+                    assert_eq!(at, start);
+                    break;
+                }
+                WalOrReset::Wal(_) => {
+                    // Ignore wal generated before reset was handled
+                }
+            }
+        }
+    }
+}
+
+impl Stream for StreamingWalReader {
+    type Item = WalOrReset;
+
+    fn poll_next(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        Pin::new(&mut self.stream).poll_next(cx)
+    }
+}
+
+struct WalReaderStreamState {
+    tli: WalResidentTimeline,
+    wal_reader: PositionedWalReader,
+    term: Option<Term>,
+    end_watch: EndWatch,
+    buffer: Vec<u8>,
+    buffer_size: usize,
+}
+
+impl WalReaderStreamState {
+    async fn read(&mut self) -> anyhow::Result<WalBytes> {
+        // Create reader if needed
+        if self.wal_reader.reader.is_none() {
+            self.wal_reader.reader = Some(self.tli.get_walreader(self.wal_reader.start).await?);
+        }
+
+        let have_something_to_send = self.wal_reader.end > self.wal_reader.start;
+        if !have_something_to_send {
+            tracing::debug!(
+                "Waiting for wal: start={}, end={}",
+                self.wal_reader.end,
+                self.wal_reader.start
+            );
+            self.wal_reader.end = self
+                .end_watch
+                .wait_for_lsn(self.wal_reader.start, self.term)
+                .await?;
+            tracing::debug!(
+                "Done waiting for wal: start={}, end={}",
+                self.wal_reader.end,
+                self.wal_reader.start
+            );
+        }
+
+        assert!(
+            self.wal_reader.end > self.wal_reader.start,
+            "nothing to send after waiting for WAL"
+        );
+
+        // Calculate chunk size
+        let mut chunk_end_pos = self.wal_reader.start + self.buffer_size as u64;
+        if chunk_end_pos >= self.wal_reader.end {
+            chunk_end_pos = self.wal_reader.end;
+        } else {
+            chunk_end_pos = chunk_end_pos
+                .checked_sub(chunk_end_pos.block_offset())
+                .unwrap();
+        }
+
+        let send_size = (chunk_end_pos.0 - self.wal_reader.start.0) as usize;
+        let buffer = &mut self.buffer[..send_size];
+
+        // Read WAL
+        let send_size = {
+            let _term_guard = if let Some(t) = self.term {
+                Some(self.tli.acquire_term(t).await?)
+            } else {
+                None
+            };
+            self.wal_reader
+                .reader
+                .as_mut()
+                .unwrap()
+                .read(buffer)
+                .await?
+        };
+
+        let wal = Bytes::copy_from_slice(&buffer[..send_size]);
+        let result = WalBytes {
+            wal,
+            wal_start_lsn: self.wal_reader.start,
+            wal_end_lsn: self.wal_reader.start + send_size as u64,
+            available_wal_end_lsn: self.wal_reader.end,
+        };
+
+        self.wal_reader.start += send_size as u64;
+
+        Ok(result)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use std::str::FromStr;
+
+    use futures::StreamExt;
+    use postgres_ffi::MAX_SEND_SIZE;
+    use utils::{
+        id::{NodeId, TenantTimelineId},
+        lsn::Lsn,
+    };
+
+    use crate::{test_utils::Env, wal_reader_stream::StreamingWalReader};
+
+    #[tokio::test]
+    async fn test_streaming_wal_reader_reset() {
+        let _ = env_logger::builder().is_test(true).try_init();
+
+        const SIZE: usize = 8 * 1024;
+        const MSG_COUNT: usize = 200;
+
+        let start_lsn = Lsn::from_str("0/149FD18").unwrap();
+        let env = Env::new(true).unwrap();
+        let tli = env
+            .make_timeline(NodeId(1), TenantTimelineId::generate(), start_lsn)
+            .await
+            .unwrap();
+
+        let resident_tli = tli.wal_residence_guard().await.unwrap();
+        let end_watch = Env::write_wal(tli, start_lsn, SIZE, MSG_COUNT)
+            .await
+            .unwrap();
+        let end_pos = end_watch.get();
+
+        tracing::info!("Doing first round of reads ...");
+
+        let mut streaming_wal_reader = StreamingWalReader::new(
+            resident_tli,
+            None,
+            start_lsn,
+            end_pos,
+            end_watch,
+            MAX_SEND_SIZE,
+        );
+
+        let mut before_reset = Vec::new();
+        while let Some(wor) = streaming_wal_reader.next().await {
+            let wal = wor.get_wal().unwrap().unwrap();
+            let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
+            before_reset.push(wal);
+
+            if stop {
+                break;
+            }
+        }
+
+        tracing::info!("Resetting the WAL stream ...");
+
+        streaming_wal_reader.reset(start_lsn).await;
+
+        tracing::info!("Doing second round of reads ...");
+
+        let mut after_reset = Vec::new();
+        while let Some(wor) = streaming_wal_reader.next().await {
+            let wal = wor.get_wal().unwrap().unwrap();
+            let stop = wal.available_wal_end_lsn == wal.wal_end_lsn;
+            after_reset.push(wal);
+
+            if stop {
+                break;
+            }
+        }
+
+        assert_eq!(before_reset, after_reset);
     }
 }
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index a99de71a04..e0d593851e 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -178,6 +178,8 @@ pub fn run_server(os: NodeOs, disk: Arc<SafekeeperDisk>) -> Result<()> {
         control_file_save_interval: Duration::from_secs(1),
         partial_backup_concurrency: 1,
         eviction_min_resident: Duration::ZERO,
+        wal_reader_fanout: false,
+        max_delta_for_fanout: None,
     };
 
     let mut global = GlobalMap::new(disk, conf.clone())?;

From 3d41069dc4002a0444c3cf6afaa87a05d95bcdb5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 15 Jan 2025 10:26:58 -0600
Subject: [PATCH 10/40] Update pgrx in extension builds to 0.12.9 (#10372)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 299f4444a3..1ee159e5df 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -871,7 +871,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
-    cargo install --locked --version 0.12.6 cargo-pgrx && \
+    cargo install --locked --version 0.12.9 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
 USER root
@@ -908,19 +908,19 @@ RUN apt update && apt install --no-install-recommends --no-install-suggests -y p
     mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
     \
     cd exts/rag && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
     \
     cd ../rag_bge_small_en_v15 && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
         cargo pgrx install --release --features remote_onnx && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
     \
     cd ../rag_jina_reranker_v1_tiny_en && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
         REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
         cargo pgrx install --release --features remote_onnx && \
@@ -945,7 +945,8 @@ RUN wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.3.tar.
     # against postgres forks that decided to change their ABI name (like us).
     # With that we can build extensions without forking them and using stock
     # pgx. As this feature is new few manual version bumps were required.
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
@@ -963,7 +964,8 @@ ARG PG_VERSION
 RUN wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.9.tar.gz -O pg_graphql.tar.gz && \
     echo "cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "=0.12.9"/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -984,9 +986,8 @@ ARG PG_VERSION
 RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
     echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
-    # TODO update pgrx version in the pg_tiktoken repo and remove this line
-    sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
-    sed -i 's/pgrx-tests = "=0.10.2"/pgrx-tests = "0.11.3"/g' Cargo.toml && \
+    sed -i 's/pgrx = { version = "=0.12.6",/pgrx = { version = "0.12.9",/g' Cargo.toml && \
+    sed -i 's/pgrx-tests = "=0.12.6"/pgrx-tests = "0.12.9"/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_tiktoken.control
 
@@ -1028,7 +1029,11 @@ ARG PG_VERSION
 RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.2.0.tar.gz -O pg_session_jwt.tar.gz && \
     echo "5ace028e591f2e000ca10afa5b1ca62203ebff014c2907c0ec3b29c36f28a1bb pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/version = "0.12.6"/version = "0.12.9"/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "=0.12.9", features = [ "unsafe-postgres" ] }/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx-macros = "=0.12.6"/pgrx-macros = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
+    sed -i 's/pgrx-pg-config = "=0.12.6"/pgrx-pg-config = "=0.12.9"/g' pgrx-tests/Cargo.toml && \
     cargo pgrx install --release
 
 #########################################################################################

From efaec6cdf81f71ded5428aeccc6e9f8f607606e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 15 Jan 2025 19:15:30 +0100
Subject: [PATCH 11/40] Add endpoint and storcon cli cmd to set sk scheduling
 policy (#10400)

Implementing the last missing endpoint of #9981, this adds support to
set the scheduling policy of an individual safekeeper, as specified in
the RFC. However, unlike in the RFC we call the endpoint
`scheduling_policy` not `status`

Closes #9981.

As for why not use the upsert endpoint for this: we want to have the
safekeeper upsert endpoint be used for testing and for deploying new
safekeepers, but not for changes of the scheduling policy. We don't want
to change any of the other fields when marking a safekeeper as
decommissioned for example, so we'd have to first fetch them only to
then specify them again. Of course one can also design an endpoint where
one can omit any field and it doesn't get modified, but it's still not
great for observability to put everything into one big "change something
about this safekeeper" endpoint.
---
 control_plane/storcon_cli/src/main.rs         | 47 ++++++++++++++++++-
 libs/pageserver_api/src/controller_api.rs     |  7 ++-
 storage_controller/src/http.rs                | 44 ++++++++++++++++-
 storage_controller/src/persistence.rs         | 31 ++++++++++++
 storage_controller/src/service.rs             | 12 ++++-
 test_runner/fixtures/neon_fixtures.py         | 12 ++++-
 .../regress/test_storage_controller.py        | 11 +++++
 7 files changed, 155 insertions(+), 9 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 2ba8f63678..96bfad4c86 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -9,8 +9,9 @@ use clap::{Parser, Subcommand};
 use pageserver_api::{
     controller_api::{
         AvailabilityZone, NodeAvailabilityWrapper, NodeDescribeResponse, NodeShardResponse,
-        SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        TenantCreateRequest, TenantDescribeResponse, TenantPolicyRequest,
+        SafekeeperDescribeResponse, SafekeeperSchedulingPolicyRequest, ShardSchedulingPolicy,
+        ShardsPreferredAzsRequest, SkSchedulingPolicy, TenantCreateRequest, TenantDescribeResponse,
+        TenantPolicyRequest,
     },
     models::{
         EvictionPolicy, EvictionPolicyLayerAccessThreshold, LocationConfigSecondary,
@@ -231,6 +232,13 @@ enum Command {
     },
     /// List safekeepers known to the storage controller
     Safekeepers {},
+    /// Set the scheduling policy of the specified safekeeper
+    SafekeeperScheduling {
+        #[arg(long)]
+        node_id: NodeId,
+        #[arg(long)]
+        scheduling_policy: SkSchedulingPolicyArg,
+    },
 }
 
 #[derive(Parser)]
@@ -283,6 +291,24 @@ impl FromStr for PlacementPolicyArg {
     }
 }
 
+#[derive(Debug, Clone)]
+struct SkSchedulingPolicyArg(SkSchedulingPolicy);
+
+impl FromStr for SkSchedulingPolicyArg {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self(SkSchedulingPolicy::Active)),
+            "disabled" => Ok(Self(SkSchedulingPolicy::Disabled)),
+            "decomissioned" => Ok(Self(SkSchedulingPolicy::Decomissioned)),
+            _ => Err(anyhow::anyhow!(
+                "Unknown scheduling policy '{s}', try active,disabled,decomissioned"
+            )),
+        }
+    }
+}
+
 #[derive(Debug, Clone)]
 struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
 
@@ -1202,6 +1228,23 @@ async fn main() -> anyhow::Result<()> {
             }
             println!("{table}");
         }
+        Command::SafekeeperScheduling {
+            node_id,
+            scheduling_policy,
+        } => {
+            let scheduling_policy = scheduling_policy.0;
+            storcon_client
+                .dispatch::<SafekeeperSchedulingPolicyRequest, ()>(
+                    Method::POST,
+                    format!("control/v1/safekeeper/{node_id}/scheduling_policy"),
+                    Some(SafekeeperSchedulingPolicyRequest { scheduling_policy }),
+                )
+                .await?;
+            println!(
+                "Scheduling policy of {node_id} set to {}",
+                String::from(scheduling_policy)
+            );
+        }
     }
 
     Ok(())
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index f3880cb766..08d1fa55b9 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -416,8 +416,6 @@ pub struct MetadataHealthListOutdatedResponse {
 }
 
 /// Publicly exposed safekeeper description
-///
-/// The `active` flag which we have in the DB is not included on purpose: it is deprecated.
 #[derive(Serialize, Deserialize, Clone)]
 pub struct SafekeeperDescribeResponse {
     pub id: NodeId,
@@ -433,6 +431,11 @@ pub struct SafekeeperDescribeResponse {
     pub scheduling_policy: SkSchedulingPolicy,
 }
 
+#[derive(Serialize, Deserialize, Clone)]
+pub struct SafekeeperSchedulingPolicyRequest {
+    pub scheduling_policy: SkSchedulingPolicy,
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 03d8f11992..ac890b008f 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -15,7 +15,7 @@ use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::controller_api::{
     MetadataHealthListOutdatedRequest, MetadataHealthListOutdatedResponse,
     MetadataHealthListUnhealthyResponse, MetadataHealthUpdateRequest, MetadataHealthUpdateResponse,
-    ShardsPreferredAzsRequest, TenantCreateRequest,
+    SafekeeperSchedulingPolicyRequest, ShardsPreferredAzsRequest, TenantCreateRequest,
 };
 use pageserver_api::models::{
     TenantConfigPatchRequest, TenantConfigRequest, TenantLocationConfigRequest,
@@ -1305,6 +1305,35 @@ async fn handle_upsert_safekeeper(mut req: Request<Body>) -> Result<Response<Bod
         .unwrap())
 }
 
+/// Sets the scheduling policy of the specified safekeeper
+async fn handle_safekeeper_scheduling_policy(
+    mut req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let body = json_request::<SafekeeperSchedulingPolicyRequest>(&mut req).await?;
+    let id = parse_request_param::<i64>(&req, "id")?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let state = get_state(&req);
+
+    state
+        .service
+        .set_safekeeper_scheduling_policy(id, body.scheduling_policy)
+        .await?;
+
+    Ok(Response::builder()
+        .status(StatusCode::NO_CONTENT)
+        .body(Body::empty())
+        .unwrap())
+}
+
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
 async fn tenant_service_handler<R, H>(
@@ -1873,7 +1902,18 @@ pub fn make_router(
         })
         .post("/control/v1/safekeeper/:id", |r| {
             // id is in the body
-            named_request_span(r, handle_upsert_safekeeper, RequestName("v1_safekeeper"))
+            named_request_span(
+                r,
+                handle_upsert_safekeeper,
+                RequestName("v1_safekeeper_post"),
+            )
+        })
+        .post("/control/v1/safekeeper/:id/scheduling_policy", |r| {
+            named_request_span(
+                r,
+                handle_safekeeper_scheduling_policy,
+                RequestName("v1_safekeeper_status"),
+            )
         })
         // Tenant Shard operations
         .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index eb0bfc879e..37bfaf1139 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -1104,6 +1104,37 @@ impl Persistence {
         })
         .await
     }
+
+    pub(crate) async fn set_safekeeper_scheduling_policy(
+        &self,
+        id_: i64,
+        scheduling_policy_: SkSchedulingPolicy,
+    ) -> Result<(), DatabaseError> {
+        use crate::schema::safekeepers::dsl::*;
+
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            #[derive(Insertable, AsChangeset)]
+            #[diesel(table_name = crate::schema::safekeepers)]
+            struct UpdateSkSchedulingPolicy<'a> {
+                id: i64,
+                scheduling_policy: &'a str,
+            }
+            let scheduling_policy_ = String::from(scheduling_policy_);
+
+            let rows_affected = diesel::update(safekeepers.filter(id.eq(id_)))
+                .set(scheduling_policy.eq(scheduling_policy_))
+                .execute(conn)?;
+
+            if rows_affected != 1 {
+                return Err(DatabaseError::Logical(format!(
+                    "unexpected number of rows ({rows_affected})",
+                )));
+            }
+
+            Ok(())
+        })
+        .await
+    }
 }
 
 /// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 57f4cc8463..1d85839881 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -47,7 +47,7 @@ use pageserver_api::{
         AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
         NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
         SafekeeperDescribeResponse, ShardSchedulingPolicy, ShardsPreferredAzsRequest,
-        ShardsPreferredAzsResponse, TenantCreateRequest, TenantCreateResponse,
+        ShardsPreferredAzsResponse, SkSchedulingPolicy, TenantCreateRequest, TenantCreateResponse,
         TenantCreateResponseShard, TenantDescribeResponse, TenantDescribeResponseShard,
         TenantLocateResponse, TenantPolicyRequest, TenantShardMigrateRequest,
         TenantShardMigrateResponse,
@@ -7651,6 +7651,16 @@ impl Service {
         self.persistence.safekeeper_upsert(record).await
     }
 
+    pub(crate) async fn set_safekeeper_scheduling_policy(
+        &self,
+        id: i64,
+        scheduling_policy: SkSchedulingPolicy,
+    ) -> Result<(), DatabaseError> {
+        self.persistence
+            .set_safekeeper_scheduling_policy(id, scheduling_policy)
+            .await
+    }
+
     pub(crate) async fn update_shards_preferred_azs(
         &self,
         req: ShardsPreferredAzsRequest,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c47739cd81..c3950e9bf7 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2336,6 +2336,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
             json=body,
         )
 
+    def safekeeper_scheduling_policy(self, id: int, scheduling_policy: str):
+        self.request(
+            "POST",
+            f"{self.api}/control/v1/safekeeper/{id}/scheduling_policy",
+            headers=self.headers(TokenScope.ADMIN),
+            json={"id": id, "scheduling_policy": scheduling_policy},
+        )
+
     def get_safekeeper(self, id: int) -> dict[str, Any] | None:
         try:
             response = self.request(
@@ -4135,7 +4143,7 @@ class Endpoint(PgProtocol, LogUtils):
 
     # Checkpoints running endpoint and returns pg_wal size in MB.
     def get_pg_wal_size(self):
-        log.info(f'checkpointing at LSN {self.safe_psql("select pg_current_wal_lsn()")[0][0]}')
+        log.info(f"checkpointing at LSN {self.safe_psql('select pg_current_wal_lsn()')[0][0]}")
         self.safe_psql("checkpoint")
         assert self.pgdata_dir is not None  # please mypy
         return get_dir_size(self.pgdata_dir / "pg_wal") / 1024 / 1024
@@ -4975,7 +4983,7 @@ def logical_replication_sync(
         if res:
             log.info(f"subscriber_lsn={res}")
             subscriber_lsn = Lsn(res)
-            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={ publisher_lsn}")
+            log.info(f"Subscriber LSN={subscriber_lsn}, publisher LSN={publisher_lsn}")
             if subscriber_lsn >= publisher_lsn:
                 return subscriber_lsn
         time.sleep(0.5)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b5d109559f..b1e1fd81d6 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3208,6 +3208,17 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
 
     assert eq_safekeeper_records(body, inserted_now)
 
+    # some small tests for the scheduling policy querying and returning APIs
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Disabled"
+    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+    newest_info = target.get_safekeeper(inserted["id"])
+    assert newest_info
+    assert newest_info["scheduling_policy"] == "Decomissioned"
+    # Ensure idempotency
+    target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
+
 
 def eq_safekeeper_records(a: dict[str, Any], b: dict[str, Any]) -> bool:
     compared = [dict(a), dict(b)]

From fb0e2acb2f8333d279371da84c17e3e6db5c31f2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 15 Jan 2025 19:07:22 +0000
Subject: [PATCH 12/40] pageserver: add `page_trace` API for debugging (#10293)

## Problem

When a pageserver is receiving high rates of requests, we don't have a
good way to efficiently discover what the client's access pattern is.

Closes: https://github.com/neondatabase/neon/issues/10275

## Summary of changes

- Add
`/v1/tenant/x/timeline/y/page_trace?size_limit_bytes=...&time_limit_secs=...`
API, which returns a binary buffer.
- Add `pagectl page-trace` tool to decode and analyze the output.

---------

Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 Cargo.lock                        |  3 ++
 libs/pageserver_api/src/key.rs    |  4 +-
 libs/pageserver_api/src/models.rs | 19 +++++++-
 pageserver/Cargo.toml             |  1 +
 pageserver/ctl/Cargo.toml         |  2 +
 pageserver/ctl/src/main.rs        |  4 ++
 pageserver/ctl/src/page_trace.rs  | 73 +++++++++++++++++++++++++++++++
 pageserver/src/http/routes.rs     | 72 ++++++++++++++++++++++++++++++
 pageserver/src/page_service.rs    | 15 +++++++
 pageserver/src/tenant/timeline.rs |  9 +++-
 10 files changed, 199 insertions(+), 3 deletions(-)
 create mode 100644 pageserver/ctl/src/page_trace.rs

diff --git a/Cargo.lock b/Cargo.lock
index afe16ff848..3f184ebe0b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3981,9 +3981,11 @@ name = "pagectl"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "bincode",
  "camino",
  "clap",
  "humantime",
+ "itertools 0.10.5",
  "pageserver",
  "pageserver_api",
  "postgres_ffi",
@@ -4005,6 +4007,7 @@ dependencies = [
  "arc-swap",
  "async-compression",
  "async-stream",
+ "bincode",
  "bit_field",
  "byteorder",
  "bytes",
diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 328dea5dec..dbd45da314 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -24,7 +24,9 @@ pub struct Key {
 
 /// When working with large numbers of Keys in-memory, it is more efficient to handle them as i128 than as
 /// a struct of fields.
-#[derive(Clone, Copy, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug)]
+#[derive(
+    Clone, Copy, Default, Hash, PartialEq, Eq, Ord, PartialOrd, Serialize, Deserialize, Debug,
+)]
 pub struct CompactKey(i128);
 
 /// The storage key size.
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 9af6c4021d..87e8df2ab6 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -29,7 +29,7 @@ use utils::{
 };
 
 use crate::{
-    key::Key,
+    key::{CompactKey, Key},
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
 };
@@ -1981,6 +1981,23 @@ impl PagestreamBeMessage {
     }
 }
 
+#[derive(Debug, Serialize, Deserialize)]
+pub struct PageTraceEvent {
+    pub key: CompactKey,
+    pub effective_lsn: Lsn,
+    pub time: SystemTime,
+}
+
+impl Default for PageTraceEvent {
+    fn default() -> Self {
+        Self {
+            key: Default::default(),
+            effective_lsn: Default::default(),
+            time: std::time::UNIX_EPOCH,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use serde_json::json;
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 8547746d94..9195951191 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -16,6 +16,7 @@ arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 bit_field.workspace = true
+bincode.workspace = true
 byteorder.workspace = true
 bytes.workspace = true
 camino.workspace = true
diff --git a/pageserver/ctl/Cargo.toml b/pageserver/ctl/Cargo.toml
index 39ca47568c..7b70f0dc87 100644
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -8,9 +8,11 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+bincode.workspace = true
 camino.workspace = true
 clap = { workspace = true, features = ["string"] }
 humantime.workspace = true
+itertools.workspace = true
 pageserver = { path = ".." }
 pageserver_api.workspace = true
 remote_storage = { path = "../../libs/remote_storage" }
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index a0aac89dc8..353b4bd2f9 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -9,7 +9,9 @@ mod index_part;
 mod key;
 mod layer_map_analyzer;
 mod layers;
+mod page_trace;
 
+use page_trace::PageTraceCmd;
 use std::{
     str::FromStr,
     time::{Duration, SystemTime},
@@ -64,6 +66,7 @@ enum Commands {
     Layer(LayerCmd),
     /// Debug print a hex key found from logs
     Key(key::DescribeKeyCommand),
+    PageTrace(PageTraceCmd),
 }
 
 /// Read and update pageserver metadata file
@@ -183,6 +186,7 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Commands::Key(dkc) => dkc.execute(),
+        Commands::PageTrace(cmd) => page_trace::main(&cmd)?,
     };
     Ok(())
 }
diff --git a/pageserver/ctl/src/page_trace.rs b/pageserver/ctl/src/page_trace.rs
new file mode 100644
index 0000000000..da0de72fd9
--- /dev/null
+++ b/pageserver/ctl/src/page_trace.rs
@@ -0,0 +1,73 @@
+use std::collections::HashMap;
+use std::io::BufReader;
+
+use camino::Utf8PathBuf;
+use clap::Parser;
+use itertools::Itertools as _;
+use pageserver_api::key::{CompactKey, Key};
+use pageserver_api::models::PageTraceEvent;
+use pageserver_api::reltag::RelTag;
+
+/// Parses a page trace (as emitted by the `page_trace` timeline API), and outputs stats.
+#[derive(Parser)]
+pub(crate) struct PageTraceCmd {
+    /// Trace input file.
+    path: Utf8PathBuf,
+}
+
+pub(crate) fn main(cmd: &PageTraceCmd) -> anyhow::Result<()> {
+    let mut file = BufReader::new(std::fs::OpenOptions::new().read(true).open(&cmd.path)?);
+    let mut events: Vec<PageTraceEvent> = Vec::new();
+    loop {
+        match bincode::deserialize_from(&mut file) {
+            Ok(event) => events.push(event),
+            Err(err) => {
+                if let bincode::ErrorKind::Io(ref err) = *err {
+                    if err.kind() == std::io::ErrorKind::UnexpectedEof {
+                        break;
+                    }
+                }
+                return Err(err.into());
+            }
+        }
+    }
+
+    let mut reads_by_relation: HashMap<RelTag, i64> = HashMap::new();
+    let mut reads_by_key: HashMap<CompactKey, i64> = HashMap::new();
+
+    for event in events {
+        let key = Key::from_compact(event.key);
+        let reltag = RelTag {
+            spcnode: key.field2,
+            dbnode: key.field3,
+            relnode: key.field4,
+            forknum: key.field5,
+        };
+
+        *reads_by_relation.entry(reltag).or_default() += 1;
+        *reads_by_key.entry(event.key).or_default() += 1;
+    }
+
+    let multi_read_keys = reads_by_key
+        .into_iter()
+        .filter(|(_, count)| *count > 1)
+        .sorted_by_key(|(key, count)| (-*count, *key))
+        .collect_vec();
+
+    println!("Multi-read keys: {}", multi_read_keys.len());
+    for (key, count) in multi_read_keys {
+        println!("  {key}: {count}");
+    }
+
+    let reads_by_relation = reads_by_relation
+        .into_iter()
+        .sorted_by_key(|(rel, count)| (-*count, *rel))
+        .collect_vec();
+
+    println!("Reads by relation:");
+    for (reltag, count) in reads_by_relation {
+        println!("  {reltag}: {count}");
+    }
+
+    Ok(())
+}
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 94e0b101bd..33b2d04588 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -27,6 +27,7 @@ use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
 use pageserver_api::models::OffloadedTimelineInfo;
+use pageserver_api::models::PageTraceEvent;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantConfigPatchRequest;
 use pageserver_api::models::TenantDetails;
@@ -51,7 +52,9 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeTravelError;
+use scopeguard::defer;
 use tenant_size_model::{svg::SvgBranchKind, SizeResult, StorageModel};
+use tokio::time::Instant;
 use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -1521,6 +1524,71 @@ async fn timeline_gc_unblocking_handler(
     block_or_unblock_gc(request, false).await
 }
 
+/// Traces GetPage@LSN requests for a timeline, and emits metadata in an efficient binary encoding.
+/// Use the `pagectl page-trace` command to decode and analyze the output.
+async fn timeline_page_trace_handler(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let state = get_state(&request);
+    check_permission(&request, None)?;
+
+    let size_limit: usize = parse_query_param(&request, "size_limit_bytes")?.unwrap_or(1024 * 1024);
+    let time_limit_secs: u64 = parse_query_param(&request, "time_limit_secs")?.unwrap_or(5);
+
+    // Convert size limit to event limit based on the serialized size of an event. The event size is
+    // fixed, as the default bincode serializer uses fixed-width integer encoding.
+    let event_size = bincode::serialize(&PageTraceEvent::default())
+        .map_err(|err| ApiError::InternalServerError(err.into()))?
+        .len();
+    let event_limit = size_limit / event_size;
+
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    // Install a page trace, unless one is already in progress. We just use a buffered channel,
+    // which may 2x the memory usage in the worst case, but it's still bounded.
+    let (trace_tx, mut trace_rx) = tokio::sync::mpsc::channel(event_limit);
+    let cur = timeline.page_trace.load();
+    let installed = cur.is_none()
+        && timeline
+            .page_trace
+            .compare_and_swap(cur, Some(Arc::new(trace_tx)))
+            .is_none();
+    if !installed {
+        return Err(ApiError::Conflict("page trace already active".to_string()));
+    }
+    defer!(timeline.page_trace.store(None)); // uninstall on return
+
+    // Collect the trace and return it to the client. We could stream the response, but this is
+    // simple and fine.
+    let mut body = Vec::with_capacity(size_limit);
+    let deadline = Instant::now() + Duration::from_secs(time_limit_secs);
+
+    while body.len() < size_limit {
+        tokio::select! {
+            event = trace_rx.recv() => {
+                let Some(event) = event else {
+                    break; // shouldn't happen (sender doesn't close, unless timeline dropped)
+                };
+                bincode::serialize_into(&mut body, &event)
+                    .map_err(|err| ApiError::InternalServerError(err.into()))?;
+            }
+            _ = tokio::time::sleep_until(deadline) => break, // time limit reached
+            _ = cancel.cancelled() => return Err(ApiError::Cancelled),
+        }
+    }
+
+    Ok(Response::builder()
+        .status(StatusCode::OK)
+        .header(header::CONTENT_TYPE, "application/octet-stream")
+        .body(hyper::Body::from(body))
+        .unwrap())
+}
+
 /// Adding a block is `POST ../block_gc`, removing a block is `POST ../unblock_gc`.
 ///
 /// Both are technically unsafe because they might fire off index uploads, thus they are POST.
@@ -3479,6 +3547,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/unblock_gc",
             |r| api_handler(r, timeline_gc_unblocking_handler),
         )
+        .get(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/page_trace",
+            |r| api_handler(r, timeline_page_trace_handler),
+        )
         .post("/v1/tenant/:tenant_shard_id/heatmap_upload", |r| {
             api_handler(r, secondary_upload_handler)
         })
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index b3e18fed99..da4180a927 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -67,6 +67,7 @@ use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::{basebackup, timed_after_cancellation};
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::models::PageTraceEvent;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
@@ -1718,6 +1719,20 @@ impl PageServerHandler {
             .query_metrics
             .observe_getpage_batch_start(requests.len());
 
+        // If a page trace is running, submit an event for this request.
+        if let Some(page_trace) = timeline.page_trace.load().as_ref() {
+            let time = SystemTime::now();
+            for batch in &requests {
+                let key = rel_block_to_key(batch.req.rel, batch.req.blkno).to_compact();
+                // Ignore error (trace buffer may be full or tracer may have disconnected).
+                _ = page_trace.try_send(PageTraceEvent {
+                    key,
+                    effective_lsn,
+                    time,
+                });
+            }
+        }
+
         let results = timeline
             .get_rel_page_at_lsn_batched(
                 requests.iter().map(|p| (&p.req.rel, &p.req.blkno)),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4aa6b7a05a..d6ae11e67d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -14,7 +14,7 @@ pub mod uninit;
 mod walreceiver;
 
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use arc_swap::ArcSwap;
+use arc_swap::{ArcSwap, ArcSwapOption};
 use bytes::Bytes;
 use camino::Utf8Path;
 use chrono::{DateTime, Utc};
@@ -23,6 +23,7 @@ use fail::fail_point;
 use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
+use pageserver_api::models::PageTraceEvent;
 use pageserver_api::{
     config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
@@ -42,6 +43,7 @@ use rand::Rng;
 use remote_storage::DownloadError;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
+use tokio::sync::mpsc::Sender;
 use tokio::{
     runtime::Handle,
     sync::{oneshot, watch},
@@ -433,6 +435,9 @@ pub struct Timeline {
 
     /// Cf. [`crate::tenant::CreateTimelineIdempotency`].
     pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
+
+    /// If Some, collects GetPage metadata for an ongoing PageTrace.
+    pub(crate) page_trace: ArcSwapOption<Sender<PageTraceEvent>>,
 }
 
 pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -2380,6 +2385,8 @@ impl Timeline {
                 attach_wal_lag_cooldown,
 
                 create_idempotency,
+
+                page_trace: Default::default(),
             };
 
             result.repartition_threshold =

From 55a68b28a27b27810255212994e601050b73917e Mon Sep 17 00:00:00 2001
From: Gleb Novikov <NanoBjorn@users.noreply.github.com>
Date: Wed, 15 Jan 2025 20:51:09 +0000
Subject: [PATCH 13/40] fast import: restore to neondb (not postgres) database
 (#10251)

## Problem

`postgres` is system database at neon, so we need to do `pg_restore`
into `neondb` instead

https://github.com/neondatabase/cloud/issues/22100

## Summary of changes

Changed fast_import a little bit:
1. After succesfull connection creating `neondb` in postgres instance
2. Changed restore connstring to use new db
3. Added optional `source_connection_string`, which allows to skip
`s3_prefix` and just connect directly.
4. Added `-i` that stops process until sigterm

## TODO
- [x] test image in cplane e2e
- [ ] Change import job image back to latest after this merged (partial
revert of https://github.com/neondatabase/cloud/pull/22338)
---
 compute_tools/src/bin/fast_import.rs | 227 +++++++++++++++++----------
 1 file changed, 147 insertions(+), 80 deletions(-)

diff --git a/compute_tools/src/bin/fast_import.rs b/compute_tools/src/bin/fast_import.rs
index f554362751..5b008f8182 100644
--- a/compute_tools/src/bin/fast_import.rs
+++ b/compute_tools/src/bin/fast_import.rs
@@ -31,7 +31,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::Parser;
 use compute_tools::extension_server::{get_pg_version, PostgresMajorVersion};
 use nix::unistd::Pid;
-use tracing::{info, info_span, warn, Instrument};
+use tracing::{error, info, info_span, warn, Instrument};
 use utils::fs_ext::is_directory_empty;
 
 #[path = "fast_import/aws_s3_sync.rs"]
@@ -41,12 +41,19 @@ mod child_stdio_to_log;
 #[path = "fast_import/s3_uri.rs"]
 mod s3_uri;
 
+const PG_WAIT_TIMEOUT: std::time::Duration = std::time::Duration::from_secs(600);
+const PG_WAIT_RETRY_INTERVAL: std::time::Duration = std::time::Duration::from_millis(300);
+
 #[derive(clap::Parser)]
 struct Args {
     #[clap(long)]
     working_directory: Utf8PathBuf,
     #[clap(long, env = "NEON_IMPORTER_S3_PREFIX")]
-    s3_prefix: s3_uri::S3Uri,
+    s3_prefix: Option<s3_uri::S3Uri>,
+    #[clap(long)]
+    source_connection_string: Option<String>,
+    #[clap(short, long)]
+    interactive: bool,
     #[clap(long)]
     pg_bin_dir: Utf8PathBuf,
     #[clap(long)]
@@ -77,30 +84,70 @@ pub(crate) async fn main() -> anyhow::Result<()> {
 
     info!("starting");
 
-    let Args {
-        working_directory,
-        s3_prefix,
-        pg_bin_dir,
-        pg_lib_dir,
-    } = Args::parse();
+    let args = Args::parse();
 
-    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+    // Validate arguments
+    if args.s3_prefix.is_none() && args.source_connection_string.is_none() {
+        anyhow::bail!("either s3_prefix or source_connection_string must be specified");
+    }
+    if args.s3_prefix.is_some() && args.source_connection_string.is_some() {
+        anyhow::bail!("only one of s3_prefix or source_connection_string can be specified");
+    }
 
-    let spec: Spec = {
-        let spec_key = s3_prefix.append("/spec.json");
-        let s3_client = aws_sdk_s3::Client::new(&aws_config);
-        let object = s3_client
-            .get_object()
-            .bucket(&spec_key.bucket)
-            .key(spec_key.key)
-            .send()
-            .await
-            .context("get spec from s3")?
-            .body
-            .collect()
-            .await
-            .context("download spec body")?;
-        serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+    let working_directory = args.working_directory;
+    let pg_bin_dir = args.pg_bin_dir;
+    let pg_lib_dir = args.pg_lib_dir;
+
+    // Initialize AWS clients only if s3_prefix is specified
+    let (aws_config, kms_client) = if args.s3_prefix.is_some() {
+        let config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
+        let kms = aws_sdk_kms::Client::new(&config);
+        (Some(config), Some(kms))
+    } else {
+        (None, None)
+    };
+
+    // Get source connection string either from S3 spec or direct argument
+    let source_connection_string = if let Some(s3_prefix) = &args.s3_prefix {
+        let spec: Spec = {
+            let spec_key = s3_prefix.append("/spec.json");
+            let s3_client = aws_sdk_s3::Client::new(aws_config.as_ref().unwrap());
+            let object = s3_client
+                .get_object()
+                .bucket(&spec_key.bucket)
+                .key(spec_key.key)
+                .send()
+                .await
+                .context("get spec from s3")?
+                .body
+                .collect()
+                .await
+                .context("download spec body")?;
+            serde_json::from_slice(&object.into_bytes()).context("parse spec as json")?
+        };
+
+        match spec.encryption_secret {
+            EncryptionSecret::KMS { key_id } => {
+                let mut output = kms_client
+                    .unwrap()
+                    .decrypt()
+                    .key_id(key_id)
+                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
+                        spec.source_connstring_ciphertext_base64,
+                    ))
+                    .send()
+                    .await
+                    .context("decrypt source connection string")?;
+                let plaintext = output
+                    .plaintext
+                    .take()
+                    .context("get plaintext source connection string")?;
+                String::from_utf8(plaintext.into_inner())
+                    .context("parse source connection string as utf8")?
+            }
+        }
+    } else {
+        args.source_connection_string.unwrap()
     };
 
     match tokio::fs::create_dir(&working_directory).await {
@@ -123,15 +170,6 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         .await
         .context("create pgdata directory")?;
 
-    //
-    // Setup clients
-    //
-    let aws_config = aws_config::load_defaults(BehaviorVersion::v2024_03_28()).await;
-    let kms_client = aws_sdk_kms::Client::new(&aws_config);
-
-    //
-    //  Initialize pgdata
-    //
     let pgbin = pg_bin_dir.join("postgres");
     let pg_version = match get_pg_version(pgbin.as_ref()) {
         PostgresMajorVersion::V14 => 14,
@@ -170,7 +208,13 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         .args(["-c", &format!("max_parallel_workers={nproc}")])
         .args(["-c", &format!("max_parallel_workers_per_gather={nproc}")])
         .args(["-c", &format!("max_worker_processes={nproc}")])
-        .args(["-c", "effective_io_concurrency=100"])
+        .args([
+            "-c",
+            &format!(
+                "effective_io_concurrency={}",
+                if cfg!(target_os = "macos") { 0 } else { 100 }
+            ),
+        ])
         .env_clear()
         .stdout(std::process::Stdio::piped())
         .stderr(std::process::Stdio::piped())
@@ -185,44 +229,58 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         )
         .instrument(info_span!("postgres")),
     );
+
+    // Create neondb database in the running postgres
     let restore_pg_connstring =
         format!("host=localhost port=5432 user={superuser} dbname=postgres");
+
+    let start_time = std::time::Instant::now();
+
     loop {
-        let res = tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await;
-        if res.is_ok() {
-            info!("postgres is ready, could connect to it");
-            break;
+        if start_time.elapsed() > PG_WAIT_TIMEOUT {
+            error!(
+                "timeout exceeded: failed to poll postgres and create database within 10 minutes"
+            );
+            std::process::exit(1);
+        }
+
+        match tokio_postgres::connect(&restore_pg_connstring, tokio_postgres::NoTls).await {
+            Ok((client, connection)) => {
+                // Spawn the connection handling task to maintain the connection
+                tokio::spawn(async move {
+                    if let Err(e) = connection.await {
+                        warn!("connection error: {}", e);
+                    }
+                });
+
+                match client.simple_query("CREATE DATABASE neondb;").await {
+                    Ok(_) => {
+                        info!("created neondb database");
+                        break;
+                    }
+                    Err(e) => {
+                        warn!(
+                            "failed to create database: {}, retying in {}s",
+                            e,
+                            PG_WAIT_RETRY_INTERVAL.as_secs_f32()
+                        );
+                        tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
+                        continue;
+                    }
+                }
+            }
+            Err(_) => {
+                info!(
+                    "postgres not ready yet, retrying in {}s",
+                    PG_WAIT_RETRY_INTERVAL.as_secs_f32()
+                );
+                tokio::time::sleep(PG_WAIT_RETRY_INTERVAL).await;
+                continue;
+            }
         }
     }
 
-    //
-    // Decrypt connection string
-    //
-    let source_connection_string = {
-        match spec.encryption_secret {
-            EncryptionSecret::KMS { key_id } => {
-                let mut output = kms_client
-                    .decrypt()
-                    .key_id(key_id)
-                    .ciphertext_blob(aws_sdk_s3::primitives::Blob::new(
-                        spec.source_connstring_ciphertext_base64,
-                    ))
-                    .send()
-                    .await
-                    .context("decrypt source connection string")?;
-                let plaintext = output
-                    .plaintext
-                    .take()
-                    .context("get plaintext source connection string")?;
-                String::from_utf8(plaintext.into_inner())
-                    .context("parse source connection string as utf8")?
-            }
-        }
-    };
-
-    //
-    // Start the work
-    //
+    let restore_pg_connstring = restore_pg_connstring.replace("dbname=postgres", "dbname=neondb");
 
     let dumpdir = working_directory.join("dumpdir");
 
@@ -310,6 +368,12 @@ pub(crate) async fn main() -> anyhow::Result<()> {
         }
     }
 
+    // If interactive mode, wait for Ctrl+C
+    if args.interactive {
+        info!("Running in interactive mode. Press Ctrl+C to shut down.");
+        tokio::signal::ctrl_c().await.context("wait for ctrl-c")?;
+    }
+
     info!("shutdown postgres");
     {
         nix::sys::signal::kill(
@@ -325,21 +389,24 @@ pub(crate) async fn main() -> anyhow::Result<()> {
             .context("wait for postgres to shut down")?;
     }
 
-    info!("upload pgdata");
-    aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
-        .await
-        .context("sync dump directory to destination")?;
-
-    info!("write status");
-    {
-        let status_dir = working_directory.join("status");
-        std::fs::create_dir(&status_dir).context("create status directory")?;
-        let status_file = status_dir.join("pgdata");
-        std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
-            .context("write status file")?;
-        aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
+    // Only sync if s3_prefix was specified
+    if let Some(s3_prefix) = args.s3_prefix {
+        info!("upload pgdata");
+        aws_s3_sync::sync(Utf8Path::new(&pgdata_dir), &s3_prefix.append("/pgdata/"))
             .await
-            .context("sync status directory to destination")?;
+            .context("sync dump directory to destination")?;
+
+        info!("write status");
+        {
+            let status_dir = working_directory.join("status");
+            std::fs::create_dir(&status_dir).context("create status directory")?;
+            let status_file = status_dir.join("pgdata");
+            std::fs::write(&status_file, serde_json::json!({"done": true}).to_string())
+                .context("write status file")?;
+            aws_s3_sync::sync(&status_dir, &s3_prefix.append("/status/"))
+                .await
+                .context("sync status directory to destination")?;
+        }
     }
 
     Ok(())

From a753349cb0d8a76b7b5884c216236f6e57357d30 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:04:06 -0500
Subject: [PATCH 14/40] feat(pageserver): validate data integrity during
 gc-compaction (#10131)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

part of https://github.com/neondatabase/neon/issues/9114
part of investigation of
https://github.com/neondatabase/neon/issues/10049

## Summary of changes

* If `cfg!(test) or cfg!(feature = testing)`, then we will always try
generating an image to ensure the history is replayable, but not put the
image layer into the final layer results, therefore discovering wrong
key history before we hit a read error.
* I suspect it's easier to trigger some races if gc-compaction is
continuously run on a timeline, so I increased the frequency to twice
per 10 churns.
* Also, create branches in gc-compaction smoke tests to get more test
coverage.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad@neon.tech>
---
 pageserver/src/tenant/timeline/compaction.rs | 77 +++++++++++++++-----
 test_runner/fixtures/workload.py             | 16 ++++
 test_runner/regress/test_compaction.py       | 20 ++++-
 3 files changed, 89 insertions(+), 24 deletions(-)

diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 05f8d476f9..2042a18e96 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1776,7 +1776,10 @@ impl Timeline {
         base_img_from_ancestor: Option<(Key, Lsn, Bytes)>,
     ) -> anyhow::Result<KeyHistoryRetention> {
         // Pre-checks for the invariants
-        if cfg!(debug_assertions) {
+
+        let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
+
+        if debug_mode {
             for (log_key, _, _) in full_history {
                 assert_eq!(log_key, &key, "mismatched key");
             }
@@ -1922,15 +1925,19 @@ impl Timeline {
             output
         }
 
+        let mut key_exists = false;
         for (i, split_for_lsn) in split_history.into_iter().enumerate() {
             // TODO: there could be image keys inside the splits, and we can compute records_since_last_image accordingly.
             records_since_last_image += split_for_lsn.len();
-            let generate_image = if i == 0 && !has_ancestor {
+            // Whether to produce an image into the final layer files
+            let produce_image = if i == 0 && !has_ancestor {
                 // We always generate images for the first batch (below horizon / lowest retain_lsn)
                 true
             } else if i == batch_cnt - 1 {
                 // Do not generate images for the last batch (above horizon)
                 false
+            } else if records_since_last_image == 0 {
+                false
             } else if records_since_last_image >= delta_threshold_cnt {
                 // Generate images when there are too many records
                 true
@@ -1945,29 +1952,45 @@ impl Timeline {
                     break;
                 }
             }
-            if let Some((_, _, val)) = replay_history.first() {
-                if !val.will_init() {
-                    return Err(anyhow::anyhow!("invalid history, no base image")).with_context(
-                        || {
-                            generate_debug_trace(
-                                Some(&replay_history),
-                                full_history,
-                                retain_lsn_below_horizon,
-                                horizon,
-                            )
-                        },
-                    );
-                }
+            if replay_history.is_empty() && !key_exists {
+                // The key does not exist at earlier LSN, we can skip this iteration.
+                retention.push(Vec::new());
+                continue;
+            } else {
+                key_exists = true;
             }
-            if generate_image && records_since_last_image > 0 {
+            let Some((_, _, val)) = replay_history.first() else {
+                unreachable!("replay history should not be empty once it exists")
+            };
+            if !val.will_init() {
+                return Err(anyhow::anyhow!("invalid history, no base image")).with_context(|| {
+                    generate_debug_trace(
+                        Some(&replay_history),
+                        full_history,
+                        retain_lsn_below_horizon,
+                        horizon,
+                    )
+                });
+            }
+            // Whether to reconstruct the image. In debug mode, we will generate an image
+            // at every retain_lsn to ensure data is not corrupted, but we won't put the
+            // image into the final layer.
+            let generate_image = produce_image || debug_mode;
+            if produce_image {
                 records_since_last_image = 0;
-                let replay_history_for_debug = if cfg!(debug_assertions) {
+            }
+            let img_and_lsn = if generate_image {
+                let replay_history_for_debug = if debug_mode {
                     Some(replay_history.clone())
                 } else {
                     None
                 };
                 let replay_history_for_debug_ref = replay_history_for_debug.as_deref();
-                let history = std::mem::take(&mut replay_history);
+                let history = if produce_image {
+                    std::mem::take(&mut replay_history)
+                } else {
+                    replay_history.clone()
+                };
                 let mut img = None;
                 let mut records = Vec::with_capacity(history.len());
                 if let (_, lsn, Value::Image(val)) = history.first().as_ref().unwrap() {
@@ -2004,8 +2027,20 @@ impl Timeline {
                 }
                 records.reverse();
                 let state = ValueReconstructState { img, records };
-                let request_lsn = lsn_split_points[i]; // last batch does not generate image so i is always in range
+                // last batch does not generate image so i is always in range, unless we force generate
+                // an image during testing
+                let request_lsn = if i >= lsn_split_points.len() {
+                    Lsn::MAX
+                } else {
+                    lsn_split_points[i]
+                };
                 let img = self.reconstruct_value(key, request_lsn, state).await?;
+                Some((request_lsn, img))
+            } else {
+                None
+            };
+            if produce_image {
+                let (request_lsn, img) = img_and_lsn.unwrap();
                 replay_history.push((key, request_lsn, Value::Image(img.clone())));
                 retention.push(vec![(request_lsn, Value::Image(img))]);
             } else {
@@ -2273,6 +2308,8 @@ impl Timeline {
         let compact_key_range = job.compact_key_range;
         let compact_lsn_range = job.compact_lsn_range;
 
+        let debug_mode = cfg!(debug_assertions) || cfg!(feature = "testing");
+
         info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compact_key_range={}..{}, compact_lsn_range={}..{}", compact_key_range.start, compact_key_range.end, compact_lsn_range.start, compact_lsn_range.end);
 
         scopeguard::defer! {
@@ -2398,7 +2435,7 @@ impl Timeline {
                 .first()
                 .copied()
                 .unwrap_or(job_desc.gc_cutoff);
-            if cfg!(debug_assertions) {
+            if debug_mode {
                 assert_eq!(
                     res,
                     job_desc
diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py
index 1b8c9fef44..eea0ec2b95 100644
--- a/test_runner/fixtures/workload.py
+++ b/test_runner/fixtures/workload.py
@@ -53,6 +53,22 @@ class Workload:
         self._endpoint: Endpoint | None = None
         self._endpoint_opts = endpoint_opts or {}
 
+    def branch(
+        self,
+        timeline_id: TimelineId,
+        branch_name: str | None = None,
+        endpoint_opts: dict[str, Any] | None = None,
+    ) -> Workload:
+        """
+        Checkpoint the current status of the workload in case of branching
+        """
+        branch_workload = Workload(
+            self.env, self.tenant_id, timeline_id, branch_name, endpoint_opts
+        )
+        branch_workload.expect_rows = self.expect_rows
+        branch_workload.churn_cursor = self.churn_cursor
+        return branch_workload
+
     def reconfigure(self) -> None:
         """
         Request the endpoint to reconfigure based on location reported by storage controller
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index fe0422088a..d0a2349ccf 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -112,7 +112,11 @@ page_cache_size=10
 
 
 @skip_in_debug_build("only run with release build")
-def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize(
+    "with_branches",
+    ["with_branches", "no_branches"],
+)
+def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder, with_branches: str):
     SMOKE_CONF = {
         # Run both gc and gc-compaction.
         "gc_period": "5s",
@@ -143,12 +147,17 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     log.info("Writing initial data ...")
     workload.write_rows(row_count, env.pageserver.id)
 
+    child_workloads: list[Workload] = []
+
     for i in range(1, churn_rounds + 1):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
-
-        if (i - 1) % 10 == 0:
-            # Run gc-compaction every 10 rounds to ensure the test doesn't take too long time.
+        if i % 10 == 5 and with_branches == "with_branches":
+            branch_name = f"child-{i}"
+            branch_timeline_id = env.create_branch(branch_name)
+            child_workloads.append(workload.branch(branch_timeline_id, branch_name))
+        if (i - 1) % 10 == 0 or (i - 1) % 10 == 1:
+            # Run gc-compaction twice every 10 rounds to ensure the test doesn't take too long time.
             ps_http.timeline_compact(
                 tenant_id,
                 timeline_id,
@@ -179,6 +188,9 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
 
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
+    for child_workload in child_workloads:
+        log.info(f"Validating at branch {child_workload.branch_name}")
+        child_workload.validate(env.pageserver.id)
 
     # Run a legacy compaction+gc to ensure gc-compaction can coexist with legacy compaction.
     ps_http.timeline_checkpoint(tenant_id, timeline_id, wait_until_uploaded=True)

From c7429af8a0765b516bd3f6a49c8bc6632deffbf7 Mon Sep 17 00:00:00 2001
From: Mikhail Kot <mikhail@neon.tech>
Date: Wed, 15 Jan 2025 22:29:18 +0000
Subject: [PATCH 15/40] Enable dblink (#10358)

Update compute image to include dblink #3720
---
 compute/compute-node.Dockerfile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1ee159e5df..f56a8358d2 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -66,6 +66,7 @@ RUN cd postgres && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C src/interfaces/libpq install && \
     # Enable some of contrib extensions
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/autoinc.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/dblink.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/bloom.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/insert_username.control && \

From 2eda484ef6dac15e0e7a72955e9d194ccfe3326f Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Thu, 16 Jan 2025 03:43:47 +0100
Subject: [PATCH 16/40] prefetch: Read more frequently from TCP buffer (#10394)

This reduces pressure on the OS TCP read buffer by increasing the
moments we read data out of the receive buffer, and increasing the
number of bytes we can pull from that buffer when we do reads.

## Problem

A backend may not always consume its prefetch data quick enough

## Summary of changes

We add a new function `prefetch_pump_state` which pulls as many prefetch
requests from the OS TCP receive buffer as possible, but without
blocking.

This thus reduces pressure on OS-level TCP buffers, thus increasing
throughput by limiting throttling caused by full TCP buffers.
---
 pgxn/neon/libpagestore.c     | 70 +++++++++++++++++++++++++++++++++++-
 pgxn/neon/pagestore_client.h | 20 +++++++++++
 pgxn/neon/pagestore_smgr.c   | 66 ++++++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 769befb4e5..4460e3b40c 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -911,7 +911,74 @@ pageserver_receive(shardno_t shard_no)
 		}
 		PG_CATCH();
 		{
-			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due malformatted response");
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
+			pageserver_disconnect(shard_no);
+			PG_RE_THROW();
+		}
+		PG_END_TRY();
+
+		if (message_level_is_interesting(PageStoreTrace))
+		{
+			char	   *msg = nm_to_string((NeonMessage *) resp);
+
+			neon_shard_log(shard_no, PageStoreTrace, "got response: %s", msg);
+			pfree(msg);
+		}
+	}
+	else if (rc == -1)
+	{
+		neon_shard_log(shard_no, LOG, "pageserver_receive disconnect: psql end of copy data: %s", pchomp(PQerrorMessage(pageserver_conn)));
+		pageserver_disconnect(shard_no);
+		resp = NULL;
+	}
+	else if (rc == -2)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: could not read COPY data: %s", msg);
+	}
+	else
+	{
+		pageserver_disconnect(shard_no);
+		neon_shard_log(shard_no, ERROR, "pageserver_receive disconnect: unexpected PQgetCopyData return value: %d", rc);
+	}
+
+	shard->nresponses_received++;
+	return (NeonResponse *) resp;
+}
+
+static NeonResponse *
+pageserver_try_receive(shardno_t shard_no)
+{
+	StringInfoData resp_buff;
+	NeonResponse *resp;
+	PageServer *shard = &page_servers[shard_no];
+	PGconn	   *pageserver_conn = shard->conn;
+	/* read response */
+	int			rc;
+
+	if (shard->state != PS_Connected)
+		return NULL;
+
+	Assert(pageserver_conn);
+
+	rc = PQgetCopyData(shard->conn, &resp_buff.data, 1 /* async = true */);
+
+	if (rc == 0)
+		return NULL;
+	else if (rc > 0)
+	{
+		PG_TRY();
+		{
+			resp_buff.len = rc;
+			resp_buff.cursor = 0;
+			resp = nm_unpack_response(&resp_buff);
+			PQfreemem(resp_buff.data);
+		}
+		PG_CATCH();
+		{
+			neon_shard_log(shard_no, LOG, "pageserver_receive: disconnect due to failure while parsing response");
 			pageserver_disconnect(shard_no);
 			PG_RE_THROW();
 		}
@@ -980,6 +1047,7 @@ page_server_api api =
 	.send = pageserver_send,
 	.flush = pageserver_flush,
 	.receive = pageserver_receive,
+	.try_receive = pageserver_try_receive,
 	.disconnect = pageserver_disconnect_shard
 };
 
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 37bc4f7886..b751235595 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -192,9 +192,29 @@ typedef uint16 shardno_t;
 
 typedef struct
 {
+	/*
+	 * Send this request to the PageServer associated with this shard.
+	 */
 	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	/*
+	 * Blocking read for the next response of this shard.
+	 *
+	 * When a CANCEL signal is handled, the connection state will be
+	 * unmodified.
+	 */
 	NeonResponse *(*receive) (shardno_t shard_no);
+	/*
+	 * Try get the next response from the TCP buffers, if any.
+	 * Returns NULL when the data is not yet available. 
+	 */
+	NeonResponse *(*try_receive) (shardno_t shard_no);
+	/*
+	 * Make sure all requests are sent to PageServer.
+	 */
 	bool		(*flush) (shardno_t shard_no);
+	/*
+	 * Disconnect from this pageserver shard.
+	 */
 	void        (*disconnect) (shardno_t shard_no);
 } page_server_api;
 
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index 7a4c0ef487..54cacea984 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -405,6 +405,56 @@ compact_prefetch_buffers(void)
 	return false;
 }
 
+/*
+ * If there might be responses still in the TCP buffer, then
+ * we should try to use those, so as to reduce any TCP backpressure
+ * on the OS/PS side.
+ *
+ * This procedure handles that.
+ *
+ * Note that this is only valid as long as the only pipelined
+ * operations in the TCP buffer are getPage@Lsn requests.
+ */
+static void
+prefetch_pump_state(void)
+{
+	while (MyPState->ring_receive != MyPState->ring_flush)
+	{
+		NeonResponse   *response;
+		PrefetchRequest *slot;
+		MemoryContext	old;
+
+		slot = GetPrfSlot(MyPState->ring_receive);
+
+		old = MemoryContextSwitchTo(MyPState->errctx);
+		response = page_server->try_receive(slot->shard_no);
+		MemoryContextSwitchTo(old);
+
+		if (response == NULL)
+			break;
+
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(slot->shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
+		/* update prefetch state */
+		MyPState->n_responses_buffered += 1;
+		MyPState->n_requests_inflight -= 1;
+		MyPState->ring_receive += 1;
+		MyNeonCounters->getpage_prefetches_buffered =
+			MyPState->n_responses_buffered;
+
+		/* update slot state */
+		slot->status = PRFS_RECEIVED;
+		slot->response = response;
+	}
+}
+
 void
 readahead_buffer_resize(int newsize, void *extra)
 {
@@ -2808,6 +2858,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 			   MyPState->ring_last <= ring_index);
 	}
 
+	prefetch_pump_state();
+
 	return false;
 }
 
@@ -2849,6 +2901,8 @@ neon_prefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 	Assert(ring_index < MyPState->ring_unused &&
 		   MyPState->ring_last <= ring_index);
 
+	prefetch_pump_state();
+
 	return false;
 }
 #endif /* PG_MAJORVERSION_NUM < 17 */
@@ -2891,6 +2945,8 @@ neon_writeback(SMgrRelation reln, ForkNumber forknum,
 	 */
 	neon_log(SmgrTrace, "writeback noop");
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdwriteback(reln, forknum, blocknum, nblocks);
@@ -3145,6 +3201,8 @@ neon_read(SMgrRelation reln, ForkNumber forkNum, BlockNumber blkno, void *buffer
 	neon_get_request_lsns(InfoFromSMgrRel(reln), forkNum, blkno, &request_lsns, 1, NULL);
 	neon_read_at_lsn(InfoFromSMgrRel(reln), forkNum, blkno, request_lsns, buffer);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -3282,6 +3340,8 @@ neon_readv(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 	neon_read_at_lsnv(InfoFromSMgrRel(reln), forknum, blocknum, request_lsns,
 					  buffers, nblocks, read);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (forkNum == MAIN_FORKNUM && IS_LOCAL_REL(reln))
 	{
@@ -3450,6 +3510,8 @@ neon_write(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, const vo
 
 	lfc_write(InfoFromSMgrRel(reln), forknum, blocknum, buffer);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		#if PG_MAJORVERSION_NUM >= 17
@@ -3503,6 +3565,8 @@ neon_writev(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 
 	lfc_writev(InfoFromSMgrRel(reln), forknum, blkno, buffers, nblocks);
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdwritev(reln, forknum, blocknum, &buffer, 1, skipFsync);
@@ -3792,6 +3856,8 @@ neon_immedsync(SMgrRelation reln, ForkNumber forknum)
 
 	neon_log(SmgrTrace, "[NEON_SMGR] immedsync noop");
 
+	prefetch_pump_state();
+
 #ifdef DEBUG_COMPARE_LOCAL
 	if (IS_LOCAL_REL(reln))
 		mdimmedsync(reln, forknum);

From 6fe4c6798f76aaf103f1f44166970f4677ff6c9d Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 16 Jan 2025 11:01:19 +0300
Subject: [PATCH 17/40] Add START_WAL_PUSH proto_version and
 allow_timeline_creation options. (#10406)

## Problem

As part of https://github.com/neondatabase/neon/issues/8614 we need to
pass options to START_WAL_PUSH.

## Summary of changes

Add two options. `allow_timeline_creation`, default true, disables
implicit timeline creation in the connection from compute. Eventually
such creation will be forbidden completely, but as we migrate to
configurations we need to support both: current mode and configurations
enabled where creation by compute is disabled.

`proto_version` specifies compute <-> sk protocol version. We have it
currently in the first greeting package also, but I plan to change tag
size from u64 to u8, which would make it hard to use. Command is more
appropriate place for it anyway.
---
 safekeeper/src/handler.rs                     | 107 ++++++++++++++++--
 safekeeper/src/receive_wal.rs                 |  57 +++++++---
 safekeeper/src/safekeeper.rs                  |  11 +-
 .../tests/walproposer_sim/safekeeper.rs       |   6 +-
 4 files changed, 152 insertions(+), 29 deletions(-)

diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index bb639bfb32..e77eeb4130 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -52,16 +52,70 @@ pub struct SafekeeperPostgresHandler {
 
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
-    StartWalPush,
-    StartReplication { start_lsn: Lsn, term: Option<Term> },
+    StartWalPush {
+        proto_version: u32,
+        // Eventually timelines will be always created explicitly by storcon.
+        // This option allows legacy behaviour for compute to do that until we
+        // fully migrate.
+        allow_timeline_creation: bool,
+    },
+    StartReplication {
+        start_lsn: Lsn,
+        term: Option<Term>,
+    },
     IdentifySystem,
     TimelineStatus,
-    JSONCtrl { cmd: AppendLogicalMessage },
+    JSONCtrl {
+        cmd: AppendLogicalMessage,
+    },
 }
 
 fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
     if cmd.starts_with("START_WAL_PUSH") {
-        Ok(SafekeeperPostgresCommand::StartWalPush)
+        // Allow additional options in postgres START_REPLICATION style like
+        //   START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false').
+        // Parsing here is very naive and breaks in case of commas or
+        // whitespaces in values, but enough for our purposes.
+        let re = Regex::new(r"START_WAL_PUSH(\s+?\((.*)\))?").unwrap();
+        let caps = re
+            .captures(cmd)
+            .context(format!("failed to parse START_WAL_PUSH command {}", cmd))?;
+        // capture () content
+        let options = caps.get(2).map(|m| m.as_str()).unwrap_or("");
+        // default values
+        let mut proto_version = 2;
+        let mut allow_timeline_creation = true;
+        for kvstr in options.split(",") {
+            if kvstr.is_empty() {
+                continue;
+            }
+            let mut kvit = kvstr.split_whitespace();
+            let key = kvit.next().context(format!(
+                "failed to parse key in kv {} in command {}",
+                kvstr, cmd
+            ))?;
+            let value = kvit.next().context(format!(
+                "failed to parse value in kv {} in command {}",
+                kvstr, cmd
+            ))?;
+            let value_trimmed = value.trim_matches('\'');
+            if key == "proto_version" {
+                proto_version = value_trimmed.parse::<u32>().context(format!(
+                    "failed to parse proto_version value {} in command {}",
+                    value, cmd
+                ))?;
+            }
+            if key == "allow_timeline_creation" {
+                allow_timeline_creation = value_trimmed.parse::<bool>().context(format!(
+                    "failed to parse allow_timeline_creation value {} in command {}",
+                    value, cmd
+                ))?;
+            }
+        }
+        Ok(SafekeeperPostgresCommand::StartWalPush {
+            proto_version,
+            allow_timeline_creation,
+        })
     } else if cmd.starts_with("START_REPLICATION") {
         let re = Regex::new(
             // We follow postgres START_REPLICATION LOGICAL options to pass term.
@@ -95,7 +149,7 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
 
 fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
     match cmd {
-        SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
+        SafekeeperPostgresCommand::StartWalPush { .. } => "START_WAL_PUSH",
         SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
         SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
         SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
@@ -293,8 +347,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
             self.ttid = TenantTimelineId::new(tenant_id, timeline_id);
 
             match cmd {
-                SafekeeperPostgresCommand::StartWalPush => {
-                    self.handle_start_wal_push(pgb)
+                SafekeeperPostgresCommand::StartWalPush {
+                    proto_version,
+                    allow_timeline_creation,
+                } => {
+                    self.handle_start_wal_push(pgb, proto_version, allow_timeline_creation)
                         .instrument(info_span!("WAL receiver"))
                         .await
                 }
@@ -467,3 +524,39 @@ impl SafekeeperPostgresHandler {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::SafekeeperPostgresCommand;
+
+    /// Test parsing of START_WAL_PUSH command
+    #[test]
+    fn test_start_wal_push_parse() {
+        let cmd = "START_WAL_PUSH";
+        let parsed = super::parse_cmd(cmd).expect("failed to parse");
+        match parsed {
+            SafekeeperPostgresCommand::StartWalPush {
+                proto_version,
+                allow_timeline_creation,
+            } => {
+                assert_eq!(proto_version, 2);
+                assert!(allow_timeline_creation);
+            }
+            _ => panic!("unexpected command"),
+        }
+
+        let cmd =
+            "START_WAL_PUSH (proto_version '3', allow_timeline_creation 'false', unknown 'hoho')";
+        let parsed = super::parse_cmd(cmd).expect("failed to parse");
+        match parsed {
+            SafekeeperPostgresCommand::StartWalPush {
+                proto_version,
+                allow_timeline_creation,
+            } => {
+                assert_eq!(proto_version, 3);
+                assert!(!allow_timeline_creation);
+            }
+            _ => panic!("unexpected command"),
+        }
+    }
+}
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index daaa8a253d..cb42f6f414 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -200,9 +200,14 @@ impl SafekeeperPostgresHandler {
     pub async fn handle_start_wal_push<IO: AsyncRead + AsyncWrite + Unpin>(
         &mut self,
         pgb: &mut PostgresBackend<IO>,
+        proto_version: u32,
+        allow_timeline_creation: bool,
     ) -> Result<(), QueryError> {
         let mut tli: Option<WalResidentTimeline> = None;
-        if let Err(end) = self.handle_start_wal_push_guts(pgb, &mut tli).await {
+        if let Err(end) = self
+            .handle_start_wal_push_guts(pgb, &mut tli, proto_version, allow_timeline_creation)
+            .await
+        {
             // Log the result and probably send it to the client, closing the stream.
             let handle_end_fut = pgb.handle_copy_stream_end(end);
             // If we managed to create the timeline, augment logging with current LSNs etc.
@@ -222,6 +227,8 @@ impl SafekeeperPostgresHandler {
         &mut self,
         pgb: &mut PostgresBackend<IO>,
         tli: &mut Option<WalResidentTimeline>,
+        proto_version: u32,
+        allow_timeline_creation: bool,
     ) -> Result<(), CopyStreamHandlerEnd> {
         // The `tli` parameter is only used for passing _out_ a timeline, one should
         // not have been passed in.
@@ -250,12 +257,17 @@ impl SafekeeperPostgresHandler {
             conn_id: self.conn_id,
             pgb_reader: &mut pgb_reader,
             peer_addr,
+            proto_version,
             acceptor_handle: &mut acceptor_handle,
             global_timelines: self.global_timelines.clone(),
         };
 
-        // Read first message and create timeline if needed.
-        let res = network_reader.read_first_message().await;
+        // Read first message and create timeline if needed and allowed. This
+        // won't be when timelines will be always created by storcon and
+        // allow_timeline_creation becomes false.
+        let res = network_reader
+            .read_first_message(allow_timeline_creation)
+            .await;
 
         let network_res = if let Ok((timeline, next_msg)) = res {
             let pageserver_feedback_rx: tokio::sync::broadcast::Receiver<PageserverFeedback> =
@@ -313,6 +325,7 @@ struct NetworkReader<'a, IO> {
     conn_id: ConnectionId,
     pgb_reader: &'a mut PostgresBackendReader<IO>,
     peer_addr: SocketAddr,
+    proto_version: u32,
     // WalAcceptor is spawned when we learn server info from walproposer and
     // create timeline; handle is put here.
     acceptor_handle: &'a mut Option<JoinHandle<anyhow::Result<()>>>,
@@ -322,9 +335,10 @@ struct NetworkReader<'a, IO> {
 impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
     async fn read_first_message(
         &mut self,
+        allow_timeline_creation: bool,
     ) -> Result<(WalResidentTimeline, ProposerAcceptorMessage), CopyStreamHandlerEnd> {
         // Receive information about server to create timeline, if not yet.
-        let next_msg = read_message(self.pgb_reader).await?;
+        let next_msg = read_message(self.pgb_reader, self.proto_version).await?;
         let tli = match next_msg {
             ProposerAcceptorMessage::Greeting(ref greeting) => {
                 info!(
@@ -336,17 +350,22 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
                     system_id: greeting.system_id,
                     wal_seg_size: greeting.wal_seg_size,
                 };
-                let tli = self
-                    .global_timelines
-                    .create(
-                        self.ttid,
-                        Configuration::empty(),
-                        server_info,
-                        Lsn::INVALID,
-                        Lsn::INVALID,
-                    )
-                    .await
-                    .context("create timeline")?;
+                let tli = if allow_timeline_creation {
+                    self.global_timelines
+                        .create(
+                            self.ttid,
+                            Configuration::empty(),
+                            server_info,
+                            Lsn::INVALID,
+                            Lsn::INVALID,
+                        )
+                        .await
+                        .context("create timeline")?
+                } else {
+                    self.global_timelines
+                        .get(self.ttid)
+                        .context("get timeline")?
+                };
                 tli.wal_residence_guard().await?
             }
             _ => {
@@ -375,7 +394,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
         ));
 
         // Forward all messages to WalAcceptor
-        read_network_loop(self.pgb_reader, msg_tx, next_msg).await
+        read_network_loop(self.pgb_reader, msg_tx, next_msg, self.proto_version).await
     }
 }
 
@@ -383,9 +402,10 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'_, IO> {
 /// TODO: Return Ok(None) on graceful termination.
 async fn read_message<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_reader: &mut PostgresBackendReader<IO>,
+    proto_version: u32,
 ) -> Result<ProposerAcceptorMessage, CopyStreamHandlerEnd> {
     let copy_data = pgb_reader.read_copy_message().await?;
-    let msg = ProposerAcceptorMessage::parse(copy_data)?;
+    let msg = ProposerAcceptorMessage::parse(copy_data, proto_version)?;
     Ok(msg)
 }
 
@@ -393,6 +413,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
     pgb_reader: &mut PostgresBackendReader<IO>,
     msg_tx: Sender<ProposerAcceptorMessage>,
     mut next_msg: ProposerAcceptorMessage,
+    proto_version: u32,
 ) -> Result<(), CopyStreamHandlerEnd> {
     /// Threshold for logging slow WalAcceptor sends.
     const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
@@ -425,7 +446,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
         WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
         WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
 
-        next_msg = read_message(pgb_reader).await?;
+        next_msg = read_message(pgb_reader, proto_version).await?;
     }
 }
 
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 06403228e9..45e19c31b6 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -29,7 +29,7 @@ use utils::{
     lsn::Lsn,
 };
 
-const SK_PROTOCOL_VERSION: u32 = 2;
+pub const SK_PROTOCOL_VERSION: u32 = 2;
 pub const UNKNOWN_SERVER_VERSION: u32 = 0;
 
 #[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, PartialOrd, Ord)]
@@ -317,7 +317,14 @@ pub enum ProposerAcceptorMessage {
 
 impl ProposerAcceptorMessage {
     /// Parse proposer message.
-    pub fn parse(msg_bytes: Bytes) -> Result<ProposerAcceptorMessage> {
+    pub fn parse(msg_bytes: Bytes, proto_version: u32) -> Result<ProposerAcceptorMessage> {
+        if proto_version != SK_PROTOCOL_VERSION {
+            bail!(
+                "incompatible protocol version {}, expected {}",
+                proto_version,
+                SK_PROTOCOL_VERSION
+            );
+        }
         // xxx using Reader is inefficient but easy to work with bincode
         let mut stream = msg_bytes.reader();
         // u64 is here to avoid padding; it will be removed once we stop packing C structs into the wire as is
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index e0d593851e..0023a4d22a 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -15,7 +15,9 @@ use desim::{
 };
 use http::Uri;
 use safekeeper::{
-    safekeeper::{ProposerAcceptorMessage, SafeKeeper, UNKNOWN_SERVER_VERSION},
+    safekeeper::{
+        ProposerAcceptorMessage, SafeKeeper, SK_PROTOCOL_VERSION, UNKNOWN_SERVER_VERSION,
+    },
     state::{TimelinePersistentState, TimelineState},
     timeline::TimelineError,
     wal_storage::Storage,
@@ -285,7 +287,7 @@ impl ConnState {
                 bail!("finished processing START_REPLICATION")
             }
 
-            let msg = ProposerAcceptorMessage::parse(copy_data)?;
+            let msg = ProposerAcceptorMessage::parse(copy_data, SK_PROTOCOL_VERSION)?;
             debug!("got msg: {:?}", msg);
             self.process(msg, global)
         } else {

From 7be971081aee245b0b78ff22766d8eeb020b66f2 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Thu, 16 Jan 2025 09:34:11 +0100
Subject: [PATCH 18/40] Make sure we request pages with a known-flushed LSN.
 (#10413)

This should fix the largest source of flakyness of
test_nbtree_pagesplit_cycleid.

## Problem

https://github.com/neondatabase/neon/issues/10390

## Summary of changes

By using a guaranteed-flushed LSN, we ensure that PS won't have to wait
forever.

(If it does wait forever, we know the issue can't be with Compute's WAL)
---
 .../regress/test_nbtree_pagesplit_cycleid.py  | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_nbtree_pagesplit_cycleid.py b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
index 558557aeba..32ec6fcb92 100644
--- a/test_runner/regress/test_nbtree_pagesplit_cycleid.py
+++ b/test_runner/regress/test_nbtree_pagesplit_cycleid.py
@@ -4,9 +4,19 @@ import time
 from fixtures.neon_fixtures import NeonEnv
 
 BTREE_NUM_CYCLEID_PAGES = """
-    WITH raw_pages AS (
-        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, NULL, NULL) page
-        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) blkno
+    WITH lsns AS (
+        /*
+         * pg_switch_wal() ensures we have an LSN that
+         * 1. is after any previous modifications, but also,
+         * 2. (critically) is flushed, preventing any issues with waiting for
+         * unflushed WAL in PageServer.
+         */
+        SELECT pg_switch_wal() as lsn
+    ),
+    raw_pages AS (
+        SELECT blkno, get_raw_page_at_lsn('t_uidx', 'main', blkno, lsn, lsn) page
+        FROM generate_series(1, pg_relation_size('t_uidx'::regclass) / 8192) AS blkno,
+            lsns l(lsn)
     ),
     parsed_pages AS (
         /* cycle ID is the last 2 bytes of the btree page */
@@ -36,7 +46,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
     ses1.execute("CREATE UNIQUE INDEX t_uidx ON t(id);")
     ses1.execute("INSERT INTO t (txt) SELECT i::text FROM generate_series(1, 2035) i;")
 
-    ses1.execute("SELECT neon_xlogflush();")
     ses1.execute(BTREE_NUM_CYCLEID_PAGES)
     pages = ses1.fetchall()
     assert (
@@ -57,7 +66,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
     ses1.execute("DELETE FROM t WHERE id <= 610;")
 
     # Flush wal, for checking purposes
-    ses1.execute("SELECT neon_xlogflush();")
     ses1.execute(BTREE_NUM_CYCLEID_PAGES)
     pages = ses1.fetchall()
     assert len(pages) == 0, f"No back splits with cycle ID expected, got batches of {pages} instead"
@@ -108,8 +116,6 @@ def test_nbtree_pagesplit_cycleid(neon_simple_env: NeonEnv):
     # unpin the btree page, allowing s3's vacuum to complete
     ses2.execute("FETCH ALL FROM foo;")
     ses2.execute("ROLLBACK;")
-    # flush WAL to make sure PS is up-to-date
-    ses1.execute("SELECT neon_xlogflush();")
     # check that our expectations are correct
     ses1.execute(BTREE_NUM_CYCLEID_PAGES)
     pages = ses1.fetchall()

From 58f6af6c9ad5bbc6238999769c801cdc52495bac Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 16 Jan 2025 02:35:36 -0600
Subject: [PATCH 19/40] Clean up compute_ctl extension server code (#10417)

---
 compute_tools/src/http/routes/extension_server.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/compute_tools/src/http/routes/extension_server.rs b/compute_tools/src/http/routes/extension_server.rs
index ee5bc675ba..5cc9b6d277 100644
--- a/compute_tools/src/http/routes/extension_server.rs
+++ b/compute_tools/src/http/routes/extension_server.rs
@@ -17,7 +17,8 @@ use crate::{
 
 #[derive(Debug, Clone, Deserialize)]
 pub(in crate::http) struct ExtensionServerParams {
-    is_library: Option<bool>,
+    #[serde(default)]
+    is_library: bool,
 }
 
 /// Download a remote extension.
@@ -51,7 +52,7 @@ pub(in crate::http) async fn download_extension(
 
         remote_extensions.get_ext(
             &filename,
-            params.is_library.unwrap_or(false),
+            params.is_library,
             &compute.build_tag,
             &compute.pgversion,
         )

From 86dbc44db12d1ebb5b9532a270fcfc68caffd8d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Thu, 16 Jan 2025 10:20:24 +0100
Subject: [PATCH 20/40] CI: Run check-codestyle-rust as part of
 pre-merge-checks (#10387)

## Problem

When multiple changes are grouped in a merge group to be merged as part
of the merge queue, the changes might individually pass
`check-codestyle-rust` but not in their combined form.

## Summary of changes

- Move `check-codestyle-rust` into a reusable workflow that is called
from it's previous location in `build_and_test.yml`, and additionally
call it from `pre_merge_checks.yml`. The additional call does not run on
ARM, only x86, to ensure the merge queue continues being responsive.
- Trigger `pre_merge_checks.yml` on PRs that change any of the workflows
running in `pre_merge_checks.yml`, so that we get feedback on those
early an not only after trying to merge those changes.
---
 .github/workflows/_check-codestyle-rust.yml | 91 +++++++++++++++++++++
 .github/workflows/build_and_test.yml        | 76 ++---------------
 .github/workflows/pre-merge-checks.yml      | 33 ++++++++
 3 files changed, 129 insertions(+), 71 deletions(-)
 create mode 100644 .github/workflows/_check-codestyle-rust.yml

diff --git a/.github/workflows/_check-codestyle-rust.yml b/.github/workflows/_check-codestyle-rust.yml
new file mode 100644
index 0000000000..cbc47c6406
--- /dev/null
+++ b/.github/workflows/_check-codestyle-rust.yml
@@ -0,0 +1,91 @@
+name: Check Codestyle Rust
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        description: "build-tools image"
+        required: true
+        type: string
+      archs:
+        description: "Json array of architectures to run on"
+        type: string
+
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  check-codestyle-rust:
+    strategy:
+      matrix:
+        arch: ${{ fromJson(inputs.archs) }}
+    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
+
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          submodules: true
+
+      - name: Cache cargo deps
+        uses: actions/cache@v4
+        with:
+          path: |
+            ~/.cargo/registry
+            !~/.cargo/registry/src
+            ~/.cargo/git
+            target
+          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
+
+      # Some of our rust modules use FFI and need those to be checked
+      - name: Get postgres headers
+        run: make postgres-headers -j$(nproc)
+
+      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
+      # This will catch compiler & clippy warnings in all feature combinations.
+      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
+      # NB: keep clippy args in sync with ./run_clippy.sh
+      #
+      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
+      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
+      # time just for that, so skip "clippy --release".
+      - run: |
+          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
+          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
+            echo "No clippy args found in .neon_clippy_args"
+            exit 1
+          fi
+          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
+      - name: Run cargo clippy (debug)
+        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
+
+      - name: Check documentation generation
+        run: cargo doc --workspace --no-deps --document-private-items
+        env:
+          RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
+
+      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
+      - name: Check formatting
+        if: ${{ !cancelled() }}
+        run: cargo fmt --all -- --check
+
+      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
+      - name: Check rust dependencies
+        if: ${{ !cancelled() }}
+        run: |
+          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
+          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
+
+      # https://github.com/EmbarkStudios/cargo-deny
+      - name: Check rust licenses/bans/advisories/sources
+        if: ${{ !cancelled() }}
+        run: cargo deny check --hide-inclusion-graph
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 489a93f46d..9ec5273af7 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -164,77 +164,11 @@ jobs:
 
   check-codestyle-rust:
     needs: [ check-permissions, build-build-tools-image ]
-    strategy:
-      matrix:
-        arch: [ x64, arm64 ]
-    runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
-
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-        with:
-          submodules: true
-
-      - name: Cache cargo deps
-        uses: actions/cache@v4
-        with:
-          path: |
-            ~/.cargo/registry
-            !~/.cargo/registry/src
-            ~/.cargo/git
-            target
-          key: v1-${{ runner.os }}-${{ runner.arch }}-cargo-${{ hashFiles('./Cargo.lock') }}-${{ hashFiles('./rust-toolchain.toml') }}-rust
-
-      # Some of our rust modules use FFI and need those to be checked
-      - name: Get postgres headers
-        run: make postgres-headers -j$(nproc)
-
-      # cargo hack runs the given cargo subcommand (clippy in this case) for all feature combinations.
-      # This will catch compiler & clippy warnings in all feature combinations.
-      # TODO: use cargo hack for build and test as well, but, that's quite expensive.
-      # NB: keep clippy args in sync with ./run_clippy.sh
-      #
-      # The only difference between "clippy --debug" and "clippy --release" is that in --release mode,
-      # #[cfg(debug_assertions)] blocks are not built. It's not worth building everything for second
-      # time just for that, so skip "clippy --release".
-      - run: |
-          CLIPPY_COMMON_ARGS="$( source .neon_clippy_args; echo "$CLIPPY_COMMON_ARGS")"
-          if [ "$CLIPPY_COMMON_ARGS" = "" ]; then
-            echo "No clippy args found in .neon_clippy_args"
-            exit 1
-          fi
-          echo "CLIPPY_COMMON_ARGS=${CLIPPY_COMMON_ARGS}" >> $GITHUB_ENV
-      - name: Run cargo clippy (debug)
-        run: cargo hack --features default --ignore-unknown-features --feature-powerset clippy $CLIPPY_COMMON_ARGS
-
-      - name: Check documentation generation
-        run: cargo doc --workspace --no-deps --document-private-items
-        env:
-            RUSTDOCFLAGS: "-Dwarnings -Arustdoc::private_intra_doc_links"
-
-      # Use `${{ !cancelled() }}` to run quck tests after the longer clippy run
-      - name: Check formatting
-        if: ${{ !cancelled() }}
-        run: cargo fmt --all -- --check
-
-      # https://github.com/facebookincubator/cargo-guppy/tree/bec4e0eb29dcd1faac70b1b5360267fc02bf830e/tools/cargo-hakari#2-keep-the-workspace-hack-up-to-date-in-ci
-      - name: Check rust dependencies
-        if: ${{ !cancelled() }}
-        run: |
-          cargo hakari generate --diff  # workspace-hack Cargo.toml is up-to-date
-          cargo hakari manage-deps --dry-run  # all workspace crates depend on workspace-hack
-
-      # https://github.com/EmbarkStudios/cargo-deny
-      - name: Check rust licenses/bans/advisories/sources
-        if: ${{ !cancelled() }}
-        run: cargo deny check --hide-inclusion-graph
+    uses: ./.github/workflows/_check-codestyle-rust.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+      archs: '["x64", "arm64"]'
+    secrets: inherit
 
   build-and-test-locally:
     needs: [ tag, build-build-tools-image ]
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index b2e00d94f7..e6dfbaeed8 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -1,6 +1,12 @@
 name: Pre-merge checks
 
 on:
+  pull_request:
+    paths:
+      - .github/workflows/_check-codestyle-python.yml
+      - .github/workflows/_check-codestyle-rust.yml
+      - .github/workflows/build-build-tools-image.yml
+      - .github/workflows/pre-merge-checks.yml
   merge_group:
     branches:
       - main
@@ -17,8 +23,10 @@ jobs:
     runs-on: ubuntu-22.04
     outputs:
       python-changed: ${{ steps.python-src.outputs.any_changed }}
+      rust-changed: ${{ steps.rust-src.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
+
       - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
         id: python-src
         with:
@@ -30,11 +38,25 @@ jobs:
             poetry.lock
             pyproject.toml
 
+      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
+        id: rust-src
+        with:
+          files: |
+            .github/workflows/_check-codestyle-rust.yml
+            .github/workflows/build-build-tools-image.yml
+            .github/workflows/pre-merge-checks.yml
+            **/**.rs
+            **/Cargo.toml
+            Cargo.toml
+            Cargo.lock
+
       - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
         env:
           PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
+          RUST_CHANGED_FILES: ${{ steps.rust-src.outputs.all_changed_files }}
         run: |
           echo "${PYTHON_CHANGED_FILES}"
+          echo "${RUST_CHANGED_FILES}"
 
   build-build-tools-image:
     if: needs.get-changed-files.outputs.python-changed == 'true'
@@ -55,6 +77,16 @@ jobs:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
     secrets: inherit
 
+  check-codestyle-rust:
+    if: needs.get-changed-files.outputs.rust-changed == 'true'
+    needs: [ get-changed-files, build-build-tools-image ]
+    uses: ./.github/workflows/_check-codestyle-rust.yml
+    with:
+      # `-bookworm-x64` suffix should match the combination in `build-build-tools-image`
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm-x64
+      archs: '["x64"]'
+    secrets: inherit
+
   # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
   # Currently we require 2 jobs (checks with exact name):
   # - conclusion
@@ -67,6 +99,7 @@ jobs:
     needs:
       - get-changed-files
       - check-codestyle-python
+      - check-codestyle-rust
     runs-on: ubuntu-22.04
     steps:
       - name: Create fake `neon-cloud-e2e` check

From 21d7b6a258e16542cf025790072850e80e40fdff Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 14:11:33 +0000
Subject: [PATCH 21/40] tests: refactor
 test_tenant_delete_races_timeline_creation (#10425)

## Problem

Threads spawned in `test_tenant_delete_races_timeline_creation` are not
joined before the test ends, and can generate
`PytestUnhandledThreadExceptionWarning` in other tests.


https://neon-github-public-dev.s3.amazonaws.com/reports/pr-10419/12805365523/index.html#/testresult/53a72568acd04dbd

## Summary of changes

- Wrap threads in ThreadPoolExecutor which will join them before the
test ends
- Remove a spurious deletion call -- the background thread doing
deletion ought to succeed.
---
 test_runner/regress/test_tenant_delete.py | 71 ++++++++++++-----------
 1 file changed, 38 insertions(+), 33 deletions(-)

diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 48e55c1ab1..3720f653c5 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import json
+from concurrent.futures import ThreadPoolExecutor
 from threading import Thread
 
 import pytest
@@ -253,29 +254,8 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
     ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "pause"))
 
     def timeline_create():
-        try:
-            ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
-            raise RuntimeError("creation succeeded even though it shouldn't")
-        except ReadTimeout:
-            pass
-
-    Thread(target=timeline_create).start()
-
-    def hit_initdb_upload_failpoint():
-        env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
-
-    wait_until(hit_initdb_upload_failpoint)
-
-    def creation_connection_timed_out():
-        env.pageserver.assert_log_contains(
-            "POST.*/timeline.* request was dropped before completing"
-        )
-
-    # Wait so that we hit the timeout and the connection is dropped
-    # (But timeline creation still continues)
-    wait_until(creation_connection_timed_out)
-
-    ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
+        ps_http.timeline_create(env.pg_version, tenant_id, TimelineId.generate(), timeout=1)
+        raise RuntimeError("creation succeeded even though it shouldn't")
 
     def tenant_delete():
         def tenant_delete_inner():
@@ -283,21 +263,46 @@ def test_tenant_delete_races_timeline_creation(neon_env_builder: NeonEnvBuilder)
 
         wait_until(tenant_delete_inner)
 
-    Thread(target=tenant_delete).start()
+    # We will spawn background threads for timeline creation and tenant deletion.  They will both
+    # get blocked on our failpoint.
+    with ThreadPoolExecutor(max_workers=1) as executor:
+        create_fut = executor.submit(timeline_create)
 
-    def deletion_arrived():
-        env.pageserver.assert_log_contains(
-            f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
-        )
+        def hit_initdb_upload_failpoint():
+            env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
-    wait_until(deletion_arrived)
+        wait_until(hit_initdb_upload_failpoint)
 
-    ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
+        def creation_connection_timed_out():
+            env.pageserver.assert_log_contains(
+                "POST.*/timeline.* request was dropped before completing"
+            )
 
-    # Disable the failpoint and wait for deletion to finish
-    ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
+        # Wait so that we hit the timeout and the connection is dropped
+        # (But timeline creation still continues)
+        wait_until(creation_connection_timed_out)
 
-    ps_http.tenant_delete(tenant_id)
+        with pytest.raises(ReadTimeout):
+            # Our creation failed from the client's point of view.
+            create_fut.result()
+
+        ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
+
+        delete_fut = executor.submit(tenant_delete)
+
+        def deletion_arrived():
+            env.pageserver.assert_log_contains(
+                f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
+            )
+
+        wait_until(deletion_arrived)
+
+        ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
+
+        # Disable the failpoint and wait for deletion to finish
+        ps_http.configure_failpoints((BEFORE_INITDB_UPLOAD_FAILPOINT, "off"))
+
+        delete_fut.result()
 
     # Physical deletion should have happened
     assert_prefix_empty(

From e436dcad57d22f6d5fcb84ff2118b5f2ec96656f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 16 Jan 2025 15:30:49 +0100
Subject: [PATCH 22/40] Rename "disabled" safekeeper scheduling policy to
 "pause" (#10410)

Rename the safekeeper scheduling policy "disabled" to "pause".

A rename was requested in
https://github.com/neondatabase/neon/pull/10400#discussion_r1916259124,
as the "disabled" policy is meant to be analogous to the "pause" policy
for pageservers.

Also simplify the `SkSchedulingPolicyArg::from_str` function, relying on
the `from_str` implementation of `SkSchedulingPolicy`. Latter is used
for the database format as well, so it is quite stable. If we ever want
to change the UI, we'll need to duplicate the function again but this is
cheap.
---
 control_plane/storcon_cli/src/main.rs                |  9 +--------
 libs/pageserver_api/src/controller_api.rs            | 12 ++++++++----
 .../down.sql                                         |  2 ++
 .../up.sql                                           |  2 ++
 test_runner/regress/test_storage_controller.py       |  2 +-
 5 files changed, 14 insertions(+), 13 deletions(-)
 create mode 100644 storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql
 create mode 100644 storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 96bfad4c86..d9b76b9600 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -298,14 +298,7 @@ impl FromStr for SkSchedulingPolicyArg {
     type Err = anyhow::Error;
 
     fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(SkSchedulingPolicy::Active)),
-            "disabled" => Ok(Self(SkSchedulingPolicy::Disabled)),
-            "decomissioned" => Ok(Self(SkSchedulingPolicy::Decomissioned)),
-            _ => Err(anyhow::anyhow!(
-                "Unknown scheduling policy '{s}', try active,disabled,decomissioned"
-            )),
-        }
+        SkSchedulingPolicy::from_str(s).map(Self)
     }
 }
 
diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 08d1fa55b9..78e080981a 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -324,7 +324,7 @@ impl From<NodeSchedulingPolicy> for String {
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum SkSchedulingPolicy {
     Active,
-    Disabled,
+    Pause,
     Decomissioned,
 }
 
@@ -334,9 +334,13 @@ impl FromStr for SkSchedulingPolicy {
     fn from_str(s: &str) -> Result<Self, Self::Err> {
         Ok(match s {
             "active" => Self::Active,
-            "disabled" => Self::Disabled,
+            "pause" => Self::Pause,
             "decomissioned" => Self::Decomissioned,
-            _ => return Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+            _ => {
+                return Err(anyhow::anyhow!(
+                    "Unknown scheduling policy '{s}', try active,pause,decomissioned"
+                ))
+            }
         })
     }
 }
@@ -346,7 +350,7 @@ impl From<SkSchedulingPolicy> for String {
         use SkSchedulingPolicy::*;
         match value {
             Active => "active",
-            Disabled => "disabled",
+            Pause => "pause",
             Decomissioned => "decomissioned",
         }
         .to_string()
diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql
new file mode 100644
index 0000000000..3c7126e343
--- /dev/null
+++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/down.sql
@@ -0,0 +1,2 @@
+ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'disabled';
+UPDATE safekeepers SET scheduling_policy = 'disabled' WHERE scheduling_policy = 'pause';
diff --git a/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql
new file mode 100644
index 0000000000..9ff75444f3
--- /dev/null
+++ b/storage_controller/migrations/2025-01-15-181207_safekeepers_disabled_to_pause/up.sql
@@ -0,0 +1,2 @@
+ALTER TABLE safekeepers ALTER COLUMN scheduling_policy SET DEFAULT 'pause';
+UPDATE safekeepers SET scheduling_policy = 'pause' WHERE scheduling_policy = 'disabled';
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index b1e1fd81d6..ff479e8fe2 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3211,7 +3211,7 @@ def test_safekeeper_deployment_time_update(neon_env_builder: NeonEnvBuilder):
     # some small tests for the scheduling policy querying and returning APIs
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info
-    assert newest_info["scheduling_policy"] == "Disabled"
+    assert newest_info["scheduling_policy"] == "Pause"
     target.safekeeper_scheduling_policy(inserted["id"], "Decomissioned")
     newest_info = target.get_safekeeper(inserted["id"])
     assert newest_info

From cccc1968487c1f29c6ed190a538178b7c79410ce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 16 Jan 2025 10:33:37 -0500
Subject: [PATCH 23/40] refactor(pageserver): make partitioning an ArcSwap
 (#10377)

## Problem

gc-compaction needs the partitioning data to decide the job split. This
refactor allows concurrent access/computing the partitioning.

## Summary of changes

Make `partitioning` an ArcSwap so that others can access the
partitioning while we compute it. Fully eliminate the `repartition is
called concurrently` warning when gc-compaction is going on.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/utils/src/guard_arc_swap.rs             | 54 ++++++++++++++++++++
 libs/utils/src/lib.rs                        |  2 +
 pageserver/src/tenant/timeline.rs            | 26 +++++-----
 pageserver/src/tenant/timeline/compaction.rs |  7 +--
 4 files changed, 70 insertions(+), 19 deletions(-)
 create mode 100644 libs/utils/src/guard_arc_swap.rs

diff --git a/libs/utils/src/guard_arc_swap.rs b/libs/utils/src/guard_arc_swap.rs
new file mode 100644
index 0000000000..cec5202460
--- /dev/null
+++ b/libs/utils/src/guard_arc_swap.rs
@@ -0,0 +1,54 @@
+//! A wrapper around `ArcSwap` that ensures there is only one writer at a time and writes
+//! don't block reads.
+
+use arc_swap::ArcSwap;
+use std::sync::Arc;
+use tokio::sync::TryLockError;
+
+pub struct GuardArcSwap<T> {
+    inner: ArcSwap<T>,
+    guard: tokio::sync::Mutex<()>,
+}
+
+pub struct Guard<'a, T> {
+    _guard: tokio::sync::MutexGuard<'a, ()>,
+    inner: &'a ArcSwap<T>,
+}
+
+impl<T> GuardArcSwap<T> {
+    pub fn new(inner: T) -> Self {
+        Self {
+            inner: ArcSwap::new(Arc::new(inner)),
+            guard: tokio::sync::Mutex::new(()),
+        }
+    }
+
+    pub fn read(&self) -> Arc<T> {
+        self.inner.load_full()
+    }
+
+    pub async fn write_guard(&self) -> Guard<'_, T> {
+        Guard {
+            _guard: self.guard.lock().await,
+            inner: &self.inner,
+        }
+    }
+
+    pub fn try_write_guard(&self) -> Result<Guard<'_, T>, TryLockError> {
+        let guard = self.guard.try_lock()?;
+        Ok(Guard {
+            _guard: guard,
+            inner: &self.inner,
+        })
+    }
+}
+
+impl<T> Guard<'_, T> {
+    pub fn read(&self) -> Arc<T> {
+        self.inner.load_full()
+    }
+
+    pub fn write(&mut self, value: T) {
+        self.inner.store(Arc::new(value));
+    }
+}
diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 2c56dd750f..1fb18e9e9a 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -98,6 +98,8 @@ pub mod try_rcu;
 
 pub mod pprof;
 
+pub mod guard_arc_swap;
+
 // Re-export used in macro. Avoids adding git-version as dep in target crates.
 #[doc(hidden)]
 pub use git_version;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d6ae11e67d..f24611e1d8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -51,7 +51,9 @@ use tokio::{
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{
-    fs_ext, pausable_failpoint,
+    fs_ext,
+    guard_arc_swap::GuardArcSwap,
+    pausable_failpoint,
     postgres_client::PostgresClientProtocol,
     sync::gate::{Gate, GateGuard},
 };
@@ -353,8 +355,8 @@ pub struct Timeline {
     // though let's keep them both for better error visibility.
     pub initdb_lsn: Lsn,
 
-    /// When did we last calculate the partitioning? Make it pub to test cases.
-    pub(super) partitioning: tokio::sync::Mutex<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
+    /// The repartitioning result. Allows a single writer and multiple readers.
+    pub(crate) partitioning: GuardArcSwap<((KeyPartitioning, SparseKeyPartitioning), Lsn)>,
 
     /// Configuration: how often should the partitioning be recalculated.
     repartition_threshold: u64,
@@ -2340,7 +2342,8 @@ impl Timeline {
                     // initial logical size is 0.
                     LogicalSize::empty_initial()
                 },
-                partitioning: tokio::sync::Mutex::new((
+
+                partitioning: GuardArcSwap::new((
                     (KeyPartitioning::new(), KeyPartitioning::new().into_sparse()),
                     Lsn(0),
                 )),
@@ -4028,18 +4031,15 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> Result<((KeyPartitioning, SparseKeyPartitioning), Lsn), CompactionError> {
-        let Ok(mut partitioning_guard) = self.partitioning.try_lock() else {
+        let Ok(mut guard) = self.partitioning.try_write_guard() else {
             // NB: there are two callers, one is the compaction task, of which there is only one per struct Tenant and hence Timeline.
             // The other is the initdb optimization in flush_frozen_layer, used by `boostrap_timeline`, which runs before `.activate()`
             // and hence before the compaction task starts.
-            // Note that there are a third "caller" that will take the `partitioning` lock. It is `gc_compaction_split_jobs` for
-            // gc-compaction where it uses the repartition data to determine the split jobs. In the future, it might use its own
-            // heuristics, but for now, we should allow concurrent access to it and let the caller retry compaction.
             return Err(CompactionError::Other(anyhow!(
-                "repartition() called concurrently, this is rare and a retry should be fine"
+                "repartition() called concurrently"
             )));
         };
-        let ((dense_partition, sparse_partition), partition_lsn) = &*partitioning_guard;
+        let ((dense_partition, sparse_partition), partition_lsn) = &*guard.read();
         if lsn < *partition_lsn {
             return Err(CompactionError::Other(anyhow!(
                 "repartition() called with LSN going backwards, this should not happen"
@@ -4067,9 +4067,9 @@ impl Timeline {
         let sparse_partitioning = SparseKeyPartitioning {
             parts: vec![sparse_ks],
         }; // no partitioning for metadata keys for now
-        *partitioning_guard = ((dense_partitioning, sparse_partitioning), lsn);
-
-        Ok((partitioning_guard.0.clone(), partitioning_guard.1))
+        let result = ((dense_partitioning, sparse_partitioning), lsn);
+        guard.write(result.clone());
+        Ok(result)
     }
 
     // Is it time to create a new image layer for the given partition?
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 2042a18e96..06a21f6b3c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -2146,12 +2146,7 @@ impl Timeline {
         let mut compact_jobs = Vec::new();
         // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
         // by estimating the amount of files read for a compaction job. We should also partition on LSN.
-        let ((dense_ks, sparse_ks), _) = {
-            let Ok(partition) = self.partitioning.try_lock() else {
-                bail!("failed to acquire partition lock during gc-compaction");
-            };
-            partition.clone()
-        };
+        let ((dense_ks, sparse_ks), _) = self.partitioning.read().as_ref().clone();
         // Truncate the key range to be within user specified compaction range.
         fn truncate_to(
             source_start: &Key,

From 2e13a3aa7ab117870b0b46525f65aa5cff0c3e2c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 16:56:44 +0000
Subject: [PATCH 24/40] storage controller: handle legacy TenantConf in
 consistency_check (#10422)

## Problem

We were comparing serialized configs from the database with serialized
configs from memory. If fields have been added/removed to TenantConfig,
this generates spurious consistency errors. This is fine in test
environments, but limits the usefulness of this debug API in the field.

Closes: https://github.com/neondatabase/neon/issues/10369

## Summary of changes

- Do a decode/encode cycle on the config before comparing it, so that it
will have exactly the expected fields.
---
 storage_controller/src/service.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 1d85839881..f56b683b9f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5411,6 +5411,15 @@ impl Service {
 
         expect_shards.sort_by_key(|tsp| (tsp.tenant_id.clone(), tsp.shard_number, tsp.shard_count));
 
+        // Because JSON contents of persistent tenants might disagree with the fields in current `TenantConfig`
+        // definition, we will do an encode/decode cycle to ensure any legacy fields are dropped and any new
+        // fields are added, before doing a comparison.
+        for tsp in &mut persistent_shards {
+            let config: TenantConfig = serde_json::from_str(&tsp.config)
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            tsp.config = serde_json::to_string(&config).expect("Encoding config is infallible");
+        }
+
         if persistent_shards != expect_shards {
             tracing::error!("Consistency check failed on shards.");
 

From da1315479173bed01804f72ad93594dbf36ee979 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 17:33:46 +0000
Subject: [PATCH 25/40] storcon: revise fill logic to prioritize AZ (#10411)

## Problem

Node fills were limited to moving (total shards / node_count) shards. In
systems that aren't perfectly balanced already, that leads us to skip
migrating some of the shards that belong on this node, generating work
for the optimizer later to gradually move them back.

## Summary of changes

- Where a shard has a preferred AZ and is currently attached outside
this AZ, then always promote it during fill, irrespective of target fill
count
---
 storage_controller/src/service.rs | 121 ++++++++++++++++++------------
 1 file changed, 71 insertions(+), 50 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index f56b683b9f..9ac9ee17ca 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -7279,19 +7279,14 @@ impl Service {
         Ok(())
     }
 
-    /// Create a node fill plan (pick secondaries to promote) that meets the following requirements:
-    /// 1. The node should be filled until it reaches the expected cluster average of
-    ///    attached shards. If there are not enough secondaries on the node, the plan stops early.
-    /// 2. Select tenant shards to promote such that the number of attached shards is balanced
-    ///    throughout the cluster. We achieve this by picking tenant shards from each node,
-    ///    starting from the ones with the largest number of attached shards, until the node
-    ///    reaches the expected cluster average.
-    /// 3. Avoid promoting more shards of the same tenant than required. The upper bound
-    ///    for the number of tenants from the same shard promoted to the node being filled is:
-    ///    shard count for the tenant divided by the number of nodes in the cluster.
+    /// Create a node fill plan (pick secondaries to promote), based on:
+    /// 1. Shards which have a secondary on this node, and this node is in their home AZ, and are currently attached to a node
+    ///    outside their home AZ, should be migrated back here.
+    /// 2. If after step 1 we have not migrated enough shards for this node to have its fair share of
+    ///    attached shards, we will promote more shards from the nodes with the most attached shards, unless
+    ///    those shards have a home AZ that doesn't match the node we're filling.
     fn fill_node_plan(&self, node_id: NodeId) -> Vec<TenantShardId> {
         let mut locked = self.inner.write().unwrap();
-        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
         let (nodes, tenants, _scheduler) = locked.parts_mut();
 
         let node_az = nodes
@@ -7300,53 +7295,79 @@ impl Service {
             .get_availability_zone_id()
             .clone();
 
-        let mut tids_by_node = tenants
-            .iter_mut()
-            .filter_map(|(tid, tenant_shard)| {
-                if !matches!(
-                    tenant_shard.get_scheduling_policy(),
-                    ShardSchedulingPolicy::Active
-                ) {
-                    // Only include tenants in fills if they have a normal (Active) scheduling policy.  We
-                    // even exclude Essential, because moving to fill a node is not essential to keeping this
-                    // tenant available.
-                    return None;
-                }
+        // The tenant shard IDs that we plan to promote from secondary to attached on this node
+        let mut plan = Vec::new();
 
-                // AZ check: when filling nodes after a restart, our intent is to move _back_ the
-                // shards which belong on this node, not to promote shards whose scheduling preference
-                // would be on their currently attached node.  So will avoid promoting shards whose
-                // home AZ doesn't match the AZ of the node we're filling.
-                match tenant_shard.preferred_az() {
-                    None => {
-                        // Shard doesn't have an AZ preference: it is elegible to be moved.
-                    }
-                    Some(az) if az == &node_az => {
-                        // This shard's home AZ is equal to the node we're filling: it is
-                        // elegible to be moved: fall through;
-                    }
-                    Some(_) => {
-                        // This shard's home AZ is somewhere other than the node we're filling:
-                        // do not include it in the fill plan.
-                        return None;
-                    }
-                }
+        // Collect shards which do not have a preferred AZ & are elegible for moving in stage 2
+        let mut free_tids_by_node: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
 
-                if tenant_shard.intent.get_secondary().contains(&node_id) {
+        // Don't respect AZ preferences if there is only one AZ.  This comes up in tests, but it could
+        // conceivably come up in real life if deploying a single-AZ region intentionally.
+        let respect_azs = nodes
+            .values()
+            .map(|n| n.get_availability_zone_id())
+            .unique()
+            .count()
+            > 1;
+
+        // Step 1: collect all shards that we are required to migrate back to this node because their AZ preference
+        // requires it.
+        for (tsid, tenant_shard) in tenants {
+            if !tenant_shard.intent.get_secondary().contains(&node_id) {
+                // Shard doesn't have a secondary on this node, ignore it.
+                continue;
+            }
+
+            // AZ check: when filling nodes after a restart, our intent is to move _back_ the
+            // shards which belong on this node, not to promote shards whose scheduling preference
+            // would be on their currently attached node.  So will avoid promoting shards whose
+            // home AZ doesn't match the AZ of the node we're filling.
+            match tenant_shard.preferred_az() {
+                _ if !respect_azs => {
                     if let Some(primary) = tenant_shard.intent.get_attached() {
-                        return Some((*primary, *tid));
+                        free_tids_by_node.entry(*primary).or_default().push(*tsid);
                     }
                 }
+                None => {
+                    // Shard doesn't have an AZ preference: it is elegible to be moved, but we
+                    // will only do so if our target shard count requires it.
+                    if let Some(primary) = tenant_shard.intent.get_attached() {
+                        free_tids_by_node.entry(*primary).or_default().push(*tsid);
+                    }
+                }
+                Some(az) if az == &node_az => {
+                    // This shard's home AZ is equal to the node we're filling: it should
+                    // be moved back to this node as part of filling, unless its currently
+                    // attached location is also in its home AZ.
+                    if let Some(primary) = tenant_shard.intent.get_attached() {
+                        if nodes
+                            .get(primary)
+                            .expect("referenced node must exist")
+                            .get_availability_zone_id()
+                            != tenant_shard
+                                .preferred_az()
+                                .expect("tenant must have an AZ preference")
+                        {
+                            plan.push(*tsid)
+                        }
+                    } else {
+                        plan.push(*tsid)
+                    }
+                }
+                Some(_) => {
+                    // This shard's home AZ is somewhere other than the node we're filling,
+                    // it may not be moved back to this node as part of filling.  Ignore it
+                }
+            }
+        }
 
-                None
-            })
-            .into_group_map();
+        // Step 2: also promote any AZ-agnostic shards as required to achieve the target number of attachments
+        let fill_requirement = locked.scheduler.compute_fill_requirement(node_id);
 
         let expected_attached = locked.scheduler.expected_attached_shard_count();
         let nodes_by_load = locked.scheduler.nodes_by_attached_shard_count();
 
         let mut promoted_per_tenant: HashMap<TenantId, usize> = HashMap::new();
-        let mut plan = Vec::new();
 
         for (node_id, attached) in nodes_by_load {
             let available = locked.nodes.get(&node_id).is_some_and(|n| n.is_available());
@@ -7355,7 +7376,7 @@ impl Service {
             }
 
             if plan.len() >= fill_requirement
-                || tids_by_node.is_empty()
+                || free_tids_by_node.is_empty()
                 || attached <= expected_attached
             {
                 break;
@@ -7367,7 +7388,7 @@ impl Service {
 
             let mut remove_node = false;
             while take > 0 {
-                match tids_by_node.get_mut(&node_id) {
+                match free_tids_by_node.get_mut(&node_id) {
                     Some(tids) => match tids.pop() {
                         Some(tid) => {
                             let max_promote_for_tenant = std::cmp::max(
@@ -7393,7 +7414,7 @@ impl Service {
             }
 
             if remove_node {
-                tids_by_node.remove(&node_id);
+                free_tids_by_node.remove(&node_id);
             }
         }
 

From 3a285a046b03954eb4ceb706d045510c70a41905 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 16 Jan 2025 18:51:56 +0000
Subject: [PATCH 26/40] pageserver: include node id when subscribing to SK
 (#10432)

## Problem

All pageserver have the same application name which makes it hard to
distinguish them.

## Summary of changes

Include the node id in the application name sent to the safekeeper. This
should gives us
more visibility in logs. There's a few metrics that will increase in
cardinality by `pageserver_count`,
but that's fine.
---
 .../src/tenant/timeline/walreceiver/walreceiver_connection.rs   | 2 +-
 test_runner/regress/test_tenants.py                             | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 129b987e57..01c272633c 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -140,7 +140,7 @@ pub(super) async fn handle_walreceiver_connection(
 
     let (replication_client, connection) = {
         let mut config = wal_source_connconf.to_tokio_postgres_config();
-        config.application_name("pageserver");
+        config.application_name(format!("pageserver-{}", node.0).as_str());
         config.replication_mode(tokio_postgres::config::ReplicationMode::Physical);
         match time::timeout(connect_timeout, config.connect(postgres::NoTls)).await {
             Ok(client_and_conn) => client_and_conn?,
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index d31901b384..b4c968b217 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -194,7 +194,7 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder):
         io_metrics = query_all_safekeepers(
             "safekeeper_pg_io_bytes_total",
             {
-                "app_name": "pageserver",
+                "app_name": f"pageserver-{env.pageserver.id}",
                 "client_az": "test_ps_az",
                 "dir": io_direction,
                 "same_az": "false",

From 8f2ebc068420b62a9cd89292870b68bb0b74664c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 16 Jan 2025 19:00:16 +0000
Subject: [PATCH 27/40] tests: stabilize test_storage_controller_node_deletion
 (#10420)

## Problem

`test_storage_controller_node_deletion` sometimes failed because shards
were moving around during timeline creation, and neon_local isn't
tolerant of that. The movements were unexpected because the shards had
only just been created.

This was a regression from #9916

Closes: #10383

## Summary of changes

- Make this test use multiple AZs -- this makes the storage controller's
scheduling reliably stable

Why this works: in #9916 , I made a simplifying assumption that we would
have multiple AZs to get nice stable scheduling -- it's much easier,
because each tenant has a well defined primary+secondary location when
they have an AZ preference and nodes have different AZs. Everything
still works if you don't have multiple AZs, but you just have this quirk
that sometimes the optimizer can disagree with initial scheduling, so
once in a while a shard moves after being created -- annoying for tests,
harmless IRL.
---
 test_runner/fixtures/neon_fixtures.py          | 14 ++++++++++++--
 test_runner/regress/test_storage_controller.py |  4 ++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index c3950e9bf7..a01cb47984 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -370,6 +370,7 @@ class NeonEnvBuilder:
         pageserver_config_override: str | Callable[[dict[str, Any]], None] | None = None,
         num_safekeepers: int = 1,
         num_pageservers: int = 1,
+        num_azs: int = 1,
         # Use non-standard SK ids to check for various parsing bugs
         safekeepers_id_start: int = 0,
         # fsync is disabled by default to make the tests go faster
@@ -401,6 +402,7 @@ class NeonEnvBuilder:
         self.pageserver_config_override = pageserver_config_override
         self.num_safekeepers = num_safekeepers
         self.num_pageservers = num_pageservers
+        self.num_azs = num_azs
         self.safekeepers_id_start = safekeepers_id_start
         self.safekeepers_enable_fsync = safekeepers_enable_fsync
         self.auth_enabled = auth_enabled
@@ -990,6 +992,7 @@ class NeonEnv:
         self.endpoints = EndpointFactory(self)
         self.safekeepers: list[Safekeeper] = []
         self.pageservers: list[NeonPageserver] = []
+        self.num_azs = config.num_azs
         self.broker = NeonBroker(self)
         self.pageserver_remote_storage = config.pageserver_remote_storage
         self.safekeepers_remote_storage = config.safekeepers_remote_storage
@@ -1090,14 +1093,21 @@ class NeonEnv:
                 http=self.port_distributor.get_port(),
             )
 
+            # Availabilty zones may also be configured manually with `NeonEnvBuilder.pageserver_config_override`
+            if self.num_azs > 1:
+                # Round-robin assignment of AZ names like us-east-2a, us-east-2b, etc.
+                az_prefix = DEFAULT_AZ_ID[:-1]
+                availability_zone = f"{az_prefix}{chr(ord('a') + (ps_id - 1) % self.num_azs)}"
+            else:
+                availability_zone = DEFAULT_AZ_ID
+
             ps_cfg: dict[str, Any] = {
                 "id": ps_id,
                 "listen_pg_addr": f"localhost:{pageserver_port.pg}",
                 "listen_http_addr": f"localhost:{pageserver_port.http}",
                 "pg_auth_type": pg_auth_type,
                 "http_auth_type": http_auth_type,
-                # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
-                "availability_zone": DEFAULT_AZ_ID,
+                "availability_zone": availability_zone,
                 # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index ff479e8fe2..350fe31099 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2394,6 +2394,7 @@ def test_storage_controller_node_deletion(
     Test that deleting a node works & properly reschedules everything that was on the node.
     """
     neon_env_builder.num_pageservers = 3
+    neon_env_builder.num_azs = 3
     env = neon_env_builder.init_configs()
     env.start()
 
@@ -2407,6 +2408,9 @@ def test_storage_controller_node_deletion(
             tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant
         )
 
+    # Sanity check: initial creations should not leave the system in an unstable scheduling state
+    assert env.storage_controller.reconcile_all() == 0
+
     victim = env.pageservers[-1]
 
     # The procedure a human would follow is:

From b0838a68e50e7caf3ae107cfa647093e0f62bb64 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 16 Jan 2025 13:49:04 -0600
Subject: [PATCH 28/40] Enable pgx_ulid on Postgres 17 (#10397)

The extension now supports Postgres 17. The release also seems to be
binary compatible with the previous version.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 43 +++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 8 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f56a8358d2..8c7200c5cb 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -995,24 +995,50 @@ RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04
 #########################################################################################
 #
 # Layer "pg-pgx-ulid-build"
-# Compile "pgx_ulid" extension
+# Compile "pgx_ulid" extension for v16 and below
 #
 #########################################################################################
 
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
-# doesn't support v17 yet
-# https://github.com/pksunkara/pgx_ulid/pull/52
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16") \
+        ;; \
+    *) \
+        echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
+        ;; \
     esac && \
     wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.5.tar.gz -O pgx_ulid.tar.gz && \
-    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17 pgx_ulid.tar.gz" | sha256sum --check && \
+    echo "9d1659a2da65af0133d5451c454de31b37364e3502087dadf579f790bc8bef17  pgx_ulid.tar.gz" | sha256sum --check && \
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx       = "^0.11.2"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx       = "^0.11.2"/pgrx       = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/ulid.control
+
+#########################################################################################
+#
+# Layer "pg-pgx-ulid-pgrx12-build"
+# Compile "pgx_ulid" extension for v17 and up
+#
+#########################################################################################
+
+FROM rust-extensions-build-pgrx12 AS pg-pgx-ulid-pgrx12-build
+ARG PG_VERSION
+
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        ;; \
+    *) \
+        echo "skipping the version of pgx_ulid for $PG_VERSION" && exit 0 \
+        ;; \
+    esac && \
+    wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.2.0.tar.gz -O pgx_ulid.tar.gz && \
+    echo "cef6a9a2e5e7bd1a10a18989286586ee9e6c1c06005a4055cff190de41bf3e9f pgx_ulid.tar.gz" | sha256sum --check && \
+    mkdir pgx_ulid-src && cd pgx_ulid-src && tar xzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
+    sed -i 's/pgrx       = "^0.12.7"/pgrx       = { version = "0.12.9", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgx_ulid.control
 
 #########################################################################################
 #
@@ -1157,6 +1183,7 @@ COPY --from=timescaledb-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-hint-plan-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-cron-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-pgx-ulid-pgrx12-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-session-jwt-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/

From c47c5f4acefa1cd2edbe0f39dac8cefd2fc95284 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 16 Jan 2025 21:34:02 +0100
Subject: [PATCH 29/40] fix(page_service pipelining): tenant cannot shut down
 because gate kept open while flushing responses (#10386)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

# Refs

- fixes https://github.com/neondatabase/neon/issues/10309
- fixup of batching design, first introduced in
https://github.com/neondatabase/neon/pull/9851
- refinement of https://github.com/neondatabase/neon/pull/8339

# Problem

`Tenant::shutdown` was occasionally taking many minutes (sometimes up to
20) in staging and prod if the
`page_service_pipelining.mode="concurrent-futures"` is enabled.

# Symptoms

The issue happens during shard migration between pageservers.
There is page_service unavailability and hence effectively downtime for
customers in the following case:
1. The source (state `AttachedStale`) gets stuck in `Tenant::shutdown`,
waiting for the gate to close.
2. Cplane/Storcon decides to transition the target `AttachedMulti` to
`AttachedSingle`.
3. That transition comes with a bump of the generation number, causing
the `PUT .../location_config` endpoint to do a full `Tenant::shutdown` /
`Tenant::attach` cycle for the target location.
4. That `Tenant::shutdown` on the target gets stuck, waiting for the
gate to close.
5. Eventually the gate closes (`close completed`), correlating with a
`page_service` connection handler logging that it's exiting because of a
network error (`Connection reset by peer` or `Broken pipe`).

While in (4):
- `Tenant::shutdown` is stuck waiting for all `Timeline::shutdown` calls
to complete.
  So, really, this is a `Timeline::shutdown` bug.
- retries from Cplane/Storcon to complete above state transitions, fail
with errors related to the tenant mgr slot being in state
`TenantSlot::InProgress`, the tenant state being
`TenantState::Stopping`, and the timelines being in
`TimelineState::Stopping`, and the `Timeline::cancel` being cancelled.
- Existing (and/or new?) page_service connections log errors `error
reading relation or page version: Not found: Timed out waiting 30s for
tenant active state. Latest state: None`

# Root-Cause

After a lengthy investigation ([internal
write-up](https://www.notion.so/neondatabase/2025-01-09-batching-deadlock-Slow-Log-Analysis-in-Staging-176f189e00478050bc21c1a072157ca4?pvs=4))
I arrived at the following root cause.

The `spsc_fold` channel (`batch_tx`/`batch_rx`) that connects the
Batcher and Executor stages of the pipelined mode was storing a `Handle`
and thus `GateGuard` of the Timeline that was not shutting down.
The design assumption with pipelining was that this would always be a
short transient state.
However, that was incorrect: the Executor was stuck on writing/flushing
an earlier response into the connection to the client, i.e., socket
write being slow because of TCP backpressure.

The probable scenario of how we end up in that case:
1. Compute backend process sends a continuous stream of getpage prefetch
requests into the connection, but never reads the responses (why this
happens: see Appendix section).
2. Batch N is processed by Batcher and Executor, up to the point where
Executor starts flushing the response.
3. Batch N+1 is procssed by Batcher and queued in the `spsc_fold`.
4. Executor is still waiting for batch N flush to finish.
5. Batcher eventually hits the `TimeoutReader` error (10min).
From here on it waits on the
`spsc_fold.send(Err(QueryError(TimeoutReader_error)))`
which will never finish because the batch already inside the `spsc_fold`
is not
being read by the Executor, because the Executor is still stuck in the
flush.
   (This state is not observable at our default `info` log level)
6. Eventually, Compute backend process is killed (`close()` on the
socket) or Compute as a whole gets killed (probably no clean TCP
shutdown happening in that case).
7. Eventually, Pageserver TCP stack learns about (6) through RST packets
and the Executor's flush() call fails with an error.
8. The Executor exits, dropping `cancel_batcher` and its end of the
spsc_fold.
   This wakes Batcher, causing the `spsc_fold.send` to fail.
   Batcher exits.
   The pipeline shuts down as intended.
We return from `process_query` and log the `Connection reset by peer` or
`Broken pipe` error.

The following diagram visualizes the wait-for graph at (5)

```mermaid
flowchart TD
   Batcher --spsc_fold.send(TimeoutReader_error)--> Executor
   Executor --flush batch N responses--> socket.write_end
   socket.write_end --wait for TCP window to move forward--> Compute
```

# Analysis

By holding the GateGuard inside the `spsc_fold` open, the pipelining
implementation
violated the principle established in
(https://github.com/neondatabase/neon/pull/8339).
That is, that `Handle`s must only be held across an await point if that
await point
is sensitive to the `<Handle as Deref<Target=Timeline>>::cancel` token.

In this case, we were holding the Handle inside the `spsc_fold` while
awaiting the
`pgb_writer.flush()` future.

One may jump to the conclusion that we should simply peek into the
spsc_fold to get
that Timeline cancel token and be sensitive to it during flush, then.

But that violates another principle of the design from
https://github.com/neondatabase/neon/pull/8339.
That is, that the page_service connection lifecycle and the Timeline
lifecycles must be completely decoupled.
Tt must be possible to shut down one shard without shutting down the
page_service connection, because on that single connection we might be
serving other shards attached to this pageserver.
(The current compute client opens separate connections per shard, but,
there are plans to change that.)

# Solution

This PR adds a `handle::WeakHandle` struct that does _not_ hold the
timeline gate open.
It must be `upgrade()`d to get a `handle::Handle`.
That `handle::Handle` _does_ hold the timeline gate open.

The batch queued inside the `spsc_fold` only holds a `WeakHandle`.
We only upgrade it while calling into the various `handle_` methods,
i.e., while interacting with the `Timeline` via `<Handle as
Deref<Target=Timeline>>`.
All that code has always been required to be (and is!) sensitive to
`Timeline::cancel`, and therefore we're guaranteed to bail from it
quickly when `Timeline::shutdown` starts.
We will drop the `Handle` immediately, before we start
`pgb_writer.flush()`ing the responses.
Thereby letting go of our hold on the `GateGuard`, allowing the timeline
shutdown to complete while the page_service handler remains intact.

# Code Changes

* Reproducer & Regression Test
* Developed and proven to reproduce the issue in
https://github.com/neondatabase/neon/pull/10399
* Add a `Test` message to the pagestream protocol (`cfg(feature =
"testing")`).
* Drive-by minimal improvement to the parsing code, we now have a
`PagestreamFeMessageTag`.
* Refactor `pageserver/client` to allow sending and receiving
`page_service` requests independently.
  * Add a Rust helper binary to produce situation (4) from above
* Rationale: (4) and (5) are the same bug class, we're holding a gate
open while `flush()`ing.
* Add a Python regression test that uses the helper binary to
demonstrate the problem.
* Fix
   * Introduce and use `WeakHandle` as explained earlier.
* Replace the `shut_down` atomic with two enum states for `HandleInner`,
wrapped in a `Mutex`.
* To make `WeakHandle::upgrade()` and `Handle::downgrade()`
cache-efficient:
     * Wrap the `Types::Timeline` in an `Arc`
     * Wrap the `GateGuard` in an `Arc`
* The separate `Arc`s enable uncontended cloning of the timeline
reference in `upgrade()` and `downgrade()`.
If instead we were `Arc<Timeline>::clone`, different connection handlers
would be hitting the same cache line on every upgrade()/downgrade(),
causing contention.
* Please read the udpated module-level comment in `mod handle`
module-level comment for details.

# Testing & Performance

The reproducer test that failed before the changes now passes, and
obviously other tests are passing as well.

We'll do more testing in staging, where the issue happens every ~4h if
chaos migrations are enabled in storcon.

Existing perf testing will be sufficient, no perf degradation is
expected.
It's a few more alloctations due to the added Arc's, but, they're low
frequency.

# Appendix: Why Compute Sometimes Doesn't Read Responses

Remember, the whole problem surfaced because flush() was slow because
Compute was not reading responses. Why is that?

In short, the way the compute works, it only advances the page_service
protocol processing when it has an interest in data, i.e., when the
pagestore smgr is called to return pages.

Thus, if compute issues a bunch of requests as part of prefetch but then
it turns out it can service the query without reading those pages, it
may very well happen that these messages stay in the TCP until the next
smgr read happens, either in that session, or possibly in another
session.

If there’s too many unread responses in the TCP, the pageserver kernel
is going to backpressure into userspace, resulting in our stuck flush().

All of this stems from the way vanilla Postgres does prefetching and
"async IO":
it issues `fadvise()` to make the kernel do the IO in the background,
buffering results in the kernel page cache.
It then consumes the results through synchronous `read()` system calls,
which hopefully will be fast because of the `fadvise()`.

If it turns out that some / all of the prefetch results are not needed,
Postgres will not be issuing those `read()` system calls.
The kernel will eventually react to that by reusing page cache pages
that hold completed prefetched data.
Uncompleted prefetch requests may or may not be processed -- it's up to
the kernel.

In Neon, the smgr + Pageserver together take on the role of the kernel
in above paragraphs.
In the current implementation, all prefetches are sent as GetPage
requests to Pageserver.
The responses are only processed in the places where vanilla Postgres
would do the synchronous `read()` system call.
If we never get to that, the responses are queued inside the TCP
connection, which, once buffers run full, will backpressure into
Pageserver's sending code, i.e., the `pgb_writer.flush()` that was the
root cause of the problems we're fixing in this PR.
---
 libs/pageserver_api/src/models.rs             | 246 +++++++--
 pageserver/Cargo.toml                         |   6 +-
 pageserver/client/Cargo.toml                  |   3 +
 pageserver/client/src/page_service.rs         | 139 ++++-
 .../src/bin/test_helper_slow_client_reads.rs  |  65 +++
 pageserver/src/metrics.rs                     |   2 +
 pageserver/src/page_service.rs                | 184 ++++++-
 .../src/tenant/remote_timeline_client.rs      |   6 +
 pageserver/src/tenant/timeline.rs             |   7 +-
 pageserver/src/tenant/timeline/handle.rs      | 518 ++++++++++++------
 pgxn/neon/pagestore_client.h                  |   4 +
 .../test_page_service_batching_regressions.py |  60 ++
 12 files changed, 966 insertions(+), 274 deletions(-)
 create mode 100644 pageserver/src/bin/test_helper_slow_client_reads.rs
 create mode 100644 test_runner/regress/test_page_service_batching_regressions.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 87e8df2ab6..c38af9cb80 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -33,7 +33,6 @@ use crate::{
     reltag::RelTag,
     shard::{ShardCount, ShardStripeSize, TenantShardId},
 };
-use anyhow::bail;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 
 /// The state of a tenant in this pageserver.
@@ -1400,6 +1399,8 @@ pub enum PagestreamFeMessage {
     GetPage(PagestreamGetPageRequest),
     DbSize(PagestreamDbSizeRequest),
     GetSlruSegment(PagestreamGetSlruSegmentRequest),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestRequest),
 }
 
 // Wrapped in libpq CopyData
@@ -1411,6 +1412,22 @@ pub enum PagestreamBeMessage {
     Error(PagestreamErrorResponse),
     DbSize(PagestreamDbSizeResponse),
     GetSlruSegment(PagestreamGetSlruSegmentResponse),
+    #[cfg(feature = "testing")]
+    Test(PagestreamTestResponse),
+}
+
+// Keep in sync with `pagestore_client.h`
+#[repr(u8)]
+enum PagestreamFeMessageTag {
+    Exists = 0,
+    Nblocks = 1,
+    GetPage = 2,
+    DbSize = 3,
+    GetSlruSegment = 4,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 99,
 }
 
 // Keep in sync with `pagestore_client.h`
@@ -1422,7 +1439,28 @@ enum PagestreamBeMessageTag {
     Error = 103,
     DbSize = 104,
     GetSlruSegment = 105,
+    /* future tags above this line */
+    /// For testing purposes, not available in production.
+    #[cfg(feature = "testing")]
+    Test = 199,
 }
+
+impl TryFrom<u8> for PagestreamFeMessageTag {
+    type Error = u8;
+    fn try_from(value: u8) -> Result<Self, u8> {
+        match value {
+            0 => Ok(PagestreamFeMessageTag::Exists),
+            1 => Ok(PagestreamFeMessageTag::Nblocks),
+            2 => Ok(PagestreamFeMessageTag::GetPage),
+            3 => Ok(PagestreamFeMessageTag::DbSize),
+            4 => Ok(PagestreamFeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            99 => Ok(PagestreamFeMessageTag::Test),
+            _ => Err(value),
+        }
+    }
+}
+
 impl TryFrom<u8> for PagestreamBeMessageTag {
     type Error = u8;
     fn try_from(value: u8) -> Result<Self, u8> {
@@ -1433,6 +1471,8 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
             103 => Ok(PagestreamBeMessageTag::Error),
             104 => Ok(PagestreamBeMessageTag::DbSize),
             105 => Ok(PagestreamBeMessageTag::GetSlruSegment),
+            #[cfg(feature = "testing")]
+            199 => Ok(PagestreamBeMessageTag::Test),
             _ => Err(value),
         }
     }
@@ -1550,6 +1590,20 @@ pub struct PagestreamDbSizeResponse {
     pub db_size: i64,
 }
 
+#[cfg(feature = "testing")]
+#[derive(Debug, PartialEq, Eq, Clone)]
+pub struct PagestreamTestRequest {
+    pub hdr: PagestreamRequest,
+    pub batch_key: u64,
+    pub message: String,
+}
+
+#[cfg(feature = "testing")]
+#[derive(Debug)]
+pub struct PagestreamTestResponse {
+    pub req: PagestreamTestRequest,
+}
+
 // This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
 // that require pageserver-internal types.  It is sufficient to get the total size.
 #[derive(Serialize, Deserialize, Debug)]
@@ -1569,7 +1623,7 @@ impl PagestreamFeMessage {
 
         match self {
             Self::Exists(req) => {
-                bytes.put_u8(0);
+                bytes.put_u8(PagestreamFeMessageTag::Exists as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1580,7 +1634,7 @@ impl PagestreamFeMessage {
             }
 
             Self::Nblocks(req) => {
-                bytes.put_u8(1);
+                bytes.put_u8(PagestreamFeMessageTag::Nblocks as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1591,7 +1645,7 @@ impl PagestreamFeMessage {
             }
 
             Self::GetPage(req) => {
-                bytes.put_u8(2);
+                bytes.put_u8(PagestreamFeMessageTag::GetPage as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1603,7 +1657,7 @@ impl PagestreamFeMessage {
             }
 
             Self::DbSize(req) => {
-                bytes.put_u8(3);
+                bytes.put_u8(PagestreamFeMessageTag::DbSize as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
@@ -1611,13 +1665,24 @@ impl PagestreamFeMessage {
             }
 
             Self::GetSlruSegment(req) => {
-                bytes.put_u8(4);
+                bytes.put_u8(PagestreamFeMessageTag::GetSlruSegment as u8);
                 bytes.put_u64(req.hdr.reqid);
                 bytes.put_u64(req.hdr.request_lsn.0);
                 bytes.put_u64(req.hdr.not_modified_since.0);
                 bytes.put_u8(req.kind);
                 bytes.put_u32(req.segno);
             }
+            #[cfg(feature = "testing")]
+            Self::Test(req) => {
+                bytes.put_u8(PagestreamFeMessageTag::Test as u8);
+                bytes.put_u64(req.hdr.reqid);
+                bytes.put_u64(req.hdr.request_lsn.0);
+                bytes.put_u64(req.hdr.not_modified_since.0);
+                bytes.put_u64(req.batch_key);
+                let message = req.message.as_bytes();
+                bytes.put_u64(message.len() as u64);
+                bytes.put_slice(message);
+            }
         }
 
         bytes.into()
@@ -1645,56 +1710,66 @@ impl PagestreamFeMessage {
             ),
         };
 
-        match msg_tag {
-            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
+        match PagestreamFeMessageTag::try_from(msg_tag)
+            .map_err(|tag: u8| anyhow::anyhow!("invalid tag {tag}"))?
+        {
+            PagestreamFeMessageTag::Exists => {
+                Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::Nblocks => {
+                Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                }))
+            }
+            PagestreamFeMessageTag::GetPage => {
+                Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
+                    rel: RelTag {
+                        spcnode: body.read_u32::<BigEndian>()?,
+                        dbnode: body.read_u32::<BigEndian>()?,
+                        relnode: body.read_u32::<BigEndian>()?,
+                        forknum: body.read_u8()?,
+                    },
+                    blkno: body.read_u32::<BigEndian>()?,
+                }))
+            }
+            PagestreamFeMessageTag::DbSize => {
+                Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
+                    hdr: PagestreamRequest {
+                        reqid,
+                        request_lsn,
+                        not_modified_since,
+                    },
                     dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-            })),
-            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                rel: RelTag {
-                    spcnode: body.read_u32::<BigEndian>()?,
-                    dbnode: body.read_u32::<BigEndian>()?,
-                    relnode: body.read_u32::<BigEndian>()?,
-                    forknum: body.read_u8()?,
-                },
-                blkno: body.read_u32::<BigEndian>()?,
-            })),
-            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                hdr: PagestreamRequest {
-                    reqid,
-                    request_lsn,
-                    not_modified_since,
-                },
-                dbnode: body.read_u32::<BigEndian>()?,
-            })),
-            4 => Ok(PagestreamFeMessage::GetSlruSegment(
+                }))
+            }
+            PagestreamFeMessageTag::GetSlruSegment => Ok(PagestreamFeMessage::GetSlruSegment(
                 PagestreamGetSlruSegmentRequest {
                     hdr: PagestreamRequest {
                         reqid,
@@ -1705,7 +1780,21 @@ impl PagestreamFeMessage {
                     segno: body.read_u32::<BigEndian>()?,
                 },
             )),
-            _ => bail!("unknown smgr message tag: {:?}", msg_tag),
+            #[cfg(feature = "testing")]
+            PagestreamFeMessageTag::Test => Ok(PagestreamFeMessage::Test(PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid,
+                    request_lsn,
+                    not_modified_since,
+                },
+                batch_key: body.read_u64::<BigEndian>()?,
+                message: {
+                    let len = body.read_u64::<BigEndian>()?;
+                    let mut buf = vec![0; len as usize];
+                    body.read_exact(&mut buf)?;
+                    String::from_utf8(buf)?
+                },
+            })),
         }
     }
 }
@@ -1748,6 +1837,15 @@ impl PagestreamBeMessage {
                         bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
                         bytes.put(&resp.segment[..]);
                     }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
                 }
             }
             PagestreamProtocolVersion::V3 => {
@@ -1816,6 +1914,18 @@ impl PagestreamBeMessage {
                         bytes.put_u32((resp.segment.len() / BLCKSZ as usize) as u32);
                         bytes.put(&resp.segment[..]);
                     }
+
+                    #[cfg(feature = "testing")]
+                    Self::Test(resp) => {
+                        bytes.put_u8(Tag::Test as u8);
+                        bytes.put_u64(resp.req.hdr.reqid);
+                        bytes.put_u64(resp.req.hdr.request_lsn.0);
+                        bytes.put_u64(resp.req.hdr.not_modified_since.0);
+                        bytes.put_u64(resp.req.batch_key);
+                        let message = resp.req.message.as_bytes();
+                        bytes.put_u64(message.len() as u64);
+                        bytes.put_slice(message);
+                    }
                 }
             }
         }
@@ -1958,6 +2068,28 @@ impl PagestreamBeMessage {
                         segment: segment.into(),
                     })
                 }
+                #[cfg(feature = "testing")]
+                Tag::Test => {
+                    let reqid = buf.read_u64::<BigEndian>()?;
+                    let request_lsn = Lsn(buf.read_u64::<BigEndian>()?);
+                    let not_modified_since = Lsn(buf.read_u64::<BigEndian>()?);
+                    let batch_key = buf.read_u64::<BigEndian>()?;
+                    let len = buf.read_u64::<BigEndian>()?;
+                    let mut msg = vec![0; len as usize];
+                    buf.read_exact(&mut msg)?;
+                    let message = String::from_utf8(msg)?;
+                    Self::Test(PagestreamTestResponse {
+                        req: PagestreamTestRequest {
+                            hdr: PagestreamRequest {
+                                reqid,
+                                request_lsn,
+                                not_modified_since,
+                            },
+                            batch_key,
+                            message,
+                        },
+                    })
+                }
             };
         let remaining = buf.into_inner();
         if !remaining.is_empty() {
@@ -1977,6 +2109,8 @@ impl PagestreamBeMessage {
             Self::Error(_) => "Error",
             Self::DbSize(_) => "DbSize",
             Self::GetSlruSegment(_) => "GetSlruSegment",
+            #[cfg(feature = "testing")]
+            Self::Test(_) => "Test",
         }
     }
 }
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 9195951191..9c835c956b 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
+testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing", "pageserver_client/testing"]
 
 [dependencies]
 anyhow.workspace = true
@@ -114,3 +114,7 @@ harness = false
 [[bench]]
 name = "upload_queue"
 harness = false
+
+[[bin]]
+name = "test_helper_slow_client_reads"
+required-features = [ "testing" ]
diff --git a/pageserver/client/Cargo.toml b/pageserver/client/Cargo.toml
index d9b36bf3d4..f582d307a7 100644
--- a/pageserver/client/Cargo.toml
+++ b/pageserver/client/Cargo.toml
@@ -4,6 +4,9 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 
+[features]
+testing = [ "pageserver_api/testing" ]
+
 [dependencies]
 pageserver_api.workspace = true
 thiserror.workspace = true
diff --git a/pageserver/client/src/page_service.rs b/pageserver/client/src/page_service.rs
index 207ec4166c..27280912b4 100644
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -1,6 +1,9 @@
-use std::pin::Pin;
+use std::sync::{Arc, Mutex};
 
-use futures::SinkExt;
+use futures::{
+    stream::{SplitSink, SplitStream},
+    SinkExt, StreamExt,
+};
 use pageserver_api::{
     models::{
         PagestreamBeMessage, PagestreamFeMessage, PagestreamGetPageRequest,
@@ -10,7 +13,6 @@ use pageserver_api::{
 };
 use tokio::task::JoinHandle;
 use tokio_postgres::CopyOutStream;
-use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
 use utils::{
     id::{TenantId, TimelineId},
@@ -62,15 +64,28 @@ impl Client {
             .client
             .copy_both_simple(&format!("pagestream_v3 {tenant_id} {timeline_id}"))
             .await?;
+        let (sink, stream) = copy_both.split(); // TODO: actually support splitting of the CopyBothDuplex so the lock inside this split adaptor goes away.
         let Client {
             cancel_on_client_drop,
             conn_task,
             client: _,
         } = self;
+        let shared = Arc::new(Mutex::new(PagestreamShared::ConnTaskRunning(
+            ConnTaskRunning {
+                cancel_on_client_drop,
+                conn_task,
+            },
+        )));
         Ok(PagestreamClient {
-            copy_both: Box::pin(copy_both),
-            conn_task,
-            cancel_on_client_drop,
+            sink: PagestreamSender {
+                shared: shared.clone(),
+                sink,
+            },
+            stream: PagestreamReceiver {
+                shared: shared.clone(),
+                stream,
+            },
+            shared,
         })
     }
 
@@ -97,7 +112,28 @@ impl Client {
 
 /// Create using [`Client::pagestream`].
 pub struct PagestreamClient {
-    copy_both: Pin<Box<tokio_postgres::CopyBothDuplex<bytes::Bytes>>>,
+    shared: Arc<Mutex<PagestreamShared>>,
+    sink: PagestreamSender,
+    stream: PagestreamReceiver,
+}
+
+pub struct PagestreamSender {
+    #[allow(dead_code)]
+    shared: Arc<Mutex<PagestreamShared>>,
+    sink: SplitSink<tokio_postgres::CopyBothDuplex<bytes::Bytes>, bytes::Bytes>,
+}
+
+pub struct PagestreamReceiver {
+    #[allow(dead_code)]
+    shared: Arc<Mutex<PagestreamShared>>,
+    stream: SplitStream<tokio_postgres::CopyBothDuplex<bytes::Bytes>>,
+}
+
+enum PagestreamShared {
+    ConnTaskRunning(ConnTaskRunning),
+    ConnTaskCancelledJoinHandleReturnedOrDropped,
+}
+struct ConnTaskRunning {
     cancel_on_client_drop: Option<tokio_util::sync::DropGuard>,
     conn_task: JoinHandle<()>,
 }
@@ -110,11 +146,11 @@ pub struct RelTagBlockNo {
 impl PagestreamClient {
     pub async fn shutdown(self) {
         let Self {
-            copy_both,
-            cancel_on_client_drop: cancel_conn_task,
-            conn_task,
-        } = self;
-        // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
+            shared,
+            sink,
+            stream,
+        } = { self };
+        // The `copy_both` split into `sink` and `stream` contains internal channel sender, the receiver of which is polled by `conn_task`.
         // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
         // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
         //
@@ -131,27 +167,77 @@ impl PagestreamClient {
         //
         // NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
         // => https://github.com/neondatabase/neon/issues/6390
-        let _ = cancel_conn_task.unwrap();
+        let ConnTaskRunning {
+            cancel_on_client_drop,
+            conn_task,
+        } = {
+            let mut guard = shared.lock().unwrap();
+            match std::mem::replace(
+                &mut *guard,
+                PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped,
+            ) {
+                PagestreamShared::ConnTaskRunning(conn_task_running) => conn_task_running,
+                PagestreamShared::ConnTaskCancelledJoinHandleReturnedOrDropped => unreachable!(),
+            }
+        };
+        let _ = cancel_on_client_drop.unwrap();
         conn_task.await.unwrap();
-        drop(copy_both);
+
+        // Now drop the split copy_both.
+        drop(sink);
+        drop(stream);
+    }
+
+    pub fn split(self) -> (PagestreamSender, PagestreamReceiver) {
+        let Self {
+            shared: _,
+            sink,
+            stream,
+        } = self;
+        (sink, stream)
     }
 
     pub async fn getpage(
         &mut self,
         req: PagestreamGetPageRequest,
     ) -> anyhow::Result<PagestreamGetPageResponse> {
-        let req = PagestreamFeMessage::GetPage(req);
-        let req: bytes::Bytes = req.serialize();
-        // let mut req = tokio_util::io::ReaderStream::new(&req);
-        let mut req = tokio_stream::once(Ok(req));
+        self.getpage_send(req).await?;
+        self.getpage_recv().await
+    }
 
-        self.copy_both.send_all(&mut req).await?;
+    pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
+        self.sink.getpage_send(req).await
+    }
 
-        let next: Option<Result<bytes::Bytes, _>> = self.copy_both.next().await;
+    pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
+        self.stream.getpage_recv().await
+    }
+}
+
+impl PagestreamSender {
+    // TODO: maybe make this impl Sink instead for better composability?
+    pub async fn send(&mut self, msg: PagestreamFeMessage) -> anyhow::Result<()> {
+        let msg = msg.serialize();
+        self.sink.send_all(&mut tokio_stream::once(Ok(msg))).await?;
+        Ok(())
+    }
+
+    pub async fn getpage_send(&mut self, req: PagestreamGetPageRequest) -> anyhow::Result<()> {
+        self.send(PagestreamFeMessage::GetPage(req)).await
+    }
+}
+
+impl PagestreamReceiver {
+    // TODO: maybe make this impl Stream instead for better composability?
+    pub async fn recv(&mut self) -> anyhow::Result<PagestreamBeMessage> {
+        let next: Option<Result<bytes::Bytes, _>> = self.stream.next().await;
         let next: bytes::Bytes = next.unwrap()?;
+        PagestreamBeMessage::deserialize(next)
+    }
 
-        let msg = PagestreamBeMessage::deserialize(next)?;
-        match msg {
+    pub async fn getpage_recv(&mut self) -> anyhow::Result<PagestreamGetPageResponse> {
+        let next: PagestreamBeMessage = self.recv().await?;
+        match next {
             PagestreamBeMessage::GetPage(p) => Ok(p),
             PagestreamBeMessage::Error(e) => anyhow::bail!("Error: {:?}", e),
             PagestreamBeMessage::Exists(_)
@@ -160,7 +246,14 @@ impl PagestreamClient {
             | PagestreamBeMessage::GetSlruSegment(_) => {
                 anyhow::bail!(
                     "unexpected be message kind in response to getpage request: {}",
-                    msg.kind()
+                    next.kind()
+                )
+            }
+            #[cfg(feature = "testing")]
+            PagestreamBeMessage::Test(_) => {
+                anyhow::bail!(
+                    "unexpected be message kind in response to getpage request: {}",
+                    next.kind()
                 )
             }
         }
diff --git a/pageserver/src/bin/test_helper_slow_client_reads.rs b/pageserver/src/bin/test_helper_slow_client_reads.rs
new file mode 100644
index 0000000000..c1ce332b6c
--- /dev/null
+++ b/pageserver/src/bin/test_helper_slow_client_reads.rs
@@ -0,0 +1,65 @@
+use std::{
+    io::{stdin, stdout, Read, Write},
+    time::Duration,
+};
+
+use clap::Parser;
+use pageserver_api::models::{PagestreamRequest, PagestreamTestRequest};
+use utils::{
+    id::{TenantId, TimelineId},
+    lsn::Lsn,
+};
+
+#[derive(clap::Parser)]
+struct Args {
+    connstr: String,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let Args {
+        connstr,
+        tenant_id,
+        timeline_id,
+    } = Args::parse();
+    let client = pageserver_client::page_service::Client::new(connstr).await?;
+    let client = client.pagestream(tenant_id, timeline_id).await?;
+    let (mut sender, _receiver) = client.split();
+
+    eprintln!("filling the pipe");
+    let mut msg = 0;
+    loop {
+        msg += 1;
+        let fut = sender.send(pageserver_api::models::PagestreamFeMessage::Test(
+            PagestreamTestRequest {
+                hdr: PagestreamRequest {
+                    reqid: 0,
+                    request_lsn: Lsn(23),
+                    not_modified_since: Lsn(23),
+                },
+                batch_key: 42,
+                message: format!("message {}", msg),
+            },
+        ));
+        let Ok(res) = tokio::time::timeout(Duration::from_secs(10), fut).await else {
+            eprintln!("pipe seems full");
+            break;
+        };
+        let _: () = res?;
+    }
+
+    let n = stdout().write(b"R")?;
+    assert_eq!(n, 1);
+    stdout().flush()?;
+
+    eprintln!("waiting for signal to tell us to exit");
+
+    let mut buf = [0u8; 1];
+    stdin().read_exact(&mut buf)?;
+
+    eprintln!("termination signal received, exiting");
+
+    anyhow::Ok(())
+}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 5b1cbbad63..3c4830e3cd 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1463,6 +1463,8 @@ pub enum SmgrQueryType {
     GetPageAtLsn,
     GetDbSize,
     GetSlruSegment,
+    #[cfg(feature = "testing")]
+    Test,
 }
 
 pub(crate) struct SmgrQueryTimePerTimeline {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index da4180a927..b14a44f9e3 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -555,37 +555,52 @@ struct BatchedGetPageRequest {
     timer: SmgrOpTimer,
 }
 
+#[cfg(feature = "testing")]
+struct BatchedTestRequest {
+    req: models::PagestreamTestRequest,
+    timer: SmgrOpTimer,
+}
+
+/// NB: we only hold [`timeline::handle::WeakHandle`] inside this enum,
+/// so that we don't keep the [`Timeline::gate`] open while the batch
+/// is being built up inside the [`spsc_fold`] (pagestream pipelining).
 enum BatchedFeMessage {
     Exists {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamExistsRequest,
     },
     Nblocks {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamNblocksRequest,
     },
     GetPage {
         span: Span,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         effective_request_lsn: Lsn,
         pages: smallvec::SmallVec<[BatchedGetPageRequest; 1]>,
     },
     DbSize {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamDbSizeRequest,
     },
     GetSlruSegment {
         span: Span,
         timer: SmgrOpTimer,
-        shard: timeline::handle::Handle<TenantManagerTypes>,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
         req: models::PagestreamGetSlruSegmentRequest,
     },
+    #[cfg(feature = "testing")]
+    Test {
+        span: Span,
+        shard: timeline::handle::WeakHandle<TenantManagerTypes>,
+        requests: Vec<BatchedTestRequest>,
+    },
     RespondError {
         span: Span,
         error: BatchedPageStreamError,
@@ -606,6 +621,12 @@ impl BatchedFeMessage {
                     page.timer.observe_execution_start(at);
                 }
             }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test { requests, .. } => {
+                for req in requests {
+                    req.timer.observe_execution_start(at);
+                }
+            }
             BatchedFeMessage::RespondError { .. } => {}
         }
     }
@@ -735,7 +756,7 @@ impl PageServerHandler {
                 BatchedFeMessage::Exists {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -754,7 +775,7 @@ impl PageServerHandler {
                 BatchedFeMessage::Nblocks {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -773,7 +794,7 @@ impl PageServerHandler {
                 BatchedFeMessage::DbSize {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -792,7 +813,7 @@ impl PageServerHandler {
                 BatchedFeMessage::GetSlruSegment {
                     span,
                     timer,
-                    shard,
+                    shard: shard.downgrade(),
                     req,
                 }
             }
@@ -844,6 +865,7 @@ impl PageServerHandler {
                 )
                 .await?;
 
+                // We're holding the Handle
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
                     req.hdr.request_lsn,
@@ -861,11 +883,27 @@ impl PageServerHandler {
                 };
                 BatchedFeMessage::GetPage {
                     span,
-                    shard,
+                    shard: shard.downgrade(),
                     effective_request_lsn,
                     pages: smallvec::smallvec![BatchedGetPageRequest { req, timer }],
                 }
             }
+            #[cfg(feature = "testing")]
+            PagestreamFeMessage::Test(req) => {
+                let span = tracing::info_span!(parent: parent_span, "handle_test_request");
+                let shard = timeline_handles
+                    .get(tenant_id, timeline_id, ShardSelector::Zero)
+                    .instrument(span.clone()) // sets `shard_id` field
+                    .await?;
+                let timer =
+                    record_op_start_and_throttle(&shard, metrics::SmgrQueryType::Test, received_at)
+                        .await?;
+                BatchedFeMessage::Test {
+                    span,
+                    shard: shard.downgrade(),
+                    requests: vec![BatchedTestRequest { req, timer }],
+                }
+            }
         };
         Ok(Some(batched_msg))
     }
@@ -907,9 +945,7 @@ impl PageServerHandler {
                     assert_eq!(accum_pages.len(), max_batch_size.get());
                     return false;
                 }
-                if (accum_shard.tenant_shard_id, accum_shard.timeline_id)
-                    != (this_shard.tenant_shard_id, this_shard.timeline_id)
-                {
+                if !accum_shard.is_same_handle_as(&this_shard) {
                     trace!(%accum_lsn, %this_lsn, "stopping batching because timeline object mismatch");
                     // TODO: we _could_ batch & execute each shard seperately (and in parallel).
                     // But the current logic for keeping responses in order does not support that.
@@ -928,6 +964,44 @@ impl PageServerHandler {
                 accum_pages.extend(this_pages);
                 Ok(())
             }
+            #[cfg(feature = "testing")]
+            (
+                Ok(BatchedFeMessage::Test {
+                    shard: accum_shard,
+                    requests: accum_requests,
+                    ..
+                }),
+                BatchedFeMessage::Test {
+                    shard: this_shard,
+                    requests: this_requests,
+                    ..
+                },
+            ) if (|| {
+                assert!(this_requests.len() == 1);
+                if accum_requests.len() >= max_batch_size.get() {
+                    trace!(%max_batch_size, "stopping batching because of batch size");
+                    assert_eq!(accum_requests.len(), max_batch_size.get());
+                    return false;
+                }
+                if !accum_shard.is_same_handle_as(&this_shard) {
+                    trace!("stopping batching because timeline object mismatch");
+                    // TODO: we _could_ batch & execute each shard seperately (and in parallel).
+                    // But the current logic for keeping responses in order does not support that.
+                    return false;
+                }
+                let this_batch_key = this_requests[0].req.batch_key;
+                let accum_batch_key = accum_requests[0].req.batch_key;
+                if this_requests[0].req.batch_key != accum_requests[0].req.batch_key {
+                    trace!(%accum_batch_key, %this_batch_key, "stopping batching because batch key changed");
+                    return false;
+                }
+                true
+            })() =>
+            {
+                // ok to batch
+                accum_requests.extend(this_requests);
+                Ok(())
+            }
             // something batched already but this message is unbatchable
             (_, this_msg) => {
                 // by default, don't continue batching
@@ -969,7 +1043,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::exists");
                 (
                     vec![self
-                        .handle_get_rel_exists_request(&shard, &req, ctx)
+                        .handle_get_rel_exists_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -986,7 +1060,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::nblocks");
                 (
                     vec![self
-                        .handle_get_nblocks_request(&shard, &req, ctx)
+                        .handle_get_nblocks_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -1007,7 +1081,7 @@ impl PageServerHandler {
                         trace!(npages, "handling getpage request");
                         let res = self
                             .handle_get_page_at_lsn_request_batched(
-                                &shard,
+                                &*shard.upgrade()?,
                                 effective_request_lsn,
                                 pages,
                                 ctx,
@@ -1029,7 +1103,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::dbsize");
                 (
                     vec![self
-                        .handle_db_size_request(&shard, &req, ctx)
+                        .handle_db_size_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -1046,7 +1120,7 @@ impl PageServerHandler {
                 fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
                 (
                     vec![self
-                        .handle_get_slru_segment_request(&shard, &req, ctx)
+                        .handle_get_slru_segment_request(&*shard.upgrade()?, &req, ctx)
                         .instrument(span.clone())
                         .await
                         .map(|msg| (msg, timer))
@@ -1054,6 +1128,27 @@ impl PageServerHandler {
                     span,
                 )
             }
+            #[cfg(feature = "testing")]
+            BatchedFeMessage::Test {
+                span,
+                shard,
+                requests,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::test");
+                (
+                    {
+                        let npages = requests.len();
+                        trace!(npages, "handling getpage request");
+                        let res = self
+                            .handle_test_request_batch(&*shard.upgrade()?, requests, ctx)
+                            .instrument(span.clone())
+                            .await;
+                        assert_eq!(res.len(), npages);
+                        res
+                    },
+                    span,
+                )
+            }
             BatchedFeMessage::RespondError { span, error } => {
                 // We've already decided to respond with an error, so we don't need to
                 // call the handler.
@@ -1791,6 +1886,51 @@ impl PageServerHandler {
         ))
     }
 
+    // NB: this impl mimics what we do for batched getpage requests.
+    #[cfg(feature = "testing")]
+    #[instrument(skip_all, fields(shard_id))]
+    async fn handle_test_request_batch(
+        &mut self,
+        timeline: &Timeline,
+        requests: Vec<BatchedTestRequest>,
+        _ctx: &RequestContext,
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), BatchedPageStreamError>> {
+        // real requests would do something with the timeline
+        let mut results = Vec::with_capacity(requests.len());
+        for _req in requests.iter() {
+            tokio::task::yield_now().await;
+
+            results.push({
+                if timeline.cancel.is_cancelled() {
+                    Err(PageReconstructError::Cancelled)
+                } else {
+                    Ok(())
+                }
+            });
+        }
+
+        // TODO: avoid creating the new Vec here
+        Vec::from_iter(
+            requests
+                .into_iter()
+                .zip(results.into_iter())
+                .map(|(req, res)| {
+                    res.map(|()| {
+                        (
+                            PagestreamBeMessage::Test(models::PagestreamTestResponse {
+                                req: req.req.clone(),
+                            }),
+                            req.timer,
+                        )
+                    })
+                    .map_err(|e| BatchedPageStreamError {
+                        err: PageStreamError::from(e),
+                        req: req.req.hdr,
+                    })
+                }),
+        )
+    }
+
     /// Note on "fullbackup":
     /// Full basebackups should only be used for debugging purposes.
     /// Originally, it was introduced to enable breaking storage format changes,
@@ -2406,6 +2546,14 @@ impl From<GetActiveTimelineError> for QueryError {
     }
 }
 
+impl From<crate::tenant::timeline::handle::HandleUpgradeError> for QueryError {
+    fn from(e: crate::tenant::timeline::handle::HandleUpgradeError) -> Self {
+        match e {
+            crate::tenant::timeline::handle::HandleUpgradeError::ShutDown => QueryError::Shutdown,
+        }
+    }
+}
+
 fn set_tracing_field_shard_id(timeline: &Timeline) {
     debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id();
     tracing::Span::current().record(
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 47c4a8637d..a006647785 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -382,6 +382,12 @@ pub(crate) struct RemoteTimelineClient {
     cancel: CancellationToken,
 }
 
+impl Drop for RemoteTimelineClient {
+    fn drop(&mut self) {
+        debug!("dropping RemoteTimelineClient");
+    }
+}
+
 impl RemoteTimelineClient {
     ///
     /// Create a remote storage client for given timeline
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f24611e1d8..2ba71416b8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -76,6 +76,7 @@ use std::{pin::pin, sync::OnceLock};
 
 use crate::{
     aux_file::AuxFileSizeEstimator,
+    page_service::TenantManagerTypes,
     tenant::{
         config::AttachmentMode,
         layer_map::{LayerMap, SearchResult},
@@ -431,7 +432,7 @@ pub struct Timeline {
 
     pub(crate) l0_flush_global_state: L0FlushGlobalState,
 
-    pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
+    pub(crate) handles: handle::PerTimelineState<TenantManagerTypes>,
 
     pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 
@@ -4625,6 +4626,10 @@ impl Drop for Timeline {
                 }
             }
         }
+        info!(
+            "Timeline {} for tenant {} is being dropped",
+            self.timeline_id, self.tenant_shard_id.tenant_id
+        );
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index e82559b8b3..35d8c75ce1 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -32,54 +32,151 @@
 //!
 //! # Design
 //!
+//! ## Data Structures
+//!
 //! There are three user-facing data structures:
 //! - `PerTimelineState`: a struct embedded into each Timeline struct. Lifetime == Timeline lifetime.
 //! - `Cache`: a struct private to each connection handler; Lifetime == connection lifetime.
 //! - `Handle`: a smart pointer that holds the Timeline gate open and derefs to `&Timeline`.
-//!   Lifetime: for a single request dispatch on the Timeline (i.e., one getpage request)
+//! - `WeakHandle`: downgrade of a `Handle` that does not keep the gate open, but allows
+//!   trying to ugprade back to a `Handle`, guaranteeing it's the same `Timeline` *object*.
 //!
-//! The `Handle` is just a wrapper around an `Arc<HandleInner>`.
+//! Internally, there is 0 or 1 `HandleInner` per `(Cache,Timeline)`.
+//! Since Cache:Connection is 1:1, there is 0 or 1 `HandleInner` per `(Connection,Timeline)`.
 //!
-//! There is one long-lived `Arc<HandleInner>`, which is stored in the `PerTimelineState`.
-//! The `Cache` stores a `Weak<HandleInner>` for each cached Timeline.
+//! The `HandleInner`  is allocated as a `Arc<Mutex<HandleInner>>` and
+//! referenced weakly and strongly from various places which we are now illustrating.
+//! For brevity, we will omit the `Arc<Mutex<>>` part in the following and instead
+//! use `strong ref` and `weak ref` when referring to the `Arc<Mutex<HandleInner>>`
+//! or `Weak<Mutex<HandleInner>>`, respectively.
+//!
+//! - The `Handle` is a strong ref.
+//! - The `WeakHandle` is a weak ref.
+//! - The `PerTimelineState` contains a `HashMap<CacheId, strong ref>`.
+//! - The `Cache` is a `HashMap<unique identifier for the shard, weak ref>`.
+//!
+//! Lifetimes:
+//! - `WeakHandle` and `Handle`: single pagestream request.
+//! - `Cache`: single page service connection.
+//! - `PerTimelineState`:  lifetime of the Timeline object (i.e., i.e., till `Timeline::shutdown`).
+//!
+//! ## Request Handling Flow (= filling and using the `Cache``)
 //!
 //! To dispatch a request, the page service connection calls `Cache::get`.
 //!
 //! A cache miss means we consult the tenant manager for shard routing,
-//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and construct an
-//! `Arc<HandleInner>`. We store a `Weak<HandleInner>` in the cache
-//! and the `Arc<HandleInner>` in the `PerTimelineState`.
+//! resulting in an `Arc<Timeline>`. We enter its gate _once_ and store it in the the
+//! `Arc<Mutex<HandleInner>>>`. A weak ref is stored in the `Cache`
+//! and a strong ref in the `PerTimelineState`.
+//! A strong ref is returned wrapped in a `Handle`.
 //!
 //! For subsequent requests, `Cache::get` will perform a "fast path" shard routing
-//! and find the `Weak<HandleInner>` in the cache.
-//! We upgrade the `Weak<HandleInner>` to an `Arc<HandleInner>` and wrap it in the user-facing `Handle` type.
+//! and find the weak ref in the cache.
+//! We upgrade the weak ref to a strong ref and return it wrapped in a `Handle`.
 //!
-//! The request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
+//! The pagestream processing is pipelined and involves a batching step.
+//! While a request is batching, the `Handle` is downgraded to a `WeakHandle`.
+//! When the batch is ready to be executed, the `WeakHandle` is upgraded back to a `Handle`
+//! and the request handler dispatches the request to the right `<Handle as Deref<Target = Timeline>>::$request_method`.
 //! It then drops the `Handle`, which drops the `Arc<HandleInner>`.
 //!
-//! # Memory Management / How The Reference Cycle Is Broken
+//! # Performance
 //!
-//! The attentive reader may have noticed the strong reference cycle
-//! from `Arc<HandleInner>` to `PerTimelineState` to `Arc<Timeline>`.
+//! Remember from the introductory section:
 //!
-//! This cycle is intentional: while it exists, the `Cache` can upgrade its
-//! `Weak<HandleInner>` to an `Arc<HandleInner>` in a single atomic operation.
+//! > However, we want to avoid the overhead of entering the gate for every
+//! > method invocation.
+//!
+//! Why do we want to avoid that?
+//! Because the gate is a shared location in memory and entering it involves
+//! bumping refcounts, which leads to cache contention if done frequently
+//! from multiple cores in parallel.
+//!
+//! So, we only acquire the `GateGuard` once on `Cache` miss, and wrap it in an `Arc`.
+//! That `Arc` is private to the `HandleInner` and hence to the connection.
+//! (Review the "Data Structures" section if that is unclear to you.)
+//!
+//! A `WeakHandle` is a weak ref to the `HandleInner`.
+//! When upgrading a `WeakHandle`, we upgrade to a strong ref to the `HandleInner` and
+//! further acquire an additional strong ref to the `Arc<GateGuard>` inside it.
+//! Again, this manipulation of ref counts is is cheap because `Arc` is private to the connection.
+//!
+//! When downgrading a `Handle` to a `WeakHandle`, we drop the `Arc<GateGuard>`.
+//! Again, this is cheap because the `Arc` is private to the connection.
+//!
+//! In addition to the GateGuard, we need to provide `Deref<Target=Timeline>` impl.
+//! For this, both `Handle` need infallible access to an `Arc<Timeline>`.
+//! We could clone the `Arc<Timeline>` when upgrading a `WeakHandle`, but that would cause contention
+//! on the shared memory location that trakcs the refcount of the `Arc<Timeline>`.
+//! Instead, we wrap the `Arc<Timeline>` into another `Arc`.
+//! so that we can clone it cheaply when upgrading a `WeakHandle`.
+//!
+//! # Shutdown
+//!
+//! The attentive reader may have noticed the following reference cycle around the `Arc<Timeline>`:
+//!
+//! ```text
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> Timeline
+//! ```
+//!
+//! Further, there is this cycle:
+//!
+//! ```text
+//! Timeline --owns--> PerTimelineState --strong--> HandleInner --strong--> GateGuard --keepalive--> Timeline
+//! ```
+//!
+//! The former cycle is a memory leak if not broken.
+//! The latter cycle further prevents the Timeline from shutting down
+//! because we certainly won't drop the Timeline while the GateGuard is alive.
+//! Preventing shutdown is the whole point of this handle/cache system,
+//! but when the Timeline needs to shut down, we need to break the cycle.
 //!
 //! The cycle is broken by either
-//! - `PerTimelineState::shutdown` or
-//! - dropping the `Cache`.
+//! - Timeline shutdown (=> `PerTimelineState::shutdown`)
+//! - Connection shutdown (=> dropping the `Cache`).
 //!
-//! Concurrently existing `Handle`s will extend the existence of the cycle.
+//! Both transition the `HandleInner` from [`HandleInner::KeepingTimelineGateOpen`] to
+//! [`HandleInner::ShutDown`], which drops the only long-lived strong ref to the
+//! `Arc<GateGuard>`.
+//!
+//! `PerTimelineState::shutdown` drops all the `HandleInners` it contains,
+//! thereby breaking the cycle.
+//! It also initiates draining of already existing `Handle`s by
+//! poisoning things so that no new `HandleInner`'s can be added
+//! to the `PerTimelineState`, which will make subsequent `Cache::get` fail.
+//!
+//! Concurrently existing / already upgraded `Handle`s will extend the
+//! lifetime of the `Arc<Mutex<HandleInner>>` and hence cycles.
 //! However, since `Handle`s are short-lived and new `Handle`s are not
-//! handed out after either `PerTimelineState::shutdown` or `Cache` drop,
-//! that extension of the cycle is bounded.
+//! handed out from `Cache::get` or `WeakHandle::upgrade` after
+//! `PerTimelineState::shutdown`, that extension of the cycle is bounded.
+//!
+//! Concurrently existing `WeakHandle`s will fail to `upgrade()`:
+//! while they will succeed in upgrading `Weak<Mutex<HandleInner>>`,
+//! they will find the inner in state `HandleInner::ShutDown` state where the
+//! `Arc<GateGuard>` and Timeline has already been dropped.
+//!
+//! Dropping the `Cache` undoes the registration of this `Cache`'s
+//! `HandleInner`s from all the `PerTimelineState`s, i.e., it
+//! removes the strong ref to each of its `HandleInner`s
+//! from all the `PerTimelineState`.
+//!
+//! # Locking Rules
+//!
+//! To prevent deadlocks we:
+//!
+//! 1. Only ever hold one of the locks at a time.
+//! 2. Don't add more than one Drop impl that locks on the
+//!    cycles above.
+//!
+//! As per (2), that impl is in `Drop for Cache`.
 //!
 //! # Fast Path for Shard Routing
 //!
 //! The `Cache` has a fast path for shard routing to avoid calling into
 //! the tenant manager for every request.
 //!
-//! The `Cache` maintains a hash map of `ShardTimelineId` to `Weak<HandleInner>`.
+//! The `Cache` maintains a hash map of `ShardTimelineId` to `WeakHandle`s.
 //!
 //! The current implementation uses the first entry in the hash map
 //! to determine the `ShardParameters` and derive the correct
@@ -87,18 +184,18 @@
 //!
 //! It then looks up the hash map for that `ShardTimelineId := {ShardIndex,TimelineId}`.
 //!
-//! If the lookup is successful and the `Weak<HandleInner>` can be upgraded,
+//! If the lookup is successful and the `WeakHandle` can be upgraded,
 //! it's a hit.
 //!
 //! ## Cache invalidation
 //!
-//! The insight is that cache invalidation is sufficient and most efficiently done lazily.
+//! The insight is that cache invalidation is sufficient and most efficiently if done lazily.
 //! The only reasons why an entry in the cache can become stale are:
 //! 1. The `PerTimelineState` / Timeline is shutting down e.g. because the shard is
 //!    being detached, timeline or shard deleted, or pageserver is shutting down.
 //! 2. We're doing a shard split and new traffic should be routed to the child shards.
 //!
-//! Regarding (1), we will eventually fail to upgrade the `Weak<HandleInner>` once the
+//! Regarding (1), we will eventually fail to upgrade the `WeakHandle` once the
 //! timeline has shut down, and when that happens, we remove the entry from the cache.
 //!
 //! Regarding (2), the insight is that it is toally fine to keep dispatching requests
@@ -107,8 +204,6 @@
 
 use std::collections::hash_map;
 use std::collections::HashMap;
-use std::sync::atomic::AtomicBool;
-use std::sync::atomic::Ordering;
 use std::sync::Arc;
 use std::sync::Mutex;
 use std::sync::Weak;
@@ -152,7 +247,7 @@ pub(crate) struct Cache<T: Types> {
     map: Map<T>,
 }
 
-type Map<T> = HashMap<ShardTimelineId, Weak<HandleInner<T>>>;
+type Map<T> = HashMap<ShardTimelineId, WeakHandle<T>>;
 
 impl<T: Types> Default for Cache<T> {
     fn default() -> Self {
@@ -170,12 +265,22 @@ pub(crate) struct ShardTimelineId {
 }
 
 /// See module-level comment.
-pub(crate) struct Handle<T: Types>(Arc<HandleInner<T>>);
-struct HandleInner<T: Types> {
-    shut_down: AtomicBool,
-    timeline: T::Timeline,
-    // The timeline's gate held open.
-    _gate_guard: utils::sync::gate::GateGuard,
+pub(crate) struct Handle<T: Types> {
+    timeline: Arc<T::Timeline>,
+    #[allow(dead_code)] // the field exists to keep the gate open
+    gate_guard: Arc<utils::sync::gate::GateGuard>,
+    inner: Arc<Mutex<HandleInner<T>>>,
+}
+pub(crate) struct WeakHandle<T: Types> {
+    inner: Weak<Mutex<HandleInner<T>>>,
+}
+enum HandleInner<T: Types> {
+    KeepingTimelineGateOpen {
+        #[allow(dead_code)]
+        gate_guard: Arc<utils::sync::gate::GateGuard>,
+        timeline: Arc<T::Timeline>,
+    },
+    ShutDown,
 }
 
 /// Embedded in each [`Types::Timeline`] as the anchor for the only long-lived strong ref to `HandleInner`.
@@ -183,7 +288,8 @@ struct HandleInner<T: Types> {
 /// See module-level comment for details.
 pub struct PerTimelineState<T: Types> {
     // None = shutting down
-    handles: Mutex<Option<HashMap<CacheId, Arc<HandleInner<T>>>>>,
+    #[allow(clippy::type_complexity)]
+    handles: Mutex<Option<HashMap<CacheId, Arc<Mutex<HandleInner<T>>>>>>,
 }
 
 impl<T: Types> Default for PerTimelineState<T> {
@@ -243,49 +349,24 @@ impl<T: Types> Cache<T> {
         shard_selector: ShardSelector,
         tenant_manager: &T::TenantManager,
     ) -> Result<Handle<T>, GetError<T>> {
-        // terminates because each iteration removes an element from the map
-        loop {
-            let handle = self
-                .get_impl(timeline_id, shard_selector, tenant_manager)
-                .await?;
-            if handle.0.shut_down.load(Ordering::Relaxed) {
-                let removed = self
-                    .map
-                    .remove(&handle.0.timeline.shard_timeline_id())
-                    .expect("invariant of get_impl is that the returned handle is in the map");
-                assert!(
-                    Weak::ptr_eq(&removed, &Arc::downgrade(&handle.0)),
-                    "shard_timeline_id() incorrect?"
-                );
-            } else {
-                return Ok(handle);
-            }
-        }
-    }
-
-    #[instrument(level = "trace", skip_all)]
-    async fn get_impl(
-        &mut self,
-        timeline_id: TimelineId,
-        shard_selector: ShardSelector,
-        tenant_manager: &T::TenantManager,
-    ) -> Result<Handle<T>, GetError<T>> {
-        let miss: ShardSelector = {
+        // terminates because when every iteration we remove an element from the map
+        let miss: ShardSelector = loop {
             let routing_state = self.shard_routing(timeline_id, shard_selector);
             match routing_state {
                 RoutingResult::FastPath(handle) => return Ok(handle),
                 RoutingResult::SlowPath(key) => match self.map.get(&key) {
                     Some(cached) => match cached.upgrade() {
-                        Some(upgraded) => return Ok(Handle(upgraded)),
-                        None => {
+                        Ok(upgraded) => return Ok(upgraded),
+                        Err(HandleUpgradeError::ShutDown) => {
+                            // TODO: dedup with shard_routing()
                             trace!("handle cache stale");
                             self.map.remove(&key).unwrap();
-                            ShardSelector::Known(key.shard_index)
+                            continue;
                         }
                     },
-                    None => ShardSelector::Known(key.shard_index),
+                    None => break ShardSelector::Known(key.shard_index),
                 },
-                RoutingResult::NeedConsultTenantManager => shard_selector,
+                RoutingResult::NeedConsultTenantManager => break shard_selector,
             }
         };
         self.get_miss(timeline_id, miss, tenant_manager).await
@@ -302,7 +383,7 @@ impl<T: Types> Cache<T> {
             let Some((first_key, first_handle)) = self.map.iter().next() else {
                 return RoutingResult::NeedConsultTenantManager;
             };
-            let Some(first_handle) = first_handle.upgrade() else {
+            let Ok(first_handle) = first_handle.upgrade() else {
                 // TODO: dedup with get()
                 trace!("handle cache stale");
                 let first_key_owned = *first_key;
@@ -310,7 +391,7 @@ impl<T: Types> Cache<T> {
                 continue;
             };
 
-            let first_handle_shard_identity = first_handle.timeline.get_shard_identity();
+            let first_handle_shard_identity = first_handle.get_shard_identity();
             let make_shard_index = |shard_num: ShardNumber| ShardIndex {
                 shard_number: shard_num,
                 shard_count: first_handle_shard_identity.count,
@@ -329,11 +410,11 @@ impl<T: Types> Cache<T> {
             };
             let first_handle_shard_timeline_id = ShardTimelineId {
                 shard_index: first_handle_shard_identity.shard_index(),
-                timeline_id: first_handle.timeline.shard_timeline_id().timeline_id,
+                timeline_id: first_handle.shard_timeline_id().timeline_id,
             };
 
             if need_shard_timeline_id == first_handle_shard_timeline_id {
-                return RoutingResult::FastPath(Handle(first_handle));
+                return RoutingResult::FastPath(first_handle);
             } else {
                 return RoutingResult::SlowPath(need_shard_timeline_id);
             }
@@ -357,23 +438,30 @@ impl<T: Types> Cache<T> {
                     ShardSelector::Known(idx) => assert_eq!(idx, &key.shard_index),
                 }
 
-                let gate_guard = match timeline.gate().enter() {
-                    Ok(guard) => guard,
-                    Err(_) => {
-                        return Err(GetError::TimelineGateClosed);
-                    }
-                };
                 trace!("creating new HandleInner");
-                let handle = Arc::new(
-                    // TODO: global metric that keeps track of the number of live HandlerTimeline instances
-                    // so we can identify reference cycle bugs.
-                    HandleInner {
-                        shut_down: AtomicBool::new(false),
-                        _gate_guard: gate_guard,
-                        timeline: timeline.clone(),
-                    },
-                );
-                let handle = {
+                let handle_inner_arc = Arc::new(Mutex::new(HandleInner::KeepingTimelineGateOpen {
+                    gate_guard: Arc::new(
+                        // this enter() is expensive in production code because
+                        // it hits the global Arc<Timeline>::gate refcounts
+                        match timeline.gate().enter() {
+                            Ok(guard) => guard,
+                            Err(_) => {
+                                return Err(GetError::TimelineGateClosed);
+                            }
+                        },
+                    ),
+                    // this clone is expensive in production code because
+                    // it hits the global Arc<Timeline>::clone refcounts
+                    timeline: Arc::new(timeline.clone()),
+                }));
+                let handle_weak = WeakHandle {
+                    inner: Arc::downgrade(&handle_inner_arc),
+                };
+                let handle = handle_weak
+                    .upgrade()
+                    .ok()
+                    .expect("we just created it and it's not linked anywhere yet");
+                {
                     let mut lock_guard = timeline
                         .per_timeline_state()
                         .handles
@@ -381,7 +469,8 @@ impl<T: Types> Cache<T> {
                         .expect("mutex poisoned");
                     match &mut *lock_guard {
                         Some(per_timeline_state) => {
-                            let replaced = per_timeline_state.insert(self.id, Arc::clone(&handle));
+                            let replaced =
+                                per_timeline_state.insert(self.id, Arc::clone(&handle_inner_arc));
                             assert!(replaced.is_none(), "some earlier code left a stale handle");
                             match self.map.entry(key) {
                                 hash_map::Entry::Occupied(_o) => {
@@ -392,8 +481,7 @@ impl<T: Types> Cache<T> {
                                     unreachable!()
                                 }
                                 hash_map::Entry::Vacant(v) => {
-                                    v.insert(Arc::downgrade(&handle));
-                                    handle
+                                    v.insert(handle_weak);
                                 }
                             }
                         }
@@ -401,14 +489,62 @@ impl<T: Types> Cache<T> {
                             return Err(GetError::PerTimelineStateShutDown);
                         }
                     }
-                };
-                Ok(Handle(handle))
+                }
+                Ok(handle)
             }
             Err(e) => Err(GetError::TenantManager(e)),
         }
     }
 }
 
+pub(crate) enum HandleUpgradeError {
+    ShutDown,
+}
+
+impl<T: Types> WeakHandle<T> {
+    pub(crate) fn upgrade(&self) -> Result<Handle<T>, HandleUpgradeError> {
+        let Some(inner) = Weak::upgrade(&self.inner) else {
+            return Err(HandleUpgradeError::ShutDown);
+        };
+        let lock_guard = inner.lock().expect("poisoned");
+        match &*lock_guard {
+            HandleInner::KeepingTimelineGateOpen {
+                timeline,
+                gate_guard,
+            } => {
+                let gate_guard = Arc::clone(gate_guard);
+                let timeline = Arc::clone(timeline);
+                drop(lock_guard);
+                Ok(Handle {
+                    timeline,
+                    gate_guard,
+                    inner,
+                })
+            }
+            HandleInner::ShutDown => Err(HandleUpgradeError::ShutDown),
+        }
+    }
+
+    pub(crate) fn is_same_handle_as(&self, other: &WeakHandle<T>) -> bool {
+        Weak::ptr_eq(&self.inner, &other.inner)
+    }
+}
+
+impl<T: Types> std::ops::Deref for Handle<T> {
+    type Target = T::Timeline;
+    fn deref(&self) -> &Self::Target {
+        &self.timeline
+    }
+}
+
+impl<T: Types> Handle<T> {
+    pub(crate) fn downgrade(&self) -> WeakHandle<T> {
+        WeakHandle {
+            inner: Arc::downgrade(&self.inner),
+        }
+    }
+}
+
 impl<T: Types> PerTimelineState<T> {
     /// After this method returns, [`Cache::get`] will never again return a [`Handle`]
     /// to the [`Types::Timeline`] that embeds this per-timeline state.
@@ -430,43 +566,54 @@ impl<T: Types> PerTimelineState<T> {
             trace!("already shut down");
             return;
         };
-        for handle in handles.values() {
+        for handle_inner_arc in handles.values() {
             // Make hits fail.
-            handle.shut_down.store(true, Ordering::Relaxed);
+            let mut lock_guard = handle_inner_arc.lock().expect("poisoned");
+            lock_guard.shutdown();
         }
         drop(handles);
     }
 }
 
-impl<T: Types> std::ops::Deref for Handle<T> {
-    type Target = T::Timeline;
-    fn deref(&self) -> &Self::Target {
-        &self.0.timeline
-    }
-}
-
-#[cfg(test)]
-impl<T: Types> Drop for HandleInner<T> {
-    fn drop(&mut self) {
-        trace!("HandleInner dropped");
-    }
-}
-
 // When dropping a [`Cache`], prune its handles in the [`PerTimelineState`] to break the reference cycle.
 impl<T: Types> Drop for Cache<T> {
     fn drop(&mut self) {
-        for (_, weak) in self.map.drain() {
-            if let Some(strong) = weak.upgrade() {
-                // handle is still being kept alive in PerTimelineState
-                let timeline = strong.timeline.per_timeline_state();
-                let mut handles = timeline.handles.lock().expect("mutex poisoned");
-                if let Some(handles) = &mut *handles {
-                    let Some(removed) = handles.remove(&self.id) else {
-                        // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
-                        continue;
-                    };
-                    assert!(Arc::ptr_eq(&removed, &strong));
-                }
+        for (
+            _,
+            WeakHandle {
+                inner: handle_inner_weak,
+            },
+        ) in self.map.drain()
+        {
+            let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
+                continue;
+            };
+            let handle_timeline = handle_inner_arc
+                // locking rules: drop lock before acquiring other lock below
+                .lock()
+                .expect("poisoned")
+                .shutdown();
+            let per_timeline_state = handle_timeline.per_timeline_state();
+            let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
+            let Some(handles) = &mut *handles_lock_guard else {
+                continue;
+            };
+            let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
+                // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
+                continue;
+            };
+            drop(handles_lock_guard); // locking rules: remember them when!
+            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,));
+        }
+    }
+}
+
+impl<T: Types> HandleInner<T> {
+    fn shutdown(&mut self) -> Arc<T::Timeline> {
+        match std::mem::replace(self, HandleInner::ShutDown) {
+            HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline,
+            HandleInner::ShutDown => {
+                unreachable!("handles are only shut down once in their lifetime");
             }
         }
     }
@@ -474,6 +621,8 @@ impl<T: Types> Drop for Cache<T> {
 
 #[cfg(test)]
 mod tests {
+    use std::sync::Weak;
+
     use pageserver_api::{
         key::{rel_block_to_key, Key, DBDIR_KEY},
         models::ShardParameters,
@@ -583,39 +732,13 @@ mod tests {
         //
         // fill the cache
         //
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
-
         let handle: Handle<_> = cache
             .get(timeline_id, ShardSelector::Page(key), &mgr)
             .await
             .expect("we have the timeline");
-        let handle_inner_weak = Arc::downgrade(&handle.0);
         assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
-        assert_eq!(
-            (
-                Weak::strong_count(&handle_inner_weak),
-                Weak::weak_count(&handle_inner_weak)
-            ),
-            (2, 2),
-            "strong: handle, per_timeline_state, weak: handle_inner_weak, cache"
-        );
         assert_eq!(cache.map.len(), 1);
-
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
         drop(handle);
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(per_timeline_state), shard0, mgr; weak: myself"
-        );
 
         //
         // demonstrate that Handle holds up gate closure
@@ -640,21 +763,11 @@ mod tests {
         // SHUTDOWN
         shard0.per_timeline_state.shutdown(); // keeping handle alive across shutdown
 
-        assert_eq!(
-            1,
-            Weak::strong_count(&handle_inner_weak),
-            "through local var handle"
-        );
         assert_eq!(
             cache.map.len(),
             1,
             "this is an implementation detail but worth pointing out: we can't clear the cache from shutdown(), it's cleared on first access after"
         );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (3, 1),
-            "strong: handleinner(via handle), shard0, mgr; weak: myself"
-        );
 
         // this handle is perfectly usable
         handle.getpage();
@@ -678,16 +791,6 @@ mod tests {
         }
 
         drop(handle);
-        assert_eq!(
-            0,
-            Weak::strong_count(&handle_inner_weak),
-            "the HandleInner destructor already ran"
-        );
-        assert_eq!(
-            (Arc::strong_count(&shard0), Arc::weak_count(&shard0)),
-            (2, 1),
-            "strong: shard0, mgr; weak: myself"
-        );
 
         // closing gate succeeds after dropping handle
         tokio::select! {
@@ -706,10 +809,8 @@ mod tests {
         assert_eq!(cache.map.len(), 0);
 
         // ensure all refs to shard0 are gone and we're not leaking anything
-        let myself = Weak::clone(&shard0.myself);
         drop(shard0);
         drop(mgr);
-        assert_eq!(Weak::strong_count(&myself), 0);
     }
 
     #[tokio::test]
@@ -948,15 +1049,11 @@ mod tests {
                 handle
             };
             handle.getpage();
-            used_handles.push(Arc::downgrade(&handle.0));
+            used_handles.push(Arc::downgrade(&handle.timeline));
         }
 
-        // No handles exist, thus gates are closed and don't require shutdown
-        assert!(used_handles
-            .iter()
-            .all(|weak| Weak::strong_count(weak) == 0));
-
-        // ... thus the gate should close immediately, even without shutdown
+        // No handles exist, thus gates are closed and don't require shutdown.
+        // Thus the gate should close immediately, even without shutdown.
         tokio::select! {
             _ = shard0.gate.close() => { }
             _ = tokio::time::sleep(FOREVER) => {
@@ -964,4 +1061,75 @@ mod tests {
             }
         }
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_weak_handles() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+
+        let refcount_start = Arc::strong_count(&shard0);
+
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+
+        let weak_handle = handle.downgrade();
+
+        drop(handle);
+
+        let upgraded_handle = weak_handle.upgrade().ok().expect("we can upgrade it");
+
+        // Start shutdown
+        shard0.per_timeline_state.shutdown();
+
+        // Upgrades during shutdown don't work, even if upgraded_handle exists.
+        weak_handle
+            .upgrade()
+            .err()
+            .expect("can't upgrade weak handle as soon as shutdown started");
+
+        // But upgraded_handle is still alive, so the gate won't close.
+        tokio::select! {
+            _ = shard0.gate.close() => {
+                panic!("handle is keeping gate open");
+            }
+            _ = tokio::time::sleep(FOREVER) => { }
+        }
+
+        // Drop the last handle.
+        drop(upgraded_handle);
+
+        // The gate should close now, despite there still being a weak_handle.
+        tokio::select! {
+            _ = shard0.gate.close() => { }
+            _ = tokio::time::sleep(FOREVER) => {
+                panic!("only strong handle is dropped and we shut down per-timeline-state")
+            }
+        }
+
+        // The weak handle still can't be upgraded.
+        weak_handle
+            .upgrade()
+            .err()
+            .expect("still shouldn't be able to upgrade the weak handle");
+
+        // There should be no strong references to the timeline object except the one on "stack".
+        assert_eq!(Arc::strong_count(&shard0), refcount_start);
+    }
 }
diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index b751235595..7b748d7252 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -34,6 +34,8 @@ typedef enum
 	T_NeonGetPageRequest,
 	T_NeonDbSizeRequest,
 	T_NeonGetSlruSegmentRequest,
+	/* future tags above this line */
+	T_NeonTestRequest = 99, /* only in cfg(feature = "testing") */
 
 	/* pagestore -> pagestore_client */
 	T_NeonExistsResponse = 100,
@@ -42,6 +44,8 @@ typedef enum
 	T_NeonErrorResponse,
 	T_NeonDbSizeResponse,
 	T_NeonGetSlruSegmentResponse,
+	/* future tags above this line */
+	T_NeonTestResponse = 199, /* only in cfg(feature = "testing") */
 } NeonMessageTag;
 
 typedef uint64 NeonRequestId;
diff --git a/test_runner/regress/test_page_service_batching_regressions.py b/test_runner/regress/test_page_service_batching_regressions.py
new file mode 100644
index 0000000000..fa85e1210b
--- /dev/null
+++ b/test_runner/regress/test_page_service_batching_regressions.py
@@ -0,0 +1,60 @@
+# NB: there are benchmarks that double-serve as tests inside the `performance` directory.
+
+import subprocess
+from pathlib import Path
+
+import pytest
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import NeonEnvBuilder
+
+
+@pytest.mark.timeout(30)  # test takes <20s if pageserver impl is correct
+@pytest.mark.parametrize("kind", ["pageserver-stop", "tenant-detach"])
+def test_slow_flush(neon_env_builder: NeonEnvBuilder, neon_binpath: Path, kind: str):
+    def patch_pageserver_toml(config):
+        config["page_service_pipelining"] = {
+            "mode": "pipelined",
+            "max_batch_size": 32,
+            "execution": "concurrent-futures",
+        }
+
+    neon_env_builder.pageserver_config_override = patch_pageserver_toml
+    env = neon_env_builder.init_start()
+
+    log.info("make flush appear slow")
+
+    log.info("sending requests until pageserver accepts no more")
+    # TODO: extract this into a helper, like subprocess_capture,
+    # so that we capture the stderr from the helper somewhere.
+    child = subprocess.Popen(
+        [
+            neon_binpath / "test_helper_slow_client_reads",
+            env.pageserver.connstr(),
+            str(env.initial_tenant),
+            str(env.initial_timeline),
+        ],
+        bufsize=0,  # unbuffered
+        stdin=subprocess.PIPE,
+        stdout=subprocess.PIPE,
+    )
+    assert child.stdout is not None
+    buf = child.stdout.read(1)
+    if len(buf) != 1:
+        raise Exception("unexpected EOF")
+    if buf != b"R":
+        raise Exception(f"unexpected data: {buf!r}")
+    log.info("helper reports pageserver accepts no more requests")
+    log.info(
+        "assuming pageserver connection handle is in a state where TCP has backpressured pageserver=>client response flush() into userspace"
+    )
+
+    if kind == "pageserver-stop":
+        log.info("try to shut down the pageserver cleanly")
+        env.pageserver.stop()
+    elif kind == "tenant-detach":
+        log.info("try to shut down the tenant")
+        env.pageserver.tenant_detach(env.initial_tenant)
+    else:
+        raise ValueError(f"unexpected kind: {kind}")
+
+    log.info("shutdown did not time out, test passed")

From 871e8b325f1509c0ec5cba03537297847345c02e Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 16 Jan 2025 14:46:53 -0600
Subject: [PATCH 30/40] Use the request ID given by the control plane in
 compute_ctl (#10418)

Instead of generating our own request ID, we can just use the one
provided by the control plane. In the event, we get a request from a
client which doesn't set X-Request-ID, then we just generate one which
is useful for tracing purposes.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                       |  1 +
 compute_tools/Cargo.toml         |  1 +
 compute_tools/src/http/server.rs | 44 ++++++++++++--------------------
 3 files changed, 19 insertions(+), 27 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f184ebe0b..02b02a09c1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1312,6 +1312,7 @@ dependencies = [
  "tracing-utils",
  "url",
  "utils",
+ "uuid",
  "vm_monitor",
  "workspace_hack",
  "zstd",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 33892813c4..b04f364cbb 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -51,6 +51,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
+uuid.workspace = true
 prometheus.workspace = true
 
 postgres_initdb.workspace = true
diff --git a/compute_tools/src/http/server.rs b/compute_tools/src/http/server.rs
index 33d4b489a0..40fb1f4b4d 100644
--- a/compute_tools/src/http/server.rs
+++ b/compute_tools/src/http/server.rs
@@ -1,15 +1,14 @@
 use std::{
     net::{IpAddr, Ipv6Addr, SocketAddr},
-    sync::{
-        atomic::{AtomicU64, Ordering},
-        Arc,
-    },
+    sync::Arc,
     thread,
     time::Duration,
 };
 
 use anyhow::Result;
 use axum::{
+    extract::Request,
+    middleware::{self, Next},
     response::{IntoResponse, Response},
     routing::{get, post},
     Router,
@@ -17,11 +16,9 @@ use axum::{
 use http::StatusCode;
 use tokio::net::TcpListener;
 use tower::ServiceBuilder;
-use tower_http::{
-    request_id::{MakeRequestId, PropagateRequestIdLayer, RequestId, SetRequestIdLayer},
-    trace::TraceLayer,
-};
+use tower_http::{request_id::PropagateRequestIdLayer, trace::TraceLayer};
 use tracing::{debug, error, info, Span};
+use uuid::Uuid;
 
 use super::routes::{
     check_writability, configure, database_schema, dbs_and_roles, extension_server, extensions,
@@ -34,30 +31,24 @@ async fn handle_404() -> Response {
     StatusCode::NOT_FOUND.into_response()
 }
 
-#[derive(Clone, Default)]
-struct ComputeMakeRequestId(Arc<AtomicU64>);
+const X_REQUEST_ID: &str = "x-request-id";
 
-impl MakeRequestId for ComputeMakeRequestId {
-    fn make_request_id<B>(
-        &mut self,
-        _request: &http::Request<B>,
-    ) -> Option<tower_http::request_id::RequestId> {
-        let request_id = self
-            .0
-            .fetch_add(1, Ordering::SeqCst)
-            .to_string()
-            .parse()
-            .unwrap();
+/// This middleware function allows compute_ctl to generate its own request ID
+/// if one isn't supplied. The control plane will always send one as a UUID. The
+/// neon Postgres extension on the other hand does not send one.
+async fn maybe_add_request_id_header(mut request: Request, next: Next) -> Response {
+    let headers = request.headers_mut();
 
-        Some(RequestId::new(request_id))
+    if headers.get(X_REQUEST_ID).is_none() {
+        headers.append(X_REQUEST_ID, Uuid::new_v4().to_string().parse().unwrap());
     }
+
+    next.run(request).await
 }
 
 /// Run the HTTP server and wait on it forever.
 #[tokio::main]
 async fn serve(port: u16, compute: Arc<ComputeNode>) {
-    const X_REQUEST_ID: &str = "x-request-id";
-
     let mut app = Router::new()
         .route("/check_writability", post(check_writability::is_writable))
         .route("/configure", post(configure::configure))
@@ -82,9 +73,8 @@ async fn serve(port: u16, compute: Arc<ComputeNode>) {
         .fallback(handle_404)
         .layer(
             ServiceBuilder::new()
-                .layer(SetRequestIdLayer::x_request_id(
-                    ComputeMakeRequestId::default(),
-                ))
+                // Add this middleware since we assume the request ID exists
+                .layer(middleware::from_fn(maybe_add_request_id_header))
                 .layer(
                     TraceLayer::new_for_http()
                         .on_request(|request: &http::Request<_>, _span: &Span| {

From 053abff71f41a2d3eefaef4c94ac7f65b6956c47 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 17 Jan 2025 15:21:30 +0100
Subject: [PATCH 31/40] Fix dependency on neon-image in promote-images-dev
 (#10437)

## Problem
871e8b325f1509c0ec5cba03537297847345c02e failed CI on main because a job
ran to soon. This was caused by
ea84ec357fa4caa5a48ec65a0aab9e37d1a9fda4. While `promote-images-dev`
does not inherently need `neon-image`, a few jobs depending on
`promote-images-dev` do need it, and previously had it when it was
`promote-images`, which depended on `test-images`, which in turn
depended on `neon-image`.

## Summary of changes
To ensure jobs depending `docker.io/neondatabase/neon` images get them,
`promote-images-dev` gets the dependency to `neon-image` back which it
previously had transitively through `test-images`.
---
 .github/workflows/build_and_test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9ec5273af7..b0e07535b3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -824,7 +824,7 @@ jobs:
           docker compose -f ./docker-compose/docker-compose.yml down
 
   promote-images-dev:
-    needs: [ check-permissions, tag, vm-compute-node-image ]
+    needs: [ check-permissions, tag, vm-compute-node-image, neon-image ]
     runs-on: ubuntu-22.04
 
     permissions:

From 6975228a766bc2e5df36559a49fee0ef3417283a Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 17 Jan 2025 14:51:33 +0000
Subject: [PATCH 32/40] pageserver: add initdb metrics (#10434)

## Problem

Initdb observability is poor.

## Summary of changes

Add some metrics so we can figure out which part, if any, is slow.

Closes https://github.com/neondatabase/neon/issues/10423
---
 pageserver/src/metrics.rs | 26 ++++++++++++++++++++++++++
 pageserver/src/tenant.rs  | 14 +++++++++++++-
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3c4830e3cd..4758aaf230 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -100,6 +100,32 @@ pub(crate) static VEC_READ_NUM_LAYERS_VISITED: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static CONCURRENT_INITDBS: Lazy<UIntGauge> = Lazy::new(|| {
+    register_uint_gauge!(
+        "pageserver_concurrent_initdb",
+        "Number of initdb processes running"
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static INITDB_SEMAPHORE_ACQUISITION_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_initdb_semaphore_seconds_global",
+        "Time spent getting a permit from the global initdb semaphore",
+        STORAGE_OP_BUCKETS.into()
+    )
+    .expect("failed to define metric")
+});
+
+pub(crate) static INITDB_RUN_TIME: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_initdb_seconds_global",
+        "Time spent performing initdb",
+        STORAGE_OP_BUCKETS.into()
+    )
+    .expect("failed to define metric")
+});
+
 // Metrics collected on operations on the storage repository.
 #[derive(
     Clone, Copy, enum_map::Enum, strum_macros::EnumString, strum_macros::Display, IntoStaticStr,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f6d758ad22..bb1b36aed6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -95,6 +95,9 @@ use crate::deletion_queue::DeletionQueueError;
 use crate::import_datadir;
 use crate::is_uninit_mark;
 use crate::l0_flush::L0FlushGlobalState;
+use crate::metrics::CONCURRENT_INITDBS;
+use crate::metrics::INITDB_RUN_TIME;
+use crate::metrics::INITDB_SEMAPHORE_ACQUISITION_TIME;
 use crate::metrics::TENANT;
 use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
@@ -5347,8 +5350,17 @@ async fn run_initdb(
         initdb_bin_path, initdb_target_dir, initdb_lib_dir,
     );
 
-    let _permit = INIT_DB_SEMAPHORE.acquire().await;
+    let _permit = {
+        let _timer = INITDB_SEMAPHORE_ACQUISITION_TIME.start_timer();
+        INIT_DB_SEMAPHORE.acquire().await
+    };
 
+    CONCURRENT_INITDBS.inc();
+    scopeguard::defer! {
+        CONCURRENT_INITDBS.dec();
+    }
+
+    let _timer = INITDB_RUN_TIME.start_timer();
     let res = postgres_initdb::do_run_initdb(postgres_initdb::RunInitdbArgs {
         superuser: &conf.superuser,
         locale: &conf.locale,

From b0f34099f90cfa08223ed653a7c7460943f34f0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 17 Jan 2025 22:43:52 +0100
Subject: [PATCH 33/40] Add safekeeper utilization endpoint (#10429)

Add an endpoint to obtain the utilization of a safekeeper. Future
changes to the storage controller can use this endpoint to find the most
suitable safekeepers for newly created timelines, analogously to how
it's done for pageservers already.

Initially we just want to assign by timeline count, then we can iterate
from there.

Part of https://github.com/neondatabase/neon/issues/9011
---
 libs/safekeeper_api/src/models.rs      |  5 +++++
 safekeeper/client/src/mgmt_api.rs      |  5 +++++
 safekeeper/src/http/routes.rs          |  8 ++++++++
 safekeeper/src/timelines_global_map.rs | 15 +++++++++++++++
 4 files changed, 33 insertions(+)

diff --git a/libs/safekeeper_api/src/models.rs b/libs/safekeeper_api/src/models.rs
index b5fa903820..30418b0efd 100644
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -277,3 +277,8 @@ pub struct TimelineTermBumpResponse {
     pub previous_term: u64,
     pub current_term: u64,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct SafekeeperUtilization {
+    pub timeline_count: u64,
+}
diff --git a/safekeeper/client/src/mgmt_api.rs b/safekeeper/client/src/mgmt_api.rs
index f78745043a..5727f32509 100644
--- a/safekeeper/client/src/mgmt_api.rs
+++ b/safekeeper/client/src/mgmt_api.rs
@@ -102,6 +102,11 @@ impl Client {
         self.get(&uri).await
     }
 
+    pub async fn utilization(&self) -> Result<reqwest::Response> {
+        let uri = format!("{}/v1/utilization/", self.mgmt_api_endpoint);
+        self.get(&uri).await
+    }
+
     async fn get<U: IntoUrl>(&self, uri: U) -> Result<reqwest::Response> {
         self.request(Method::GET, uri, ()).await
     }
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 4b9fb9eb67..7ec08ecf9a 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -127,6 +127,13 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
     json_response(StatusCode::OK, ())
 }
 
+async fn utilization_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
+    check_permission(&request, None)?;
+    let global_timelines = get_global_timelines(&request);
+    let utilization = global_timelines.get_timeline_counts();
+    json_response(StatusCode::OK, utilization)
+}
+
 /// List all (not deleted) timelines.
 /// Note: it is possible to do the same with debug_dump.
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -620,6 +627,7 @@ pub fn make_router(
                 failpoints_handler(r, cancel).await
             })
         })
+        .get("/v1/uzilization", |r| request_span(r, utilization_handler))
         .delete("/v1/tenant/:tenant_id", |r| {
             request_span(r, tenant_delete_handler)
         })
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index a701534f65..01c6aff6c3 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -13,6 +13,7 @@ use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
 use camino_tempfile::Utf8TempDir;
 use safekeeper_api::membership::Configuration;
+use safekeeper_api::models::SafekeeperUtilization;
 use safekeeper_api::ServerInfo;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -416,6 +417,20 @@ impl GlobalTimelines {
             .collect()
     }
 
+    /// Returns statistics about timeline counts
+    pub fn get_timeline_counts(&self) -> SafekeeperUtilization {
+        let global_lock = self.state.lock().unwrap();
+        let timeline_count = global_lock
+            .timelines
+            .values()
+            .filter(|t| match t {
+                GlobalMapTimeline::CreationInProgress => false,
+                GlobalMapTimeline::Timeline(t) => !t.is_cancelled(),
+            })
+            .count() as u64;
+        SafekeeperUtilization { timeline_count }
+    }
+
     /// Returns all timelines belonging to a given tenant. Used for deleting all timelines of a tenant,
     /// and that's why it can return cancelled timelines, to retry deleting them.
     fn get_all_for_tenant(&self, tenant_id: TenantId) -> Vec<Arc<Timeline>> {

From 8bdaee35f3dec86b37bb6b91be57a88a86d9ad33 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 09:20:31 +0000
Subject: [PATCH 34/40] pageserver: safety checks on validity of uploaded
 indices (#10403)

## Problem

Occasionally, we encounter bugs in test environments that can be
detected at the point of uploading an index, but we proceed to upload it
anyway and leave a tenant in a broken state that's awkward to handle.

## Summary of changes

- Validate index when submitting it for upload, so that we can see the
issue quickly e.g. in an API invoking compaction
- Validate index before executing the upload, so that we have a hard
enforcement that any code path that tries to upload an index will not
overwrite a valid index with an invalid one.
---
 .../src/tenant/remote_timeline_client.rs      |  6 ++
 .../tenant/remote_timeline_client/index.rs    | 15 +++++
 .../tenant/remote_timeline_client/upload.rs   |  4 ++
 .../src/tenant/storage_layer/layer/tests.rs   | 56 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs             | 20 ++++++-
 5 files changed, 91 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index a006647785..bcba6d1f62 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -803,6 +803,12 @@ impl RemoteTimelineClient {
 
         upload_queue.dirty.metadata.apply(update);
 
+        // Defense in depth: if we somehow generated invalid metadata, do not persist it.
+        upload_queue
+            .dirty
+            .validate()
+            .map_err(|e| anyhow::anyhow!(e))?;
+
         self.schedule_index_upload(upload_queue);
 
         Ok(())
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 244be5bbb7..08e94ae197 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -152,6 +152,21 @@ impl IndexPart {
         };
         is_same_remote_layer_path(name, metadata, name, index_metadata)
     }
+
+    /// Check for invariants in the index: this is useful when uploading an index to ensure that if
+    /// we encounter a bug, we do not persist buggy metadata.
+    pub(crate) fn validate(&self) -> Result<(), String> {
+        if self.import_pgdata.is_none()
+            && self.metadata.ancestor_timeline().is_none()
+            && self.layer_metadata.is_empty()
+        {
+            // Unless we're in the middle of a raw pgdata import, or this is a child timeline,the index must
+            // always have at least one layer.
+            return Err("Index has no ancestor and no layers".to_string());
+        }
+
+        Ok(())
+    }
 }
 
 /// Metadata gathered for each of the layer files.
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index e434d24e5f..af4dbbbfb6 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -40,6 +40,10 @@ pub(crate) async fn upload_index_part(
     });
     pausable_failpoint!("before-upload-index-pausable");
 
+    // Safety: refuse to persist invalid index metadata, to mitigate the impact of any bug that produces this
+    // (this should never happen)
+    index_part.validate().map_err(|e| anyhow::anyhow!(e))?;
+
     // FIXME: this error comes too late
     let serialized = index_part.to_json_bytes()?;
     let serialized = Bytes::from(serialized);
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 36dcc8d805..fcb73ad20d 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -1,6 +1,6 @@
 use std::time::UNIX_EPOCH;
 
-use pageserver_api::key::CONTROLFILE_KEY;
+use pageserver_api::key::{Key, CONTROLFILE_KEY};
 use tokio::task::JoinSet;
 use utils::{
     completion::{self, Completion},
@@ -9,7 +9,10 @@ use utils::{
 
 use super::failpoints::{Failpoint, FailpointKind};
 use super::*;
-use crate::{context::DownloadBehavior, tenant::storage_layer::LayerVisibilityHint};
+use crate::{
+    context::DownloadBehavior,
+    tenant::{harness::test_img, storage_layer::LayerVisibilityHint},
+};
 use crate::{task_mgr::TaskKind, tenant::harness::TenantHarness};
 
 /// Used in tests to advance a future to wanted await point, and not futher.
@@ -31,20 +34,51 @@ async fn smoke_test() {
 
     let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
 
+    let image_layers = vec![(
+        Lsn(0x40),
+        vec![(
+            Key::from_hex("620000000033333333444444445500000000").unwrap(),
+            test_img("foo"),
+        )],
+    )];
+
+    // Create a test timeline with one real layer, and one synthetic test layer.  The synthetic
+    // one is only there so that we can GC the real one without leaving the timeline's metadata
+    // empty, which is an illegal state (see [`IndexPart::validate`]).
     let timeline = tenant
-        .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
+        .create_test_timeline_with_layers(
+            TimelineId::generate(),
+            Lsn(0x10),
+            14,
+            &ctx,
+            Default::default(),
+            image_layers,
+            Lsn(0x100),
+        )
         .await
         .unwrap();
 
-    let layer = {
+    // Grab one of the timeline's layers to exercise in the test, and the other layer that is just
+    // there to avoid the timeline being illegally empty
+    let (layer, dummy_layer) = {
         let mut layers = {
             let layers = timeline.layers.read().await;
             layers.likely_resident_layers().cloned().collect::<Vec<_>>()
         };
 
-        assert_eq!(layers.len(), 1);
+        assert_eq!(layers.len(), 2);
 
-        layers.swap_remove(0)
+        layers.sort_by_key(|l| l.layer_desc().get_key_range().start);
+        let synthetic_layer = layers.pop().unwrap();
+        let real_layer = layers.pop().unwrap();
+        tracing::info!(
+            "real_layer={:?} ({}), synthetic_layer={:?} ({})",
+            real_layer,
+            real_layer.layer_desc().file_size,
+            synthetic_layer,
+            synthetic_layer.layer_desc().file_size
+        );
+        (real_layer, synthetic_layer)
     };
 
     // all layers created at pageserver are like `layer`, initialized with strong
@@ -173,10 +207,13 @@ async fn smoke_test() {
 
     let rtc = &timeline.remote_client;
 
+    // Simulate GC removing our test layer.
     {
-        let layers = &[layer];
         let mut g = timeline.layers.write().await;
+
+        let layers = &[layer];
         g.open_mut().unwrap().finish_gc_timeline(layers);
+
         // this just updates the remote_physical_size for demonstration purposes
         rtc.schedule_gc_update(layers).unwrap();
     }
@@ -191,7 +228,10 @@ async fn smoke_test() {
 
     rtc.wait_completion().await.unwrap();
 
-    assert_eq!(rtc.get_remote_physical_size(), 0);
+    assert_eq!(
+        rtc.get_remote_physical_size(),
+        dummy_layer.metadata().file_size
+    );
     assert_eq!(0, LAYER_IMPL_METRICS.inits_cancelled.get())
 }
 
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2ba71416b8..5f4272fb2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -5678,9 +5678,17 @@ impl Timeline {
         info!("force created image layer {}", image_layer.local_path());
         {
             let mut guard = self.layers.write().await;
-            guard.open_mut().unwrap().force_insert_layer(image_layer);
+            guard
+                .open_mut()
+                .unwrap()
+                .force_insert_layer(image_layer.clone());
         }
 
+        // Update remote_timeline_client state to reflect existence of this layer
+        self.remote_client
+            .schedule_layer_file_upload(image_layer)
+            .unwrap();
+
         Ok(())
     }
 
@@ -5731,9 +5739,17 @@ impl Timeline {
         info!("force created delta layer {}", delta_layer.local_path());
         {
             let mut guard = self.layers.write().await;
-            guard.open_mut().unwrap().force_insert_layer(delta_layer);
+            guard
+                .open_mut()
+                .unwrap()
+                .force_insert_layer(delta_layer.clone());
         }
 
+        // Update remote_timeline_client state to reflect existence of this layer
+        self.remote_client
+            .schedule_layer_file_upload(delta_layer)
+            .unwrap();
+
         Ok(())
     }
 

From 7d761a9d22e0c3ca0e337af1793b65cb4d3f7203 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 09:47:23 +0000
Subject: [PATCH 35/40] storage controller: make chaos less disruptive to AZ
 locality (#10438)

## Problem

Since #9916 , the chaos code is actively fighting the optimizer: tenants
tend to be attached in their preferred AZ, so most chaos migrations were
moving them to a non-preferred AZ.

## Summary of changes

- When picking migrations, prefer to migrate things _toward_ their
preferred AZ when possible. Then pick shards to move the other way when
necessary.

The resulting behavior should be an alternating "back and forth" where
the chaos code migrates thiings away from home, and then migrates them
back on the next iteration.

The side effect will be that the chaos code actively helps to push
things into their home AZ. That's not contrary to its purpose though: we
mainly just want it to continuously migrate things to exercise
migration+notification code.
---
 .../src/service/chaos_injector.rs             | 110 ++++++++++++------
 storage_controller/src/tenant_shard.rs        |  17 +++
 2 files changed, 93 insertions(+), 34 deletions(-)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 0e551beaa7..98034421d6 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,11 +1,17 @@
-use std::{sync::Arc, time::Duration};
+use std::{
+    collections::{BTreeMap, HashMap},
+    sync::Arc,
+    time::Duration,
+};
 
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 use tokio_util::sync::CancellationToken;
+use utils::id::NodeId;
+use utils::shard::TenantShardId;
 
-use super::Service;
+use super::{Node, Scheduler, Service, TenantShard};
 
 pub struct ChaosInjector {
     service: Arc<Service>,
@@ -35,50 +41,86 @@ impl ChaosInjector {
         }
     }
 
+    /// If a shard has a secondary and attached location, then re-assign the secondary to be
+    /// attached and the attached to be secondary.
+    ///
+    /// Only modifies tenants if they're in Active scheduling policy.
+    fn maybe_migrate_to_secondary(
+        &self,
+        tenant_shard_id: TenantShardId,
+        nodes: &Arc<HashMap<NodeId, Node>>,
+        tenants: &mut BTreeMap<TenantShardId, TenantShard>,
+        scheduler: &mut Scheduler,
+    ) {
+        let shard = tenants
+            .get_mut(&tenant_shard_id)
+            .expect("Held lock between choosing ID and this get");
+
+        if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
+            // Skip non-active scheduling policies, so that a shard with a policy like Pause can
+            // be pinned without being disrupted by us.
+            tracing::info!(
+                "Skipping shard {tenant_shard_id}: scheduling policy is {:?}",
+                shard.get_scheduling_policy()
+            );
+            return;
+        }
+
+        // Pick a secondary to promote
+        let Some(new_location) = shard
+            .intent
+            .get_secondary()
+            .choose(&mut thread_rng())
+            .cloned()
+        else {
+            tracing::info!(
+                "Skipping shard {tenant_shard_id}: no secondary location, can't migrate"
+            );
+            return;
+        };
+
+        let Some(old_location) = *shard.intent.get_attached() else {
+            tracing::info!("Skipping shard {tenant_shard_id}: currently has no attached location");
+            return;
+        };
+
+        tracing::info!("Injecting chaos: migrate {tenant_shard_id} {old_location}->{new_location}");
+
+        shard.intent.demote_attached(scheduler, old_location);
+        shard.intent.promote_attached(scheduler, new_location);
+        self.service.maybe_reconcile_shard(shard, nodes);
+    }
+
     async fn inject_chaos(&mut self) {
         // Pick some shards to interfere with
         let batch_size = 128;
         let mut inner = self.service.inner.write().unwrap();
         let (nodes, tenants, scheduler) = inner.parts_mut();
         let tenant_ids = tenants.keys().cloned().collect::<Vec<_>>();
-        let victims = tenant_ids.choose_multiple(&mut thread_rng(), batch_size);
 
-        for victim in victims {
-            let shard = tenants
-                .get_mut(victim)
-                .expect("Held lock between choosing ID and this get");
-
-            if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
-                // Skip non-active scheduling policies, so that a shard with a policy like Pause can
-                // be pinned without being disrupted by us.
-                tracing::info!(
-                    "Skipping shard {victim}: scheduling policy is {:?}",
-                    shard.get_scheduling_policy()
-                );
-                continue;
+        // Prefer to migrate tenants that are currently outside their home AZ.  This avoids the chaos injector
+        // continuously pushing tenants outside their home AZ: instead, we'll tend to cycle between picking some
+        // random tenants to move, and then on next chaos iteration moving them back, then picking some new
+        // random tenants on the next iteration.
+        let mut victims = Vec::with_capacity(batch_size);
+        for shard in tenants.values() {
+            if shard.is_attached_outside_preferred_az(nodes) {
+                victims.push(shard.tenant_shard_id);
             }
 
-            // Pick a secondary to promote
-            let Some(new_location) = shard
-                .intent
-                .get_secondary()
-                .choose(&mut thread_rng())
-                .cloned()
-            else {
-                tracing::info!("Skipping shard {victim}: no secondary location, can't migrate");
-                continue;
-            };
+            if victims.len() >= batch_size {
+                break;
+            }
+        }
 
-            let Some(old_location) = *shard.intent.get_attached() else {
-                tracing::info!("Skipping shard {victim}: currently has no attached location");
-                continue;
-            };
+        let choose_random = batch_size.saturating_sub(victims.len());
+        tracing::info!("Injecting chaos: found {} shards to migrate back to home AZ, picking {choose_random} random shards to migrate", victims.len());
 
-            tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
+        let random_victims = tenant_ids.choose_multiple(&mut thread_rng(), choose_random);
+        victims.extend(random_victims);
 
-            shard.intent.demote_attached(scheduler, old_location);
-            shard.intent.promote_attached(scheduler, new_location);
-            self.service.maybe_reconcile_shard(shard, nodes);
+        for victim in victims {
+            self.maybe_migrate_to_secondary(victim, nodes, tenants, scheduler);
         }
     }
 }
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 79ed628c25..cbc2696b26 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1793,6 +1793,23 @@ impl TenantShard {
             }
         }
     }
+
+    /// Returns true if the tenant shard is attached to a node that is outside the preferred AZ.
+    ///
+    /// If the shard does not have a preferred AZ, returns false.
+    pub(crate) fn is_attached_outside_preferred_az(&self, nodes: &HashMap<NodeId, Node>) -> bool {
+        self.intent
+            .get_attached()
+            .map(|node_id| {
+                Some(
+                    nodes
+                        .get(&node_id)
+                        .expect("referenced node exists")
+                        .get_availability_zone_id(),
+                ) == self.intent.preferred_az_id.as_ref()
+            })
+            .unwrap_or(false)
+    }
 }
 
 impl Drop for TenantShard {

From b312a3c320695a4b528968250225dfbd40af0e2a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 20 Jan 2025 13:50:44 +0100
Subject: [PATCH 36/40] Move DeleteTimelineFlow::prepare to separate function
 and use enum (#10334)

It was requested by review in #10305 to use an enum or something like it
for distinguishing the different modes instead of two parameters,
because two flags allow four combinations, and two of them don't really
make sense/ aren't used.

follow-up of #10305
---
 pageserver/src/tenant/timeline/delete.rs  | 149 +++++++++++-----------
 pageserver/src/tenant/timeline/offload.rs |  12 +-
 2 files changed, 81 insertions(+), 80 deletions(-)

diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index bdc315d985..3c828c8a9e 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -112,7 +112,7 @@ pub(super) async fn delete_local_timeline_directory(
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+/// For more context see comments in [`make_timeline_delete_guard`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
     tenant: &Tenant,
     timeline: &TimelineOrOffloaded,
@@ -193,10 +193,8 @@ impl DeleteTimelineFlow {
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let allow_offloaded_children = false;
-        let set_stopping = true;
         let (timeline, mut guard) =
-            Self::prepare(tenant, timeline_id, allow_offloaded_children, set_stopping)?;
+            make_timeline_delete_guard(tenant, timeline_id, TimelineDeleteGuardKind::Delete)?;
 
         guard.mark_in_progress()?;
 
@@ -333,75 +331,6 @@ impl DeleteTimelineFlow {
         Ok(())
     }
 
-    pub(super) fn prepare(
-        tenant: &Tenant,
-        timeline_id: TimelineId,
-        allow_offloaded_children: bool,
-        set_stopping: bool,
-    ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
-        // Note the interaction between this guard and deletion guard.
-        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
-        // This is important because when you take into account `remove_timeline_from_tenant`
-        // we remove timeline from memory when we still hold the deletion guard.
-        // So here when timeline deletion is finished timeline wont be present in timelines map at all
-        // which makes the following sequence impossible:
-        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
-        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
-        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
-        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
-        let timelines = tenant.timelines.lock().unwrap();
-        let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
-
-        let timeline = match timelines.get(&timeline_id) {
-            Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
-            None => match timelines_offloaded.get(&timeline_id) {
-                Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
-                None => return Err(DeleteTimelineError::NotFound),
-            },
-        };
-
-        // Ensure that there are no child timelines, because we are about to remove files,
-        // which will break child branches
-        let mut children = Vec::new();
-        if !allow_offloaded_children {
-            children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
-                (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
-            }));
-        }
-        children.extend(timelines.iter().filter_map(|(id, entry)| {
-            (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
-        }));
-
-        if !children.is_empty() {
-            return Err(DeleteTimelineError::HasChildren(children));
-        }
-
-        // Note that using try_lock here is important to avoid a deadlock.
-        // Here we take lock on timelines and then the deletion guard.
-        // At the end of the operation we're holding the guard and need to lock timelines map
-        // to remove the timeline from it.
-        // Always if you have two locks that are taken in different order this can result in a deadlock.
-
-        let delete_progress = Arc::clone(timeline.delete_progress());
-        let delete_lock_guard = match delete_progress.try_lock_owned() {
-            Ok(guard) => DeletionGuard(guard),
-            Err(_) => {
-                // Unfortunately if lock fails arc is consumed.
-                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
-                    timeline.delete_progress(),
-                )));
-            }
-        };
-
-        if set_stopping {
-            if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
-                timeline.set_state(TimelineState::Stopping);
-            }
-        }
-
-        Ok((timeline, delete_lock_guard))
-    }
-
     fn schedule_background(
         guard: DeletionGuard,
         conf: &'static PageServerConf,
@@ -483,6 +412,80 @@ impl DeleteTimelineFlow {
     }
 }
 
+#[derive(Copy, Clone, PartialEq, Eq)]
+pub(super) enum TimelineDeleteGuardKind {
+    Offload,
+    Delete,
+}
+
+pub(super) fn make_timeline_delete_guard(
+    tenant: &Tenant,
+    timeline_id: TimelineId,
+    guard_kind: TimelineDeleteGuardKind,
+) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
+    // Note the interaction between this guard and deletion guard.
+    // Here we attempt to lock deletion guard when we're holding a lock on timelines.
+    // This is important because when you take into account `remove_timeline_from_tenant`
+    // we remove timeline from memory when we still hold the deletion guard.
+    // So here when timeline deletion is finished timeline wont be present in timelines map at all
+    // which makes the following sequence impossible:
+    // T1: get preempted right before the try_lock on `Timeline::delete_progress`
+    // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
+    // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
+    // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
+    let timelines = tenant.timelines.lock().unwrap();
+    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+
+    let timeline = match timelines.get(&timeline_id) {
+        Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
+        None => match timelines_offloaded.get(&timeline_id) {
+            Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
+            None => return Err(DeleteTimelineError::NotFound),
+        },
+    };
+
+    // Ensure that there are no child timelines, because we are about to remove files,
+    // which will break child branches
+    let mut children = Vec::new();
+    if guard_kind == TimelineDeleteGuardKind::Delete {
+        children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
+            (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
+        }));
+    }
+    children.extend(timelines.iter().filter_map(|(id, entry)| {
+        (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
+    }));
+
+    if !children.is_empty() {
+        return Err(DeleteTimelineError::HasChildren(children));
+    }
+
+    // Note that using try_lock here is important to avoid a deadlock.
+    // Here we take lock on timelines and then the deletion guard.
+    // At the end of the operation we're holding the guard and need to lock timelines map
+    // to remove the timeline from it.
+    // Always if you have two locks that are taken in different order this can result in a deadlock.
+
+    let delete_progress = Arc::clone(timeline.delete_progress());
+    let delete_lock_guard = match delete_progress.try_lock_owned() {
+        Ok(guard) => DeletionGuard(guard),
+        Err(_) => {
+            // Unfortunately if lock fails arc is consumed.
+            return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
+                timeline.delete_progress(),
+            )));
+        }
+    };
+
+    if guard_kind == TimelineDeleteGuardKind::Delete {
+        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+            timeline.set_state(TimelineState::Stopping);
+        }
+    }
+
+    Ok((timeline, delete_lock_guard))
+}
+
 pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
 
 impl Deref for DeletionGuard {
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 6c6b19e8b1..3b5bf8290c 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -2,10 +2,11 @@ use std::sync::Arc;
 
 use pageserver_api::models::{TenantState, TimelineState};
 
-use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
+use super::delete::{delete_local_timeline_directory, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::ShutdownIfArchivedError;
+use crate::tenant::timeline::delete::{make_timeline_delete_guard, TimelineDeleteGuardKind};
 use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
 
 #[derive(thiserror::Error, Debug)]
@@ -36,13 +37,10 @@ pub(crate) async fn offload_timeline(
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
-    let allow_offloaded_children = true;
-    let set_stopping = false;
-    let (timeline, guard) = DeleteTimelineFlow::prepare(
+    let (timeline, guard) = make_timeline_delete_guard(
         tenant,
         timeline.timeline_id,
-        allow_offloaded_children,
-        set_stopping,
+        TimelineDeleteGuardKind::Offload,
     )
     .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
@@ -106,7 +104,7 @@ pub(crate) async fn offload_timeline(
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+/// For more context see comments in [`make_timeline_delete_guard`]
 ///
 /// Returns the strong count of the timeline `Arc`
 fn remove_timeline_from_tenant(

From 02fc58b878d4342c05c084cd7db7a01940a70c3f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Jan 2025 15:37:24 +0100
Subject: [PATCH 37/40] impr(timeline handles): add more tests covering
 reference cyle (#10446)

The other test focus on the external interface usage while the tests
added in this PR add some testing around HandleInner's lifecycle,
ensuring we don't leak it once either connection gets dropped or
per-timeline-state is shut down explicitly.
---
 pageserver/src/tenant/timeline/handle.rs | 97 ++++++++++++++++++++++++
 1 file changed, 97 insertions(+)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 35d8c75ce1..4c7bea25be 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -1132,4 +1132,101 @@ mod tests {
         // There should be no strong references to the timeline object except the one on "stack".
         assert_eq!(Arc::strong_count(&shard0), refcount_start);
     }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_reference_cycle_broken_when_cache_is_dropped() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+
+        // helper to check if a handle is referenced by per_timeline_state
+        let per_timeline_state_refs_handle = |handle_weak: &Weak<Mutex<HandleInner<_>>>| {
+            let per_timeline_state = shard0.per_timeline_state.handles.lock().unwrap();
+            let per_timeline_state = per_timeline_state.as_ref().unwrap();
+            per_timeline_state
+                .values()
+                .any(|v| Weak::ptr_eq(&Arc::downgrade(v), handle_weak))
+        };
+
+        // Fill the cache.
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        assert!(Weak::ptr_eq(&handle.myself, &shard0.myself));
+        let handle_inner_weak = Arc::downgrade(&handle.inner);
+        assert!(
+            per_timeline_state_refs_handle(&handle_inner_weak),
+            "we still hold `handle` _and_ haven't dropped `cache` yet"
+        );
+
+        // Drop the cache.
+        drop(cache);
+
+        assert!(
+            !(per_timeline_state_refs_handle(&handle_inner_weak)),
+            "nothing should reference the handle allocation anymore"
+        );
+        assert!(
+            Weak::upgrade(&handle_inner_weak).is_some(),
+            "the local `handle` still keeps the allocation alive"
+        );
+        // but obviously the cache is gone so no new allocations can be handed out.
+
+        // Drop handle.
+        drop(handle);
+        assert!(
+            Weak::upgrade(&handle_inner_weak).is_none(),
+            "the local `handle` is dropped, so the allocation should be dropped by now"
+        );
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn test_reference_cycle_broken_when_per_timeline_state_shutdown() {
+        crate::tenant::harness::setup_logging();
+        let timeline_id = TimelineId::generate();
+        let shard0 = Arc::new_cyclic(|myself| StubTimeline {
+            gate: Default::default(),
+            id: timeline_id,
+            shard: ShardIdentity::unsharded(),
+            per_timeline_state: PerTimelineState::default(),
+            myself: myself.clone(),
+        });
+        let mgr = StubManager {
+            shards: vec![shard0.clone()],
+        };
+        let key = DBDIR_KEY;
+
+        let mut cache = Cache::<TestTypes>::default();
+        let handle = cache
+            .get(timeline_id, ShardSelector::Page(key), &mgr)
+            .await
+            .expect("we have the timeline");
+        // grab a weak reference to the inner so can later try to Weak::upgrade it and assert that fails
+        let handle_inner_weak = Arc::downgrade(&handle.inner);
+
+        // drop the handle, obviously the lifetime of `inner` is at least as long as each strong reference to it
+        drop(handle);
+        assert!(Weak::upgrade(&handle_inner_weak).is_some(), "can still");
+
+        // Shutdown the per_timeline_state.
+        shard0.per_timeline_state.shutdown();
+        assert!(Weak::upgrade(&handle_inner_weak).is_none(), "can no longer");
+
+        // cache only contains Weak's, so, it can outlive the per_timeline_state without
+        // Drop explicitly solely to make this point.
+        drop(cache);
+    }
 }

From 2657b7ec7540df3d9060ff2ed15442ed14d7843c Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 20 Jan 2025 17:33:07 +0000
Subject: [PATCH 38/40] rfcs: add sharded ingest RFC (#8754)

## Summary

Whereas currently we send all WAL to all pageserver shards, and each
shard filters out the data that it needs,
in this RFC we add a mechanism to filter the WAL on the safekeeper, so
that each shard receives
only the data it needs.

This will place some extra CPU load on the safekeepers, in exchange for
reducing the network bandwidth
for ingesting WAL back to scaling as O(1) with shard count, rather than
O(N_shards).

Touches #9329.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Vlad Lazar <vlalazar.vlad@gmail.com>
Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 docs/rfcs/041-sharded-ingest.md | 255 ++++++++++++++++++++++++++++++++
 1 file changed, 255 insertions(+)
 create mode 100644 docs/rfcs/041-sharded-ingest.md

diff --git a/docs/rfcs/041-sharded-ingest.md b/docs/rfcs/041-sharded-ingest.md
new file mode 100644
index 0000000000..47b314891c
--- /dev/null
+++ b/docs/rfcs/041-sharded-ingest.md
@@ -0,0 +1,255 @@
+# 
+Created on Aug 2024
+Implemented on Jan 2025
+
+## Summary
+
+Data in large tenants is split up between multiple pageservers according to key hashes, as
+introduced in the [sharding RFC](031-sharding-static.md) and [shard splitting RFC](032-shard-splitting.md).
+
+Whereas currently we send all WAL to all pageserver shards, and each shard filters out the data that it needs,
+in this RFC we add a mechanism to filter the WAL on the safekeeper, so that each shard receives
+only the data it needs.
+
+This will place some extra CPU load on the safekeepers, in exchange for reducing the network bandwidth
+for ingesting WAL back to scaling as O(1) with shard count, rather than O(N_shards).
+
+## Motivation
+
+1. Large databases require higher shard counts.  Whereas currently we run with up to 8 shards for tenants
+with a few TB of storage, the next order of magnitude capacity increase will require tens of shards, such
+that sending all WAL to all shards is impractical in terms of bandwidth.
+2. For contemporary database sizes (~2TB), the pageserver is the bottleneck for ingest: since each
+   shard has to decode and process the whole WAL, sharding doesn't fully relieve this bottleneck.  To achieve significantly higher ingest speeds, we need to filter the WAL earlier so that each pageserver
+   only has to process relevant parts.
+
+## Non Goals (if relevant)
+
+We do not seek to introduce multiple WALs per timeline, or to share the work of handling a timeline's
+WAL across safekeepers (beyond simple 3x replication).  This RFC may be thought of as an incremental
+move of the ingestion bottleneck up the stack: instead of high write rates bottlenecking on the
+pageserver, they will bottleneck on the safekeeper.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Safekeeper, pageserver.
+
+There will be no control plane or storage controller coordination needed, as pageservers will directly
+indicate their sharding parameters to the safekeeper when subscribing for WAL.
+
+## Proposed implementation
+
+Terminology:
+- "Data pages" refers to postgres relation blocks, and SLRU blocks.
+- "Metadata pages" refers to everything else the pageserver stores, such as relation sizes and
+  directories of relations.
+
+### Phase 1: Refactor ingest
+
+Currently, pageserver ingest code is structured approximately as follows:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+   socket
+2. `WalIngest::ingest_record` to translate the record into a series of page-level modifications
+3. `DatadirModification` accumulates page updates from several `ingest_record` calls, and when
+   its `commit()` method is called, flushes these into a Timeline's open `InMemoryLayer`.
+
+This process currently assumes access to a pageserver `Timeline` throughout `ingest_record` and
+from `DatadirModification`, which is used to do read-modify-write cycles on metadata pages
+such as relation sizes and the master DBDIR page.  It also assumes that records are ingested
+strictly one after the other: they cannot be ingested in parallel because each record assumes
+that earlier records' changes have already been applied to `Timeline`.
+
+This code will be refactored to disentangle the simple, fast decode of relation page writes
+from the more complex logic for updating internal metadata.  An intermediate representation
+called `InterpretedWalRecords` will be introduced.  This is similar to the internal state of
+a `DatadirModification`, but does not require access to a Timeline.  Instead of storing
+metadata updates as materialized writes to pages, it will accumulate these as abstract operations,
+for example rather than including a write to a relation size key, this structure will include
+an operation that indicates "Update relation _foo_'s size to the max of its current value and
+_bar_", such that these may be applied later to a real Timeline.
+
+The `DatadirModification` will be aware of the `EphemeralFile` format, so that as it accumulates
+simple page writes of relation blocks, it can write them directly into a buffer in the serialized
+format.  This will avoid the need to later deserialize/reserialize this data when passing the
+structure between safekeeper and pageserver.
+
+The new pipeline will be:
+1. `handle_walreceiver_connection` reads a stream of binary WAL records off a network
+2. A `InterpretedWalRecords` is generated from the incoming WAL records.  This does not
+   require a reference to a Timeline.
+3. The logic that is current spread between `WalIngest` and `DatadirModification` for updating
+   metadata will be refactored to consume the metadata operations from the `InterpretedWalRecords`
+   and turn them into literal writes to metadata pages.  This part must be done sequentially.
+4. The resulting buffer of metadata page writes is combined with the buffer of relation block
+   writes, and written into the `InMemoryLayer`.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9472
+2. https://github.com/neondatabase/neon/pull/9504
+3. https://github.com/neondatabase/neon/pull/9524
+
+### Phase 2: Decode & filter on safekeeper
+
+In the previous phase, the ingest code was modified to be able to do most of its work without access to
+a Timeline: this first stage of ingest simply converts a series of binary wal records into
+a buffer of relation/SLRU page writes, and a buffer of abstract metadata writes.
+
+The modified ingest code may be transplanted from pageserver to safekeeper (probably via a
+shared crate).  The safekeeper->pageserver network protocol is modified to:
+ - in subscription requests, send the `ShardIdentity` from the pageserver to the safekeeper
+ - in responses, transmit a `InterpretedWalRecords` instead of a raw `WalRecord`.
+ - use the `ShardIdentity` to filter the `ProcessedWalIngest` to relevant content for
+   the subscribing shard before transmitting it.
+
+The overall behavior of the pageserver->safekeeper interaction remains the same, in terms of
+consistent LSN feedback, and connection management.  Only the payload of the subscriptions
+changes, to express an LSN range of WAL as a filtered `ProcessedWalIngest` instead of the
+raw data.
+
+The ingest code on the pageserver can now skip the part where it does the first phase of
+processing, as it will receive pre-processed, compressed data off the wire.
+
+Note that `InterpretedWalRecord` batches multiple `InterpretedWalRecord(s)` in the same network
+message. Safekeeper reads WAL in chunks of 16 blocks and then decodes as many Postgres WAL records
+as possible. Each Postgres WAL record maps to one `InterpretedWalRecord` for potentially multiple shards.
+Hence, the size of the batch is given by the number of Postgres WAL records that fit in 16 blocks.
+
+The protocol needs to support evolution. Protobuf was chosen here with the view that, in the future,
+we may migrate it to GRPC altogether
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/9746
+2. https://github.com/neondatabase/neon/pull/9821
+
+### Phase 3: Fan out interpreted WAL
+
+In the previous phase, the initial processing of WAL was moved to the safekeeper, but it is still
+done once for each shard: this will generate O(N_shards) CPU work on the safekeeper (especially
+when considering converting to Protobuf format and compression).
+
+To avoid this, we fan-out WAL from one (tenant, timeline, shard) to all other shards subscribed on
+the same safekeeper. Under normal operation, the WAL will be read from disk, decoded and interpreted
+_only_ once per (safekeeper, timeline).
+
+When the first shard of a sharded timeline subscribes to a given safekeeper a task is spawned
+for the WAL reader (`InterpretedWalReader`). This task reads WAL, decodes, interprets it and sends
+it to the sender (`InterpretedWalSender`). The sender is a future that is polled from the connection
+task. When further shards subscribe on the safekeeper they will attach themselves to the existing WAL reader.
+There's two cases to consider:
+1. The shard's requested `start_lsn` is ahead of the current position of the WAL reader. In this case, the shard
+will start receiving data when the reader reaches that LSN. The intuition here is that there's little to gain
+by letting shards "front-run" since compute backpressure is based on the laggard LSN.
+2. The shard's requested `start_lsn` is below the current position of the WAL reader. In this case, the WAL reader
+gets reset to this requested position (same intuition). Special care is taken such that advanced shards do not receive
+interpreted WAL records below their current position.
+
+The approach above implies that there is at most one WAL reader per (tenant, timeline) on a given safekeeper at any point in time.
+If this turns out to be operationally problematic, there's a trick we can deploy: `--max-delta-for-fanout` is an optional safekeeper
+argument that controls the max absolute delta between a new shard and the current WAL position of the WAL reader. If the absolute
+delta is above that value, a new reader is spawned. Note that there's currently no concurrency control on the number of WAL readers,
+so it's recommended to use large values to avoid pushing CPU utilisation too high.
+
+Unsharded tenants do not spawn a separate task for the interpreted WAL reader since there's no benefit to it. Instead they poll
+the reader and sender concurrently from the connection task.
+
+Shard splits are interesting here because it is the only case when the same shard might have two subscriptions at the same time.
+This is handled by giving readers a unique identifier. Both shards will receive the same data while respecting their requested start
+position.
+
+Implemented in:
+1. https://github.com/neondatabase/neon/pull/10190
+
+## Deployment
+
+Each phase shall be deployed independently. Special care should be taken around protocol changes.
+
+## Observability Tips
+
+* The safekeeper logs the protocol requested by the pageserver
+along with the pageserver ID, tenant, timeline and shard: `starting streaming from`.
+* There's metrics for the number of wal readers:
+  * `safekeeper_wal_readers{kind="task", target=~"pageserver.*"}` gives the number of wal reader tasks for each SK
+  * `safekeeper_wal_readers{kind="future", target=~"pageserver.*"}` gives the numer of wal readers polled inline by each SK
+  * `safekeeper_interpreted_wal_reader_tasks` gives the number of wal reader tasks per tenant, timeline
+* Interesting log lines for the fan-out reader:
+  * `Spawning interpreted`: first shard creates the interpreted wal reader
+  * `Fanning out`: a subsequent shard attaches itself to an interpreted wal reader
+  * `Aborting interpreted`: all senders have finished and the reader task is being aborted
+
+## Future Optimizations
+
+This sections describes some improvement areas which may be revisited in the future.
+
+### Buffering of Interpreted WAL
+
+The interpreted WAL reader may buffer interpreted WAL records in user space to help with serving
+subscriptions that are lagging behind the current position of the reader.
+
+Counterpoints:
+* Safekeepers serve many thousands of timelines and allocating a buffer for each might be wasteful,
+especially given that it would go unused on the happy path.
+* WAL is buffered in the kernel page cache. Usually we'd only pay the CPU cost of decoding and interpreting.
+
+### Tweaking the Pagserver Safekeeper Selection Algorithm
+
+We could make the pageserver aware of which safekeeper's already host shards for the timeline along
+with their current WAL positions. The pageserver should then prefer safkeepers that are in the same
+AZ _and_ already have a shard with a position close to the desired start position.
+
+We currently run one safekeeper per AZ, so the point is mute until that changes.
+
+### Pipelining first ingest phase
+
+The first ingest phase is a stateless transformation of a binary WAL record into a pre-processed
+output per shard.  To put multiple CPUs to work, we may pipeline this processing up to some defined buffer
+depth.
+
+## Alternatives considered
+
+### Give safekeepers enough state to fully decode WAL
+
+In this RFC, we only do the first phase of ingest on the safekeeper, because this is
+the phase that is stateless.  Subsequent changes then happen on the pageserver, with
+access to the `Timeline` state.
+
+We could do more work on the safekeeper if we transmitted metadata state to the safekeeper
+when subscribing to the WAL: for example, by telling the safekeeper all the relation sizes,
+so that it could then generate all the metadata writes for relation sizes.
+
+We avoid doing this for several reasons:
+1. Complexity: it's a more invasive protocol change
+2. Decoupling: having the safekeeper understand the `ProcessedWalIngest` already somewhat
+   infects it with knowledge of the pageserver, but this is mainly an abstract structure
+   that describes postgres writes.  However, if we taught the safekeeper about the exact
+   way that pageserver deals with metadata keys, this would be a much tighter coupling.
+3. Load: once the WAL has been processed to the point that it can be split between shards,
+   it is preferable to share out work on the remaining shards rather than adding extra CPU
+   load to the safekeeper.
+
+### Do pre-processing on the compute instead of the safekeeper
+
+Since our first stage of ingest is stateless, it could be done at any stage in the pipeline,
+all the way up to the compute.
+
+We choose not to do this, because it is useful for the safekeeper to store the raw WAL rather
+than just the preprocessed WAL:
+- The safekeeper still needs to be able to serve raw WAL back to postgres for e.g. physical replication
+- It simplifies our paxos implementation to have the offset in the write log be literally
+  the same as the LSN
+- Raw WAL must have a stable protocol since we might have to re-ingest it at arbitrary points in the future.
+  Storing raw WAL give us more flexibility to evolve the pageserver, safekeeper protocol.
+
+### Do wal pre-processing on shard 0 or a separate service, send it to other shards from there
+
+If we wanted to keep the safekeepers as entirely pure stores of raw WAL bytes, then
+we could do the initial decode and shard-splitting in some other location:
+- Shard 0 could subscribe to the full WAL and then send writes to other shards
+- A new intermediate service between the safekeeper and pageserver could do the splitting.
+
+So why not?
+- Extra network hop from shard 0 to the final destination shard
+- Clearly there is more infrastructure involved here compared with doing it inline on the safekeeper.
+- Safekeepers already have very light CPU load: typical cloud instances shapes with appropriate
+  disks for the safekeepers effectively have "free" CPU resources.
+- Doing extra work on shard 0 would complicate scheduling of shards on pageservers, because
+  shard 0 would have significantly higher CPU load under write workloads than other shards.

From 72130d7d6c975df81249b4c3862d16d4fff40cf6 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Jan 2025 18:51:30 +0100
Subject: [PATCH 39/40] fix(page_service / handle): panic when parallel client
 disconnect & Timeline shutdown (#10445)

## Refs
- fixes https://github.com/neondatabase/neon/issues/10444

## Problem

We're seeing a panic `handles are only shut down once in their lifetime`
in our performance testbed.

## Hypothesis

Annotated code in
https://github.com/neondatabase/neon/issues/10444#issuecomment-2602286415.

```
T1: drop Cache, executes up to (1)
=> HandleInner is now in state ShutDown
T2: Timeline::shutdown => PerTimelineState::shutdown  executes shutdown() again => panics
```

Likely this snuck in the final touches of #10386 where I narrowed down
the locking rules.

## Summary of changes

Make duplicate shutdowns a no-op.
---
 pageserver/src/tenant/timeline/handle.rs | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/tenant/timeline/handle.rs b/pageserver/src/tenant/timeline/handle.rs
index 4c7bea25be..5b39daaaf8 100644
--- a/pageserver/src/tenant/timeline/handle.rs
+++ b/pageserver/src/tenant/timeline/handle.rs
@@ -588,32 +588,40 @@ impl<T: Types> Drop for Cache<T> {
             let Some(handle_inner_arc) = handle_inner_weak.upgrade() else {
                 continue;
             };
-            let handle_timeline = handle_inner_arc
+            let Some(handle_timeline) = handle_inner_arc
                 // locking rules: drop lock before acquiring other lock below
                 .lock()
                 .expect("poisoned")
-                .shutdown();
+                .shutdown()
+            else {
+                // Concurrent PerTimelineState::shutdown.
+                continue;
+            };
+            // Clean up per_timeline_state so the HandleInner allocation can be dropped.
             let per_timeline_state = handle_timeline.per_timeline_state();
             let mut handles_lock_guard = per_timeline_state.handles.lock().expect("mutex poisoned");
             let Some(handles) = &mut *handles_lock_guard else {
                 continue;
             };
             let Some(removed_handle_inner_arc) = handles.remove(&self.id) else {
-                // There could have been a shutdown inbetween us upgrading the weak and locking the mutex.
+                // Concurrent PerTimelineState::shutdown.
                 continue;
             };
-            drop(handles_lock_guard); // locking rules: remember them when!
-            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc,));
+            drop(handles_lock_guard); // locking rules!
+            assert!(Arc::ptr_eq(&removed_handle_inner_arc, &handle_inner_arc));
         }
     }
 }
 
 impl<T: Types> HandleInner<T> {
-    fn shutdown(&mut self) -> Arc<T::Timeline> {
+    fn shutdown(&mut self) -> Option<Arc<T::Timeline>> {
         match std::mem::replace(self, HandleInner::ShutDown) {
-            HandleInner::KeepingTimelineGateOpen { timeline, .. } => timeline,
+            HandleInner::KeepingTimelineGateOpen { timeline, .. } => Some(timeline),
             HandleInner::ShutDown => {
-                unreachable!("handles are only shut down once in their lifetime");
+                // Duplicate shutdowns are possible because both Cache::drop and PerTimelineState::shutdown
+                // may do it concurrently, but locking rules disallow holding per-timeline-state lock and
+                // the handle lock at the same time.
+                None
             }
         }
     }

From e781cf6dd82a150133621ad0165e1c6b03c844ad Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 20 Jan 2025 19:29:21 +0100
Subject: [PATCH 40/40] Compute/LFC: Apply limits consistently (#10449)

Otherwise we might hit ERRORs in otherwise safe situations (such as user
queries), which isn't a great user experience.

## Problem

https://github.com/neondatabase/neon/pull/10376

## Summary of changes

Instead of accepting internal errors as acceptable, we ensure we don't
exceed our allocated usage.
---
 pgxn/neon/file_cache.c                       | 110 ++++++++++++-------
 test_runner/regress/test_local_file_cache.py |  71 +++++++++++-
 2 files changed, 139 insertions(+), 42 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index ad5667cbab..64b236061d 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -911,57 +911,85 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno,
 			if (entry->access_count++ == 0)
 				dlist_delete(&entry->list_node);
 		}
-		else
+		/*-----------
+		 * If the chunk wasn't already in the LFC then we have these
+		 * options, in order of preference:
+		 *
+		 * Unless there is no space available, we can:
+		 *  1. Use an entry from the `holes` list, and
+		 *  2. Create a new entry.
+		 * We can always, regardless of space in the LFC:
+		 *  3. evict an entry from LRU, and
+		 *  4. ignore the write operation (the least favorite option)
+		 */
+		else if (lfc_ctl->used < lfc_ctl->limit)
 		{
-			/*
-			 * We have two choices if all cache pages are pinned (i.e. used in IO
-			 * operations):
-			 *
-			 * 1) Wait until some of this operation is completed and pages is
-			 * unpinned.
-			 *
-			 * 2) Allocate one more chunk, so that specified cache size is more
-			 * recommendation than hard limit.
-			 *
-			 * As far as probability of such event (that all pages are pinned) is
-			 * considered to be very very small: there are should be very large
-			 * number of concurrent IO operations and them are limited by
-			 * max_connections, we prefer not to complicate code and use second
-			 * approach.
-			 */
-			if (lfc_ctl->used >= lfc_ctl->limit && !dlist_is_empty(&lfc_ctl->lru))
-			{
-				/* Cache overflow: evict least recently used chunk */
-				FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->lru));
-	
-				for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
-				{
-					lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
-				}
-				CriticalAssert(victim->access_count == 0);
-				entry->offset = victim->offset; /* grab victim's chunk */
-				hash_search_with_hash_value(lfc_hash, &victim->key, victim->hash, HASH_REMOVE, NULL);
-				neon_log(DEBUG2, "Swap file cache page");
-			}
-			else if (!dlist_is_empty(&lfc_ctl->holes))
+			if (!dlist_is_empty(&lfc_ctl->holes))
 			{
 				/* We can reuse a hole that was left behind when the LFC was shrunk previously */
-				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node, dlist_pop_head_node(&lfc_ctl->holes));
-				uint32		offset = hole->offset;
-				bool		hole_found;
-	
-				hash_search_with_hash_value(lfc_hash, &hole->key, hole->hash, HASH_REMOVE, &hole_found);
+				FileCacheEntry *hole = dlist_container(FileCacheEntry, list_node,
+													   dlist_pop_head_node(&lfc_ctl->holes));
+				uint32 offset = hole->offset;
+				bool hole_found;
+
+				hash_search_with_hash_value(lfc_hash, &hole->key,
+											hole->hash, HASH_REMOVE, &hole_found);
 				CriticalAssert(hole_found);
-	
+
 				lfc_ctl->used += 1;
-				entry->offset = offset;	/* reuse the hole */
+				entry->offset = offset;			/* reuse the hole */
 			}
 			else
 			{
 				lfc_ctl->used += 1;
-				entry->offset = lfc_ctl->size++;	/* allocate new chunk at end
-													 * of file */
+				entry->offset = lfc_ctl->size++;/* allocate new chunk at end
+												 * of file */
 			}
+		}
+		/*
+		 * We've already used up all allocated LFC entries.
+		 *
+		 * If we can clear an entry from the LRU, do that.
+		 * If we can't (e.g. because all other slots are being accessed)
+		 * then we will remove this entry from the hash and continue
+		 * on to the next chunk, as we may not exceed the limit.
+		 */
+		else if (!dlist_is_empty(&lfc_ctl->lru))
+		{
+			/* Cache overflow: evict least recently used chunk */
+			FileCacheEntry *victim = dlist_container(FileCacheEntry, list_node,
+													 dlist_pop_head_node(&lfc_ctl->lru));
+
+			for (int i = 0; i < BLOCKS_PER_CHUNK; i++)
+			{
+				lfc_ctl->used_pages -= (victim->bitmap[i >> 5] >> (i & 31)) & 1;
+			}
+
+			CriticalAssert(victim->access_count == 0);
+			entry->offset = victim->offset; /* grab victim's chunk */
+			hash_search_with_hash_value(lfc_hash, &victim->key,
+										victim->hash, HASH_REMOVE, NULL);
+			neon_log(DEBUG2, "Swap file cache page");
+		}
+		else
+		{
+			/* Can't add this chunk - we don't have the space for it */
+			hash_search_with_hash_value(lfc_hash, &entry->key, hash,
+										HASH_REMOVE, NULL);
+
+			/*
+			 * We can't process this chunk due to lack of space in LFC,
+			 * so skip to the next one
+			 */
+			LWLockRelease(lfc_lock);
+			blkno += blocks_in_chunk;
+			buf_offset += blocks_in_chunk;
+			nblocks -= blocks_in_chunk;
+			continue;
+		}
+
+		if (!found)
+		{
 			entry->access_count = 1;
 			entry->hash = hash;
 			memset(entry->bitmap, 0, sizeof entry->bitmap);
diff --git a/test_runner/regress/test_local_file_cache.py b/test_runner/regress/test_local_file_cache.py
index 21c9e97a42..52ee2f32a2 100644
--- a/test_runner/regress/test_local_file_cache.py
+++ b/test_runner/regress/test_local_file_cache.py
@@ -7,9 +7,78 @@ import threading
 import time
 
 import pytest
-from fixtures.neon_fixtures import NeonEnvBuilder
+from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
 from fixtures.utils import USE_LFC, query_scalar
 
+"""
+Test whether LFC doesn't error out when the LRU is empty, but the LFC is
+already at its maximum size.
+
+If we don't handle this safely, we might allocate more hash entries than
+otherwise considered safe, thus causing ERRORs in hash_search(HASH_ENTER) once
+we hit lfc->used >= lfc->limit.
+"""
+
+
+@pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
+def test_local_file_cache_all_pinned(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            "neon.max_file_cache_size='1MB'",
+            "neon.file_cache_size_limit='1MB'",
+        ],
+    )
+    top_cur = endpoint.connect().cursor()
+
+    stop = threading.Event()
+    n_rows = 10000
+    n_threads = 5
+    n_updates_per_connection = 1000
+
+    top_cur.execute("CREATE TABLE lfctest (id int4 PRIMARY KEY, n int) WITH (fillfactor=10)")
+    top_cur.execute(f"INSERT INTO lfctest SELECT g, 1 FROM generate_series(1, {n_rows}) g")
+
+    # Start threads that will perform random UPDATEs. Each UPDATE
+    # increments the counter on the row, so that we can check at the
+    # end that the sum of all the counters match the number of updates
+    # performed (plus the initial 1 on each row).
+    #
+    # Furthermore, each thread will reconnect between every 1000 updates.
+    def run_updates(n_updates_performed_q: queue.Queue[int]):
+        n_updates_performed = 0
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        while not stop.is_set():
+            id = random.randint(1, n_rows)
+            cur.execute(f"UPDATE lfctest SET n = n + 1 WHERE id = {id}")
+            n_updates_performed += 1
+            if n_updates_performed % n_updates_per_connection == 0:
+                cur.close()
+                conn.close()
+                conn = endpoint.connect()
+                cur = conn.cursor()
+        n_updates_performed_q.put(n_updates_performed)
+
+    n_updates_performed_q: queue.Queue[int] = queue.Queue()
+    threads: list[threading.Thread] = []
+    for _i in range(n_threads):
+        thread = threading.Thread(target=run_updates, args=(n_updates_performed_q,), daemon=True)
+        thread.start()
+        threads.append(thread)
+
+    time.sleep(15)
+
+    stop.set()
+
+    n_updates_performed = 0
+    for thread in threads:
+        thread.join()
+        n_updates_performed += n_updates_performed_q.get()
+
+    assert query_scalar(top_cur, "SELECT SUM(n) FROM lfctest") == n_rows + n_updates_performed
+
 
 @pytest.mark.skipif(not USE_LFC, reason="LFC is disabled, skipping")
 def test_local_file_cache_unlink(neon_env_builder: NeonEnvBuilder):