From 73c6626b381bd013064d72332c3a0a372c555877 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 15 Oct 2024 09:31:18 +0100
Subject: [PATCH 001/239] pageserver: stabilize & refine controller scale test
 (#8971)

## Problem

We were seeing timeouts on migrations in this test.

The test unfortunately tends to saturate local storage, which is shared
between the pageservers and the control plane database, which makes the
test kind of unrealistic. We will also want to increase the scale of
this test, so it's worth fixing that.

## Summary of changes

- Instead of randomly creating timelines at the same time as the other
background operations, explicitly identify a subset of tenant which will
have timelines, and create them at the start. This avoids pageservers
putting a lot of load on the test node during the main body of the test.
- Adjust the tenants created to create some number of 8 shard tenants
and the rest 1 shard tenants, instead of just creating a lot of 2 shard
tenants.
- Use archival_config to exercise tenant-mutating operations, instead of
using timeline creation for this.
- Adjust reconcile_until_idle calls to avoid waiting 5 seconds between
calls, which causes timelines with large shard count tenants.
- Fix a pageserver bug where calls to archival_config during activation
get 404
---
 libs/utils/src/http/error.rs                  |   7 +
 pageserver/src/http/routes.rs                 |   2 +
 proxy/src/serverless/http_util.rs             |   4 +
 storage_controller/src/service.rs             |   5 +
 test_runner/fixtures/neon_fixtures.py         |   6 +-
 .../test_storage_controller_scale.py          | 225 ++++++++++++++----
 6 files changed, 204 insertions(+), 45 deletions(-)

diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 5e05e4e713..02fc9e3b99 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -28,6 +28,9 @@ pub enum ApiError {
     #[error("Resource temporarily unavailable: {0}")]
     ResourceUnavailable(Cow<'static, str>),
 
+    #[error("Too many requests: {0}")]
+    TooManyRequests(Cow<'static, str>),
+
     #[error("Shutting down")]
     ShuttingDown,
 
@@ -73,6 +76,10 @@ impl ApiError {
                 err.to_string(),
                 StatusCode::SERVICE_UNAVAILABLE,
             ),
+            ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::TOO_MANY_REQUESTS,
+            ),
             ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
                 err.to_string(),
                 StatusCode::REQUEST_TIMEOUT,
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2985ab1efb..1079d8df29 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -715,6 +715,8 @@ async fn timeline_archival_config_handler(
             .tenant_manager
             .get_attached_tenant_shard(tenant_shard_id)?;
 
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
         tenant
             .apply_timeline_archival_config(timeline_id, request_data.state, ctx)
             .await?;
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index 87a72ec5f0..c1c5764d17 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -41,6 +41,10 @@ pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes,
             err.to_string(),
             StatusCode::SERVICE_UNAVAILABLE,
         ),
+        ApiError::TooManyRequests(err) => HttpErrorBody::response_from_msg_and_status(
+            err.to_string(),
+            StatusCode::TOO_MANY_REQUESTS,
+        ),
         ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
             err.to_string(),
             StatusCode::REQUEST_TIMEOUT,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cc735dc27e..cedee54534 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -246,6 +246,11 @@ fn passthrough_api_error(node: &Node, e: mgmt_api::Error) -> ApiError {
             // storage controller's auth configuration.
             ApiError::InternalServerError(anyhow::anyhow!("{node} {status}: {msg}"))
         }
+        mgmt_api::Error::ApiError(status @ StatusCode::TOO_MANY_REQUESTS, msg) => {
+            // Pass through 429 errors: if pageserver is asking us to wait + retry, we in
+            // turn ask our clients to wait + retry
+            ApiError::Conflict(format!("{node} {status}: {status} {msg}"))
+        }
         mgmt_api::Error::ApiError(status, msg) => {
             // Presume general case of pageserver API errors is that we tried to do something
             // that can't be done right now.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 059707c8ed..a313ac2ed3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1986,11 +1986,11 @@ class NeonStorageController(MetricsGetter, LogUtils):
         log.info(f"reconcile_all waited for {n} shards")
         return n
 
-    def reconcile_until_idle(self, timeout_secs=30):
+    def reconcile_until_idle(self, timeout_secs=30, max_interval=5):
         start_at = time.time()
         n = 1
-        delay_sec = 0.5
-        delay_max = 5
+        delay_sec = 0.1
+        delay_max = max_interval
         while n > 0:
             n = self.reconcile_all()
             if n == 0:
diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 452a856714..d2eba751f8 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -4,9 +4,10 @@ import concurrent.futures
 import random
 import time
 from collections import defaultdict
+from enum import Enum
 
 import pytest
-from fixtures.common_types import TenantId, TenantShardId, TimelineId
+from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -34,6 +35,7 @@ def get_consistent_node_shard_counts(env: NeonEnv, total_shards) -> defaultdict[
         if tenant_placement[tid]["intent"]["attached"]
         == tenant_placement[tid]["observed"]["attached"]
     }
+
     assert len(matching) == total_shards
 
     attached_per_node: defaultdict[str, int] = defaultdict(int)
@@ -107,15 +109,48 @@ def test_storage_controller_many_tenants(
         ps.allowed_errors.append(".*request was dropped before completing.*")
 
     # Total tenants
-    tenant_count = 4000
+    small_tenant_count = 7800
+    large_tenant_count = 200
+    tenant_count = small_tenant_count + large_tenant_count
+    large_tenant_shard_count = 8
+    total_shards = small_tenant_count + large_tenant_count * large_tenant_shard_count
 
-    # Shards per tenant
-    shard_count = 2
-    stripe_size = 1024
+    # A small stripe size to encourage all shards to get some data
+    stripe_size = 1
 
-    total_shards = tenant_count * shard_count
+    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
+    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
+    rng = random.Random(1234)
 
-    tenants = set(TenantId.generate() for _i in range(0, tenant_count))
+    class Tenant:
+        def __init__(self):
+            # Tenants may optionally contain a timeline
+            self.timeline_id = None
+
+            # Tenants may be marked as 'large' to get multiple shard during creation phase
+            self.large = False
+
+    tenant_ids = list(TenantId.generate() for _i in range(0, tenant_count))
+    tenants = dict((tid, Tenant()) for tid in tenant_ids)
+
+    # We will create timelines in only a subset of tenants, because creating timelines
+    # does many megabytes of IO, and we want to densely simulate huge tenant counts on
+    # a single test node.
+    tenant_timelines_count = 100
+
+    # These lists are maintained for use with rng.choice
+    tenants_with_timelines = list(rng.sample(tenants.keys(), tenant_timelines_count))
+    tenants_without_timelines = list(
+        tenant_id for tenant_id in tenants if tenant_id not in tenants_with_timelines
+    )
+
+    # For our sharded tenants, we will make half of them with timelines and half without
+    assert large_tenant_count >= tenant_timelines_count / 2
+    for tenant_id in tenants_with_timelines[0 : large_tenant_count // 2]:
+        tenants[tenant_id].large = True
+
+    for tenant_id in tenants_without_timelines[0 : large_tenant_count // 2]:
+        tenants[tenant_id].large = True
 
     virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
@@ -125,23 +160,39 @@ def test_storage_controller_many_tenants(
 
         rss = env.storage_controller.get_metric_value("process_resident_memory_bytes")
         assert rss is not None
-        log.info(f"Resident memory: {rss} ({ rss / (shard_count * tenant_count)} per shard)")
-        assert rss < expect_memory_per_shard * shard_count * tenant_count
-
-    # We use a fixed seed to make the test somewhat reproducible: we want a randomly
-    # chosen order in the sense that it's arbitrary, but not in the sense that it should change every run.
-    rng = random.Random(1234)
+        log.info(f"Resident memory: {rss} ({ rss / total_shards} per shard)")
+        assert rss < expect_memory_per_shard * total_shards
 
     # Issue more concurrent operations than the storage controller's reconciler concurrency semaphore
     # permits, to ensure that we are exercising stressing that.
     api_concurrency = 135
 
-    # We will create tenants directly via API, not via neon_local, to avoid any false
-    # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
-    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
-        futs = []
+    # A different concurrency limit for bulk tenant+timeline creations: these do I/O and will
+    # start timing on test nodes if we aren't a bit careful.
+    create_concurrency = 16
+
+    class Operation(str, Enum):
+        TIMELINE_OPS = "timeline_ops"
+        SHARD_MIGRATE = "shard_migrate"
+        TENANT_PASSTHROUGH = "tenant_passthrough"
+
+    run_ops = api_concurrency * 4
+    assert run_ops < len(tenants)
+
+    # Creation phase: make a lot of tenants, and create timelines in a subset of them
+    # This executor has concurrency set modestly, to avoid overloading pageservers with timeline creations.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=create_concurrency) as executor:
+        tenant_create_futs = []
         t1 = time.time()
-        for tenant_id in tenants:
+
+        for tenant_id, tenant in tenants.items():
+            if tenant.large:
+                shard_count = large_tenant_shard_count
+            else:
+                shard_count = 1
+
+            # We will create tenants directly via API, not via neon_local, to avoid any false
+            # serialization of operations in neon_local (it e.g. loads/saves a config file on each call)
             f = executor.submit(
                 env.storage_controller.tenant_create,
                 tenant_id,
@@ -152,44 +203,106 @@ def test_storage_controller_many_tenants(
                 tenant_config={"heatmap_period": "10s"},
                 placement_policy={"Attached": 1},
             )
-            futs.append(f)
+            tenant_create_futs.append(f)
 
-        # Wait for creations to finish
-        for f in futs:
+        # Wait for tenant creations to finish
+        for f in tenant_create_futs:
             f.result()
         log.info(
             f"Created {len(tenants)} tenants in {time.time() - t1}, {len(tenants) / (time.time() - t1)}/s"
         )
 
-        run_ops = api_concurrency * 4
-        assert run_ops < len(tenants)
-        op_tenants = list(tenants)[0:run_ops]
+        # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior
+        # would be for original scheduling decisions to always match optimizer's preference)
+        # (workaround for https://github.com/neondatabase/neon/issues/8969)
+        env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
+        # Create timelines in those tenants which are going to get one
+        t1 = time.time()
+        timeline_create_futs = []
+        for tenant_id in tenants_with_timelines:
+            timeline_id = TimelineId.generate()
+            tenants[tenant_id].timeline_id = timeline_id
+            f = executor.submit(
+                env.storage_controller.pageserver_api().timeline_create,
+                PgVersion.NOT_SET,
+                tenant_id,
+                timeline_id,
+            )
+            timeline_create_futs.append(f)
+
+        for f in timeline_create_futs:
+            f.result()
+        log.info(
+            f"Created {len(tenants_with_timelines)} timelines in {time.time() - t1}, {len(tenants_with_timelines) / (time.time() - t1)}/s"
+        )
+
+    # Plan operations: ensure each tenant with a timeline gets at least
+    # one of each operation type.  Then add other tenants to make up the
+    # numbers.
+    ops_plan = []
+    for tenant_id in tenants_with_timelines:
+        ops_plan.append((tenant_id, Operation.TIMELINE_OPS))
+        ops_plan.append((tenant_id, Operation.SHARD_MIGRATE))
+        ops_plan.append((tenant_id, Operation.TENANT_PASSTHROUGH))
+
+    # Fill up remaining run_ops with migrations of tenants without timelines
+    other_migrate_tenants = rng.sample(tenants_without_timelines, run_ops - len(ops_plan))
+
+    for tenant_id in other_migrate_tenants:
+        ops_plan.append(
+            (
+                tenant_id,
+                rng.choice([Operation.SHARD_MIGRATE, Operation.TENANT_PASSTHROUGH]),
+            )
+        )
+
+    # Exercise phase: pick pseudo-random operations to do on the tenants + timelines
+    # This executor has concurrency high enough to stress the storage controller API.
+    with concurrent.futures.ThreadPoolExecutor(max_workers=api_concurrency) as executor:
+
+        def exercise_timeline_ops(tenant_id, timeline_id):
+            # A read operation: this requires looking up shard zero and routing there
+            detail = virtual_ps_http.timeline_detail(tenant_id, timeline_id)
+            assert detail["timeline_id"] == str(timeline_id)
+
+            # A fan-out write operation to all shards in a tenant.
+            # - We use a metadata operation rather than something like a timeline create, because
+            #   timeline creations are I/O intensive and this test isn't meant to be a stress test for
+            #   doing lots of concurrent timeline creations.
+            archival_state = rng.choice(
+                [TimelineArchivalState.ARCHIVED, TimelineArchivalState.UNARCHIVED]
+            )
+            virtual_ps_http.timeline_archival_config(tenant_id, timeline_id, archival_state)
 
         # Generate a mixture of operations and dispatch them all concurrently
         futs = []
-        for tenant_id in op_tenants:
-            op = rng.choice([0, 1, 2])
-            if op == 0:
-                # A fan-out write operation to all shards in a tenant (timeline creation)
+        for tenant_id, op in ops_plan:
+            if op == Operation.TIMELINE_OPS:
+                op_timeline_id = tenants[tenant_id].timeline_id
+                assert op_timeline_id is not None
+
+                # Exercise operations that modify tenant scheduling state but require traversing
+                # the fan-out-to-all-shards functionality.
                 f = executor.submit(
-                    virtual_ps_http.timeline_create,
-                    PgVersion.NOT_SET,
+                    exercise_timeline_ops,
                     tenant_id,
-                    TimelineId.generate(),
+                    op_timeline_id,
                 )
-            elif op == 1:
+            elif op == Operation.SHARD_MIGRATE:
                 # A reconciler operation: migrate a shard.
-                shard_number = rng.randint(0, shard_count - 1)
-                tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+                desc = env.storage_controller.tenant_describe(tenant_id)
+
+                shard_number = rng.randint(0, len(desc["shards"]) - 1)
+                tenant_shard_id = TenantShardId(tenant_id, shard_number, len(desc["shards"]))
 
                 # Migrate it to its secondary location
-                desc = env.storage_controller.tenant_describe(tenant_id)
                 dest_ps_id = desc["shards"][shard_number]["node_secondary"][0]
 
                 f = executor.submit(
                     env.storage_controller.tenant_shard_migrate, tenant_shard_id, dest_ps_id
                 )
-            elif op == 2:
+            elif op == Operation.TENANT_PASSTHROUGH:
                 # A passthrough read to shard zero
                 f = executor.submit(virtual_ps_http.tenant_status, tenant_id)
 
@@ -199,10 +312,18 @@ def test_storage_controller_many_tenants(
         for f in futs:
             f.result()
 
+    log.info("Completed mixed operations phase")
+
     # Some of the operations above (notably migrations) might leave the controller in a state where it has
     # some work to do, for example optimizing shard placement after we do a random migration. Wait for the system
     # to reach a quiescent state before doing following checks.
-    env.storage_controller.reconcile_until_idle()
+    #
+    # - Set max_interval low because we probably have a significant number of optimizations to complete and would like
+    #   the test to run quickly.
+    # - Set timeout high because we might be waiting for optimizations that reuqire a secondary
+    #   to warm up, and if we just started a secondary in the previous step, it might wait some time
+    #   before downloading its heatmap
+    env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
 
     env.storage_controller.consistency_check()
     check_memory()
@@ -213,6 +334,7 @@ def test_storage_controller_many_tenants(
     #
     # We do not require that the system is quiescent already here, although at present in this point in the test
     # that may be the case.
+    log.info("Reconciling all & timing")
     while True:
         t1 = time.time()
         reconcilers = env.storage_controller.reconcile_all()
@@ -225,6 +347,7 @@ def test_storage_controller_many_tenants(
             break
 
     # Restart the storage controller
+    log.info("Restarting controller")
     env.storage_controller.stop()
     env.storage_controller.start()
 
@@ -246,7 +369,16 @@ def test_storage_controller_many_tenants(
 
     # Restart pageservers gracefully: this exercises the /re-attach pageserver API
     # and the storage controller drain and fill API
+    log.info("Restarting pageservers...")
+
+    # Parameters for how long we expect it to take to migrate all of the tenants from/to
+    # a node during a drain/fill operation
+    DRAIN_FILL_TIMEOUT = 240
+    DRAIN_FILL_BACKOFF = 5
+
     for ps in env.pageservers:
+        log.info(f"Draining pageserver {ps.id}")
+        t1 = time.time()
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_drain(ps_id), ps.id, max_attempts=3, backoff=2
         )
@@ -255,9 +387,10 @@ def test_storage_controller_many_tenants(
             ps.id,
             PageserverAvailability.ACTIVE,
             PageserverSchedulingPolicy.PAUSE_FOR_RESTART,
-            max_attempts=24,
-            backoff=5,
+            max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF,
+            backoff=DRAIN_FILL_BACKOFF,
         )
+        log.info(f"Drained pageserver {ps.id} in {time.time() - t1}s")
 
         shard_counts = get_consistent_node_shard_counts(env, total_shards)
         log.info(f"Shard counts after draining node {ps.id}: {shard_counts}")
@@ -275,6 +408,7 @@ def test_storage_controller_many_tenants(
             backoff=1,
         )
 
+        log.info(f"Filling pageserver {ps.id}")
         env.storage_controller.retryable_node_operation(
             lambda ps_id: env.storage_controller.node_fill(ps_id), ps.id, max_attempts=3, backoff=2
         )
@@ -282,16 +416,23 @@ def test_storage_controller_many_tenants(
             ps.id,
             PageserverAvailability.ACTIVE,
             PageserverSchedulingPolicy.ACTIVE,
-            max_attempts=24,
-            backoff=5,
+            max_attempts=DRAIN_FILL_TIMEOUT // DRAIN_FILL_BACKOFF,
+            backoff=DRAIN_FILL_BACKOFF,
         )
 
+        log.info(f"Filled pageserver {ps.id} in {time.time() - t1}s")
+
+        # Waiting for optimizer to stabilize, if it disagrees with scheduling (the correct behavior
+        # would be for original scheduling decisions to always match optimizer's preference)
+        # (workaround for https://github.com/neondatabase/neon/issues/8969)
+        env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
+
         shard_counts = get_consistent_node_shard_counts(env, total_shards)
         log.info(f"Shard counts after filling node {ps.id}: {shard_counts}")
 
         assert_consistent_balanced_attachments(env, total_shards)
 
-        env.storage_controller.reconcile_until_idle()
+        env.storage_controller.reconcile_until_idle(max_interval=0.1, timeout_secs=120)
         env.storage_controller.consistency_check()
 
     # Consistency check is safe here: restarting pageservers should not have caused any Reconcilers to spawn,

From ec4cc30de9bc1140761a7f8b7e4a5886c4d3b4c7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 15 Oct 2024 11:46:51 +0200
Subject: [PATCH 002/239] Shut down timelines during offload and add offload
 tests (#9289)

Add a test for timeline offloading, and subsequent unoffloading.

Also adds a manual endpoint, and issues a proper timeline shutdown
during offloading which prevents a pageserver hang at shutdown.

Part of #8088.
---
 pageserver/src/http/routes.rs                | 49 ++++++++++++
 pageserver/src/tenant.rs                     | 29 +++++++
 pageserver/src/tenant/timeline/offload.rs    |  3 +
 test_runner/fixtures/pageserver/http.py      | 16 ++++
 test_runner/regress/test_timeline_archive.py | 84 ++++++++++++++++++++
 5 files changed, 181 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 1079d8df29..dd403c1cef 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -77,6 +77,7 @@ use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
+use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
@@ -325,6 +326,7 @@ impl From<crate::tenant::TimelineArchivalError> for ApiError {
         match value {
             NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found").into()),
             Timeout => ApiError::Timeout("hit pageserver internal timeout".into()),
+            Cancelled => ApiError::ShuttingDown,
             e @ HasArchivedParent(_) => {
                 ApiError::PreconditionFailed(e.to_string().into_boxed_str())
             }
@@ -1785,6 +1787,49 @@ async fn timeline_compact_handler(
     .await
 }
 
+// Run offload immediately on given timeline.
+async fn timeline_offload_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        if tenant.get_offloaded_timeline(timeline_id).is_ok() {
+            return json_response(StatusCode::OK, ());
+        }
+        let timeline =
+            active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+                .await?;
+
+        if !tenant.timeline_has_no_attached_children(timeline_id) {
+            return Err(ApiError::PreconditionFailed(
+                "timeline has attached children".into(),
+            ));
+        }
+        if !timeline.can_offload() {
+            return Err(ApiError::PreconditionFailed(
+                "Timeline::can_offload() returned false".into(),
+            ));
+        }
+        offload_timeline(&tenant, &timeline)
+            .await
+            .map_err(ApiError::InternalServerError)?;
+
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("manual_timeline_offload", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run checkpoint immediately on given timeline.
 async fn timeline_checkpoint_handler(
     request: Request<Body>,
@@ -3008,6 +3053,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
             |r| api_handler(r, timeline_compact_handler),
         )
+        .put(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
+            |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
             |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 397778d4c8..44d1bb74ca 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -619,6 +619,9 @@ pub enum TimelineArchivalError {
     #[error("Timeout")]
     Timeout,
 
+    #[error("Cancelled")]
+    Cancelled,
+
     #[error("ancestor is archived: {}", .0)]
     HasArchivedParent(TimelineId),
 
@@ -637,6 +640,7 @@ impl Debug for TimelineArchivalError {
         match self {
             Self::NotFound => write!(f, "NotFound"),
             Self::Timeout => write!(f, "Timeout"),
+            Self::Cancelled => write!(f, "Cancelled"),
             Self::HasArchivedParent(p) => f.debug_tuple("HasArchivedParent").field(p).finish(),
             Self::HasUnarchivedChildren(c) => {
                 f.debug_tuple("HasUnarchivedChildren").field(c).finish()
@@ -1552,6 +1556,7 @@ impl Tenant {
         timeline_id: TimelineId,
         ctx: RequestContext,
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
+        info!("unoffloading timeline");
         let cancel = self.cancel.clone();
         let timeline_preload = self
             .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel)
@@ -1566,6 +1571,7 @@ impl Tenant {
                 error!(%timeline_id, "index_part not found on remote");
                 return Err(TimelineArchivalError::NotFound);
             }
+            Err(DownloadError::Cancelled) => return Err(TimelineArchivalError::Cancelled),
             Err(e) => {
                 // Some (possibly ephemeral) error happened during index_part download.
                 warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
@@ -1603,6 +1609,7 @@ impl Tenant {
             if offloaded_timelines.remove(&timeline_id).is_none() {
                 warn!("timeline already removed from offloaded timelines");
             }
+            info!("timeline unoffloading complete");
             Ok(Arc::clone(timeline))
         } else {
             warn!("timeline not available directly after attach");
@@ -1683,6 +1690,21 @@ impl Tenant {
         Ok(())
     }
 
+    pub fn get_offloaded_timeline(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Result<Arc<OffloadedTimeline>, GetTimelineError> {
+        self.timelines_offloaded
+            .lock()
+            .unwrap()
+            .get(&timeline_id)
+            .map(Arc::clone)
+            .ok_or(GetTimelineError::NotFound {
+                tenant_id: self.tenant_shard_id,
+                timeline_id,
+            })
+    }
+
     pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
         self.tenant_shard_id
     }
@@ -2218,6 +2240,13 @@ impl Tenant {
         }
     }
 
+    pub fn timeline_has_no_attached_children(&self, timeline_id: TimelineId) -> bool {
+        let timelines = self.timelines.lock().unwrap();
+        !timelines
+            .iter()
+            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(timeline_id))
+    }
+
     pub fn current_state(&self) -> TenantState {
         self.state.borrow().clone()
     }
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index fb906d906b..7e6084baaf 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -19,6 +19,9 @@ pub(crate) async fn offload_timeline(
         return Ok(());
     };
 
+    // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+    timeline.shutdown(super::ShutdownMode::Hard).await;
+
     // TODO extend guard mechanism above with method
     // to make deletions possible while offloading is in progress
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index aa4435af4e..18d65cb7de 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -583,6 +583,22 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         log.info(f"Got GC request response code: {res.status_code}")
         self.verbose_error(res)
 
+    def timeline_offload(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+    ):
+        self.is_testing_enabled_or_skip()
+
+        log.info(f"Requesting offload: tenant {tenant_id}, timeline {timeline_id}")
+        res = self.put(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/offload",
+        )
+        log.info(f"Got offload request response code: {res.status_code}")
+        self.verbose_error(res)
+        res_json = res.json()
+        assert res_json is None
+
     def timeline_compact(
         self,
         tenant_id: Union[TenantId, TenantShardId],
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 841707d32e..971cc57a1c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -6,6 +6,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pageserver.http import PageserverApiException
+from fixtures.utils import wait_until
 
 
 @pytest.mark.parametrize("shard_count", [0, 4])
@@ -114,3 +115,86 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
         leaf_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
+
+
+@pytest.mark.parametrize("manual_offload", [False, True])
+def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, initial_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s" if manual_offload else "1s",
+        }
+    )
+
+    # Create two branches and archive them
+    parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
+    leaf_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        leaf_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id,
+        leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is True
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        parent_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+
+    def timeline_offloaded(timeline_id: TimelineId) -> bool:
+        return (
+            env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
+            is not None
+        )
+
+    if manual_offload:
+        with pytest.raises(
+            PageserverApiException,
+            match="timeline has attached children",
+        ):
+            # This only tests the (made for testing only) http handler,
+            # but still demonstrates the constraints we have.
+            ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
+
+    def parent_offloaded():
+        if manual_offload:
+            ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
+        assert timeline_offloaded(parent_timeline_id)
+
+    def leaf_offloaded():
+        if manual_offload:
+            ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
+        assert timeline_offloaded(leaf_timeline_id)
+
+    wait_until(30, 1, leaf_offloaded)
+    wait_until(30, 1, parent_offloaded)
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        parent_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+    ps_http.timeline_archival_config(
+        tenant_id,
+        leaf_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id,
+        leaf_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is False
+
+    assert not timeline_offloaded(initial_timeline_id)

From d92d36a315f955cd39bc6f6b0948bae25ed195ad Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 15 Oct 2024 13:13:57 +0100
Subject: [PATCH 003/239] [local_proxy] update api for pg_session_jwt (#9359)

pg_session_jwt now:
1. Sets the JWK in a PGU_BACKEND session guc, no longer in the init()
function.
2. JWK no longer needs the kid.
---
 Cargo.lock                              |   7 +-
 Cargo.toml                              |   1 +
 compute/Dockerfile.compute-node         |   4 +-
 proxy/Cargo.toml                        |   3 +-
 proxy/src/serverless/backend.rs         |  49 ++++----
 proxy/src/serverless/local_conn_pool.rs | 143 ++++++++++++++++--------
 workspace_hack/Cargo.toml               |   6 +-
 7 files changed, 139 insertions(+), 74 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5edf5cf7b4..7e772814ec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2695,6 +2695,7 @@ checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
 dependencies = [
  "equivalent",
  "hashbrown 0.14.5",
+ "serde",
 ]
 
 [[package]]
@@ -2794,9 +2795,9 @@ dependencies = [
 
 [[package]]
 name = "itoa"
-version = "1.0.6"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
+checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
@@ -4296,6 +4297,7 @@ dependencies = [
  "indexmap 2.0.1",
  "ipnet",
  "itertools 0.10.5",
+ "itoa",
  "jose-jwa",
  "jose-jwk",
  "lasso",
@@ -7307,6 +7309,7 @@ dependencies = [
  "hyper 1.4.1",
  "hyper-util",
  "indexmap 1.9.3",
+ "indexmap 2.0.1",
  "itertools 0.12.1",
  "lazy_static",
  "libc",
diff --git a/Cargo.toml b/Cargo.toml
index dde80f5020..a1a974b33b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -107,6 +107,7 @@ indexmap = "2"
 indoc = "2"
 ipnet = "2.9.0"
 itertools = "0.10"
+itoa = "1.0.11"
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 91528618da..412c64eda4 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -929,8 +929,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/ff0a72440e8ff584dab24b3f9b7c00c56c660b8e.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "1fbb2b5a339263bcf6daa847fad8bccbc0b451cea6a62e6d3bf232b0087f05cb pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 963fb94a7d..e25d2fcbab 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -42,9 +42,10 @@ hyper0.workspace = true
 hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
-indexmap.workspace = true
+indexmap = { workspace = true, features = ["serde"] }
 ipnet.workspace = true
 itertools.workspace = true
+itoa.workspace = true
 lasso = { workspace = true, features = ["multi-threaded"] }
 measured = { workspace = true, features = ["lasso"] }
 metrics.workspace = true
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 2b060af9e1..927854897f 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -2,8 +2,9 @@ use std::{io, sync::Arc, time::Duration};
 
 use async_trait::async_trait;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
+use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey};
+use rand::rngs::OsRng;
 use tokio::net::{lookup_host, TcpStream};
-use tokio_postgres::types::ToSql;
 use tracing::{debug, field::display, info};
 
 use crate::{
@@ -267,50 +268,58 @@ impl PoolingBackend {
             auth::Backend::Local(local) => local.node_info.clone(),
         };
 
+        let (key, jwk) = create_random_jwk();
+
         let config = node_info
             .config
             .user(&conn_info.user_info.user)
-            .dbname(&conn_info.dbname);
+            .dbname(&conn_info.dbname)
+            .options(&format!(
+                "-c pg_session_jwt.jwk={}",
+                serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
+            ));
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
         drop(pause);
 
-        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
+        let pid = client.get_process_id();
+        tracing::Span::current().record("pid", pid);
 
-        let handle = local_conn_pool::poll_client(
+        let mut handle = local_conn_pool::poll_client(
             self.local_pool.clone(),
             ctx,
             conn_info,
             client,
             connection,
+            key,
             conn_id,
             node_info.aux.clone(),
         );
 
-        let kid = handle.get_client().get_process_id() as i64;
-        let jwk = p256::PublicKey::from(handle.key().verifying_key()).to_jwk();
+        {
+            let (client, mut discard) = handle.inner();
+            debug!("setting up backend session state");
 
-        debug!(kid, ?jwk, "setting up backend session state");
+            // initiates the auth session
+            if let Err(e) = client.query("select auth.init()", &[]).await {
+                discard.discard();
+                return Err(e.into());
+            }
 
-        // initiates the auth session
-        handle
-            .get_client()
-            .query(
-                "select auth.init($1, $2);",
-                &[
-                    &kid as &(dyn ToSql + Sync),
-                    &tokio_postgres::types::Json(jwk),
-                ],
-            )
-            .await?;
-
-        info!(?kid, "backend session state init");
+            info!("backend session state initialized");
+        }
 
         Ok(handle)
     }
 }
 
+fn create_random_jwk() -> (SigningKey, JwkEcKey) {
+    let key = SigningKey::random(&mut OsRng);
+    let jwk = p256::PublicKey::from(key.verifying_key()).to_jwk();
+    (key, jwk)
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 1dde5952e1..4ab14ad35f 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -1,9 +1,9 @@
 use futures::{future::poll_fn, Future};
+use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
-use rand::rngs::OsRng;
-use serde_json::Value;
+use serde_json::value::RawValue;
 use signature::Signer;
 use std::task::{ready, Poll};
 use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
@@ -12,14 +12,13 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 use tokio_util::sync::CancellationToken;
-use typed_json::json;
 
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{context::RequestMonitoring, DbName, RoleName};
 
-use tracing::{debug, error, warn, Span};
+use tracing::{error, warn, Span};
 use tracing::{info, info_span, Instrument};
 
 use super::backend::HttpConnError;
@@ -245,12 +244,14 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
     }
 }
 
+#[allow(clippy::too_many_arguments)]
 pub(crate) fn poll_client(
     global_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     ctx: &RequestMonitoring,
     conn_info: ConnInfo,
     client: tokio_postgres::Client,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
+    key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> LocalClient<tokio_postgres::Client> {
@@ -346,8 +347,6 @@ pub(crate) fn poll_client(
     }
     .instrument(span));
 
-    let key = SigningKey::random(&mut OsRng);
-
     let inner = ClientInner {
         inner: client,
         session: tx,
@@ -430,13 +429,6 @@ impl<C: ClientInnerExt> LocalClient<C> {
         let inner = inner.as_mut().expect("client inner should not be removed");
         (&mut inner.inner, Discard { conn_info, pool })
     }
-    pub(crate) fn key(&self) -> &SigningKey {
-        let inner = &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed");
-        &inner.key
-    }
 }
 
 impl LocalClient<tokio_postgres::Client> {
@@ -445,25 +437,9 @@ impl LocalClient<tokio_postgres::Client> {
             .inner
             .as_mut()
             .expect("client inner should not be removed");
+
         inner.jti += 1;
-
-        let kid = inner.inner.get_process_id();
-        let header = json!({"kid":kid}).to_string();
-
-        let mut payload = serde_json::from_slice::<serde_json::Map<String, Value>>(payload)
-            .map_err(HttpConnError::JwtPayloadError)?;
-        payload.insert("jti".to_string(), Value::Number(inner.jti.into()));
-        let payload = Value::Object(payload).to_string();
-
-        debug!(
-            kid,
-            jti = inner.jti,
-            ?header,
-            ?payload,
-            "signing new ephemeral JWT"
-        );
-
-        let token = sign_jwt(&inner.key, header, payload);
+        let token = resign_jwt(&inner.key, payload, inner.jti)?;
 
         // initiates the auth session
         inner.inner.simple_query("discard all").await?;
@@ -475,20 +451,74 @@ impl LocalClient<tokio_postgres::Client> {
             )
             .await?;
 
-        info!(kid, jti = inner.jti, "user session state init");
+        let pid = inner.inner.get_process_id();
+        info!(pid, jti = inner.jti, "user session state init");
 
         Ok(())
     }
 }
 
-fn sign_jwt(sk: &SigningKey, header: String, payload: String) -> String {
-    let header = Base64UrlUnpadded::encode_string(header.as_bytes());
-    let payload = Base64UrlUnpadded::encode_string(payload.as_bytes());
+/// implements relatively efficient in-place json object key upserting
+///
+/// only supports top-level keys
+fn upsert_json_object(
+    payload: &[u8],
+    key: &str,
+    value: &RawValue,
+) -> Result<String, serde_json::Error> {
+    let mut payload = serde_json::from_slice::<IndexMap<&str, &RawValue>>(payload)?;
+    payload.insert(key, value);
+    serde_json::to_string(&payload)
+}
 
-    let message = format!("{header}.{payload}");
-    let sig: Signature = sk.sign(message.as_bytes());
-    let base64_sig = Base64UrlUnpadded::encode_string(&sig.to_bytes());
-    format!("{message}.{base64_sig}")
+fn resign_jwt(sk: &SigningKey, payload: &[u8], jti: u64) -> Result<String, HttpConnError> {
+    let mut buffer = itoa::Buffer::new();
+
+    // encode the jti integer to a json rawvalue
+    let jti = serde_json::from_str::<&RawValue>(buffer.format(jti)).unwrap();
+
+    // update the jti in-place
+    let payload =
+        upsert_json_object(payload, "jti", jti).map_err(HttpConnError::JwtPayloadError)?;
+
+    // sign the jwt
+    let token = sign_jwt(sk, payload.as_bytes());
+
+    Ok(token)
+}
+
+fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
+    let header_len = 20;
+    let payload_len = Base64UrlUnpadded::encoded_len(payload);
+    let signature_len = Base64UrlUnpadded::encoded_len(&[0; 64]);
+    let total_len = header_len + payload_len + signature_len + 2;
+
+    let mut jwt = String::with_capacity(total_len);
+    let cap = jwt.capacity();
+
+    // we only need an empty header with the alg specified.
+    // base64url(r#"{"alg":"ES256"}"#) == "eyJhbGciOiJFUzI1NiJ9"
+    jwt.push_str("eyJhbGciOiJFUzI1NiJ9.");
+
+    // encode the jwt payload in-place
+    base64::encode_config_buf(payload, base64::URL_SAFE_NO_PAD, &mut jwt);
+
+    // create the signature from the encoded header || payload
+    let sig: Signature = sk.sign(jwt.as_bytes());
+
+    jwt.push('.');
+
+    // encode the jwt signature in-place
+    base64::encode_config_buf(sig.to_bytes(), base64::URL_SAFE_NO_PAD, &mut jwt);
+
+    debug_assert_eq!(
+        jwt.len(),
+        total_len,
+        "the jwt len should match our expected len"
+    );
+    debug_assert_eq!(jwt.capacity(), cap, "the jwt capacity should not change");
+
+    jwt
 }
 
 impl<C: ClientInnerExt> Discard<'_, C> {
@@ -509,14 +539,6 @@ impl<C: ClientInnerExt> Discard<'_, C> {
 }
 
 impl<C: ClientInnerExt> LocalClient<C> {
-    pub fn get_client(&self) -> &C {
-        &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed")
-            .inner
-    }
-
     fn do_drop(&mut self) -> Option<impl FnOnce()> {
         let conn_info = self.conn_info.clone();
         let client = self
@@ -542,3 +564,30 @@ impl<C: ClientInnerExt> Drop for LocalClient<C> {
         }
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use p256::ecdsa::SigningKey;
+    use typed_json::json;
+
+    use super::resign_jwt;
+
+    #[test]
+    fn jwt_token_snapshot() {
+        let key = SigningKey::from_bytes(&[1; 32].into()).unwrap();
+        let data =
+            json!({"foo":"bar","jti":"foo\nbar","nested":{"jti":"tricky nesting"}}).to_string();
+
+        let jwt = resign_jwt(&key, data.as_bytes(), 2).unwrap();
+
+        // To validate the JWT, copy the JWT string and paste it into https://jwt.io/.
+        // In the public-key box, paste the following jwk public key
+        // `{"kty":"EC","crv":"P-256","x":"b_A7lJJBzh2t1DUZ5pYOCoW0GmmgXDKBA6orzhWUyhY","y":"PE91OlW_AdxT9sCwx-7ni0DG_30lqW4igrmJzvccFEo"}`
+
+        // let pub_key = p256::ecdsa::VerifyingKey::from(&key);
+        // let pub_key = p256::PublicKey::from(pub_key);
+        // println!("{}", pub_key.to_jwk_string());
+
+        assert_eq!(jwt, "eyJhbGciOiJFUzI1NiJ9.eyJmb28iOiJiYXIiLCJqdGkiOjIsIm5lc3RlZCI6eyJqdGkiOiJ0cmlja3kgbmVzdGluZyJ9fQ.pYf0LxoJ8sDgpmsYOgrbNecOSipnPBEGwnZzB-JhW2cONrKlqRsgXwK8_cOsyolGy-hTTe8GXbWTl_UdpF5RyA");
+    }
+}
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 0a90b6b6f7..1347d6ddff 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -46,7 +46,8 @@ hmac = { version = "0.12", default-features = false, features = ["reset"] }
 hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] }
 hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"] }
 hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
-indexmap = { version = "1", default-features = false, features = ["std"] }
+indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
+indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
 itertools = { version = "0.12" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
@@ -101,7 +102,8 @@ either = { version = "1" }
 getrandom = { version = "0.2", default-features = false, features = ["std"] }
 half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
-indexmap = { version = "1", default-features = false, features = ["std"] }
+indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
+indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
 itertools = { version = "0.12" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }

From fb74c21e8cae23831b7728232772315297463e63 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 15 Oct 2024 15:24:56 +0200
Subject: [PATCH 004/239] proxy: Migrate jwt module away from anyhow (#9361)

---
 proxy/src/auth/backend/jwt.rs            | 188 +++++++++++++++++------
 proxy/src/auth/backend/local.rs          |   6 +-
 proxy/src/auth/backend/mod.rs            |   3 +-
 proxy/src/control_plane/provider/mock.rs |  10 +-
 proxy/src/control_plane/provider/mod.rs  |  43 +++++-
 proxy/src/control_plane/provider/neon.rs |  27 ++--
 proxy/src/proxy/tests/mod.rs             |  42 ++---
 proxy/src/proxy/wake_compute.rs          |   2 +-
 8 files changed, 228 insertions(+), 93 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 17ab7eda22..402e59fdb3 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -4,21 +4,20 @@ use std::{
     time::{Duration, SystemTime},
 };
 
-use anyhow::{bail, ensure, Context};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
 use serde::{de::Visitor, Deserialize, Deserializer};
 use signature::Verifier;
+use thiserror::Error;
 use tokio::time::Instant;
 
 use crate::{
-    context::RequestMonitoring, http::parse_json_body_with_limit, intern::RoleNameInt, EndpointId,
-    RoleName,
+    auth::backend::ComputeCredentialKeys, context::RequestMonitoring,
+    control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit,
+    intern::RoleNameInt, EndpointId, RoleName,
 };
 
-use super::ComputeCredentialKeys;
-
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
 const MIN_RENEW: Duration = Duration::from_secs(30);
@@ -32,7 +31,16 @@ pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> impl Future<Output = anyhow::Result<Vec<AuthRule>>> + Send;
+    ) -> impl Future<Output = Result<Vec<AuthRule>, FetchAuthRulesError>> + Send;
+}
+
+#[derive(Error, Debug)]
+pub(crate) enum FetchAuthRulesError {
+    #[error(transparent)]
+    GetEndpointJwks(#[from] GetEndpointJwksError),
+
+    #[error("JWKs settings for this role were not configured")]
+    RoleJwksNotConfigured,
 }
 
 pub(crate) struct AuthRule {
@@ -122,7 +130,7 @@ impl JwkCacheEntryLock {
         client: &reqwest::Client,
         endpoint: EndpointId,
         auth_rules: &F,
-    ) -> anyhow::Result<Arc<JwkCacheEntry>> {
+    ) -> Result<Arc<JwkCacheEntry>, JwtError> {
         // double check that no one beat us to updating the cache.
         let now = Instant::now();
         let guard = self.cached.load_full();
@@ -188,7 +196,7 @@ impl JwkCacheEntryLock {
         client: &reqwest::Client,
         endpoint: EndpointId,
         fetch: &F,
-    ) -> Result<Arc<JwkCacheEntry>, anyhow::Error> {
+    ) -> Result<Arc<JwkCacheEntry>, JwtError> {
         let now = Instant::now();
         let guard = self.cached.load_full();
 
@@ -243,27 +251,24 @@ impl JwkCacheEntryLock {
         endpoint: EndpointId,
         role_name: &RoleName,
         fetch: &F,
-    ) -> Result<ComputeCredentialKeys, anyhow::Error> {
+    ) -> Result<ComputeCredentialKeys, JwtError> {
         // JWT compact form is defined to be
         // <B64(Header)> || . || <B64(Payload)> || . || <B64(Signature)>
         // where Signature = alg(<B64(Header)> || . || <B64(Payload)>);
 
         let (header_payload, signature) = jwt
             .rsplit_once('.')
-            .context("Provided authentication token is not a valid JWT encoding")?;
+            .ok_or(JwtEncodingError::InvalidCompactForm)?;
         let (header, payload) = header_payload
             .split_once('.')
-            .context("Provided authentication token is not a valid JWT encoding")?;
+            .ok_or(JwtEncodingError::InvalidCompactForm)?;
 
-        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)
-            .context("Provided authentication token is not a valid JWT encoding")?;
+        let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?;
+        let header = serde_json::from_slice::<JwtHeader<'_>>(&header)?;
 
-        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
+        let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?;
 
-        let kid = header.key_id.context("missing key id")?;
+        let kid = header.key_id.ok_or(JwtError::MissingKeyId)?;
 
         let mut guard = self
             .get_or_update_jwk_cache(ctx, client, endpoint.clone(), fetch)
@@ -281,16 +286,13 @@ impl JwkCacheEntryLock {
                         .renew_jwks(permit, ctx, client, endpoint.clone(), fetch)
                         .await?;
                 }
-                _ => {
-                    bail!("jwk not found");
-                }
+                _ => return Err(JwtError::JwkNotFound),
             }
         };
 
-        ensure!(
-            jwk.is_supported(&header.algorithm),
-            "signature algorithm not supported"
-        );
+        if !jwk.is_supported(&header.algorithm) {
+            return Err(JwtError::SignatureAlgorithmNotSupported);
+        }
 
         match &jwk.key {
             jose_jwk::Key::Ec(key) => {
@@ -299,34 +301,32 @@ impl JwkCacheEntryLock {
             jose_jwk::Key::Rsa(key) => {
                 verify_rsa_signature(header_payload.as_bytes(), &sig, key, &header.algorithm)?;
             }
-            key => bail!("unsupported key type {key:?}"),
+            key => return Err(JwtError::UnsupportedKeyType(key.into())),
         };
 
-        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)
-            .context("Provided authentication token is not a valid JWT encoding")?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)
-            .context("Provided authentication token is not a valid JWT encoding")?;
+        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
 
         tracing::debug!(?payload, "JWT signature valid with claims");
 
         if let Some(aud) = expected_audience {
-            ensure!(
-                payload.audience.0.iter().any(|s| s == aud),
-                "invalid JWT token audience"
-            );
+            if payload.audience.0.iter().all(|s| s != aud) {
+                return Err(JwtError::InvalidJwtTokenAudience);
+            }
         }
 
         let now = SystemTime::now();
 
         if let Some(exp) = payload.expiration {
-            ensure!(now < exp + CLOCK_SKEW_LEEWAY, "JWT token has expired");
+            if now >= exp + CLOCK_SKEW_LEEWAY {
+                return Err(JwtError::JwtTokenHasExpired);
+            }
         }
 
         if let Some(nbf) = payload.not_before {
-            ensure!(
-                nbf < now + CLOCK_SKEW_LEEWAY,
-                "JWT token is not yet ready to use"
-            );
+            if nbf >= now + CLOCK_SKEW_LEEWAY {
+                return Err(JwtError::JwtTokenNotYetReadyToUse);
+            }
         }
 
         Ok(ComputeCredentialKeys::JwtPayload(payloadb))
@@ -341,7 +341,7 @@ impl JwkCache {
         role_name: &RoleName,
         fetch: &F,
         jwt: &str,
-    ) -> Result<ComputeCredentialKeys, anyhow::Error> {
+    ) -> Result<ComputeCredentialKeys, JwtError> {
         // try with just a read lock first
         let key = (endpoint.clone(), role_name.clone());
         let entry = self.map.get(&key).as_deref().map(Arc::clone);
@@ -357,19 +357,18 @@ impl JwkCache {
     }
 }
 
-fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> anyhow::Result<()> {
+fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
     use ecdsa::Signature;
     use signature::Verifier;
 
     match key.crv {
         jose_jwk::EcCurves::P256 => {
-            let pk =
-                p256::PublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid P256 key"))?;
+            let pk = p256::PublicKey::try_from(key).map_err(JwtError::InvalidP256Key)?;
             let key = p256::ecdsa::VerifyingKey::from(&pk);
             let sig = Signature::from_slice(sig)?;
             key.verify(data, &sig)?;
         }
-        key => bail!("unsupported ec key type {key:?}"),
+        key => return Err(JwtError::UnsupportedEcKeyType(key)),
     }
 
     Ok(())
@@ -380,14 +379,14 @@ fn verify_rsa_signature(
     sig: &[u8],
     key: &jose_jwk::Rsa,
     alg: &jose_jwa::Algorithm,
-) -> anyhow::Result<()> {
+) -> Result<(), JwtError> {
     use jose_jwa::{Algorithm, Signing};
     use rsa::{
         pkcs1v15::{Signature, VerifyingKey},
         RsaPublicKey,
     };
 
-    let key = RsaPublicKey::try_from(key).map_err(|_| anyhow::anyhow!("invalid RSA key"))?;
+    let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?;
 
     match alg {
         Algorithm::Signing(Signing::Rs256) => {
@@ -395,7 +394,7 @@ fn verify_rsa_signature(
             let sig = Signature::try_from(sig)?;
             key.verify(data, &sig)?;
         }
-        _ => bail!("invalid RSA signing algorithm"),
+        _ => return Err(JwtError::InvalidRsaSigningAlgorithm),
     };
 
     Ok(())
@@ -561,6 +560,99 @@ impl Drop for JwkRenewalPermit<'_> {
     }
 }
 
+#[derive(Error, Debug)]
+#[non_exhaustive]
+pub(crate) enum JwtError {
+    #[error("jwk not found")]
+    JwkNotFound,
+
+    #[error("missing key id")]
+    MissingKeyId,
+
+    #[error("Provided authentication token is not a valid JWT encoding")]
+    JwtEncoding(#[from] JwtEncodingError),
+
+    #[error("invalid JWT token audience")]
+    InvalidJwtTokenAudience,
+
+    #[error("JWT token has expired")]
+    JwtTokenHasExpired,
+
+    #[error("JWT token is not yet ready to use")]
+    JwtTokenNotYetReadyToUse,
+
+    #[error("invalid P256 key")]
+    InvalidP256Key(jose_jwk::crypto::Error),
+
+    #[error("invalid RSA key")]
+    InvalidRsaKey(jose_jwk::crypto::Error),
+
+    #[error("invalid RSA signing algorithm")]
+    InvalidRsaSigningAlgorithm,
+
+    #[error("unsupported EC key type {0:?}")]
+    UnsupportedEcKeyType(jose_jwk::EcCurves),
+
+    #[error("unsupported key type {0:?}")]
+    UnsupportedKeyType(KeyType),
+
+    #[error("signature algorithm not supported")]
+    SignatureAlgorithmNotSupported,
+
+    #[error("signature error: {0}")]
+    Signature(#[from] signature::Error),
+
+    #[error("failed to fetch auth rules: {0}")]
+    FetchAuthRules(#[from] FetchAuthRulesError),
+}
+
+impl From<base64::DecodeError> for JwtError {
+    fn from(err: base64::DecodeError) -> Self {
+        JwtEncodingError::Base64Decode(err).into()
+    }
+}
+
+impl From<serde_json::Error> for JwtError {
+    fn from(err: serde_json::Error) -> Self {
+        JwtEncodingError::SerdeJson(err).into()
+    }
+}
+
+#[derive(Error, Debug)]
+#[non_exhaustive]
+pub enum JwtEncodingError {
+    #[error(transparent)]
+    Base64Decode(#[from] base64::DecodeError),
+
+    #[error(transparent)]
+    SerdeJson(#[from] serde_json::Error),
+
+    #[error("invalid compact form")]
+    InvalidCompactForm,
+}
+
+#[allow(dead_code, reason = "Debug use only")]
+#[derive(Debug)]
+pub(crate) enum KeyType {
+    Ec(jose_jwk::EcCurves),
+    Rsa,
+    Oct,
+    Okp(jose_jwk::OkpCurves),
+    Unknown,
+}
+
+impl From<&jose_jwk::Key> for KeyType {
+    fn from(key: &jose_jwk::Key) -> Self {
+        match key {
+            jose_jwk::Key::Ec(ec) => Self::Ec(ec.crv),
+            jose_jwk::Key::Rsa(_rsa) => Self::Rsa,
+            jose_jwk::Key::Oct(_oct) => Self::Oct,
+            jose_jwk::Key::Okp(okp) => Self::Okp(okp.crv),
+            _ => Self::Unknown,
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use crate::RoleName;
@@ -758,7 +850,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                 &self,
                 _ctx: &RequestMonitoring,
                 _endpoint: EndpointId,
-            ) -> anyhow::Result<Vec<AuthRule>> {
+            ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
                 Ok(vec![
                     AuthRule {
                         id: "foo".to_owned(),
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 12451847b1..1dea4d2d73 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,9 +1,9 @@
 use std::net::SocketAddr;
 
-use anyhow::Context;
 use arc_swap::ArcSwapOption;
 
 use crate::{
+    auth::backend::jwt::FetchAuthRulesError,
     compute::ConnCfg,
     context::RequestMonitoring,
     control_plane::{
@@ -53,11 +53,11 @@ impl FetchAuthRules for StaticAuthRules {
         &self,
         _ctx: &RequestMonitoring,
         _endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
         let mappings = JWKS_ROLE_MAP.load();
         let role_mappings = mappings
             .as_deref()
-            .context("JWKs settings for this role were not configured")?;
+            .ok_or(FetchAuthRulesError::RoleJwksNotConfigured)?;
         let mut rules = vec![];
         for setting in &role_mappings.jwks {
             rules.push(AuthRule {
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 96e1a787ed..7cf158bcd9 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -561,7 +561,8 @@ mod tests {
             &self,
             _ctx: &RequestMonitoring,
             _endpoint: crate::EndpointId,
-        ) -> anyhow::Result<Vec<super::jwt::AuthRule>> {
+        ) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
+        {
             unimplemented!()
         }
 
diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs
index ea2eb79e2a..51cddec672 100644
--- a/proxy/src/control_plane/provider/mock.rs
+++ b/proxy/src/control_plane/provider/mock.rs
@@ -5,7 +5,8 @@ use super::{
     AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
 };
 use crate::{
-    auth::backend::jwt::AuthRule, context::RequestMonitoring, intern::RoleNameInt, RoleName,
+    auth::backend::jwt::AuthRule, context::RequestMonitoring,
+    control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName,
 };
 use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
 use crate::{auth::IpPattern, cache::Cached};
@@ -120,7 +121,10 @@ impl Api {
         })
     }
 
-    async fn do_get_endpoint_jwks(&self, endpoint: EndpointId) -> anyhow::Result<Vec<AuthRule>> {
+    async fn do_get_endpoint_jwks(
+        &self,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         let (client, connection) =
             tokio_postgres::connect(self.endpoint.as_str(), tokio_postgres::NoTls).await?;
 
@@ -224,7 +228,7 @@ impl super::Api for Api {
         &self,
         _ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(endpoint).await
     }
 
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index 6cc525a324..0a196fe2a3 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -6,7 +6,7 @@ use super::messages::{ControlPlaneError, MetricsAuxInfo};
 use crate::{
     auth::{
         backend::{
-            jwt::{AuthRule, FetchAuthRules},
+            jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError},
             ComputeCredentialKeys, ComputeUserInfo,
         },
         IpPattern,
@@ -44,7 +44,7 @@ pub(crate) mod errors {
     pub(crate) enum ApiError {
         /// Error returned by the console itself.
         #[error("{REQUEST_FAILED} with {0}")]
-        ControlPlane(ControlPlaneError),
+        ControlPlane(Box<ControlPlaneError>),
 
         /// Various IO errors like broken pipe or malformed payload.
         #[error("{REQUEST_FAILED}: {0}")]
@@ -90,7 +90,7 @@ pub(crate) mod errors {
                     Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
                     Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
                     Reason::RunningOperations => ErrorKind::ControlPlane,
-                    Reason::Unknown => match &e {
+                    Reason::Unknown => match &**e {
                         ControlPlaneError {
                             http_status_code:
                                 http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
@@ -246,6 +246,33 @@ pub(crate) mod errors {
             }
         }
     }
+
+    #[derive(Debug, Error)]
+    pub enum GetEndpointJwksError {
+        #[error("endpoint not found")]
+        EndpointNotFound,
+
+        #[error("failed to build control plane request: {0}")]
+        RequestBuild(#[source] reqwest::Error),
+
+        #[error("failed to send control plane request: {0}")]
+        RequestExecute(#[source] reqwest_middleware::Error),
+
+        #[error(transparent)]
+        ControlPlane(#[from] ApiError),
+
+        #[cfg(any(test, feature = "testing"))]
+        #[error(transparent)]
+        TokioPostgres(#[from] tokio_postgres::Error),
+
+        #[cfg(any(test, feature = "testing"))]
+        #[error(transparent)]
+        ParseUrl(#[from] url::ParseError),
+
+        #[cfg(any(test, feature = "testing"))]
+        #[error(transparent)]
+        TaskJoin(#[from] tokio::task::JoinError),
+    }
 }
 
 /// Auth secret which is managed by the cloud.
@@ -342,7 +369,7 @@ pub(crate) trait Api {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>>;
+    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
 
     /// Wake up the compute node and return the corresponding connection info.
     async fn wake_compute(
@@ -401,7 +428,7 @@ impl Api for ControlPlaneBackend {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
             Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(any(test, feature = "testing"))]
@@ -583,7 +610,9 @@ impl FetchAuthRules for ControlPlaneBackend {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
-        self.get_endpoint_jwks(ctx, endpoint).await
+    ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
+        self.get_endpoint_jwks(ctx, endpoint)
+            .await
+            .map_err(FetchAuthRulesError::GetEndpointJwks)
     }
 }
diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs
index d01878741c..2487ce0e3f 100644
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/provider/neon.rs
@@ -9,7 +9,10 @@ use super::{
 use crate::{
     auth::backend::{jwt::AuthRule, ComputeUserInfo},
     compute,
-    control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason},
+    control_plane::{
+        errors::GetEndpointJwksError,
+        messages::{ColdStartInfo, EndpointJwksResponse, Reason},
+    },
     http,
     metrics::{CacheOutcome, Metrics},
     rate_limiter::WakeComputeRateLimiter,
@@ -17,7 +20,6 @@ use crate::{
 };
 use crate::{cache::Cached, context::RequestMonitoring};
 use ::http::{header::AUTHORIZATION, HeaderName};
-use anyhow::bail;
 use futures::TryFutureExt;
 use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
@@ -137,14 +139,14 @@ impl Api {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         if !self
             .caches
             .endpoints_cache
             .is_valid(ctx, &endpoint.normalize())
             .await
         {
-            bail!("endpoint not found");
+            return Err(GetEndpointJwksError::EndpointNotFound);
         }
         let request_id = ctx.session_id().to_string();
         async {
@@ -159,12 +161,17 @@ impl Api {
                 .header(X_REQUEST_ID, &request_id)
                 .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
                 .query(&[("session_id", ctx.session_id())])
-                .build()?;
+                .build()
+                .map_err(GetEndpointJwksError::RequestBuild)?;
 
             info!(url = request.url().as_str(), "sending http request");
             let start = Instant::now();
             let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
-            let response = self.endpoint.execute(request).await?;
+            let response = self
+                .endpoint
+                .execute(request)
+                .await
+                .map_err(GetEndpointJwksError::RequestExecute)?;
             drop(pause);
             info!(duration = ?start.elapsed(), "received http response");
 
@@ -330,7 +337,7 @@ impl super::Api for Api {
         &self,
         ctx: &RequestMonitoring,
         endpoint: EndpointId,
-    ) -> anyhow::Result<Vec<AuthRule>> {
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
         self.do_get_endpoint_jwks(ctx, endpoint).await
     }
 
@@ -348,7 +355,7 @@ impl super::Api for Api {
                     let (cached, info) = cached.take_value();
                     let info = info.map_err(|c| {
                         info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ApiError(ApiError::ControlPlane(*c))
+                        WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c)))
                     })?;
 
                     debug!(key = &*key, "found cached compute node info");
@@ -418,7 +425,7 @@ impl super::Api for Api {
 
                     self.caches.node_info.insert_ttl(
                         key,
-                        Err(Box::new(err.clone())),
+                        Err(err.clone()),
                         Duration::from_secs(30),
                     );
 
@@ -457,7 +464,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     body.http_status_code = status;
 
     warn!("console responded with an error ({status}): {body:?}");
-    Err(ApiError::ControlPlane(body))
+    Err(ApiError::ControlPlane(Box::new(body)))
 }
 
 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 58fb36dba7..deb4d4a63f 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -492,30 +492,32 @@ impl TestBackend for TestConnectMechanism {
         match action {
             ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
-                let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError {
-                    http_status_code: StatusCode::BAD_REQUEST,
-                    error: "TEST".into(),
-                    status: None,
-                });
+                let err =
+                    control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
+                        http_status_code: StatusCode::BAD_REQUEST,
+                        error: "TEST".into(),
+                        status: None,
+                    }));
                 assert!(!err.could_retry());
                 Err(control_plane::errors::WakeComputeError::ApiError(err))
             }
             ConnectAction::WakeRetry => {
-                let err = control_plane::errors::ApiError::ControlPlane(ControlPlaneError {
-                    http_status_code: StatusCode::BAD_REQUEST,
-                    error: "TEST".into(),
-                    status: Some(Status {
-                        code: "error".into(),
-                        message: "error".into(),
-                        details: Details {
-                            error_info: None,
-                            retry_info: Some(control_plane::messages::RetryInfo {
-                                retry_delay_ms: 1,
-                            }),
-                            user_facing_message: None,
-                        },
-                    }),
-                });
+                let err =
+                    control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
+                        http_status_code: StatusCode::BAD_REQUEST,
+                        error: "TEST".into(),
+                        status: Some(Status {
+                            code: "error".into(),
+                            message: "error".into(),
+                            details: Details {
+                                error_info: None,
+                                retry_info: Some(control_plane::messages::RetryInfo {
+                                    retry_delay_ms: 1,
+                                }),
+                                user_facing_message: None,
+                            },
+                        }),
+                    }));
                 assert!(err.could_retry());
                 Err(control_plane::errors::WakeComputeError::ApiError(err))
             }
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index ba674f5d0d..0d1527a2c1 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -79,7 +79,7 @@ fn report_error(e: &WakeComputeError, retry: bool) {
             Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked,
             Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked,
             Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked,
-            Reason::Unknown => match e {
+            Reason::Unknown => match **e {
                 ControlPlaneError {
                     http_status_code: StatusCode::LOCKED,
                     ref error,

From 614c3aef72ed595190801e8d77fe188e3cb13605 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 15 Oct 2024 17:18:52 +0300
Subject: [PATCH 005/239] Remove redundant code (#9373)

## Problem

There is double update of resize cache in `put_rel_truncation`
Also `page_server_request` contains check that fork is MAIN_FORKNUM
which
1. is incorrect (because Vm/FSM pages are shreded in the same way as
MAIN fork pages and
2. is redundant because `page_server_request` is never called for `get
page` request so first part to OR condition is always true.

## Summary of changes

Remove redundant code

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs | 3 ---
 pgxn/neon/pagestore_smgr.c          | 3 +--
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7aa313f031..900da5beab 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1545,9 +1545,6 @@ impl<'a> DatadirModification<'a> {
             // Update relation size cache
             self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
 
-            // Update relation size cache
-            self.tline.set_cached_rel_size(rel, self.lsn, nblocks);
-
             // Update logical database size.
             self.pending_nblocks -= old_size as i64 - nblocks as i64;
         }
diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index f46df7f70a..cbb0e2ae6d 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -1092,8 +1092,7 @@ page_server_request(void const *req)
 	 * Current sharding model assumes that all metadata is present only at shard 0.
 	 * We still need to call get_shard_no() to check if shard map is up-to-date.
 	 */
-	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest ||
-		((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest)
 	{
 		shard_no = 0;
 	}

From cf7a596a151487c1b3afafbe1eb2efab895326ea Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Oct 2024 11:18:38 -0500
Subject: [PATCH 006/239] Generate sql_exporter config files with Jsonnet

There are quite a few benefits to this approach:

- Reduce config duplication
  - The two sql_exporter configs were super similar with just a few
    differences
- Pull SQL queries into standalone files
  - That means we could run a SQL formatter on the file in the future
  - It also means access to syntax highlighting
- In the future, run different queries for different PG versions
  - This is relevant because right now, we have queries that are failing
    on PG 17 due to catalog updates

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/build_and_test.yml          |  19 +
 Dockerfile.build-tools                        |   1 +
 Makefile                                      |   1 +
 compute/.gitignore                            |   5 +
 compute/Dockerfile.compute-node               |  22 +-
 compute/Makefile                              |  35 ++
 compute/etc/README.md                         |  17 +
 compute/etc/neon_collector.jsonnet            |  43 +++
 compute/etc/neon_collector.yml                | 331 ------------------
 .../etc/neon_collector_autoscaling.jsonnet    |  11 +
 compute/etc/neon_collector_autoscaling.yml    |  55 ---
 compute/etc/sql_exporter.jsonnet              |  40 +++
 compute/etc/sql_exporter.yml                  |  33 --
 .../sql_exporter/checkpoints_req.libsonnet    |  10 +
 compute/etc/sql_exporter/checkpoints_req.sql  |   1 +
 .../sql_exporter/checkpoints_timed.libsonnet  |  10 +
 .../etc/sql_exporter/checkpoints_timed.sql    |   1 +
 .../compute_current_lsn.libsonnet             |  10 +
 .../etc/sql_exporter/compute_current_lsn.sql  |   4 +
 .../compute_logical_snapshot_files.libsonnet  |  12 +
 .../compute_logical_snapshot_files.sql        |   7 +
 .../compute_receive_lsn.libsonnet             |  10 +
 .../etc/sql_exporter/compute_receive_lsn.sql  |   4 +
 .../compute_subscriptions_count.libsonnet     |  12 +
 .../compute_subscriptions_count.sql           |   1 +
 .../sql_exporter/connection_counts.libsonnet  |  13 +
 .../etc/sql_exporter/connection_counts.sql    |   1 +
 .../etc/sql_exporter/db_total_size.libsonnet  |  10 +
 compute/etc/sql_exporter/db_total_size.sql    |   1 +
 .../getpage_prefetch_discards_total.libsonnet |   9 +
 .../getpage_prefetch_misses_total.libsonnet   |   9 +
 .../getpage_prefetch_requests_total.libsonnet |   9 +
 .../getpage_sync_requests_total.libsonnet     |   9 +
 .../getpage_wait_seconds_bucket.libsonnet     |  12 +
 .../getpage_wait_seconds_bucket.sql           |   1 +
 .../getpage_wait_seconds_count.libsonnet      |   9 +
 .../getpage_wait_seconds_sum.libsonnet        |   9 +
 ...lfc_approximate_working_set_size.libsonnet |  12 +
 .../lfc_approximate_working_set_size.sql      |   1 +
 ...ing_set_size_windows.autoscaling.libsonnet |  12 +
 ...e_working_set_size_windows.autoscaling.sql |   8 +
 ...oximate_working_set_size_windows.libsonnet |  12 +
 ...c_approximate_working_set_size_windows.sql |   8 +
 .../lfc_cache_size_limit.libsonnet            |  10 +
 .../etc/sql_exporter/lfc_cache_size_limit.sql |   1 +
 compute/etc/sql_exporter/lfc_hits.libsonnet   |  10 +
 compute/etc/sql_exporter/lfc_hits.sql         |   1 +
 compute/etc/sql_exporter/lfc_misses.libsonnet |  10 +
 compute/etc/sql_exporter/lfc_misses.sql       |   1 +
 compute/etc/sql_exporter/lfc_used.libsonnet   |  10 +
 compute/etc/sql_exporter/lfc_used.sql         |   1 +
 compute/etc/sql_exporter/lfc_writes.libsonnet |  10 +
 compute/etc/sql_exporter/lfc_writes.sql       |   1 +
 .../logical_slot_restart_lsn.libsonnet        |  15 +
 .../sql_exporter/logical_slot_restart_lsn.sql |   3 +
 .../sql_exporter/max_cluster_size.libsonnet   |  10 +
 compute/etc/sql_exporter/max_cluster_size.sql |   1 +
 .../etc/sql_exporter/neon_perf_counters.sql   |  13 +
 .../pageserver_disconnects_total.libsonnet    |   9 +
 .../pageserver_requests_sent_total.libsonnet  |   9 +
 .../pageserver_send_flushes_total.libsonnet   |   9 +
 .../sql_exporter/pg_stats_userdb.libsonnet    |  18 +
 compute/etc/sql_exporter/pg_stats_userdb.sql  |  10 +
 .../replication_delay_bytes.libsonnet         |  10 +
 .../sql_exporter/replication_delay_bytes.sql  |   6 +
 .../replication_delay_seconds.libsonnet       |  10 +
 .../replication_delay_seconds.sql             |   5 +
 .../etc/sql_exporter/retained_wal.libsonnet   |  12 +
 compute/etc/sql_exporter/retained_wal.sql     |   5 +
 .../etc/sql_exporter/wal_is_lost.libsonnet    |  12 +
 compute/etc/sql_exporter/wal_is_lost.sql      |   7 +
 compute/etc/sql_exporter_autoscaling.yml      |  33 --
 72 files changed, 635 insertions(+), 457 deletions(-)
 create mode 100644 compute/.gitignore
 create mode 100644 compute/Makefile
 create mode 100644 compute/etc/README.md
 create mode 100644 compute/etc/neon_collector.jsonnet
 delete mode 100644 compute/etc/neon_collector.yml
 create mode 100644 compute/etc/neon_collector_autoscaling.jsonnet
 delete mode 100644 compute/etc/neon_collector_autoscaling.yml
 create mode 100644 compute/etc/sql_exporter.jsonnet
 delete mode 100644 compute/etc/sql_exporter.yml
 create mode 100644 compute/etc/sql_exporter/checkpoints_req.libsonnet
 create mode 100644 compute/etc/sql_exporter/checkpoints_req.sql
 create mode 100644 compute/etc/sql_exporter/checkpoints_timed.libsonnet
 create mode 100644 compute/etc/sql_exporter/checkpoints_timed.sql
 create mode 100644 compute/etc/sql_exporter/compute_current_lsn.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_current_lsn.sql
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshot_files.sql
 create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_receive_lsn.sql
 create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_subscriptions_count.sql
 create mode 100644 compute/etc/sql_exporter/connection_counts.libsonnet
 create mode 100644 compute/etc/sql_exporter/connection_counts.sql
 create mode 100644 compute/etc/sql_exporter/db_total_size.libsonnet
 create mode 100644 compute/etc/sql_exporter/db_total_size.sql
 create mode 100644 compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
 create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_cache_size_limit.sql
 create mode 100644 compute/etc/sql_exporter/lfc_hits.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_hits.sql
 create mode 100644 compute/etc/sql_exporter/lfc_misses.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_misses.sql
 create mode 100644 compute/etc/sql_exporter/lfc_used.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_used.sql
 create mode 100644 compute/etc/sql_exporter/lfc_writes.libsonnet
 create mode 100644 compute/etc/sql_exporter/lfc_writes.sql
 create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
 create mode 100644 compute/etc/sql_exporter/logical_slot_restart_lsn.sql
 create mode 100644 compute/etc/sql_exporter/max_cluster_size.libsonnet
 create mode 100644 compute/etc/sql_exporter/max_cluster_size.sql
 create mode 100644 compute/etc/sql_exporter/neon_perf_counters.sql
 create mode 100644 compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
 create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.libsonnet
 create mode 100644 compute/etc/sql_exporter/pg_stats_userdb.sql
 create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.libsonnet
 create mode 100644 compute/etc/sql_exporter/replication_delay_bytes.sql
 create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.libsonnet
 create mode 100644 compute/etc/sql_exporter/replication_delay_seconds.sql
 create mode 100644 compute/etc/sql_exporter/retained_wal.libsonnet
 create mode 100644 compute/etc/sql_exporter/retained_wal.sql
 create mode 100644 compute/etc/sql_exporter/wal_is_lost.libsonnet
 create mode 100644 compute/etc/sql_exporter/wal_is_lost.sql
 delete mode 100644 compute/etc/sql_exporter_autoscaling.yml

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 51f6975e63..c9a447626f 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -120,6 +120,25 @@ jobs:
       - name: Run mypy to check types
         run: poetry run mypy .
 
+  check-codestyle-jsonnet:
+    needs: [ check-permissions, build-build-tools-image ]
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ needs.build-build-tools-image.outputs.image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Check Jsonnet code formatting
+        run: |
+          jsonnetfmt --test \
+            $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet')
+
   # Check that the vendor/postgres-* submodules point to the
   # corresponding REL_*_STABLE_neon branches.
   check-submodules:
diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 54e9134257..7cba1c8635 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -27,6 +27,7 @@ RUN set -e \
         gnupg \
         gzip \
         jq \
+        jsonnet \
         libcurl4-openssl-dev \
         libbz2-dev \
         libffi-dev \
diff --git a/Makefile b/Makefile
index 5e227ed3f5..33cfda2661 100644
--- a/Makefile
+++ b/Makefile
@@ -291,6 +291,7 @@ postgres-check: \
 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
 clean: postgres-clean neon-pg-clean-ext
+	$(MAKE) -C compute clean
 	$(CARGO_CMD_PREFIX) cargo clean
 
 # This removes everything
diff --git a/compute/.gitignore b/compute/.gitignore
new file mode 100644
index 0000000000..70980d335a
--- /dev/null
+++ b/compute/.gitignore
@@ -0,0 +1,5 @@
+# sql_exporter config files generated from Jsonnet
+etc/neon_collector.yml
+etc/neon_collector_autoscaling.yml
+etc/sql_exporter.yml
+etc/sql_exporter_autoscaling.yml
diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 412c64eda4..13381b2901 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -349,7 +349,7 @@ ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # not version-specific
-# doesn't use releases, last commit f3d82fd - Mar 2, 2023 
+# doesn't use releases, last commit f3d82fd - Mar 2, 2023
 RUN wget https://github.com/michelp/pgjwt/archive/f3d82fd30151e754e19ce5d6a06c71c20689ce3d.tar.gz -O pgjwt.tar.gz && \
     echo "dae8ed99eebb7593b43013f6532d772b12dfecd55548d2673f2dfd0163f6d2b9 pgjwt.tar.gz" | sha256sum --check && \
     mkdir pgjwt-src && cd pgjwt-src && tar xzf ../pgjwt.tar.gz --strip-components=1 -C . && \
@@ -1169,6 +1169,18 @@ RUN rm -r /usr/local/pgsql/include
 # if they were to be used by other libraries.
 RUN rm /usr/local/pgsql/lib/lib*.a
 
+#########################################################################################
+#
+# Preprocess the sql_exporter configuration files
+#
+#########################################################################################
+FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
+
+USER nonroot
+
+COPY --chown=nonroot compute compute
+
+RUN make -C compute
 
 #########################################################################################
 #
@@ -1287,10 +1299,10 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
 
-COPY --chmod=0644 compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
-COPY --chmod=0644 compute/etc/neon_collector.yml             /etc/neon_collector.yml
-COPY --chmod=0644 compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
-COPY --chmod=0644 compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
+COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector_autoscaling.yml /etc/neon_collector_autoscaling.yml
 
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
diff --git a/compute/Makefile b/compute/Makefile
new file mode 100644
index 0000000000..45fbfa6d5e
--- /dev/null
+++ b/compute/Makefile
@@ -0,0 +1,35 @@
+jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet)
+
+.PHONY: all
+all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml
+
+neon_collector.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		etc/neon_collector.jsonnet
+
+neon_collector_autoscaling.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		etc/neon_collector_autoscaling.jsonnet
+
+sql_exporter.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		--tla-str collector_file=neon_collector.yml \
+		etc/sql_exporter.jsonnet
+
+sql_exporter_autoscaling.yml: $(jsonnet_files)
+	JSONNET_PATH=etc jsonnet \
+		--output-file etc/$@ \
+		--tla-str collector_file=neon_collector_autoscaling.yml \
+		--tla-str application_name=sql_exporter_autoscaling \
+		etc/sql_exporter.jsonnet
+
+.PHONY: clean
+clean:
+	rm --force \
+		etc/neon_collector.yml \
+		etc/neon_collector_autoscaling.yml \
+		etc/sql_exporter.yml \
+		etc/sql_exporter_autoscaling.yml
diff --git a/compute/etc/README.md b/compute/etc/README.md
new file mode 100644
index 0000000000..70b108146c
--- /dev/null
+++ b/compute/etc/README.md
@@ -0,0 +1,17 @@
+# Compute Configuration
+
+These files are the configuration files for various other pieces of software
+that will be running in the compute alongside Postgres.
+
+## `sql_exporter`
+
+### Adding a `sql_exporter` Metric
+
+We use `sql_exporter` to export various metrics from Postgres. In order to add
+a metric, you will need to create two files: a `libsonnet` and a `sql` file. You
+will then import the `libsonnet` file in one of the collector files, and the
+`sql` file will be imported in the `libsonnet` file.
+
+In the event your statistic is an LSN, you may want to cast it to a `float8`
+because Prometheus only supports floats. It's probably fine because `float8` can
+store integers from `-2^53` to `+2^53` exactly.
diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
new file mode 100644
index 0000000000..2031eb8c85
--- /dev/null
+++ b/compute/etc/neon_collector.jsonnet
@@ -0,0 +1,43 @@
+{
+  collector_name: 'neon_collector',
+  metrics: [
+    import 'sql_exporter/checkpoints_req.libsonnet',
+    import 'sql_exporter/checkpoints_timed.libsonnet',
+    import 'sql_exporter/compute_current_lsn.libsonnet',
+    import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_receive_lsn.libsonnet',
+    import 'sql_exporter/compute_subscriptions_count.libsonnet',
+    import 'sql_exporter/connection_counts.libsonnet',
+    import 'sql_exporter/db_total_size.libsonnet',
+    import 'sql_exporter/getpage_prefetch_discards_total.libsonnet',
+    import 'sql_exporter/getpage_prefetch_misses_total.libsonnet',
+    import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
+    import 'sql_exporter/getpage_sync_requests_total.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
+    import 'sql_exporter/getpage_wait_seconds_sum.libsonnet',
+    import 'sql_exporter/lfc_approximate_working_set_size.libsonnet',
+    import 'sql_exporter/lfc_approximate_working_set_size_windows.libsonnet',
+    import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_hits.libsonnet',
+    import 'sql_exporter/lfc_misses.libsonnet',
+    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_writes.libsonnet',
+    import 'sql_exporter/logical_slot_restart_lsn.libsonnet',
+    import 'sql_exporter/max_cluster_size.libsonnet',
+    import 'sql_exporter/pageserver_disconnects_total.libsonnet',
+    import 'sql_exporter/pageserver_requests_sent_total.libsonnet',
+    import 'sql_exporter/pageserver_send_flushes_total.libsonnet',
+    import 'sql_exporter/pg_stats_userdb.libsonnet',
+    import 'sql_exporter/replication_delay_bytes.libsonnet',
+    import 'sql_exporter/replication_delay_seconds.libsonnet',
+    import 'sql_exporter/retained_wal.libsonnet',
+    import 'sql_exporter/wal_is_lost.libsonnet',
+  ],
+  queries: [
+    {
+      query_name: 'neon_perf_counters',
+      query: importstr 'sql_exporter/neon_perf_counters.sql',
+    },
+  ],
+}
diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml
deleted file mode 100644
index 92da0cdbdd..0000000000
--- a/compute/etc/neon_collector.yml
+++ /dev/null
@@ -1,331 +0,0 @@
-collector_name: neon_collector
-metrics:
-- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-- metric_name: connection_counts
-  type: gauge
-  help: 'Connection counts'
-  key_labels:
-    - datname
-    - state
-  values: [count]
-  query: |
-    select datname, state, count(*) as count from pg_stat_activity where state <> '' group by datname, state;
-
-- metric_name: pg_stats_userdb
-  type: gauge
-  help: 'Stats for several oldest non-system dbs'
-  key_labels:
-    - datname
-  value_label: kind
-  values:
-    - db_size
-    - deadlocks
-    # Rows
-    - inserted
-    - updated
-    - deleted
-  # We export stats for 10 non-system database. Without this limit
-  # it is too easy to abuse the system by creating lots of databases.
-  query: |
-    select pg_database_size(datname) as db_size, deadlocks,
-       tup_inserted as inserted, tup_updated as updated, tup_deleted as deleted,
-       datname
-     from pg_stat_database
-     where datname IN (
-       select datname
-       from pg_database
-       where datname <> 'postgres' and not datistemplate
-       order by oid
-       limit 10
-     );
-
-- metric_name: max_cluster_size
-  type: gauge
-  help: 'neon.max_cluster_size setting'
-  key_labels:
-  values: [max_cluster_size]
-  query: |
-    select setting::int as max_cluster_size from pg_settings where name = 'neon.max_cluster_size';
-
-- metric_name: db_total_size
-  type: gauge
-  help: 'Size of all databases'
-  key_labels:
-  values: [total]
-  query: |
-    select sum(pg_database_size(datname)) as total from pg_database;
-
-- metric_name: getpage_wait_seconds_count
-  type: counter
-  help: 'Number of getpage requests'
-  values: [getpage_wait_seconds_count]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_wait_seconds_sum
-  type: counter
-  help: 'Time spent in getpage requests'
-  values: [getpage_wait_seconds_sum]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_prefetch_requests_total
-  type: counter
-  help: 'Number of getpage issued for prefetching'
-  values: [getpage_prefetch_requests_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_sync_requests_total
-  type: counter
-  help: 'Number of synchronous getpage issued'
-  values: [getpage_sync_requests_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_prefetch_misses_total
-  type: counter
-  help: 'Total number of readahead misses; consisting of either prefetches that don''t satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read'
-  values: [getpage_prefetch_misses_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_prefetch_discards_total
-  type: counter
-  help: 'Number of prefetch responses issued but not used'
-  values: [getpage_prefetch_discards_total]
-  query_ref: neon_perf_counters
-
-- metric_name: pageserver_requests_sent_total
-  type: counter
-  help: 'Number of all requests sent to the pageserver (not just GetPage requests)'
-  values: [pageserver_requests_sent_total]
-  query_ref: neon_perf_counters
-
-- metric_name: pageserver_disconnects_total
-  type: counter
-  help: 'Number of times that the connection to the pageserver was lost'
-  values: [pageserver_disconnects_total]
-  query_ref: neon_perf_counters
-
-- metric_name: pageserver_send_flushes_total
-  type: counter
-  help: 'Number of flushes to the pageserver connection'
-  values: [pageserver_send_flushes_total]
-  query_ref: neon_perf_counters
-
-- metric_name: getpage_wait_seconds_bucket
-  type: counter
-  help: 'Histogram buckets of getpage request latency'
-  key_labels:
-      - bucket_le
-  values: [value]
-  query_ref: getpage_wait_seconds_buckets
-
-# DEPRECATED
-- metric_name: lfc_approximate_working_set_size
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels:
-  values: [approximate_working_set_size]
-  query: |
-    select neon.approximate_working_set_size(false) as approximate_working_set_size;
-
-- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration]
-  values: [size]
-  # NOTE: This is the "public" / "human-readable" version. Here, we supply a small selection
-  # of durations in a pretty-printed form.
-  query: |
-    select
-      x as duration,
-      neon.approximate_working_set_size_seconds(extract('epoch' from x::interval)::int) as size
-    from
-      (values ('5m'),('15m'),('1h')) as t (x);
-
-- metric_name: compute_current_lsn
-  type: gauge
-  help: 'Current LSN of the database'
-  key_labels:
-  values: [lsn]
-  query: |
-    select
-      case
-        when pg_catalog.pg_is_in_recovery()
-        then (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
-        else (pg_current_wal_lsn() - '0/0')::FLOAT8
-      end as lsn;
-
-- metric_name: compute_receive_lsn
-  type: gauge
-  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication'
-  key_labels:
-  values: [lsn]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_catalog.pg_is_in_recovery()
-        THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
-        ELSE 0
-      END AS lsn;
-
-- metric_name: replication_delay_bytes
-  type: gauge
-  help: 'Bytes between received and replayed LSN'
-  key_labels:
-  values: [replication_delay_bytes]
-  # We use a GREATEST call here because this calculation can be negative.
-  # The calculation is not atomic, meaning after we've gotten the receive
-  # LSN, the replay LSN may have advanced past the receive LSN we
-  # are using for the calculation.
-  query: |
-    SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
-
-- metric_name: replication_delay_seconds
-  type: gauge
-  help: 'Time since last LSN was replayed'
-  key_labels:
-  values: [replication_delay_seconds]
-  query: |
-    SELECT
-      CASE
-        WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
-        ELSE GREATEST (0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
-     END AS replication_delay_seconds;
-
-- metric_name: checkpoints_req
-  type: gauge
-  help: 'Number of requested checkpoints'
-  key_labels:
-  values: [checkpoints_req]
-  query: |
-    SELECT checkpoints_req FROM pg_stat_bgwriter;
-
-- metric_name: checkpoints_timed
-  type: gauge
-  help: 'Number of scheduled checkpoints'
-  key_labels:
-  values: [checkpoints_timed]
-  query: |
-    SELECT checkpoints_timed FROM pg_stat_bgwriter;
-
-- metric_name: compute_logical_snapshot_files
-  type: gauge
-  help: 'Number of snapshot files in pg_logical/snapshot'
-  key_labels:
-    - timeline_id
-  values: [num_logical_snapshot_files]
-  query: |
-    SELECT
-      (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
-      -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp. These
-      -- temporary snapshot files are renamed to the actual snapshot files after they are
-      -- completely built. We only WAL-log the completely built snapshot files.
-      (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
-
-# In all the below metrics, we cast LSNs to floats because Prometheus only supports floats.
-# It's probably fine because float64 can store integers from -2^53 to +2^53 exactly.
-
-# Number of slots is limited by max_replication_slots, so collecting position for all of them shouldn't be bad.
-- metric_name: logical_slot_restart_lsn
-  type: gauge
-  help: 'restart_lsn of logical slots'
-  key_labels:
-    - slot_name
-  values: [restart_lsn]
-  query: |
-    select slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
-    from pg_replication_slots
-    where slot_type = 'logical';
-
-- metric_name: compute_subscriptions_count
-  type: gauge
-  help: 'Number of logical replication subscriptions grouped by enabled/disabled'
-  key_labels:
-    - enabled
-  values: [subscriptions_count]
-  query: |
-    select subenabled::text as enabled, count(*) as subscriptions_count
-    from pg_subscription
-    group by subenabled;
-
-- metric_name: retained_wal
-  type: gauge
-  help: 'Retained WAL in inactive replication slots'
-  key_labels:
-    - slot_name
-  values: [retained_wal]
-  query: |
-    SELECT slot_name, pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
-    FROM pg_replication_slots
-    WHERE active = false;
-
-- metric_name: wal_is_lost
-  type: gauge
-  help: 'Whether or not the replication slot wal_status is lost'
-  key_labels:
-    - slot_name
-  values: [wal_is_lost]
-  query: |
-    SELECT slot_name,
-           CASE WHEN wal_status = 'lost' THEN 1 ELSE 0 END AS wal_is_lost
-    FROM pg_replication_slots;
-
-queries:
-  - query_name: neon_perf_counters
-    query: |
-      WITH c AS (
-        SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters
-      )
-      SELECT d.*
-      FROM pg_catalog.jsonb_to_record((select jb from c)) as d(
-          getpage_wait_seconds_count numeric,
-          getpage_wait_seconds_sum numeric,
-          getpage_prefetch_requests_total numeric,
-          getpage_sync_requests_total numeric,
-          getpage_prefetch_misses_total numeric,
-          getpage_prefetch_discards_total numeric,
-          pageserver_requests_sent_total numeric,
-          pageserver_disconnects_total numeric,
-          pageserver_send_flushes_total numeric
-      );
-
-  - query_name: getpage_wait_seconds_buckets
-    query: |
-      SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
diff --git a/compute/etc/neon_collector_autoscaling.jsonnet b/compute/etc/neon_collector_autoscaling.jsonnet
new file mode 100644
index 0000000000..e248172a3d
--- /dev/null
+++ b/compute/etc/neon_collector_autoscaling.jsonnet
@@ -0,0 +1,11 @@
+{
+  collector_name: 'neon_collector_autoscaling',
+  metrics: [
+    import 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet',
+    import 'sql_exporter/lfc_cache_size_limit.libsonnet',
+    import 'sql_exporter/lfc_hits.libsonnet',
+    import 'sql_exporter/lfc_misses.libsonnet',
+    import 'sql_exporter/lfc_used.libsonnet',
+    import 'sql_exporter/lfc_writes.libsonnet',
+  ],
+}
diff --git a/compute/etc/neon_collector_autoscaling.yml b/compute/etc/neon_collector_autoscaling.yml
deleted file mode 100644
index 5616264eba..0000000000
--- a/compute/etc/neon_collector_autoscaling.yml
+++ /dev/null
@@ -1,55 +0,0 @@
-collector_name: neon_collector_autoscaling
-metrics:
-- metric_name: lfc_misses
-  type: gauge
-  help: 'lfc_misses'
-  key_labels:
-  values: [lfc_misses]
-  query: |
-    select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-- metric_name: lfc_used
-  type: gauge
-  help: 'LFC chunks used (chunk = 1MB)'
-  key_labels:
-  values: [lfc_used]
-  query: |
-    select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-- metric_name: lfc_hits
-  type: gauge
-  help: 'lfc_hits'
-  key_labels:
-  values: [lfc_hits]
-  query: |
-    select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-- metric_name: lfc_writes
-  type: gauge
-  help: 'lfc_writes'
-  key_labels:
-  values: [lfc_writes]
-  query: |
-    select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
-- metric_name: lfc_cache_size_limit
-  type: gauge
-  help: 'LFC cache size limit in bytes'
-  key_labels:
-  values: [lfc_cache_size_limit]
-  query: |
-    select pg_size_bytes(current_setting('neon.file_cache_size_limit')) as lfc_cache_size_limit;
-
-- metric_name: lfc_approximate_working_set_size_windows
-  type: gauge
-  help: 'Approximate working set size in pages of 8192 bytes'
-  key_labels: [duration_seconds]
-  values: [size]
-  # NOTE: This is the "internal" / "machine-readable" version. This outputs the working set
-  # size looking back 1..60 minutes, labeled with the number of minutes.
-  query: |
-    select
-      x::text as duration_seconds,
-      neon.approximate_working_set_size_seconds(x) as size
-    from
-      (select generate_series * 60 as x from generate_series(1, 60)) as t (x);
diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
new file mode 100644
index 0000000000..1e3665ac47
--- /dev/null
+++ b/compute/etc/sql_exporter.jsonnet
@@ -0,0 +1,40 @@
+function(collector_file, application_name='sql_exporter') {
+  // Configuration for sql_exporter for autoscaling-agent
+  // Global defaults.
+  global: {
+    // If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
+    scrape_timeout: '10s',
+    // Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
+    scrape_timeout_offset: '500ms',
+    // Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
+    min_interval: '0s',
+    // Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
+    // as will concurrent scrapes.
+    max_connections: 1,
+    // Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
+    // always be the same as max_connections.
+    max_idle_connections: 1,
+    // Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
+    // If 0, connections are not closed due to a connection's age.
+    max_connection_lifetime: '5m',
+  },
+
+  // The target to monitor and the collectors to execute on it.
+  target: {
+    // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
+    // the schema gets dropped or replaced to match the driver expected DSN format.
+    data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
+
+    // Collectors (referenced by name) to execute on the target.
+    // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+    collectors: [
+      'neon_collector_autoscaling',
+    ],
+  },
+
+  // Collector files specifies a list of globs. One collector definition is read from each matching file.
+  // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
+  collector_files: [
+    collector_file,
+  ],
+}
diff --git a/compute/etc/sql_exporter.yml b/compute/etc/sql_exporter.yml
deleted file mode 100644
index 139d04468a..0000000000
--- a/compute/etc/sql_exporter.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector.yml"
diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet
new file mode 100644
index 0000000000..8697f8af3b
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'checkpoints_req',
+  type: 'gauge',
+  help: 'Number of requested checkpoints',
+  key_labels: null,
+  values: [
+    'checkpoints_req',
+  ],
+  query: importstr 'sql_exporter/checkpoints_req.sql',
+}
diff --git a/compute/etc/sql_exporter/checkpoints_req.sql b/compute/etc/sql_exporter/checkpoints_req.sql
new file mode 100644
index 0000000000..eb8427c883
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_req.sql
@@ -0,0 +1 @@
+SELECT checkpoints_req FROM pg_stat_bgwriter;
diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
new file mode 100644
index 0000000000..9f0b742400
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'checkpoints_timed',
+  type: 'gauge',
+  help: 'Number of scheduled checkpoints',
+  key_labels: null,
+  values: [
+    'checkpoints_timed',
+  ],
+  query: importstr 'sql_exporter/checkpoints_timed.sql',
+}
diff --git a/compute/etc/sql_exporter/checkpoints_timed.sql b/compute/etc/sql_exporter/checkpoints_timed.sql
new file mode 100644
index 0000000000..c50853134c
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_timed.sql
@@ -0,0 +1 @@
+SELECT checkpoints_timed FROM pg_stat_bgwriter;
diff --git a/compute/etc/sql_exporter/compute_current_lsn.libsonnet b/compute/etc/sql_exporter/compute_current_lsn.libsonnet
new file mode 100644
index 0000000000..ccff161358
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_current_lsn.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_current_lsn',
+  type: 'gauge',
+  help: 'Current LSN of the database',
+  key_labels: null,
+  values: [
+    'lsn',
+  ],
+  query: importstr 'sql_exporter/compute_current_lsn.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_current_lsn.sql b/compute/etc/sql_exporter/compute_current_lsn.sql
new file mode 100644
index 0000000000..be02b8a094
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_current_lsn.sql
@@ -0,0 +1,4 @@
+SELECT CASE
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_replay_lsn() - '0/0')::FLOAT8
+  ELSE (pg_current_wal_lsn() - '0/0')::FLOAT8
+END AS lsn;
diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
new file mode 100644
index 0000000000..212f079ccf
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'compute_logical_snapshot_files',
+  type: 'gauge',
+  help: 'Number of snapshot files in pg_logical/snapshot',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'num_logical_snapshot_files',
+  ],
+  query: importstr 'sql_exporter/compute_logical_snapshot_files.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_logical_snapshot_files.sql b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
new file mode 100644
index 0000000000..f2454235b7
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshot_files.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COUNT(*) FROM pg_ls_dir('pg_logical/snapshots') AS name WHERE name LIKE '%.snap') AS num_logical_snapshot_files;
diff --git a/compute/etc/sql_exporter/compute_receive_lsn.libsonnet b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet
new file mode 100644
index 0000000000..eb68a77ec2
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_receive_lsn.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_receive_lsn',
+  type: 'gauge',
+  help: 'Returns the last write-ahead log location that has been received and synced to disk by streaming replication',
+  key_labels: null,
+  values: [
+    'lsn',
+  ],
+  query: importstr 'sql_exporter/compute_receive_lsn.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_receive_lsn.sql b/compute/etc/sql_exporter/compute_receive_lsn.sql
new file mode 100644
index 0000000000..318b31ab41
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_receive_lsn.sql
@@ -0,0 +1,4 @@
+SELECT CASE
+  WHEN pg_catalog.pg_is_in_recovery() THEN (pg_last_wal_receive_lsn() - '0/0')::FLOAT8
+  ELSE 0
+END AS lsn;
diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
new file mode 100644
index 0000000000..e1575da397
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_subscriptions_count.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'compute_subscriptions_count',
+  type: 'gauge',
+  help: 'Number of logical replication subscriptions grouped by enabled/disabled',
+  key_labels: [
+    'enabled',
+  ],
+  values: [
+    'subscriptions_count',
+  ],
+  query: importstr 'sql_exporter/compute_subscriptions_count.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_subscriptions_count.sql b/compute/etc/sql_exporter/compute_subscriptions_count.sql
new file mode 100644
index 0000000000..50740cb5df
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_subscriptions_count.sql
@@ -0,0 +1 @@
+SELECT subenabled::text AS enabled, count(*) AS subscriptions_count FROM pg_subscription GROUP BY subenabled;
diff --git a/compute/etc/sql_exporter/connection_counts.libsonnet b/compute/etc/sql_exporter/connection_counts.libsonnet
new file mode 100644
index 0000000000..9f94db67a9
--- /dev/null
+++ b/compute/etc/sql_exporter/connection_counts.libsonnet
@@ -0,0 +1,13 @@
+{
+  metric_name: 'connection_counts',
+  type: 'gauge',
+  help: 'Connection counts',
+  key_labels: [
+    'datname',
+    'state',
+  ],
+  values: [
+    'count',
+  ],
+  query: importstr 'sql_exporter/connection_counts.sql',
+}
diff --git a/compute/etc/sql_exporter/connection_counts.sql b/compute/etc/sql_exporter/connection_counts.sql
new file mode 100644
index 0000000000..6824480fdb
--- /dev/null
+++ b/compute/etc/sql_exporter/connection_counts.sql
@@ -0,0 +1 @@
+SELECT datname, state, count(*) AS count FROM pg_stat_activity WHERE state <> '' GROUP BY datname, state;
diff --git a/compute/etc/sql_exporter/db_total_size.libsonnet b/compute/etc/sql_exporter/db_total_size.libsonnet
new file mode 100644
index 0000000000..6e08d5fb87
--- /dev/null
+++ b/compute/etc/sql_exporter/db_total_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'db_total_size',
+  type: 'gauge',
+  help: 'Size of all databases',
+  key_labels: null,
+  values: [
+    'total',
+  ],
+  query: importstr 'sql_exporter/db_total_size.sql',
+}
diff --git a/compute/etc/sql_exporter/db_total_size.sql b/compute/etc/sql_exporter/db_total_size.sql
new file mode 100644
index 0000000000..9cbbdfd8a3
--- /dev/null
+++ b/compute/etc/sql_exporter/db_total_size.sql
@@ -0,0 +1 @@
+SELECT sum(pg_database_size(datname)) AS total FROM pg_database;
diff --git a/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
new file mode 100644
index 0000000000..935e35d2e4
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetch_discards_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_discards_total',
+  type: 'counter',
+  help: 'Number of prefetch responses issued but not used',
+  values: [
+    'getpage_prefetch_discards_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
new file mode 100644
index 0000000000..b9a9632105
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetch_misses_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_misses_total',
+  type: 'counter',
+  help: "Total number of readahead misses; consisting of either prefetches that don't satisfy the LSN bounds once the prefetch got read by the backend, or cases where somehow no readahead was issued for the read",
+  values: [
+    'getpage_prefetch_misses_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
new file mode 100644
index 0000000000..75fdb6717b
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetch_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetch_requests_total',
+  type: 'counter',
+  help: 'Number of getpage issued for prefetching',
+  values: [
+    'getpage_prefetch_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
new file mode 100644
index 0000000000..f3a1e6b339
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_sync_requests_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_sync_requests_total',
+  type: 'counter',
+  help: 'Number of synchronous getpage issued',
+  values: [
+    'getpage_sync_requests_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
new file mode 100644
index 0000000000..2adda2ad03
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'getpage_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of getpage request latency',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/getpage_wait_seconds_bucket.sql',
+}
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
new file mode 100644
index 0000000000..b4a6bc1560
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'getpage_wait_seconds_bucket';
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
new file mode 100644
index 0000000000..d2326974fc
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of getpage requests',
+  values: [
+    'getpage_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
new file mode 100644
index 0000000000..844c8419ff
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in getpage requests',
+  values: [
+    'getpage_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
new file mode 100644
index 0000000000..78859ce60d
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.libsonnet
@@ -0,0 +1,12 @@
+// DEPRECATED
+
+{
+  metric_name: 'lfc_approximate_working_set_size',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: null,
+  values: [
+    'approximate_working_set_size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
new file mode 100644
index 0000000000..de509ebb47
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size.sql
@@ -0,0 +1 @@
+SELECT neon.approximate_working_set_size(false) AS approximate_working_set_size;
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
new file mode 100644
index 0000000000..a54deca467
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'lfc_approximate_working_set_size_windows',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: [
+    'duration_seconds',
+  ],
+  values: [
+    'size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
new file mode 100644
index 0000000000..35fa42c34c
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.autoscaling.sql
@@ -0,0 +1,8 @@
+-- NOTE: This is the "internal" / "machine-readable" version. This outputs the
+-- working set size looking back 1..60 minutes, labeled with the number of
+-- minutes.
+
+SELECT
+  x::text as duration_seconds,
+  neon.approximate_working_set_size_seconds(x) AS size
+FROM (SELECT generate_series * 60 AS x FROM generate_series(1, 60)) AS t (x);
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
new file mode 100644
index 0000000000..4970bd2c7f
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'lfc_approximate_working_set_size_windows',
+  type: 'gauge',
+  help: 'Approximate working set size in pages of 8192 bytes',
+  key_labels: [
+    'duration',
+  ],
+  values: [
+    'size',
+  ],
+  query: importstr 'sql_exporter/lfc_approximate_working_set_size_windows.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
new file mode 100644
index 0000000000..46c7d1610c
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_approximate_working_set_size_windows.sql
@@ -0,0 +1,8 @@
+-- NOTE: This is the "public" / "human-readable" version. Here, we supply a
+-- small selection of durations in a pretty-printed form.
+
+SELECT
+  x AS duration,
+  neon.approximate_working_set_size_seconds(extract('epoch' FROM x::interval)::int) AS size FROM (
+    VALUES ('5m'), ('15m'), ('1h')
+  ) AS t (x);
diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
new file mode 100644
index 0000000000..4cbbd76621
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_cache_size_limit.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_cache_size_limit',
+  type: 'gauge',
+  help: 'LFC cache size limit in bytes',
+  key_labels: null,
+  values: [
+    'lfc_cache_size_limit',
+  ],
+  query: importstr 'sql_exporter/lfc_cache_size_limit.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_cache_size_limit.sql b/compute/etc/sql_exporter/lfc_cache_size_limit.sql
new file mode 100644
index 0000000000..378904c1fe
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_cache_size_limit.sql
@@ -0,0 +1 @@
+SELECT pg_size_bytes(current_setting('neon.file_cache_size_limit')) AS lfc_cache_size_limit;
diff --git a/compute/etc/sql_exporter/lfc_hits.libsonnet b/compute/etc/sql_exporter/lfc_hits.libsonnet
new file mode 100644
index 0000000000..4a0b7671bf
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_hits.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_hits',
+  type: 'gauge',
+  help: 'lfc_hits',
+  key_labels: null,
+  values: [
+    'lfc_hits',
+  ],
+  query: importstr 'sql_exporter/lfc_hits.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_hits.sql b/compute/etc/sql_exporter/lfc_hits.sql
new file mode 100644
index 0000000000..2e14f5c73c
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_hits.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_hits FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_hits';
diff --git a/compute/etc/sql_exporter/lfc_misses.libsonnet b/compute/etc/sql_exporter/lfc_misses.libsonnet
new file mode 100644
index 0000000000..302998d04f
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_misses.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_misses',
+  type: 'gauge',
+  help: 'lfc_misses',
+  key_labels: null,
+  values: [
+    'lfc_misses',
+  ],
+  query: importstr 'sql_exporter/lfc_misses.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_misses.sql b/compute/etc/sql_exporter/lfc_misses.sql
new file mode 100644
index 0000000000..27ed4ecf86
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_misses.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_misses FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_misses';
diff --git a/compute/etc/sql_exporter/lfc_used.libsonnet b/compute/etc/sql_exporter/lfc_used.libsonnet
new file mode 100644
index 0000000000..23891dadaf
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_used.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_used',
+  type: 'gauge',
+  help: 'LFC chunks used (chunk = 1MB)',
+  key_labels: null,
+  values: [
+    'lfc_used',
+  ],
+  query: importstr 'sql_exporter/lfc_used.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_used.sql b/compute/etc/sql_exporter/lfc_used.sql
new file mode 100644
index 0000000000..4f01545f30
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_used.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_used FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_used';
diff --git a/compute/etc/sql_exporter/lfc_writes.libsonnet b/compute/etc/sql_exporter/lfc_writes.libsonnet
new file mode 100644
index 0000000000..6a22ee1dd9
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_writes.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'lfc_writes',
+  type: 'gauge',
+  help: 'lfc_writes',
+  key_labels: null,
+  values: [
+    'lfc_writes',
+  ],
+  query: importstr 'sql_exporter/lfc_writes.sql',
+}
diff --git a/compute/etc/sql_exporter/lfc_writes.sql b/compute/etc/sql_exporter/lfc_writes.sql
new file mode 100644
index 0000000000..37c9abc9cf
--- /dev/null
+++ b/compute/etc/sql_exporter/lfc_writes.sql
@@ -0,0 +1 @@
+SELECT lfc_value AS lfc_writes FROM neon.neon_lfc_stats WHERE lfc_key = 'file_cache_writes';
diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
new file mode 100644
index 0000000000..8ef31b5d8d
--- /dev/null
+++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.libsonnet
@@ -0,0 +1,15 @@
+// Number of slots is limited by max_replication_slots, so collecting position
+// for all of them shouldn't be bad.
+
+{
+  metric_name: 'logical_slot_restart_lsn',
+  type: 'gauge',
+  help: 'restart_lsn of logical slots',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'restart_lsn',
+  ],
+  query: importstr 'sql_exporter/logical_slot_restart_lsn.sql',
+}
diff --git a/compute/etc/sql_exporter/logical_slot_restart_lsn.sql b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
new file mode 100644
index 0000000000..1b1c038501
--- /dev/null
+++ b/compute/etc/sql_exporter/logical_slot_restart_lsn.sql
@@ -0,0 +1,3 @@
+SELECT slot_name, (restart_lsn - '0/0')::FLOAT8 as restart_lsn
+FROM pg_replication_slots
+WHERE slot_type = 'logical';
diff --git a/compute/etc/sql_exporter/max_cluster_size.libsonnet b/compute/etc/sql_exporter/max_cluster_size.libsonnet
new file mode 100644
index 0000000000..1352fb77ee
--- /dev/null
+++ b/compute/etc/sql_exporter/max_cluster_size.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'max_cluster_size',
+  type: 'gauge',
+  help: 'neon.max_cluster_size setting',
+  key_labels: null,
+  values: [
+    'max_cluster_size',
+  ],
+  query: importstr 'sql_exporter/max_cluster_size.sql',
+}
diff --git a/compute/etc/sql_exporter/max_cluster_size.sql b/compute/etc/sql_exporter/max_cluster_size.sql
new file mode 100644
index 0000000000..2d2355a9a7
--- /dev/null
+++ b/compute/etc/sql_exporter/max_cluster_size.sql
@@ -0,0 +1 @@
+SELECT setting::int AS max_cluster_size FROM pg_settings WHERE name = 'neon.max_cluster_size';
diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql
new file mode 100644
index 0000000000..58998907a0
--- /dev/null
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -0,0 +1,13 @@
+WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters)
+
+SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
+  getpage_wait_seconds_count numeric,
+  getpage_wait_seconds_sum numeric,
+  getpage_prefetch_requests_total numeric,
+  getpage_sync_requests_total numeric,
+  getpage_prefetch_misses_total numeric,
+  getpage_prefetch_discards_total numeric,
+  pageserver_requests_sent_total numeric,
+  pageserver_disconnects_total numeric,
+  pageserver_send_flushes_total numeric
+);
diff --git a/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
new file mode 100644
index 0000000000..5ad9ba078e
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_disconnects_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_disconnects_total',
+  type: 'counter',
+  help: 'Number of times that the connection to the pageserver was lost',
+  values: [
+    'pageserver_disconnects_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
new file mode 100644
index 0000000000..c191e2467f
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_requests_sent_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_requests_sent_total',
+  type: 'counter',
+  help: 'Number of all requests sent to the pageserver (not just GetPage requests)',
+  values: [
+    'pageserver_requests_sent_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
new file mode 100644
index 0000000000..9fa5f77758
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_send_flushes_total.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_send_flushes_total',
+  type: 'counter',
+  help: 'Number of flushes to the pageserver connection',
+  values: [
+    'pageserver_send_flushes_total',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/pg_stats_userdb.libsonnet b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet
new file mode 100644
index 0000000000..46ea2f4192
--- /dev/null
+++ b/compute/etc/sql_exporter/pg_stats_userdb.libsonnet
@@ -0,0 +1,18 @@
+{
+  metric_name: 'pg_stats_userdb',
+  type: 'gauge',
+  help: 'Stats for several oldest non-system dbs',
+  key_labels: [
+    'datname',
+  ],
+  value_label: 'kind',
+  values: [
+    'db_size',
+    'deadlocks',
+    // Rows
+    'inserted',
+    'updated',
+    'deleted',
+  ],
+  query: importstr 'sql_exporter/pg_stats_userdb.sql',
+}
diff --git a/compute/etc/sql_exporter/pg_stats_userdb.sql b/compute/etc/sql_exporter/pg_stats_userdb.sql
new file mode 100644
index 0000000000..00ada87370
--- /dev/null
+++ b/compute/etc/sql_exporter/pg_stats_userdb.sql
@@ -0,0 +1,10 @@
+-- We export stats for 10 non-system databases. Without this limit it is too
+-- easy to abuse the system by creating lots of databases.
+
+SELECT pg_database_size(datname) AS db_size, deadlocks, tup_inserted AS inserted,
+  tup_updated AS updated, tup_deleted AS deleted, datname
+FROM pg_stat_database
+WHERE datname IN (
+  SELECT datname FROM pg_database
+  WHERE datname <> 'postgres' AND NOT datistemplate ORDER BY oid LIMIT 10
+);
diff --git a/compute/etc/sql_exporter/replication_delay_bytes.libsonnet b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet
new file mode 100644
index 0000000000..3e5bb6af1f
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_bytes.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'replication_delay_bytes',
+  type: 'gauge',
+  help: 'Bytes between received and replayed LSN',
+  key_labels: null,
+  values: [
+    'replication_delay_bytes',
+  ],
+  query: importstr 'sql_exporter/replication_delay_bytes.sql',
+}
diff --git a/compute/etc/sql_exporter/replication_delay_bytes.sql b/compute/etc/sql_exporter/replication_delay_bytes.sql
new file mode 100644
index 0000000000..60a6981acd
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_bytes.sql
@@ -0,0 +1,6 @@
+-- We use a GREATEST call here because this calculation can be negative. The
+-- calculation is not atomic, meaning after we've gotten the receive LSN, the
+-- replay LSN may have advanced past the receive LSN we are using for the
+-- calculation.
+
+SELECT GREATEST(0, pg_wal_lsn_diff(pg_last_wal_receive_lsn(), pg_last_wal_replay_lsn())) AS replication_delay_bytes;
diff --git a/compute/etc/sql_exporter/replication_delay_seconds.libsonnet b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet
new file mode 100644
index 0000000000..d3f2c21b54
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_seconds.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'replication_delay_seconds',
+  type: 'gauge',
+  help: 'Time since last LSN was replayed',
+  key_labels: null,
+  values: [
+    'replication_delay_seconds',
+  ],
+  query: importstr 'sql_exporter/replication_delay_seconds.sql',
+}
diff --git a/compute/etc/sql_exporter/replication_delay_seconds.sql b/compute/etc/sql_exporter/replication_delay_seconds.sql
new file mode 100644
index 0000000000..a76809ad74
--- /dev/null
+++ b/compute/etc/sql_exporter/replication_delay_seconds.sql
@@ -0,0 +1,5 @@
+SELECT
+  CASE
+    WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0
+    ELSE GREATEST(0, EXTRACT (EPOCH FROM now() - pg_last_xact_replay_timestamp()))
+  END AS replication_delay_seconds;
diff --git a/compute/etc/sql_exporter/retained_wal.libsonnet b/compute/etc/sql_exporter/retained_wal.libsonnet
new file mode 100644
index 0000000000..f9eff5faa5
--- /dev/null
+++ b/compute/etc/sql_exporter/retained_wal.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'retained_wal',
+  type: 'gauge',
+  help: 'Retained WAL in inactive replication slots',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'retained_wal',
+  ],
+  query: importstr 'sql_exporter/retained_wal.sql',
+}
diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql
new file mode 100644
index 0000000000..6c58359461
--- /dev/null
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -0,0 +1,5 @@
+SELECT
+  slot_name,
+  pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+FROM pg_replication_slots
+WHERE active = false;
diff --git a/compute/etc/sql_exporter/wal_is_lost.libsonnet b/compute/etc/sql_exporter/wal_is_lost.libsonnet
new file mode 100644
index 0000000000..3cd25f4b39
--- /dev/null
+++ b/compute/etc/sql_exporter/wal_is_lost.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'wal_is_lost',
+  type: 'gauge',
+  help: 'Whether or not the replication slot wal_status is lost',
+  key_labels: [
+    'slot_name',
+  ],
+  values: [
+    'wal_is_lost',
+  ],
+  query: importstr 'sql_exporter/wal_is_lost.sql',
+}
diff --git a/compute/etc/sql_exporter/wal_is_lost.sql b/compute/etc/sql_exporter/wal_is_lost.sql
new file mode 100644
index 0000000000..5521270851
--- /dev/null
+++ b/compute/etc/sql_exporter/wal_is_lost.sql
@@ -0,0 +1,7 @@
+SELECT
+  slot_name,
+  CASE
+    WHEN wal_status = 'lost' THEN 1
+    ELSE 0
+  END AS wal_is_lost
+FROM pg_replication_slots;
diff --git a/compute/etc/sql_exporter_autoscaling.yml b/compute/etc/sql_exporter_autoscaling.yml
deleted file mode 100644
index 044557233e..0000000000
--- a/compute/etc/sql_exporter_autoscaling.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-# Configuration for sql_exporter for autoscaling-agent
-# Global defaults.
-global:
-  # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-  scrape_timeout: 10s
-  # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-  scrape_timeout_offset: 500ms
-  # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-  min_interval: 0s
-  # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-  # as will concurrent scrapes.
-  max_connections: 1
-  # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-  # always be the same as max_connections.
-  max_idle_connections: 1
-  # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-  # If 0, connections are not closed due to a connection's age.
-  max_connection_lifetime: 5m
-
-# The target to monitor and the collectors to execute on it.
-target:
-  # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-  # the schema gets dropped or replaced to match the driver expected DSN format.
-  data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling'
-
-  # Collectors (referenced by name) to execute on the target.
-  # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-  collectors: [neon_collector_autoscaling]
-
-# Collector files specifies a list of globs. One collector definition is read from each matching file.
-# Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-collector_files:
-  - "neon_collector_autoscaling.yml"

From f1eb7032569c35ec47806c5e736486508d559439 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 15 Oct 2024 16:35:21 -0400
Subject: [PATCH 007/239] fix(pageserver): use a buffer for basebackup; add aux
 basebackup metrics log (#9401)

Our replication bench project is stuck because it is too slow to
generate basebackup and it caused compute to disconnect.

https://neondb.slack.com/archives/C03438W3FLZ/p1728330685012419

The compute timeout for waiting for basebackup is 10m (is it true?).
Generating basebackup directly on pageserver takes ~3min. Therefore, I
suspect it's because there are too many wasted round-trip time for
writing the 10000+ snapshot aux files. Also, it is possible that the
basebackup process takes too long time retrieving all aux files that it
did not write anything over the wire protocol, causing a read timeout.

Basebackup size is 800KB gzipped for that project and was 55MB tar
before compression.

## Summary of changes

* Potentially fix the issue by placing a write buffer for basebackup.
* Log how many aux files did we read + the time spent on it.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/basebackup.rs   | 21 +++++++++++++++++----
 pageserver/src/page_service.rs | 10 +++++++---
 2 files changed, 24 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index a32d09f3b3..975318419f 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -16,7 +16,7 @@ use fail::fail_point;
 use pageserver_api::key::Key;
 use postgres_ffi::pg_constants;
 use std::fmt::Write as FmtWrite;
-use std::time::SystemTime;
+use std::time::{Instant, SystemTime};
 use tokio::io;
 use tokio::io::AsyncWrite;
 use tracing::*;
@@ -352,12 +352,25 @@ where
             }
         }
 
-        for (path, content) in self
+        let start_time = Instant::now();
+        let aux_files = self
             .timeline
             .list_aux_files(self.lsn, self.ctx)
             .await
-            .map_err(|e| BasebackupError::Server(e.into()))?
-        {
+            .map_err(|e| BasebackupError::Server(e.into()))?;
+        let aux_scan_time = start_time.elapsed();
+        let aux_estimated_size = aux_files
+            .values()
+            .map(|content| content.len())
+            .sum::<usize>();
+        info!(
+            "Scanned {} aux files in {}ms, aux file content size = {}",
+            aux_files.len(),
+            aux_scan_time.as_millis(),
+            aux_estimated_size
+        );
+
+        for (path, content) in aux_files {
             if path.starts_with("pg_replslot") {
                 let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                 let restart_lsn = Lsn(u64::from_le_bytes(
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 8fa6b9a7f0..afb2f92ff8 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,8 +26,8 @@ use std::str::FromStr;
 use std::sync::Arc;
 use std::time::SystemTime;
 use std::time::{Duration, Instant};
-use tokio::io::AsyncWriteExt;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::io::{AsyncWriteExt, BufWriter};
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
@@ -1137,10 +1137,10 @@ impl PageServerHandler {
             .await
             .map_err(map_basebackup_error)?;
         } else {
-            let mut writer = pgb.copyout_writer();
+            let mut writer = BufWriter::new(pgb.copyout_writer());
             if gzip {
                 let mut encoder = GzipEncoder::with_quality(
-                    writer,
+                    &mut writer,
                     // NOTE using fast compression because it's on the critical path
                     //      for compute startup. For an empty database, we get
                     //      <100KB with this method. The Level::Best compression method
@@ -1175,6 +1175,10 @@ impl PageServerHandler {
                 .await
                 .map_err(map_basebackup_error)?;
             }
+            writer
+                .flush()
+                .await
+                .map_err(|e| map_basebackup_error(BasebackupError::Client(e)))?;
         }
 
         pgb.write_message_noflush(&BeMessage::CopyDone)

From 18f4e5f10cd1eeaa5a5949f9a6130983691311d6 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Tue, 15 Oct 2024 23:13:31 +0200
Subject: [PATCH 008/239] Add newly added metrics from neondatabase/neon#9116
 to exports (#9402)

They weren't added in that PR, but should be available immediately on
rollout as the neon extension already defaults to 1.5.
---
 compute/etc/neon_collector.jsonnet                   |  8 ++++++++
 .../file_cache_read_wait_seconds_bucket.libsonnet    | 12 ++++++++++++
 .../file_cache_read_wait_seconds_bucket.sql          |  1 +
 .../file_cache_read_wait_seconds_count.libsonnet     |  9 +++++++++
 .../file_cache_read_wait_seconds_sum.libsonnet       |  9 +++++++++
 .../file_cache_write_wait_seconds_bucket.libsonnet   | 12 ++++++++++++
 .../file_cache_write_wait_seconds_bucket.sql         |  1 +
 .../file_cache_write_wait_seconds_count.libsonnet    |  9 +++++++++
 .../file_cache_write_wait_seconds_sum.libsonnet      |  9 +++++++++
 .../getpage_prefetches_buffered.libsonnet            |  9 +++++++++
 compute/etc/sql_exporter/neon_perf_counters.sql      |  8 +++++++-
 .../sql_exporter/pageserver_open_requests.libsonnet  |  9 +++++++++
 12 files changed, 95 insertions(+), 1 deletion(-)
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
 create mode 100644 compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
 create mode 100644 compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
 create mode 100644 compute/etc/sql_exporter/pageserver_open_requests.libsonnet

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 2031eb8c85..8b43ebe7a3 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -9,9 +9,16 @@
     import 'sql_exporter/compute_subscriptions_count.libsonnet',
     import 'sql_exporter/connection_counts.libsonnet',
     import 'sql_exporter/db_total_size.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_count.libsonnet',
+    import 'sql_exporter/file_cache_read_wait_seconds_sum.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_count.libsonnet',
+    import 'sql_exporter/file_cache_write_wait_seconds_sum.libsonnet',
     import 'sql_exporter/getpage_prefetch_discards_total.libsonnet',
     import 'sql_exporter/getpage_prefetch_misses_total.libsonnet',
     import 'sql_exporter/getpage_prefetch_requests_total.libsonnet',
+    import 'sql_exporter/getpage_prefetches_buffered.libsonnet',
     import 'sql_exporter/getpage_sync_requests_total.libsonnet',
     import 'sql_exporter/getpage_wait_seconds_bucket.libsonnet',
     import 'sql_exporter/getpage_wait_seconds_count.libsonnet',
@@ -28,6 +35,7 @@
     import 'sql_exporter/pageserver_disconnects_total.libsonnet',
     import 'sql_exporter/pageserver_requests_sent_total.libsonnet',
     import 'sql_exporter/pageserver_send_flushes_total.libsonnet',
+    import 'sql_exporter/pageserver_open_requests.libsonnet',
     import 'sql_exporter/pg_stats_userdb.libsonnet',
     import 'sql_exporter/replication_delay_bytes.libsonnet',
     import 'sql_exporter/replication_delay_seconds.libsonnet',
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
new file mode 100644
index 0000000000..d13f657a7f
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of LFC read operation latencies',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/file_cache_read_wait_seconds_bucket.sql',
+}
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
new file mode 100644
index 0000000000..09047bf0c4
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_read_wait_seconds_bucket';
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
new file mode 100644
index 0000000000..aa028b0f5e
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of read operations in LFC',
+  values: [
+    'file_cache_read_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
new file mode 100644
index 0000000000..2547aabf3d
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_read_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_read_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in LFC read operations',
+  values: [
+    'file_cache_read_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
new file mode 100644
index 0000000000..13dbc77f76
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.libsonnet
@@ -0,0 +1,12 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_bucket',
+  type: 'counter',
+  help: 'Histogram buckets of LFC write operation latencies',
+  key_labels: [
+    'bucket_le',
+  ],
+  values: [
+    'value',
+  ],
+  query: importstr 'sql_exporter/file_cache_write_wait_seconds_bucket.sql',
+}
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
new file mode 100644
index 0000000000..d03613cf91
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_bucket.sql
@@ -0,0 +1 @@
+SELECT bucket_le, value FROM neon.neon_perf_counters WHERE metric = 'file_cache_write_wait_seconds_bucket';
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
new file mode 100644
index 0000000000..6227d3193a
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_count.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_count',
+  type: 'counter',
+  help: 'Number of write operations in LFC',
+  values: [
+    'file_cache_write_wait_seconds_count',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
new file mode 100644
index 0000000000..2acfe7f608
--- /dev/null
+++ b/compute/etc/sql_exporter/file_cache_write_wait_seconds_sum.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'file_cache_write_wait_seconds_sum',
+  type: 'counter',
+  help: 'Time spent in LFC write operations',
+  values: [
+    'file_cache_write_wait_seconds_sum',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
new file mode 100644
index 0000000000..8926d867c9
--- /dev/null
+++ b/compute/etc/sql_exporter/getpage_prefetches_buffered.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'getpage_prefetches_buffered',
+  type: 'gauge',
+  help: 'Number of prefetched pages buffered in neon',
+  values: [
+    'getpage_prefetches_buffered',
+  ],
+  query_ref: 'neon_perf_counters',
+}
diff --git a/compute/etc/sql_exporter/neon_perf_counters.sql b/compute/etc/sql_exporter/neon_perf_counters.sql
index 58998907a0..4a36f3bf2f 100644
--- a/compute/etc/sql_exporter/neon_perf_counters.sql
+++ b/compute/etc/sql_exporter/neon_perf_counters.sql
@@ -1,13 +1,19 @@
 WITH c AS (SELECT pg_catalog.jsonb_object_agg(metric, value) jb FROM neon.neon_perf_counters)
 
 SELECT d.* FROM pg_catalog.jsonb_to_record((SELECT jb FROM c)) AS d(
+  file_cache_read_wait_seconds_count numeric,
+  file_cache_read_wait_seconds_sum numeric,
+  file_cache_write_wait_seconds_count numeric,
+  file_cache_write_wait_seconds_sum numeric,
   getpage_wait_seconds_count numeric,
   getpage_wait_seconds_sum numeric,
   getpage_prefetch_requests_total numeric,
   getpage_sync_requests_total numeric,
   getpage_prefetch_misses_total numeric,
   getpage_prefetch_discards_total numeric,
+  getpage_prefetches_buffered numeric,
   pageserver_requests_sent_total numeric,
   pageserver_disconnects_total numeric,
-  pageserver_send_flushes_total numeric
+  pageserver_send_flushes_total numeric,
+  pageserver_open_requests numeric
 );
diff --git a/compute/etc/sql_exporter/pageserver_open_requests.libsonnet b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet
new file mode 100644
index 0000000000..dca89ea64a
--- /dev/null
+++ b/compute/etc/sql_exporter/pageserver_open_requests.libsonnet
@@ -0,0 +1,9 @@
+{
+  metric_name: 'pageserver_open_requests',
+  type: 'gauge',
+  help: 'Number of open requests to PageServer',
+  values: [
+    'pageserver_open_requests',
+  ],
+  query_ref: 'neon_perf_counters',
+}

From be5d6a69dc6a05d339235d00958eb9fea7b0e9f5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Oct 2024 16:30:31 -0500
Subject: [PATCH 009/239] Fix jsonnet_files wildcard

Just a typo in a path.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/Makefile | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/compute/Makefile b/compute/Makefile
index 45fbfa6d5e..b407fc60be 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -1,4 +1,6 @@
-jsonnet_files = $(wildcard etc/*.jsonnet etc/*.libsonnet)
+jsonnet_files = $(wildcard \
+	etc/*.jsonnet \
+	etc/sql_exporter/*.libsonnet)
 
 .PHONY: all
 all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml

From 061ea0de7a9768716d941e2e3472f19e075a5ce5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 15 Oct 2024 20:01:13 -0500
Subject: [PATCH 010/239] Add jsonnetfmt targets

This should make it a little bit easier for people wanting to check if
their files are formated correctly. Has the added bonus of making the CI
check simpler as well.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/build_and_test.yml | 3 +--
 compute/Makefile                     | 8 ++++++++
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c9a447626f..faee1d89e1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -136,8 +136,7 @@ jobs:
 
       - name: Check Jsonnet code formatting
         run: |
-          jsonnetfmt --test \
-            $(find . -type f -name '*.jsonnet' -o -name '*.libsonnet')
+          make -C compute jsonnetfmt-test
 
   # Check that the vendor/postgres-* submodules point to the
   # corresponding REL_*_STABLE_neon branches.
diff --git a/compute/Makefile b/compute/Makefile
index b407fc60be..f8faa882ee 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -35,3 +35,11 @@ clean:
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \
 		etc/sql_exporter_autoscaling.yml
+
+.PHONY: jsonnetfmt-test
+jsonnetfmt-test:
+	jsonnetfmt --test $(jsonnet_files)
+
+.PHONY: jsonnetfmt-format
+jsonnetfmt-format:
+	jsonnetfmt --in-place $(jsonnet_files)

From bc6b8cee01cc4055332fef052c048856612bcbab Mon Sep 17 00:00:00 2001
From: Cihan Demirci <128653800+fcdm@users.noreply.github.com>
Date: Wed, 16 Oct 2024 10:43:48 +0100
Subject: [PATCH 011/239] don't trigger workflows in two repos (#9340)

https://github.com/neondatabase/cloud/issues/16723
---
 .github/workflows/build_and_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index faee1d89e1..b669eaeb11 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1100,7 +1100,6 @@ jobs:
         run: |
           if [[ "$GITHUB_REF_NAME" == "main" ]]; then
             gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
-            gh workflow --repo neondatabase/azure run deploy.yml -f dockerTag=${{needs.tag.outputs.build-tag}}
           elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
             gh workflow --repo neondatabase/infra run deploy-dev.yml --ref main \
               -f deployPgSniRouter=false \

From 89a65a9e5a30c7525d165d1a9c2675d05811bfcb Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 16 Oct 2024 13:39:58 +0100
Subject: [PATCH 012/239] pageserver: improve handling of archival_config calls
 during Timeline shutdown (#9415)

## Problem

In test `test_timeline_offloading`, we see failures like:
```
PageserverApiException: queue is in state Stopped
```

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/11356917668/index.html#testresult/ff0e348a78a974ee/retries

## Summary of changes

- Amend code paths that handle errors from RemoteTimelineClient to check
for cancellation and emit the Cancelled error variant in these cases
(will give clients a 503 to retry)
- Remove the implicit `#[from]` for the Other error case, to make it
harder to add code that accidentally squashes errors into this
(500-equivalent) error variant.

This would be neater if we made RemoteTimelineClient return a structured
error instead of anyhow::Error, but that's a bigger refactor.

I'm not sure if the test really intends to hit this path, but the error
handling fix makes sense either way.
---
 pageserver/src/tenant.rs | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 44d1bb74ca..20925c7fd6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -67,7 +67,7 @@ use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
 use self::remote_timeline_client::upload::upload_index_part;
-use self::remote_timeline_client::RemoteTimelineClient;
+use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
 use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
@@ -632,7 +632,7 @@ pub enum TimelineArchivalError {
     AlreadyInProgress,
 
     #[error(transparent)]
-    Other(#[from] anyhow::Error),
+    Other(anyhow::Error),
 }
 
 impl Debug for TimelineArchivalError {
@@ -1602,7 +1602,8 @@ impl Tenant {
                 "failed to load remote timeline {} for tenant {}",
                 timeline_id, self.tenant_shard_id
             )
-        })?;
+        })
+        .map_err(TimelineArchivalError::Other)?;
         let timelines = self.timelines.lock().unwrap();
         if let Some(timeline) = timelines.get(&timeline_id) {
             let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
@@ -1672,9 +1673,19 @@ impl Tenant {
         };
 
         // Third part: upload new timeline archival state and block until it is present in S3
-        let upload_needed = timeline
+        let upload_needed = match timeline
             .remote_client
-            .schedule_index_upload_for_timeline_archival_state(new_state)?;
+            .schedule_index_upload_for_timeline_archival_state(new_state)
+        {
+            Ok(upload_needed) => upload_needed,
+            Err(e) => {
+                if timeline.cancel.is_cancelled() {
+                    return Err(TimelineArchivalError::Cancelled);
+                } else {
+                    return Err(TimelineArchivalError::Other(e));
+                }
+            }
+        };
 
         if upload_needed {
             info!("Uploading new state");
@@ -1685,7 +1696,14 @@ impl Tenant {
                 tracing::warn!("reached timeout for waiting on upload queue");
                 return Err(TimelineArchivalError::Timeout);
             };
-            v.map_err(|e| TimelineArchivalError::Other(anyhow::anyhow!(e)))?;
+            v.map_err(|e| match e {
+                WaitCompletionError::NotInitialized(e) => {
+                    TimelineArchivalError::Other(anyhow::anyhow!(e))
+                }
+                WaitCompletionError::UploadQueueShutDownOrStopped => {
+                    TimelineArchivalError::Cancelled
+                }
+            })?;
         }
         Ok(())
     }

From f14e45f0cee38bfbbbf1141d486fdd8edfbcc2f2 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 16 Oct 2024 15:01:56 +0200
Subject: [PATCH 013/239] proxy: format imports with nightly rustfmt (#9414)

```shell
cargo +nightly fmt -p proxy -- -l --config imports_granularity=Module,group_imports=StdExternalCrate,reorder_imports=true
```

These rust-analyzer settings for VSCode should help retain this style:
```json
  "rust-analyzer.imports.group.enable": true,
  "rust-analyzer.imports.prefix": "crate",
  "rust-analyzer.imports.merge.glob": false,
  "rust-analyzer.imports.granularity.group": "module",
  "rust-analyzer.imports.granularity.enforce": true,
```
---
 proxy/src/auth/backend/classic.rs             | 19 +++--
 proxy/src/auth/backend/console_redirect.rs    | 21 +++--
 proxy/src/auth/backend/hacks.rs               | 19 +++--
 proxy/src/auth/backend/jwt.rs                 | 39 +++++----
 proxy/src/auth/backend/local.rs               | 19 ++---
 proxy/src/auth/backend/mod.rs                 | 61 ++++++--------
 proxy/src/auth/credentials.rs                 | 25 +++---
 proxy/src/auth/flow.rs                        | 25 +++---
 proxy/src/auth/mod.rs                         | 12 +--
 proxy/src/bin/local_proxy.rs                  | 50 ++++++------
 proxy/src/bin/pg_sni_router.rs                | 16 ++--
 proxy/src/bin/proxy.rs                        | 51 +++++-------
 proxy/src/cache/endpoints.rs                  | 34 +++-----
 proxy/src/cache/project_info.rs               | 27 +++----
 proxy/src/cache/timed_lru.rs                  | 13 ++-
 proxy/src/cancellation.rs                     | 14 ++--
 proxy/src/compute.rs                          | 30 ++++---
 proxy/src/config.rs                           | 37 ++++-----
 proxy/src/console_redirect_proxy.rs           | 29 +++----
 proxy/src/context/mod.rs                      | 21 ++---
 proxy/src/context/parquet.rs                  | 49 ++++++------
 proxy/src/control_plane/messages.rs           |  9 ++-
 proxy/src/control_plane/mgmt.rs               | 10 +--
 proxy/src/control_plane/provider/mock.rs      | 39 ++++-----
 proxy/src/control_plane/provider/mod.rs       | 47 +++++------
 proxy/src/control_plane/provider/neon.rs      | 42 +++++-----
 proxy/src/error.rs                            |  3 +-
 proxy/src/http/health_server.rs               | 25 +++---
 proxy/src/http/mod.rs                         | 19 +++--
 proxy/src/intern.rs                           | 14 ++--
 proxy/src/jemalloc.rs                         | 16 ++--
 proxy/src/logging.rs                          | 16 ++--
 proxy/src/metrics.rs                          |  8 +-
 proxy/src/protocol2.rs                        | 10 +--
 proxy/src/proxy/connect_compute.rs            | 29 ++++---
 proxy/src/proxy/copy_bidirectional.rs         |  9 ++-
 proxy/src/proxy/handshake.rs                  | 20 +++--
 proxy/src/proxy/mod.rs                        | 36 ++++-----
 proxy/src/proxy/passthrough.rs                | 14 ++--
 proxy/src/proxy/retry.rs                      |  8 +-
 proxy/src/proxy/tests/mitm.rs                 |  3 +-
 proxy/src/proxy/tests/mod.rs                  | 22 ++---
 proxy/src/proxy/wake_compute.rs               | 11 +--
 proxy/src/rate_limiter/leaky_bucket.rs        |  6 +-
 proxy/src/rate_limiter/limit_algorithm.rs     | 12 +--
 .../src/rate_limiter/limit_algorithm/aimd.rs  |  3 +-
 proxy/src/rate_limiter/limiter.rs             | 24 +++---
 proxy/src/rate_limiter/mod.rs                 |  4 +-
 proxy/src/redis/cancellation_publisher.rs     |  7 +-
 .../connection_with_credentials_provider.rs   |  9 +--
 proxy/src/redis/notifications.rs              | 17 ++--
 proxy/src/sasl/messages.rs                    |  3 +-
 proxy/src/sasl/mod.rs                         |  5 +-
 proxy/src/sasl/stream.rs                      |  7 +-
 proxy/src/scram/countmin.rs                   |  4 +-
 proxy/src/scram/exchange.rs                   |  3 +-
 proxy/src/scram/messages.rs                   |  5 +-
 proxy/src/scram/mod.rs                        | 15 ++--
 proxy/src/scram/pbkdf2.rs                     | 10 +--
 proxy/src/scram/threadpool.rs                 | 32 +++-----
 proxy/src/serverless/backend.rs               | 58 ++++++--------
 proxy/src/serverless/cancel_set.rs            |  8 +-
 proxy/src/serverless/conn_pool.rs             | 44 +++++-----
 proxy/src/serverless/http_conn_pool.rs        | 17 ++--
 proxy/src/serverless/http_util.rs             |  7 +-
 proxy/src/serverless/json.rs                  |  9 +--
 proxy/src/serverless/local_conn_pool.rs       | 25 +++---
 proxy/src/serverless/mod.rs                   | 19 +++--
 proxy/src/serverless/sql_over_http.rs         | 80 ++++++-------------
 proxy/src/serverless/websocket.rs             | 41 ++++------
 proxy/src/stream.rs                           | 17 ++--
 proxy/src/usage_metrics.rs                    | 41 +++++-----
 proxy/src/waiters.rs                          |  8 +-
 73 files changed, 726 insertions(+), 835 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 94b84b6f00..de32a06e9e 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,16 +1,15 @@
-use super::{ComputeCredentials, ComputeUserInfo};
-use crate::{
-    auth::{self, backend::ComputeCredentialKeys, AuthFlow},
-    compute,
-    config::AuthenticationConfig,
-    context::RequestMonitoring,
-    control_plane::AuthSecret,
-    sasl,
-    stream::{PqStream, Stream},
-};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
+use super::{ComputeCredentials, ComputeUserInfo};
+use crate::auth::backend::ComputeCredentialKeys;
+use crate::auth::{self, AuthFlow};
+use crate::config::AuthenticationConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::AuthSecret;
+use crate::stream::{PqStream, Stream};
+use crate::{compute, sasl};
+
 pub(super) async fn authenticate(
     ctx: &RequestMonitoring,
     creds: ComputeUserInfo,
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 457410ec8c..255e1fed54 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,15 +1,3 @@
-use crate::{
-    auth,
-    cache::Cached,
-    compute,
-    config::AuthenticationConfig,
-    context::RequestMonitoring,
-    control_plane::{self, provider::NodeInfo, CachedNodeInfo},
-    error::{ReportableError, UserFacingError},
-    proxy::connect_compute::ComputeConnectBackend,
-    stream::PqStream,
-    waiters,
-};
 use async_trait::async_trait;
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
@@ -18,6 +6,15 @@ use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 use super::ComputeCredentialKeys;
+use crate::cache::Cached;
+use crate::config::AuthenticationConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::provider::NodeInfo;
+use crate::control_plane::{self, CachedNodeInfo};
+use crate::error::{ReportableError, UserFacingError};
+use crate::proxy::connect_compute::ComputeConnectBackend;
+use crate::stream::PqStream;
+use crate::{auth, compute, waiters};
 
 #[derive(Debug, Error)]
 pub(crate) enum WebAuthError {
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 749218d260..8ab8d5d37f 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,16 +1,15 @@
-use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
-use crate::{
-    auth::{self, AuthFlow},
-    config::AuthenticationConfig,
-    context::RequestMonitoring,
-    control_plane::AuthSecret,
-    intern::EndpointIdInt,
-    sasl,
-    stream::{self, Stream},
-};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
+use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
+use crate::auth::{self, AuthFlow};
+use crate::config::AuthenticationConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::AuthSecret;
+use crate::intern::EndpointIdInt;
+use crate::sasl;
+use crate::stream::{self, Stream};
+
 /// Compared to [SCRAM](crate::scram), cleartext password auth saves
 /// one round trip and *expensive* computations (>= 4096 HMAC iterations).
 /// These properties are benefical for serverless JS workers, so we
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 402e59fdb3..3f53ee24c3 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,22 +1,22 @@
-use std::{
-    future::Future,
-    sync::Arc,
-    time::{Duration, SystemTime},
-};
+use std::future::Future;
+use std::sync::Arc;
+use std::time::{Duration, SystemTime};
 
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
-use serde::{de::Visitor, Deserialize, Deserializer};
+use serde::de::Visitor;
+use serde::{Deserialize, Deserializer};
 use signature::Verifier;
 use thiserror::Error;
 use tokio::time::Instant;
 
-use crate::{
-    auth::backend::ComputeCredentialKeys, context::RequestMonitoring,
-    control_plane::errors::GetEndpointJwksError, http::parse_json_body_with_limit,
-    intern::RoleNameInt, EndpointId, RoleName,
-};
+use crate::auth::backend::ComputeCredentialKeys;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::GetEndpointJwksError;
+use crate::http::parse_json_body_with_limit;
+use crate::intern::RoleNameInt;
+use crate::{EndpointId, RoleName};
 
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
@@ -381,10 +381,8 @@ fn verify_rsa_signature(
     alg: &jose_jwa::Algorithm,
 ) -> Result<(), JwtError> {
     use jose_jwa::{Algorithm, Signing};
-    use rsa::{
-        pkcs1v15::{Signature, VerifyingKey},
-        RsaPublicKey,
-    };
+    use rsa::pkcs1v15::{Signature, VerifyingKey};
+    use rsa::RsaPublicKey;
 
     let key = RsaPublicKey::try_from(key).map_err(JwtError::InvalidRsaKey)?;
 
@@ -655,11 +653,9 @@ impl From<&jose_jwk::Key> for KeyType {
 
 #[cfg(test)]
 mod tests {
-    use crate::RoleName;
-
-    use super::*;
-
-    use std::{future::IntoFuture, net::SocketAddr, time::SystemTime};
+    use std::future::IntoFuture;
+    use std::net::SocketAddr;
+    use std::time::SystemTime;
 
     use base64::URL_SAFE_NO_PAD;
     use bytes::Bytes;
@@ -672,6 +668,9 @@ mod tests {
     use signature::Signer;
     use tokio::net::TcpListener;
 
+    use super::*;
+    use crate::RoleName;
+
     fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
         let sk = p256::SecretKey::random(&mut OsRng);
         let pk = sk.public_key().into();
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 1dea4d2d73..e3995ac6c0 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -2,19 +2,14 @@ use std::net::SocketAddr;
 
 use arc_swap::ArcSwapOption;
 
-use crate::{
-    auth::backend::jwt::FetchAuthRulesError,
-    compute::ConnCfg,
-    context::RequestMonitoring,
-    control_plane::{
-        messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo},
-        NodeInfo,
-    },
-    intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag},
-    EndpointId,
-};
-
 use super::jwt::{AuthRule, FetchAuthRules};
+use crate::auth::backend::jwt::FetchAuthRulesError;
+use crate::compute::ConnCfg;
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
+use crate::control_plane::NodeInfo;
+use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
+use crate::EndpointId;
 
 pub struct LocalBackend {
     pub(crate) node_info: NodeInfo,
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 7cf158bcd9..a4db130b61 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -17,29 +17,22 @@ use tokio_postgres::config::AuthKeys;
 use tracing::{info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
-use crate::auth::{validate_password_and_exchange, AuthError};
+use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserInfoMaybeEndpoint};
 use crate::cache::Cached;
+use crate::config::AuthenticationConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetAuthInfoError;
-use crate::control_plane::provider::{CachedRoleSecret, ControlPlaneBackend};
-use crate::control_plane::AuthSecret;
+use crate::control_plane::provider::{
+    CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend,
+};
+use crate::control_plane::{self, Api, AuthSecret};
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
-use crate::{
-    auth::{self, ComputeUserInfoMaybeEndpoint},
-    config::AuthenticationConfig,
-    control_plane::{
-        self,
-        provider::{CachedAllowedIps, CachedNodeInfo},
-        Api,
-    },
-    stream,
-};
-use crate::{scram, EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -500,34 +493,32 @@ impl ComputeConnectBackend for Backend<'_, ComputeCredentials> {
 
 #[cfg(test)]
 mod tests {
-    use std::{net::IpAddr, sync::Arc, time::Duration};
+    use std::net::IpAddr;
+    use std::sync::Arc;
+    use std::time::Duration;
 
     use bytes::BytesMut;
     use fallible_iterator::FallibleIterator;
     use once_cell::sync::Lazy;
-    use postgres_protocol::{
-        authentication::sasl::{ChannelBinding, ScramSha256},
-        message::{backend::Message as PgMessage, frontend},
-    };
+    use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
+    use postgres_protocol::message::backend::Message as PgMessage;
+    use postgres_protocol::message::frontend;
     use provider::AuthSecret;
     use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
 
-    use crate::{
-        auth::{backend::MaskedIp, ComputeUserInfoMaybeEndpoint, IpPattern},
-        config::AuthenticationConfig,
-        context::RequestMonitoring,
-        control_plane::{
-            self,
-            provider::{self, CachedAllowedIps, CachedRoleSecret},
-            CachedNodeInfo,
-        },
-        proxy::NeonOptions,
-        rate_limiter::{EndpointRateLimiter, RateBucketInfo},
-        scram::{threadpool::ThreadPool, ServerSecret},
-        stream::{PqStream, Stream},
-    };
-
-    use super::{auth_quirks, jwt::JwkCache, AuthRateLimiter};
+    use super::jwt::JwkCache;
+    use super::{auth_quirks, AuthRateLimiter};
+    use crate::auth::backend::MaskedIp;
+    use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
+    use crate::config::AuthenticationConfig;
+    use crate::context::RequestMonitoring;
+    use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret};
+    use crate::control_plane::{self, CachedNodeInfo};
+    use crate::proxy::NeonOptions;
+    use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
+    use crate::scram::threadpool::ThreadPool;
+    use crate::scram::ServerSecret;
+    use crate::stream::{PqStream, Stream};
 
     struct Auth {
         ips: Vec<IpPattern>,
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index cba8601d14..fa6bc4c6f5 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,20 +1,22 @@
 //! User credentials used in authentication.
 
-use crate::{
-    auth::password_hack::parse_endpoint_param,
-    context::RequestMonitoring,
-    error::{ReportableError, UserFacingError},
-    metrics::{Metrics, SniKind},
-    proxy::NeonOptions,
-    serverless::SERVERLESS_DRIVER_SNI,
-    EndpointId, RoleName,
-};
+use std::collections::HashSet;
+use std::net::IpAddr;
+use std::str::FromStr;
+
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::{collections::HashSet, net::IpAddr, str::FromStr};
 use thiserror::Error;
 use tracing::{info, warn};
 
+use crate::auth::password_hack::parse_endpoint_param;
+use crate::context::RequestMonitoring;
+use crate::error::{ReportableError, UserFacingError};
+use crate::metrics::{Metrics, SniKind};
+use crate::proxy::NeonOptions;
+use crate::serverless::SERVERLESS_DRIVER_SNI;
+use crate::{EndpointId, RoleName};
+
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
 pub(crate) enum ComputeUserInfoParseError {
     #[error("Parameter '{0}' is missing in startup packet.")]
@@ -249,10 +251,11 @@ fn project_name_valid(name: &str) -> bool {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use serde_json::json;
     use ComputeUserInfoParseError::*;
 
+    use super::*;
+
     #[test]
     fn parse_bare_minimum() -> anyhow::Result<()> {
         // According to postgresql, only `user` should be required.
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 9a5139dfb8..ccb17b66b9 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,21 +1,24 @@
 //! Main authentication flow.
 
-use super::{backend::ComputeCredentialKeys, AuthErrorImpl, PasswordHackPayload};
-use crate::{
-    config::TlsServerEndPoint,
-    context::RequestMonitoring,
-    control_plane::AuthSecret,
-    intern::EndpointIdInt,
-    sasl,
-    scram::{self, threadpool::ThreadPool},
-    stream::{PqStream, Stream},
-};
+use std::io;
+use std::sync::Arc;
+
 use postgres_protocol::authentication::sasl::{SCRAM_SHA_256, SCRAM_SHA_256_PLUS};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
-use std::{io, sync::Arc};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
+use super::backend::ComputeCredentialKeys;
+use super::{AuthErrorImpl, PasswordHackPayload};
+use crate::config::TlsServerEndPoint;
+use crate::context::RequestMonitoring;
+use crate::control_plane::AuthSecret;
+use crate::intern::EndpointIdInt;
+use crate::sasl;
+use crate::scram::threadpool::ThreadPool;
+use crate::scram::{self};
+use crate::stream::{PqStream, Stream};
+
 /// Every authentication selector is supposed to implement this trait.
 pub(crate) trait AuthMethod {
     /// Any authentication selector should provide initial backend message
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 0c8686add2..ff97e6c35d 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -14,15 +14,15 @@ pub(crate) use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;
 
 mod flow;
+use std::io;
+use std::net::IpAddr;
+
 pub(crate) use flow::*;
+use thiserror::Error;
 use tokio::time::error::Elapsed;
 
-use crate::{
-    control_plane,
-    error::{ReportableError, UserFacingError},
-};
-use std::{io, net::IpAddr};
-use thiserror::Error;
+use crate::control_plane;
+use crate::error::{ReportableError, UserFacingError};
 
 /// Convenience wrapper for the authentication error.
 pub(crate) type Result<T> = std::result::Result<T, AuthError>;
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index c92ebbc51f..e6bc369d9a 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -1,41 +1,43 @@
-use std::{net::SocketAddr, pin::pin, str::FromStr, sync::Arc, time::Duration};
+use std::net::SocketAddr;
+use std::pin::pin;
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::Duration;
 
 use anyhow::{bail, ensure, Context};
 use camino::{Utf8Path, Utf8PathBuf};
 use compute_api::spec::LocalProxySpec;
 use dashmap::DashMap;
 use futures::future::Either;
-use proxy::{
-    auth::{
-        self,
-        backend::{
-            jwt::JwkCache,
-            local::{LocalBackend, JWKS_ROLE_MAP},
-        },
-    },
-    cancellation::CancellationHandlerMain,
-    config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig},
-    control_plane::{
-        locks::ApiLocks,
-        messages::{EndpointJwksResponse, JwksSettings},
-    },
-    http::health_server::AppMetrics,
-    intern::RoleNameInt,
-    metrics::{Metrics, ThreadPoolMetrics},
-    rate_limiter::{BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo},
-    scram::threadpool::ThreadPool,
-    serverless::{self, cancel_set::CancelSet, GlobalConnPoolOptions},
-    RoleName,
+use proxy::auth::backend::jwt::JwkCache;
+use proxy::auth::backend::local::{LocalBackend, JWKS_ROLE_MAP};
+use proxy::auth::{self};
+use proxy::cancellation::CancellationHandlerMain;
+use proxy::config::{self, AuthenticationConfig, HttpConfig, ProxyConfig, RetryConfig};
+use proxy::control_plane::locks::ApiLocks;
+use proxy::control_plane::messages::{EndpointJwksResponse, JwksSettings};
+use proxy::http::health_server::AppMetrics;
+use proxy::intern::RoleNameInt;
+use proxy::metrics::{Metrics, ThreadPoolMetrics};
+use proxy::rate_limiter::{
+    BucketRateLimiter, EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo,
 };
+use proxy::scram::threadpool::ThreadPool;
+use proxy::serverless::cancel_set::CancelSet;
+use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::RoleName;
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
 
 use clap::Parser;
-use tokio::{net::TcpListener, sync::Notify, task::JoinSet};
+use tokio::net::TcpListener;
+use tokio::sync::Notify;
+use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn};
-use utils::{pid_file, project_build_tag, project_git_version, sentry_init::init_sentry};
+use utils::sentry_init::init_sentry;
+use utils::{pid_file, project_build_tag, project_git_version};
 
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 53f1586abe..00eb830d98 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -5,25 +5,23 @@
 /// the outside. Similar to an ingress controller for HTTPS.
 use std::{net::SocketAddr, sync::Arc};
 
+use anyhow::{anyhow, bail, ensure, Context};
+use clap::Arg;
 use futures::future::Either;
+use futures::TryFutureExt;
 use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
-use rustls::pki_types::PrivateKeyDer;
-use tokio::net::TcpListener;
-
-use anyhow::{anyhow, bail, ensure, Context};
-use clap::Arg;
-use futures::TryFutureExt;
 use proxy::stream::{PqStream, Stream};
-
+use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::TcpListener;
 use tokio_util::sync::CancellationToken;
-use utils::{project_git_version, sentry_init::init_sentry};
-
 use tracing::{error, info, Instrument};
+use utils::project_git_version;
+use utils::sentry_init::init_sentry;
 
 project_git_version!(GIT_VERSION);
 
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3c0e66dec3..96a71e69c6 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,3 +1,8 @@
+use std::net::SocketAddr;
+use std::pin::pin;
+use std::sync::Arc;
+
+use anyhow::bail;
 use aws_config::environment::EnvironmentVariableCredentialsProvider;
 use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
@@ -7,52 +12,34 @@ use aws_config::provider_config::ProviderConfig;
 use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
 use aws_config::Region;
 use futures::future::Either;
-use proxy::auth;
 use proxy::auth::backend::jwt::JwkCache;
-use proxy::auth::backend::AuthRateLimiter;
-use proxy::auth::backend::ConsoleRedirectBackend;
-use proxy::auth::backend::MaybeOwned;
-use proxy::cancellation::CancelMap;
-use proxy::cancellation::CancellationHandler;
-use proxy::config::remote_storage_from_toml;
-use proxy::config::AuthenticationConfig;
-use proxy::config::CacheOptions;
-use proxy::config::HttpConfig;
-use proxy::config::ProjectInfoCacheOptions;
-use proxy::config::ProxyProtocolV2;
+use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
+use proxy::cancellation::{CancelMap, CancellationHandler};
+use proxy::config::{
+    self, remote_storage_from_toml, AuthenticationConfig, CacheOptions, HttpConfig,
+    ProjectInfoCacheOptions, ProxyConfig, ProxyProtocolV2,
+};
 use proxy::context::parquet::ParquetUploadArgs;
-use proxy::control_plane;
-use proxy::http;
 use proxy::http::health_server::AppMetrics;
 use proxy::metrics::Metrics;
-use proxy::rate_limiter::EndpointRateLimiter;
-use proxy::rate_limiter::LeakyBucketConfig;
-use proxy::rate_limiter::RateBucketInfo;
-use proxy::rate_limiter::WakeComputeRateLimiter;
+use proxy::rate_limiter::{
+    EndpointRateLimiter, LeakyBucketConfig, RateBucketInfo, WakeComputeRateLimiter,
+};
 use proxy::redis::cancellation_publisher::RedisPublisherClient;
 use proxy::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use proxy::redis::elasticache;
-use proxy::redis::notifications;
+use proxy::redis::{elasticache, notifications};
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::GlobalConnPoolOptions;
-use proxy::usage_metrics;
-
-use anyhow::bail;
-use proxy::config::{self, ProxyConfig};
-use proxy::serverless;
+use proxy::{auth, control_plane, http, serverless, usage_metrics};
 use remote_storage::RemoteStorageConfig;
-use std::net::SocketAddr;
-use std::pin::pin;
-use std::sync::Arc;
 use tokio::net::TcpListener;
 use tokio::sync::Mutex;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
-use tracing::info;
-use tracing::warn;
-use tracing::Instrument;
-use utils::{project_build_tag, project_git_version, sentry_init::init_sentry};
+use tracing::{info, warn, Instrument};
+use utils::sentry_init::init_sentry;
+use utils::{project_build_tag, project_git_version};
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 27121ce89e..82f3247fa7 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -1,31 +1,23 @@
-use std::{
-    convert::Infallible,
-    sync::{
-        atomic::{AtomicBool, Ordering},
-        Arc,
-    },
-    time::Duration,
-};
+use std::convert::Infallible;
+use std::sync::atomic::{AtomicBool, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
 
 use dashmap::DashSet;
-use redis::{
-    streams::{StreamReadOptions, StreamReadReply},
-    AsyncCommands, FromRedisValue, Value,
-};
+use redis::streams::{StreamReadOptions, StreamReadReply};
+use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
 use tokio::sync::Mutex;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
-use crate::{
-    config::EndpointCacheConfig,
-    context::RequestMonitoring,
-    intern::{BranchIdInt, EndpointIdInt, ProjectIdInt},
-    metrics::{Metrics, RedisErrors, RedisEventsCount},
-    rate_limiter::GlobalRateLimiter,
-    redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    EndpointId,
-};
+use crate::config::EndpointCacheConfig;
+use crate::context::RequestMonitoring;
+use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
+use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
+use crate::rate_limiter::GlobalRateLimiter;
+use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use crate::EndpointId;
 
 #[derive(Deserialize, Debug, Clone)]
 pub(crate) struct ControlPlaneEventKey {
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index b92cedb043..31d1dc96e7 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -1,9 +1,8 @@
-use std::{
-    collections::HashSet,
-    convert::Infallible,
-    sync::{atomic::AtomicU64, Arc},
-    time::Duration,
-};
+use std::collections::HashSet;
+use std::convert::Infallible;
+use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
+use std::time::Duration;
 
 use async_trait::async_trait;
 use dashmap::DashMap;
@@ -13,15 +12,12 @@ use tokio::sync::Mutex;
 use tokio::time::Instant;
 use tracing::{debug, info};
 
-use crate::{
-    auth::IpPattern,
-    config::ProjectInfoCacheOptions,
-    control_plane::AuthSecret,
-    intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
-    EndpointId, RoleName,
-};
-
 use super::{Cache, Cached};
+use crate::auth::IpPattern;
+use crate::config::ProjectInfoCacheOptions;
+use crate::control_plane::AuthSecret;
+use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
+use crate::{EndpointId, RoleName};
 
 #[async_trait]
 pub(crate) trait ProjectInfoCache {
@@ -371,7 +367,8 @@ impl Cache for ProjectInfoCacheImpl {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::{scram::ServerSecret, ProjectId};
+    use crate::scram::ServerSecret;
+    use crate::ProjectId;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
diff --git a/proxy/src/cache/timed_lru.rs b/proxy/src/cache/timed_lru.rs
index 5b08d74696..06eaeb9a30 100644
--- a/proxy/src/cache/timed_lru.rs
+++ b/proxy/src/cache/timed_lru.rs
@@ -1,9 +1,6 @@
-use std::{
-    borrow::Borrow,
-    hash::Hash,
-    time::{Duration, Instant},
-};
-use tracing::debug;
+use std::borrow::Borrow;
+use std::hash::Hash;
+use std::time::{Duration, Instant};
 
 // This seems to make more sense than `lru` or `cached`:
 //
@@ -15,8 +12,10 @@ use tracing::debug;
 //
 // On the other hand, `hashlink` has good download stats and appears to be maintained.
 use hashlink::{linked_hash_map::RawEntryMut, LruCache};
+use tracing::debug;
 
-use super::{common::Cached, timed_lru, Cache};
+use super::common::Cached;
+use super::{timed_lru, Cache};
 
 /// An implementation of timed LRU cache with fixed capacity.
 /// Key properties:
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 71a2a16af8..db0970adcb 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -1,6 +1,8 @@
+use std::net::SocketAddr;
+use std::sync::Arc;
+
 use dashmap::DashMap;
 use pq_proto::CancelKeyData;
-use std::{net::SocketAddr, sync::Arc};
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
@@ -8,12 +10,10 @@ use tokio_postgres::{CancelToken, NoTls};
 use tracing::info;
 use uuid::Uuid;
 
-use crate::{
-    error::ReportableError,
-    metrics::{CancellationRequest, CancellationSource, Metrics},
-    redis::cancellation_publisher::{
-        CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
-    },
+use crate::error::ReportableError;
+use crate::metrics::{CancellationRequest, CancellationSource, Metrics};
+use crate::redis::cancellation_publisher::{
+    CancellationPublisher, CancellationPublisherMut, RedisPublisherClient,
 };
 
 pub type CancelMap = Arc<DashMap<CancelKeyData, Option<CancelClosure>>>;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 006804fcd4..212e82497f 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,25 +1,31 @@
-use crate::{
-    auth::parse_endpoint_param,
-    cancellation::CancelClosure,
-    context::RequestMonitoring,
-    control_plane::{errors::WakeComputeError, messages::MetricsAuxInfo, provider::ApiLockError},
-    error::{ReportableError, UserFacingError},
-    metrics::{Metrics, NumDbConnectionsGuard},
-    proxy::neon_option,
-    Host,
-};
+use std::io;
+use std::net::SocketAddr;
+use std::sync::Arc;
+use std::time::Duration;
+
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
-use rustls::{client::danger::ServerCertVerifier, pki_types::InvalidDnsNameError};
-use std::{io, net::SocketAddr, sync::Arc, time::Duration};
+use rustls::client::danger::ServerCertVerifier;
+use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
 use tokio_postgres_rustls::MakeRustlsConnect;
 use tracing::{error, info, warn};
 
+use crate::auth::parse_endpoint_param;
+use crate::cancellation::CancelClosure;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::WakeComputeError;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::control_plane::provider::ApiLockError;
+use crate::error::{ReportableError, UserFacingError};
+use crate::metrics::{Metrics, NumDbConnectionsGuard};
+use crate::proxy::neon_option;
+use crate::Host;
+
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
 #[derive(Debug, Error)]
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index c068fc50fb..2ec8c7adda 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,29 +1,27 @@
-use crate::{
-    auth::backend::{jwt::JwkCache, AuthRateLimiter},
-    control_plane::locks::ApiLocks,
-    rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig},
-    scram::threadpool::ThreadPool,
-    serverless::{cancel_set::CancelSet, GlobalConnPoolOptions},
-    Host,
-};
+use std::collections::{HashMap, HashSet};
+use std::str::FromStr;
+use std::sync::Arc;
+use std::time::Duration;
+
 use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::{
-    crypto::ring::sign,
-    pki_types::{CertificateDer, PrivateKeyDer},
-};
+use rustls::crypto::ring::sign;
+use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use sha2::{Digest, Sha256};
-use std::{
-    collections::{HashMap, HashSet},
-    str::FromStr,
-    sync::Arc,
-    time::Duration,
-};
 use tracing::{error, info};
 use x509_parser::oid_registry;
 
+use crate::auth::backend::jwt::JwkCache;
+use crate::auth::backend::AuthRateLimiter;
+use crate::control_plane::locks::ApiLocks;
+use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig};
+use crate::scram::threadpool::ThreadPool;
+use crate::serverless::cancel_set::CancelSet;
+use crate::serverless::GlobalConnPoolOptions;
+use crate::Host;
+
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
     pub metric_collection: Option<MetricCollectionConfig>,
@@ -692,9 +690,8 @@ impl FromStr for ConcurrencyLockOptions {
 
 #[cfg(test)]
 mod tests {
-    use crate::rate_limiter::Aimd;
-
     use super::*;
+    use crate::rate_limiter::Aimd;
 
     #[test]
     fn test_parse_cache_options() -> anyhow::Result<()> {
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 9e17976720..81d1d70958 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -1,25 +1,22 @@
-use crate::auth::backend::ConsoleRedirectBackend;
-use crate::config::{ProxyConfig, ProxyProtocolV2};
-use crate::proxy::{
-    prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource,
-};
-use crate::{
-    cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal},
-    context::RequestMonitoring,
-    error::ReportableError,
-    metrics::{Metrics, NumClientConnectionsGuard},
-    protocol2::read_proxy_protocol,
-    proxy::handshake::{handshake, HandshakeData},
-};
-use futures::TryFutureExt;
 use std::sync::Arc;
+
+use futures::TryFutureExt;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, Instrument};
 
+use crate::auth::backend::ConsoleRedirectBackend;
+use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::config::{ProxyConfig, ProxyProtocolV2};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::protocol2::read_proxy_protocol;
+use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
+use crate::proxy::handshake::{handshake, HandshakeData};
+use crate::proxy::passthrough::ProxyPassthrough;
 use crate::proxy::{
-    connect_compute::{connect_to_compute, TcpMechanism},
-    passthrough::ProxyPassthrough,
+    prepare_client_connection, run_until_cancelled, ClientRequestError, ErrorSource,
 };
 
 pub async fn task_main(
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 7fb4e7c698..e2d2c1b766 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -1,24 +1,25 @@
 //! Connection request monitoring contexts
 
+use std::net::IpAddr;
+
 use chrono::Utc;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
 use smol_str::SmolStr;
-use std::net::IpAddr;
 use tokio::sync::mpsc;
-use tracing::{debug, field::display, info, info_span, Span};
+use tracing::field::display;
+use tracing::{debug, info, info_span, Span};
 use try_lock::TryLock;
 use uuid::Uuid;
 
-use crate::{
-    control_plane::messages::{ColdStartInfo, MetricsAuxInfo},
-    error::ErrorKind,
-    intern::{BranchIdInt, ProjectIdInt},
-    metrics::{ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting},
-    DbName, EndpointId, RoleName,
-};
-
 use self::parquet::RequestData;
+use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
+use crate::error::ErrorKind;
+use crate::intern::{BranchIdInt, ProjectIdInt};
+use crate::metrics::{
+    ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
+};
+use crate::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
 
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 9f6f83022e..b0ad0e4566 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -1,29 +1,28 @@
-use std::{sync::Arc, time::SystemTime};
+use std::sync::Arc;
+use std::time::SystemTime;
 
 use anyhow::Context;
-use bytes::{buf::Writer, BufMut, BytesMut};
+use bytes::buf::Writer;
+use bytes::{BufMut, BytesMut};
 use chrono::{Datelike, Timelike};
 use futures::{Stream, StreamExt};
-use parquet::{
-    basic::Compression,
-    file::{
-        metadata::RowGroupMetaDataPtr,
-        properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE},
-        writer::SerializedFileWriter,
-    },
-    record::RecordWriter,
-};
+use parquet::basic::Compression;
+use parquet::file::metadata::RowGroupMetaDataPtr;
+use parquet::file::properties::{WriterProperties, WriterPropertiesPtr, DEFAULT_PAGE_SIZE};
+use parquet::file::writer::SerializedFileWriter;
+use parquet::record::RecordWriter;
 use pq_proto::StartupMessageParams;
 use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use serde::ser::SerializeMap;
-use tokio::{sync::mpsc, time};
+use tokio::sync::mpsc;
+use tokio::time;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
 use utils::backoff;
 
-use crate::{config::remote_storage_from_toml, context::LOG_CHAN_DISCONNECT};
-
 use super::{RequestMonitoringInner, LOG_CHAN};
+use crate::config::remote_storage_from_toml;
+use crate::context::LOG_CHAN_DISCONNECT;
 
 #[derive(clap::Args, Clone, Debug)]
 pub struct ParquetUploadArgs {
@@ -407,26 +406,26 @@ async fn upload_parquet(
 
 #[cfg(test)]
 mod tests {
-    use std::{net::Ipv4Addr, num::NonZeroUsize, sync::Arc};
+    use std::net::Ipv4Addr;
+    use std::num::NonZeroUsize;
+    use std::sync::Arc;
 
     use camino::Utf8Path;
     use clap::Parser;
     use futures::{Stream, StreamExt};
     use itertools::Itertools;
-    use parquet::{
-        basic::{Compression, ZstdLevel},
-        file::{
-            properties::{WriterProperties, DEFAULT_PAGE_SIZE},
-            reader::FileReader,
-            serialized_reader::SerializedFileReader,
-        },
-    };
-    use rand::{rngs::StdRng, Rng, SeedableRng};
+    use parquet::basic::{Compression, ZstdLevel};
+    use parquet::file::properties::{WriterProperties, DEFAULT_PAGE_SIZE};
+    use parquet::file::reader::FileReader;
+    use parquet::file::serialized_reader::SerializedFileReader;
+    use rand::rngs::StdRng;
+    use rand::{Rng, SeedableRng};
     use remote_storage::{
         GenericRemoteStorage, RemoteStorageConfig, RemoteStorageKind, S3Config,
         DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT,
     };
-    use tokio::{sync::mpsc, time};
+    use tokio::sync::mpsc;
+    use tokio::time;
     use walkdir::WalkDir;
 
     use super::{worker_inner, ParquetConfig, ParquetUploadArgs, RequestData};
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 960bb5bc21..dae23f7c53 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -1,9 +1,9 @@
-use measured::FixedCardinalityLabel;
-use serde::{Deserialize, Serialize};
 use std::fmt::{self, Display};
 
-use crate::auth::IpPattern;
+use measured::FixedCardinalityLabel;
+use serde::{Deserialize, Serialize};
 
+use crate::auth::IpPattern;
 use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt, RoleNameInt};
 use crate::proxy::retry::CouldRetry;
 
@@ -362,9 +362,10 @@ pub struct JwksSettings {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use serde_json::json;
 
+    use super::*;
+
     fn dummy_aux() -> serde_json::Value {
         json!({
             "endpoint_id": "endpoint",
diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs
index 2c4b5a9b94..5ac3acd28a 100644
--- a/proxy/src/control_plane/mgmt.rs
+++ b/proxy/src/control_plane/mgmt.rs
@@ -1,16 +1,16 @@
-use crate::{
-    control_plane::messages::{DatabaseInfo, KickSession},
-    waiters::{self, Waiter, Waiters},
-};
+use std::convert::Infallible;
+
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, PostgresBackend, PostgresBackendTCP, QueryError};
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::convert::Infallible;
 use tokio::net::{TcpListener, TcpStream};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, Instrument};
 
+use crate::control_plane::messages::{DatabaseInfo, KickSession};
+use crate::waiters::{self, Waiter, Waiters};
+
 static CPLANE_WAITERS: Lazy<Waiters<ComputeReady>> = Lazy::new(Default::default);
 
 /// Give caller an opportunity to wait for the cloud's reply.
diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs
index 51cddec672..fb061376e7 100644
--- a/proxy/src/control_plane/provider/mock.rs
+++ b/proxy/src/control_plane/provider/mock.rs
@@ -1,28 +1,29 @@
 //! Mock console backend which relies on a user-provided postgres instance.
 
-use super::{
-    errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
-};
-use crate::{
-    auth::backend::jwt::AuthRule, context::RequestMonitoring,
-    control_plane::errors::GetEndpointJwksError, intern::RoleNameInt, RoleName,
-};
-use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
-use crate::{auth::IpPattern, cache::Cached};
-use crate::{
-    control_plane::{
-        messages::MetricsAuxInfo,
-        provider::{CachedAllowedIps, CachedRoleSecret},
-    },
-    BranchId, EndpointId, ProjectId,
-};
+use std::str::FromStr;
+use std::sync::Arc;
+
 use futures::TryFutureExt;
-use std::{str::FromStr, sync::Arc};
 use thiserror::Error;
-use tokio_postgres::{config::SslMode, Client};
+use tokio_postgres::config::SslMode;
+use tokio_postgres::Client;
 use tracing::{error, info, info_span, warn, Instrument};
 
+use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
+use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::auth::IpPattern;
+use crate::cache::Cached;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::GetEndpointJwksError;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
+use crate::error::io_error;
+use crate::intern::RoleNameInt;
+use crate::url::ApiUrl;
+use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName};
+
 #[derive(Debug, Error)]
 enum MockApiError {
     #[error("Failed to read password: {0}")]
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index 0a196fe2a3..a4a330cd5f 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -2,39 +2,36 @@
 pub mod mock;
 pub mod neon;
 
-use super::messages::{ControlPlaneError, MetricsAuxInfo};
-use crate::{
-    auth::{
-        backend::{
-            jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError},
-            ComputeCredentialKeys, ComputeUserInfo,
-        },
-        IpPattern,
-    },
-    cache::{endpoints::EndpointsCache, project_info::ProjectInfoCacheImpl, Cached, TimedLru},
-    compute,
-    config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions},
-    context::RequestMonitoring,
-    error::ReportableError,
-    intern::ProjectIdInt,
-    metrics::ApiLockMetrics,
-    rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token},
-    scram, EndpointCacheKey, EndpointId,
-};
+use std::hash::Hash;
+use std::sync::Arc;
+use std::time::Duration;
+
 use dashmap::DashMap;
-use std::{hash::Hash, sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tracing::info;
 
+use super::messages::{ControlPlaneError, MetricsAuxInfo};
+use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
+use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
+use crate::auth::IpPattern;
+use crate::cache::endpoints::EndpointsCache;
+use crate::cache::project_info::ProjectInfoCacheImpl;
+use crate::cache::{Cached, TimedLru};
+use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::intern::ProjectIdInt;
+use crate::metrics::ApiLockMetrics;
+use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
+use crate::{compute, scram, EndpointCacheKey, EndpointId};
+
 pub(crate) mod errors {
-    use crate::{
-        control_plane::messages::{self, ControlPlaneError, Reason},
-        error::{io_error, ErrorKind, ReportableError, UserFacingError},
-        proxy::retry::CouldRetry,
-    };
     use thiserror::Error;
 
     use super::ApiLockError;
+    use crate::control_plane::messages::{self, ControlPlaneError, Reason};
+    use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
+    use crate::proxy::retry::CouldRetry;
 
     /// A go-to error message which doesn't leak any detail.
     pub(crate) const REQUEST_FAILED: &str = "Console request failed";
diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs
index 2487ce0e3f..5d0692c7ca 100644
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/provider/neon.rs
@@ -1,31 +1,31 @@
 //! Production console backend.
 
-use super::{
-    super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute},
-    errors::{ApiError, GetAuthInfoError, WakeComputeError},
-    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
-    NodeInfo,
-};
-use crate::{
-    auth::backend::{jwt::AuthRule, ComputeUserInfo},
-    compute,
-    control_plane::{
-        errors::GetEndpointJwksError,
-        messages::{ColdStartInfo, EndpointJwksResponse, Reason},
-    },
-    http,
-    metrics::{CacheOutcome, Metrics},
-    rate_limiter::WakeComputeRateLimiter,
-    scram, EndpointCacheKey, EndpointId,
-};
-use crate::{cache::Cached, context::RequestMonitoring};
-use ::http::{header::AUTHORIZATION, HeaderName};
+use std::sync::Arc;
+use std::time::Duration;
+
+use ::http::header::AUTHORIZATION;
+use ::http::HeaderName;
 use futures::TryFutureExt;
-use std::{sync::Arc, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{debug, info, info_span, warn, Instrument};
 
+use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute};
+use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
+use super::{
+    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
+    NodeInfo,
+};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::cache::Cached;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::GetEndpointJwksError;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::metrics::{CacheOutcome, Metrics};
+use crate::rate_limiter::WakeComputeRateLimiter;
+use crate::{compute, http, scram, EndpointCacheKey, EndpointId};
+
 const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
 
 #[derive(Clone)]
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 1cd4dc2c22..e71ed0c048 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,4 +1,5 @@
-use std::{error::Error as StdError, fmt, io};
+use std::error::Error as StdError;
+use std::{fmt, io};
 
 use measured::FixedCardinalityLabel;
 
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index d0352351d5..978ad9f761 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,19 +1,18 @@
+use std::convert::Infallible;
+use std::net::TcpListener;
+use std::sync::{Arc, Mutex};
+
 use anyhow::{anyhow, bail};
-use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
-use measured::{text::BufferedTextEncoder, MetricGroup};
+use hyper0::header::CONTENT_TYPE;
+use hyper0::{Body, Request, Response, StatusCode};
+use measured::text::BufferedTextEncoder;
+use measured::MetricGroup;
 use metrics::NeonMetrics;
-use std::{
-    convert::Infallible,
-    net::TcpListener,
-    sync::{Arc, Mutex},
-};
 use tracing::{info, info_span};
-use utils::http::{
-    endpoint::{self, request_span},
-    error::ApiError,
-    json::json_response,
-    RouterBuilder, RouterService,
-};
+use utils::http::endpoint::{self, request_span};
+use utils::http::error::ApiError;
+use utils::http::json::json_response;
+use utils::http::{RouterBuilder, RouterService};
 
 use crate::jemalloc;
 
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index d8676d5b50..fd587e8f01 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -10,17 +10,15 @@ use anyhow::bail;
 use bytes::Bytes;
 use http_body_util::BodyExt;
 use hyper::body::Body;
+pub(crate) use reqwest::{Request, Response};
+use reqwest_middleware::RequestBuilder;
+pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
+pub(crate) use reqwest_retry::policies::ExponentialBackoff;
+pub(crate) use reqwest_retry::RetryTransientMiddleware;
 use serde::de::DeserializeOwned;
 
-pub(crate) use reqwest::{Request, Response};
-pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
-pub(crate) use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-
-use crate::{
-    metrics::{ConsoleRequest, Metrics},
-    url::ApiUrl,
-};
-use reqwest_middleware::RequestBuilder;
+use crate::metrics::{ConsoleRequest, Metrics};
+use crate::url::ApiUrl;
 
 /// This is the preferred way to create new http clients,
 /// because it takes care of observability (OpenTelemetry).
@@ -142,9 +140,10 @@ pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use reqwest::Client;
 
+    use super::*;
+
     #[test]
     fn optional_query_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 108420d7d7..09fd9657d0 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -1,6 +1,8 @@
-use std::{
-    hash::BuildHasherDefault, marker::PhantomData, num::NonZeroUsize, ops::Index, sync::OnceLock,
-};
+use std::hash::BuildHasherDefault;
+use std::marker::PhantomData;
+use std::num::NonZeroUsize;
+use std::ops::Index;
+use std::sync::OnceLock;
 
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;
@@ -208,9 +210,8 @@ impl From<ProjectId> for ProjectIdInt {
 mod tests {
     use std::sync::OnceLock;
 
-    use crate::intern::StringInterner;
-
     use super::InternId;
+    use crate::intern::StringInterner;
 
     struct MyId;
     impl InternId for MyId {
@@ -222,7 +223,8 @@ mod tests {
 
     #[test]
     fn push_many_strings() {
-        use rand::{rngs::StdRng, Rng, SeedableRng};
+        use rand::rngs::StdRng;
+        use rand::{Rng, SeedableRng};
         use rand_distr::Zipf;
 
         let endpoint_dist = Zipf::new(500000, 0.8).unwrap();
diff --git a/proxy/src/jemalloc.rs b/proxy/src/jemalloc.rs
index d307d80f4a..0fae78b60c 100644
--- a/proxy/src/jemalloc.rs
+++ b/proxy/src/jemalloc.rs
@@ -1,14 +1,12 @@
 use std::marker::PhantomData;
 
-use measured::{
-    label::NoLabels,
-    metric::{
-        gauge::GaugeState, group::Encoding, name::MetricNameEncoder, MetricEncoding,
-        MetricFamilyEncoding, MetricType,
-    },
-    text::TextEncoder,
-    LabelGroup, MetricGroup,
-};
+use measured::label::NoLabels;
+use measured::metric::gauge::GaugeState;
+use measured::metric::group::Encoding;
+use measured::metric::name::MetricNameEncoder;
+use measured::metric::{MetricEncoding, MetricFamilyEncoding, MetricType};
+use measured::text::TextEncoder;
+use measured::{LabelGroup, MetricGroup};
 use tikv_jemalloc_ctl::{config, epoch, epoch_mib, stats, version};
 
 pub struct MetricRecorder {
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index a34eb820f8..11921867e4 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -1,14 +1,10 @@
 use tracing::Subscriber;
-use tracing_subscriber::{
-    filter::{EnvFilter, LevelFilter},
-    fmt::{
-        format::{Format, Full},
-        time::SystemTime,
-        FormatEvent, FormatFields,
-    },
-    prelude::*,
-    registry::LookupSpan,
-};
+use tracing_subscriber::filter::{EnvFilter, LevelFilter};
+use tracing_subscriber::fmt::format::{Format, Full};
+use tracing_subscriber::fmt::time::SystemTime;
+use tracing_subscriber::fmt::{FormatEvent, FormatFields};
+use tracing_subscriber::prelude::*;
+use tracing_subscriber::registry::LookupSpan;
 
 /// Initialize logging and OpenTelemetry tracing and exporter.
 ///
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 272723a1bc..542826e833 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,14 +1,16 @@
 use std::sync::{Arc, OnceLock};
 
 use lasso::ThreadedRodeo;
+use measured::label::{
+    FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet,
+};
+use measured::metric::histogram::Thresholds;
+use measured::metric::name::MetricName;
 use measured::{
-    label::{FixedCardinalitySet, LabelGroupSet, LabelName, LabelSet, LabelValue, StaticLabelSet},
-    metric::{histogram::Thresholds, name::MetricName},
     Counter, CounterVec, FixedCardinalityLabel, Gauge, Histogram, HistogramVec, LabelGroup,
     MetricGroup,
 };
 use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
-
 use tokio::time::{self, Instant};
 
 use crate::control_plane::messages::ColdStartInfo;
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index 17764f78d1..ef2391cdd8 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,11 +1,9 @@
 //! Proxy Protocol V2 implementation
 
-use std::{
-    io,
-    net::SocketAddr,
-    pin::Pin,
-    task::{Context, Poll},
-};
+use std::io;
+use std::net::SocketAddr;
+use std::pin::Pin;
+use std::task::{Context, Poll};
 
 use bytes::BytesMut;
 use pin_project_lite::pin_project;
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index aac7720890..8e9663626a 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -1,24 +1,23 @@
-use crate::{
-    auth::backend::ComputeCredentialKeys,
-    compute::COULD_NOT_CONNECT,
-    compute::{self, PostgresConnection},
-    config::RetryConfig,
-    context::RequestMonitoring,
-    control_plane::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo},
-    error::ReportableError,
-    metrics::{ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType},
-    proxy::{
-        retry::{retry_after, should_retry, CouldRetry},
-        wake_compute::wake_compute,
-    },
-    Host,
-};
 use async_trait::async_trait;
 use pq_proto::StartupMessageParams;
 use tokio::time;
 use tracing::{debug, info, warn};
 
 use super::retry::ShouldRetryWakeCompute;
+use crate::auth::backend::ComputeCredentialKeys;
+use crate::compute::{self, PostgresConnection, COULD_NOT_CONNECT};
+use crate::config::RetryConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::WakeComputeError;
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
+use crate::error::ReportableError;
+use crate::metrics::{
+    ConnectOutcome, ConnectionFailureKind, Metrics, RetriesMetricGroup, RetryType,
+};
+use crate::proxy::retry::{retry_after, should_retry, CouldRetry};
+use crate::proxy::wake_compute::wake_compute;
+use crate::Host;
 
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 
diff --git a/proxy/src/proxy/copy_bidirectional.rs b/proxy/src/proxy/copy_bidirectional.rs
index 4ebda013ac..91a3ceff75 100644
--- a/proxy/src/proxy/copy_bidirectional.rs
+++ b/proxy/src/proxy/copy_bidirectional.rs
@@ -1,11 +1,11 @@
-use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
-use tracing::info;
-
 use std::future::poll_fn;
 use std::io;
 use std::pin::Pin;
 use std::task::{ready, Context, Poll};
 
+use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
+use tracing::info;
+
 #[derive(Debug)]
 enum TransferState {
     Running(CopyBuffer),
@@ -256,9 +256,10 @@ impl CopyBuffer {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use tokio::io::AsyncWriteExt;
 
+    use super::*;
+
     #[tokio::test]
     async fn test_client_to_compute() {
         let (mut client_client, mut client_proxy) = tokio::io::duplex(8); // Create a mock duplex stream
diff --git a/proxy/src/proxy/handshake.rs b/proxy/src/proxy/handshake.rs
index 5996b11c11..a67f1b8112 100644
--- a/proxy/src/proxy/handshake.rs
+++ b/proxy/src/proxy/handshake.rs
@@ -1,21 +1,19 @@
 use bytes::Buf;
+use pq_proto::framed::Framed;
 use pq_proto::{
-    framed::Framed, BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion,
-    StartupMessageParams,
+    BeMessage as Be, CancelKeyData, FeStartupPacket, ProtocolVersion, StartupMessageParams,
 };
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
 
-use crate::{
-    auth::endpoint_sni,
-    config::{TlsConfig, PG_ALPN_PROTOCOL},
-    context::RequestMonitoring,
-    error::ReportableError,
-    metrics::Metrics,
-    proxy::ERR_INSECURE_CONNECTION,
-    stream::{PqStream, Stream, StreamUpgradeError},
-};
+use crate::auth::endpoint_sni;
+use crate::config::{TlsConfig, PG_ALPN_PROTOCOL};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::metrics::Metrics;
+use crate::proxy::ERR_INSECURE_CONNECTION;
+use crate::stream::{PqStream, Stream, StreamUpgradeError};
 
 #[derive(Error, Debug)]
 pub(crate) enum HandshakeError {
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index b2b5a7f43d..f646862caa 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -7,40 +7,32 @@ pub(crate) mod handshake;
 pub(crate) mod passthrough;
 pub(crate) mod retry;
 pub(crate) mod wake_compute;
-pub use copy_bidirectional::copy_bidirectional_client_compute;
-pub use copy_bidirectional::ErrorSource;
+use std::sync::Arc;
 
-use crate::config::ProxyProtocolV2;
-use crate::{
-    auth,
-    cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal},
-    compute,
-    config::{ProxyConfig, TlsConfig},
-    context::RequestMonitoring,
-    error::ReportableError,
-    metrics::{Metrics, NumClientConnectionsGuard},
-    protocol2::read_proxy_protocol,
-    proxy::handshake::{handshake, HandshakeData},
-    rate_limiter::EndpointRateLimiter,
-    stream::{PqStream, Stream},
-    EndpointCacheKey,
-};
+pub use copy_bidirectional::{copy_bidirectional_client_compute, ErrorSource};
 use futures::TryFutureExt;
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::{BeMessage as Be, StartupMessageParams};
 use regex::Regex;
 use smol_str::{format_smolstr, SmolStr};
-use std::sync::Arc;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, warn, Instrument};
 
-use self::{
-    connect_compute::{connect_to_compute, TcpMechanism},
-    passthrough::ProxyPassthrough,
-};
+use self::connect_compute::{connect_to_compute, TcpMechanism};
+use self::passthrough::ProxyPassthrough;
+use crate::cancellation::{self, CancellationHandlerMain, CancellationHandlerMainInternal};
+use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
+use crate::context::RequestMonitoring;
+use crate::error::ReportableError;
+use crate::metrics::{Metrics, NumClientConnectionsGuard};
+use crate::protocol2::read_proxy_protocol;
+use crate::proxy::handshake::{handshake, HandshakeData};
+use crate::rate_limiter::EndpointRateLimiter;
+use crate::stream::{PqStream, Stream};
+use crate::{auth, compute, EndpointCacheKey};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
diff --git a/proxy/src/proxy/passthrough.rs b/proxy/src/proxy/passthrough.rs
index 497cf4bfd5..e3b4730982 100644
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,16 +1,14 @@
-use crate::{
-    cancellation,
-    compute::PostgresConnection,
-    control_plane::messages::MetricsAuxInfo,
-    metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard},
-    stream::Stream,
-    usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS},
-};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 use utils::measured_stream::MeasuredStream;
 
 use super::copy_bidirectional::ErrorSource;
+use crate::cancellation;
+use crate::compute::PostgresConnection;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::{Direction, Metrics, NumClientConnectionsGuard, NumConnectionRequestsGuard};
+use crate::stream::Stream;
+use crate::usage_metrics::{Ids, MetricCounterRecorder, USAGE_METRICS};
 
 /// Forward bytes in both directions (client <-> compute).
 #[tracing::instrument(skip_all)]
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index 15895d37e6..d3f0c3e7d4 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -1,7 +1,11 @@
-use crate::{compute, config::RetryConfig};
-use std::{error::Error, io};
+use std::error::Error;
+use std::io;
+
 use tokio::time;
 
+use crate::compute;
+use crate::config::RetryConfig;
+
 pub(crate) trait CouldRetry {
     /// Returns true if the error could be retried
     fn could_retry(&self) -> bool;
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index 33a2162bc7..df9f79a7e3 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -6,7 +6,6 @@
 
 use std::fmt::Debug;
 
-use super::*;
 use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
 use postgres_protocol::message::frontend;
@@ -14,6 +13,8 @@ use tokio::io::{AsyncReadExt, DuplexStream};
 use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
+use super::*;
+
 enum Intercept {
     None,
     Methods,
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index deb4d4a63f..e50ae4bc93 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -4,6 +4,16 @@ mod mitm;
 
 use std::time::Duration;
 
+use anyhow::{bail, Context};
+use async_trait::async_trait;
+use http::StatusCode;
+use retry::{retry_after, ShouldRetryWakeCompute};
+use rstest::rstest;
+use rustls::pki_types;
+use tokio_postgres::config::SslMode;
+use tokio_postgres::tls::{MakeTlsConnect, NoTls};
+use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
+
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
 use super::*;
@@ -18,15 +28,6 @@ use crate::control_plane::provider::{
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::{sasl, scram, BranchId, EndpointId, ProjectId};
-use anyhow::{bail, Context};
-use async_trait::async_trait;
-use http::StatusCode;
-use retry::{retry_after, ShouldRetryWakeCompute};
-use rstest::rstest;
-use rustls::pki_types;
-use tokio_postgres::config::SslMode;
-use tokio_postgres::tls::{MakeTlsConnect, NoTls};
-use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
 
 /// Generate a set of TLS certificates: CA + server.
 fn generate_certs(
@@ -336,7 +337,8 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), Scram::mock()));
 
-    use rand::{distributions::Alphanumeric, Rng};
+    use rand::distributions::Alphanumeric;
+    use rand::Rng;
     let password: String = rand::thread_rng()
         .sample_iter(&Alphanumeric)
         .take(rand::random::<u8>() as usize)
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 0d1527a2c1..9dfa485fa4 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,16 +1,17 @@
+use hyper::StatusCode;
+use tracing::{error, info, warn};
+
+use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestMonitoring;
+use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::{ControlPlaneError, Reason};
-use crate::control_plane::{errors::WakeComputeError, provider::CachedNodeInfo};
+use crate::control_plane::provider::CachedNodeInfo;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
     WakeupFailureKind,
 };
 use crate::proxy::retry::{retry_after, should_retry};
-use hyper::StatusCode;
-use tracing::{error, info, warn};
-
-use super::connect_compute::ComputeConnectBackend;
 
 pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
     num_retries: &mut u32,
diff --git a/proxy/src/rate_limiter/leaky_bucket.rs b/proxy/src/rate_limiter/leaky_bucket.rs
index bf4d85f2e4..45f9630dde 100644
--- a/proxy/src/rate_limiter/leaky_bucket.rs
+++ b/proxy/src/rate_limiter/leaky_bucket.rs
@@ -1,7 +1,5 @@
-use std::{
-    hash::Hash,
-    sync::atomic::{AtomicUsize, Ordering},
-};
+use std::hash::Hash;
+use std::sync::atomic::{AtomicUsize, Ordering};
 
 use ahash::RandomState;
 use dashmap::DashMap;
diff --git a/proxy/src/rate_limiter/limit_algorithm.rs b/proxy/src/rate_limiter/limit_algorithm.rs
index 25607b7e10..16c398f303 100644
--- a/proxy/src/rate_limiter/limit_algorithm.rs
+++ b/proxy/src/rate_limiter/limit_algorithm.rs
@@ -1,10 +1,12 @@
 //! Algorithms for controlling concurrency limits.
+use std::pin::pin;
+use std::sync::Arc;
+use std::time::Duration;
+
 use parking_lot::Mutex;
-use std::{pin::pin, sync::Arc, time::Duration};
-use tokio::{
-    sync::Notify,
-    time::{error::Elapsed, Instant},
-};
+use tokio::sync::Notify;
+use tokio::time::error::Elapsed;
+use tokio::time::Instant;
 
 use self::aimd::Aimd;
 
diff --git a/proxy/src/rate_limiter/limit_algorithm/aimd.rs b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
index 86b56e38fb..5332a5184f 100644
--- a/proxy/src/rate_limiter/limit_algorithm/aimd.rs
+++ b/proxy/src/rate_limiter/limit_algorithm/aimd.rs
@@ -60,12 +60,11 @@ impl LimitAlgorithm for Aimd {
 mod tests {
     use std::time::Duration;
 
+    use super::*;
     use crate::rate_limiter::limit_algorithm::{
         DynamicLimiter, RateLimitAlgorithm, RateLimiterConfig,
     };
 
-    use super::*;
-
     #[tokio::test(start_paused = true)]
     async fn increase_decrease() {
         let config = RateLimiterConfig {
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index be529f174d..5de64c2254 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -1,17 +1,14 @@
-use std::{
-    borrow::Cow,
-    collections::hash_map::RandomState,
-    hash::{BuildHasher, Hash},
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Mutex,
-    },
-};
+use std::borrow::Cow;
+use std::collections::hash_map::RandomState;
+use std::hash::{BuildHasher, Hash};
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::Mutex;
 
 use anyhow::bail;
 use dashmap::DashMap;
 use itertools::Itertools;
-use rand::{rngs::StdRng, Rng, SeedableRng};
+use rand::rngs::StdRng;
+use rand::{Rng, SeedableRng};
 use tokio::time::{Duration, Instant};
 use tracing::info;
 
@@ -243,14 +240,17 @@ impl<K: Hash + Eq, R: Rng, S: BuildHasher + Clone> BucketRateLimiter<K, R, S> {
 
 #[cfg(test)]
 mod tests {
-    use std::{hash::BuildHasherDefault, time::Duration};
+    use std::hash::BuildHasherDefault;
+    use std::time::Duration;
 
     use rand::SeedableRng;
     use rustc_hash::FxHasher;
     use tokio::time;
 
     use super::{BucketRateLimiter, WakeComputeRateLimiter};
-    use crate::{intern::EndpointIdInt, rate_limiter::RateBucketInfo, EndpointId};
+    use crate::intern::EndpointIdInt;
+    use crate::rate_limiter::RateBucketInfo;
+    use crate::EndpointId;
 
     #[test]
     fn rate_bucket_rpi() {
diff --git a/proxy/src/rate_limiter/mod.rs b/proxy/src/rate_limiter/mod.rs
index 6e38f89458..3ae2ecaf8f 100644
--- a/proxy/src/rate_limiter/mod.rs
+++ b/proxy/src/rate_limiter/mod.rs
@@ -2,13 +2,11 @@ mod leaky_bucket;
 mod limit_algorithm;
 mod limiter;
 
+pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
 #[cfg(test)]
 pub(crate) use limit_algorithm::aimd::Aimd;
-
 pub(crate) use limit_algorithm::{
     DynamicLimiter, Outcome, RateLimitAlgorithm, RateLimiterConfig, Token,
 };
 pub(crate) use limiter::GlobalRateLimiter;
-
-pub use leaky_bucket::{EndpointRateLimiter, LeakyBucketConfig, LeakyBucketRateLimiter};
 pub use limiter::{BucketRateLimiter, RateBucketInfo, WakeComputeRateLimiter};
diff --git a/proxy/src/redis/cancellation_publisher.rs b/proxy/src/redis/cancellation_publisher.rs
index 95bdfc0965..0000246971 100644
--- a/proxy/src/redis/cancellation_publisher.rs
+++ b/proxy/src/redis/cancellation_publisher.rs
@@ -5,13 +5,10 @@ use redis::AsyncCommands;
 use tokio::sync::Mutex;
 use uuid::Uuid;
 
+use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
+use super::notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME};
 use crate::rate_limiter::{GlobalRateLimiter, RateBucketInfo};
 
-use super::{
-    connection_with_credentials_provider::ConnectionWithCredentialsProvider,
-    notifications::{CancelSession, Notification, PROXY_CHANNEL_NAME},
-};
-
 pub trait CancellationPublisherMut: Send + Sync + 'static {
     #[allow(async_fn_in_trait)]
     async fn try_publish(
diff --git a/proxy/src/redis/connection_with_credentials_provider.rs b/proxy/src/redis/connection_with_credentials_provider.rs
index ccd48f1481..82139ea1d5 100644
--- a/proxy/src/redis/connection_with_credentials_provider.rs
+++ b/proxy/src/redis/connection_with_credentials_provider.rs
@@ -1,10 +1,9 @@
-use std::{sync::Arc, time::Duration};
+use std::sync::Arc;
+use std::time::Duration;
 
 use futures::FutureExt;
-use redis::{
-    aio::{ConnectionLike, MultiplexedConnection},
-    ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult,
-};
+use redis::aio::{ConnectionLike, MultiplexedConnection};
+use redis::{ConnectionInfo, IntoConnectionInfo, RedisConnectionInfo, RedisResult};
 use tokio::task::JoinHandle;
 use tracing::{debug, error, info, warn};
 
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index c3af6740cb..e56c5a3414 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -1,4 +1,5 @@
-use std::{convert::Infallible, sync::Arc};
+use std::convert::Infallible;
+use std::sync::Arc;
 
 use futures::StreamExt;
 use pq_proto::CancelKeyData;
@@ -8,12 +9,10 @@ use tokio_util::sync::CancellationToken;
 use uuid::Uuid;
 
 use super::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::{
-    cache::project_info::ProjectInfoCache,
-    cancellation::{CancelMap, CancellationHandler},
-    intern::{ProjectIdInt, RoleNameInt},
-    metrics::{Metrics, RedisErrors, RedisEventsCount},
-};
+use crate::cache::project_info::ProjectInfoCache;
+use crate::cancellation::{CancelMap, CancellationHandler};
+use crate::intern::{ProjectIdInt, RoleNameInt};
+use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 
 const CPLANE_CHANNEL_NAME: &str = "neondb-proxy-ws-updates";
 pub(crate) const PROXY_CHANNEL_NAME: &str = "neondb-proxy-to-proxy-updates";
@@ -269,10 +268,10 @@ where
 
 #[cfg(test)]
 mod tests {
-    use crate::{ProjectId, RoleName};
+    use serde_json::json;
 
     use super::*;
-    use serde_json::json;
+    use crate::{ProjectId, RoleName};
 
     #[test]
     fn parse_allowed_ips() -> anyhow::Result<()> {
diff --git a/proxy/src/sasl/messages.rs b/proxy/src/sasl/messages.rs
index 6c9a42b2db..1373dfba3d 100644
--- a/proxy/src/sasl/messages.rs
+++ b/proxy/src/sasl/messages.rs
@@ -1,8 +1,9 @@
 //! Definitions for SASL messages.
 
-use crate::parse::{split_at_const, split_cstr};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage};
 
+use crate::parse::{split_at_const, split_cstr};
+
 /// SASL-specific payload of [`PasswordMessage`](pq_proto::FeMessage::PasswordMessage).
 #[derive(Debug)]
 pub(crate) struct FirstMessage<'a> {
diff --git a/proxy/src/sasl/mod.rs b/proxy/src/sasl/mod.rs
index 0a36694359..f0181b404f 100644
--- a/proxy/src/sasl/mod.rs
+++ b/proxy/src/sasl/mod.rs
@@ -10,13 +10,14 @@ mod channel_binding;
 mod messages;
 mod stream;
 
-use crate::error::{ReportableError, UserFacingError};
 use std::io;
-use thiserror::Error;
 
 pub(crate) use channel_binding::ChannelBinding;
 pub(crate) use messages::FirstMessage;
 pub(crate) use stream::{Outcome, SaslStream};
+use thiserror::Error;
+
+use crate::error::{ReportableError, UserFacingError};
 
 /// Fine-grained auth errors help in writing tests.
 #[derive(Error, Debug)]
diff --git a/proxy/src/sasl/stream.rs b/proxy/src/sasl/stream.rs
index b6becd28e1..f1c916daa2 100644
--- a/proxy/src/sasl/stream.rs
+++ b/proxy/src/sasl/stream.rs
@@ -1,11 +1,14 @@
 //! Abstraction for the string-oriented SASL protocols.
 
-use super::{messages::ServerMessage, Mechanism};
-use crate::stream::PqStream;
 use std::io;
+
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
+use super::messages::ServerMessage;
+use super::Mechanism;
+use crate::stream::PqStream;
+
 /// Abstracts away all peculiarities of the libpq's protocol.
 pub(crate) struct SaslStream<'a, S> {
     /// The underlying stream.
diff --git a/proxy/src/scram/countmin.rs b/proxy/src/scram/countmin.rs
index 64ee0135e1..87ab6e0d5f 100644
--- a/proxy/src/scram/countmin.rs
+++ b/proxy/src/scram/countmin.rs
@@ -69,7 +69,9 @@ impl CountMinSketch {
 
 #[cfg(test)]
 mod tests {
-    use rand::{rngs::StdRng, seq::SliceRandom, Rng, SeedableRng};
+    use rand::rngs::StdRng;
+    use rand::seq::SliceRandom;
+    use rand::{Rng, SeedableRng};
 
     use super::CountMinSketch;
 
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index afb5604666..493295c938 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -209,7 +209,8 @@ impl sasl::Mechanism for Exchange<'_> {
     type Output = super::ScramKey;
 
     fn exchange(mut self, input: &str) -> sasl::Result<sasl::Step<Self, Self::Output>> {
-        use {sasl::Step, ExchangeState};
+        use sasl::Step;
+        use ExchangeState;
         match &self.state {
             ExchangeState::Initial(init) => {
                 match init.transition(self.secret, &self.tls_server_end_point, input)? {
diff --git a/proxy/src/scram/messages.rs b/proxy/src/scram/messages.rs
index fd9e77764c..5ee3a51352 100644
--- a/proxy/src/scram/messages.rs
+++ b/proxy/src/scram/messages.rs
@@ -1,11 +1,12 @@
 //! Definitions for SCRAM messages.
 
+use std::fmt;
+use std::ops::Range;
+
 use super::base64_decode_array;
 use super::key::{ScramKey, SCRAM_KEY_LEN};
 use super::signature::SignatureBuilder;
 use crate::sasl::ChannelBinding;
-use std::fmt;
-use std::ops::Range;
 
 /// Faithfully taken from PostgreSQL.
 pub(crate) const SCRAM_RAW_NONCE_LEN: usize = 18;
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index d058f1c3f8..97644b6282 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -16,10 +16,9 @@ mod signature;
 pub mod threadpool;
 
 pub(crate) use exchange::{exchange, Exchange};
+use hmac::{Hmac, Mac};
 pub(crate) use key::ScramKey;
 pub(crate) use secret::ServerSecret;
-
-use hmac::{Hmac, Mac};
 use sha2::{Digest, Sha256};
 
 const SCRAM_SHA_256: &str = "SCRAM-SHA-256";
@@ -59,13 +58,11 @@ fn sha256<'a>(parts: impl IntoIterator<Item = &'a [u8]>) -> [u8; 32] {
 
 #[cfg(test)]
 mod tests {
-    use crate::{
-        intern::EndpointIdInt,
-        sasl::{Mechanism, Step},
-        EndpointId,
-    };
-
-    use super::{threadpool::ThreadPool, Exchange, ServerSecret};
+    use super::threadpool::ThreadPool;
+    use super::{Exchange, ServerSecret};
+    use crate::intern::EndpointIdInt;
+    use crate::sasl::{Mechanism, Step};
+    use crate::EndpointId;
 
     #[test]
     fn snapshot() {
diff --git a/proxy/src/scram/pbkdf2.rs b/proxy/src/scram/pbkdf2.rs
index 4cf76c8452..9c559e9082 100644
--- a/proxy/src/scram/pbkdf2.rs
+++ b/proxy/src/scram/pbkdf2.rs
@@ -1,7 +1,6 @@
-use hmac::{
-    digest::{consts::U32, generic_array::GenericArray},
-    Hmac, Mac,
-};
+use hmac::digest::consts::U32;
+use hmac::digest::generic_array::GenericArray;
+use hmac::{Hmac, Mac};
 use sha2::Sha256;
 
 pub(crate) struct Pbkdf2 {
@@ -66,10 +65,11 @@ impl Pbkdf2 {
 
 #[cfg(test)]
 mod tests {
-    use super::Pbkdf2;
     use pbkdf2::pbkdf2_hmac_array;
     use sha2::Sha256;
 
+    use super::Pbkdf2;
+
     #[test]
     fn works() {
         let salt = b"sodium chloride";
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index c027a0cd20..cc1b69fcf9 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -4,28 +4,21 @@
 //! 1. Fairness per endpoint.
 //! 2. Yield support for high iteration counts.
 
-use std::{
-    cell::RefCell,
-    future::Future,
-    pin::Pin,
-    sync::{
-        atomic::{AtomicUsize, Ordering},
-        Arc, Weak,
-    },
-    task::{Context, Poll},
-};
+use std::cell::RefCell;
+use std::future::Future;
+use std::pin::Pin;
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Weak};
+use std::task::{Context, Poll};
 
 use futures::FutureExt;
-use rand::Rng;
-use rand::{rngs::SmallRng, SeedableRng};
-
-use crate::{
-    intern::EndpointIdInt,
-    metrics::{ThreadPoolMetrics, ThreadPoolWorkerId},
-    scram::countmin::CountMinSketch,
-};
+use rand::rngs::SmallRng;
+use rand::{Rng, SeedableRng};
 
 use super::pbkdf2::Pbkdf2;
+use crate::intern::EndpointIdInt;
+use crate::metrics::{ThreadPoolMetrics, ThreadPoolWorkerId};
+use crate::scram::countmin::CountMinSketch;
 
 pub struct ThreadPool {
     runtime: Option<tokio::runtime::Runtime>,
@@ -195,9 +188,8 @@ impl Drop for JobHandle {
 
 #[cfg(test)]
 mod tests {
-    use crate::EndpointId;
-
     use super::*;
+    use crate::EndpointId;
 
     #[tokio::test]
     async fn hash_is_correct() {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 927854897f..a180c4c2ed 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -1,42 +1,34 @@
-use std::{io, sync::Arc, time::Duration};
+use std::io;
+use std::sync::Arc;
+use std::time::Duration;
 
 use async_trait::async_trait;
 use hyper_util::rt::{TokioExecutor, TokioIo, TokioTimer};
-use p256::{ecdsa::SigningKey, elliptic_curve::JwkEcKey};
+use p256::ecdsa::SigningKey;
+use p256::elliptic_curve::JwkEcKey;
 use rand::rngs::OsRng;
 use tokio::net::{lookup_host, TcpStream};
-use tracing::{debug, field::display, info};
+use tracing::field::display;
+use tracing::{debug, info};
 
-use crate::{
-    auth::{
-        self,
-        backend::{local::StaticAuthRules, ComputeCredentials, ComputeUserInfo},
-        check_peer_addr_is_in_list, AuthError,
-    },
-    compute,
-    config::ProxyConfig,
-    context::RequestMonitoring,
-    control_plane::{
-        errors::{GetAuthInfoError, WakeComputeError},
-        locks::ApiLocks,
-        provider::ApiLockError,
-        CachedNodeInfo,
-    },
-    error::{ErrorKind, ReportableError, UserFacingError},
-    intern::EndpointIdInt,
-    proxy::{
-        connect_compute::ConnectMechanism,
-        retry::{CouldRetry, ShouldRetryWakeCompute},
-    },
-    rate_limiter::EndpointRateLimiter,
-    EndpointId, Host,
-};
-
-use super::{
-    conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool},
-    http_conn_pool::{self, poll_http2_client},
-    local_conn_pool::{self, LocalClient, LocalConnPool},
-};
+use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
+use super::http_conn_pool::{self, poll_http2_client};
+use super::local_conn_pool::{self, LocalClient, LocalConnPool};
+use crate::auth::backend::local::StaticAuthRules;
+use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
+use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::config::ProxyConfig;
+use crate::context::RequestMonitoring;
+use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::provider::ApiLockError;
+use crate::control_plane::CachedNodeInfo;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::intern::EndpointIdInt;
+use crate::proxy::connect_compute::ConnectMechanism;
+use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
+use crate::rate_limiter::EndpointRateLimiter;
+use crate::{compute, EndpointId, Host};
 
 pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool>,
diff --git a/proxy/src/serverless/cancel_set.rs b/proxy/src/serverless/cancel_set.rs
index 7659745473..6db986f1f7 100644
--- a/proxy/src/serverless/cancel_set.rs
+++ b/proxy/src/serverless/cancel_set.rs
@@ -1,10 +1,8 @@
 //! A set for cancelling random http connections
 
-use std::{
-    hash::{BuildHasher, BuildHasherDefault},
-    num::NonZeroUsize,
-    time::Duration,
-};
+use std::hash::{BuildHasher, BuildHasherDefault};
+use std::num::NonZeroUsize;
+use std::time::Duration;
 
 use indexmap::IndexMap;
 use parking_lot::Mutex;
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 2e576e0ded..aa869ff1c0 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,33 +1,31 @@
+use std::collections::HashMap;
+use std::fmt;
+use std::ops::Deref;
+use std::pin::pin;
+use std::sync::atomic::{self, AtomicUsize};
+use std::sync::{Arc, Weak};
+use std::task::{ready, Poll};
+use std::time::Duration;
+
 use dashmap::DashMap;
-use futures::{future::poll_fn, Future};
+use futures::future::poll_fn;
+use futures::Future;
 use parking_lot::RwLock;
 use rand::Rng;
 use smallvec::SmallVec;
-use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
-use std::{
-    fmt,
-    task::{ready, Poll},
-};
-use std::{
-    ops::Deref,
-    sync::atomic::{self, AtomicUsize},
-};
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 use tokio_util::sync::CancellationToken;
+use tracing::{debug, error, info, info_span, warn, Instrument, Span};
 
+use super::backend::HttpConnError;
+use crate::auth::backend::ComputeUserInfo;
+use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
-};
-
-use tracing::{debug, error, warn, Span};
-use tracing::{info, info_span, Instrument};
-
-use super::backend::HttpConnError;
+use crate::{DbName, EndpointCacheKey, RoleName};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
@@ -724,13 +722,13 @@ impl<C: ClientInnerExt> Drop for Client<C> {
 
 #[cfg(test)]
 mod tests {
-    use std::{mem, sync::atomic::AtomicBool};
-
-    use crate::{
-        proxy::NeonOptions, serverless::cancel_set::CancelSet, BranchId, EndpointId, ProjectId,
-    };
+    use std::mem;
+    use std::sync::atomic::AtomicBool;
 
     use super::*;
+    use crate::proxy::NeonOptions;
+    use crate::serverless::cancel_set::CancelSet;
+    use crate::{BranchId, EndpointId, ProjectId};
 
     struct MockClient(Arc<AtomicBool>);
     impl MockClient {
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 6d61536f1a..9b6bc98557 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -1,22 +1,21 @@
+use std::collections::VecDeque;
+use std::sync::atomic::{self, AtomicUsize};
+use std::sync::{Arc, Weak};
+
 use dashmap::DashMap;
 use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use rand::Rng;
-use std::collections::VecDeque;
-use std::sync::atomic::{self, AtomicUsize};
-use std::{sync::Arc, sync::Weak};
 use tokio::net::TcpStream;
+use tracing::{debug, error, info, info_span, Instrument};
 
+use super::conn_pool::ConnInfo;
+use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{context::RequestMonitoring, EndpointCacheKey};
-
-use tracing::{debug, error};
-use tracing::{info, info_span, Instrument};
-
-use super::conn_pool::ConnInfo;
+use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
 pub(crate) type Connect =
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index c1c5764d17..c0208d4f68 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -1,12 +1,11 @@
 //! Things stolen from `libs/utils/src/http` to add hyper 1.0 compatibility
 //! Will merge back in at some point in the future.
 
-use bytes::Bytes;
-
 use anyhow::Context;
+use bytes::Bytes;
 use http::{Response, StatusCode};
-use http_body_util::{combinators::BoxBody, BodyExt, Full};
-
+use http_body_util::combinators::BoxBody;
+use http_body_util::{BodyExt, Full};
 use serde::Serialize;
 use utils::http::error::ApiError;
 
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 9f328a0e1d..8c56d317cc 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,7 +1,5 @@
-use serde_json::Map;
-use serde_json::Value;
-use tokio_postgres::types::Kind;
-use tokio_postgres::types::Type;
+use serde_json::{Map, Value};
+use tokio_postgres::types::{Kind, Type};
 use tokio_postgres::Row;
 
 //
@@ -256,9 +254,10 @@ fn _pg_array_parse(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use serde_json::json;
 
+    use super::*;
+
     #[test]
     fn test_atomic_types_to_pg_params() {
         let json = vec![Value::Bool(true), Value::Bool(false)];
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 4ab14ad35f..5df37a8762 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -1,28 +1,31 @@
-use futures::{future::poll_fn, Future};
+use std::collections::HashMap;
+use std::pin::pin;
+use std::sync::{Arc, Weak};
+use std::task::{ready, Poll};
+use std::time::Duration;
+
+use futures::future::poll_fn;
+use futures::Future;
 use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
 use serde_json::value::RawValue;
 use signature::Signer;
-use std::task::{ready, Poll};
-use std::{collections::HashMap, pin::pin, sync::Arc, sync::Weak, time::Duration};
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
 use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
 use tokio_util::sync::CancellationToken;
-
-use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::Metrics;
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{context::RequestMonitoring, DbName, RoleName};
-
-use tracing::{error, warn, Span};
-use tracing::{info, info_span, Instrument};
+use tracing::{error, info, info_span, warn, Instrument, Span};
 
 use super::backend::HttpConnError;
 use super::conn_pool::{ClientInnerExt, ConnInfo};
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
+use crate::metrics::Metrics;
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
     conn: ClientInner<C>,
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 3131adada4..3ed3b6c845 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -12,12 +12,15 @@ mod local_conn_pool;
 mod sql_over_http;
 mod websocket;
 
+use std::net::{IpAddr, SocketAddr};
+use std::pin::{pin, Pin};
+use std::sync::Arc;
+
+use anyhow::Context;
 use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
 pub use conn_pool::GlobalConnPoolOptions;
-
-use anyhow::Context;
 use futures::future::{select, Either};
 use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
@@ -29,9 +32,13 @@ use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tokio::net::{TcpListener, TcpStream};
 use tokio::time::timeout;
 use tokio_rustls::TlsAcceptor;
+use tokio_util::sync::CancellationToken;
 use tokio_util::task::TaskTracker;
+use tracing::{info, warn, Instrument};
+use utils::http::error::ApiError;
 
 use crate::cancellation::CancellationHandlerMain;
 use crate::config::ProxyConfig;
@@ -43,14 +50,6 @@ use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
 use crate::serverless::http_util::{api_error_into_response, json_response};
 
-use std::net::{IpAddr, SocketAddr};
-use std::pin::{pin, Pin};
-use std::sync::Arc;
-use tokio::net::{TcpListener, TcpStream};
-use tokio_util::sync::CancellationToken;
-use tracing::{info, warn, Instrument};
-use utils::http::error::ApiError;
-
 pub(crate) const SERVERLESS_DRIVER_SNI: &str = "api";
 
 pub async fn task_main(
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index cf3324926c..3d8a2adef1 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -2,77 +2,43 @@ use std::pin::pin;
 use std::sync::Arc;
 
 use bytes::Bytes;
-use futures::future::select;
-use futures::future::try_join;
-use futures::future::Either;
-use futures::StreamExt;
-use futures::TryFutureExt;
+use futures::future::{select, try_join, Either};
+use futures::{StreamExt, TryFutureExt};
 use http::header::AUTHORIZATION;
 use http::Method;
 use http_body_util::combinators::BoxBody;
-use http_body_util::BodyExt;
-use http_body_util::Full;
-use hyper::body::Body;
-use hyper::body::Incoming;
-use hyper::header;
-use hyper::http::HeaderName;
-use hyper::http::HeaderValue;
-use hyper::Response;
-use hyper::StatusCode;
-use hyper::{HeaderMap, Request};
+use http_body_util::{BodyExt, Full};
+use hyper::body::{Body, Incoming};
+use hyper::http::{HeaderName, HeaderValue};
+use hyper::{header, HeaderMap, Request, Response, StatusCode};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
 use tokio::time;
-use tokio_postgres::error::DbError;
-use tokio_postgres::error::ErrorPosition;
-use tokio_postgres::error::SqlState;
-use tokio_postgres::GenericClient;
-use tokio_postgres::IsolationLevel;
-use tokio_postgres::NoTls;
-use tokio_postgres::ReadyForQueryStatus;
-use tokio_postgres::Transaction;
+use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
+use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
-use tracing::error;
-use tracing::info;
+use tracing::{error, info};
 use typed_json::json;
 use url::Url;
 use urlencoding;
 use utils::http::error::ApiError;
 
-use crate::auth::backend::ComputeCredentialKeys;
-use crate::auth::backend::ComputeUserInfo;
-use crate::auth::endpoint_sni;
-use crate::auth::ComputeUserInfoParseError;
-use crate::config::AuthenticationConfig;
-use crate::config::HttpConfig;
-use crate::config::ProxyConfig;
-use crate::config::TlsConfig;
-use crate::context::RequestMonitoring;
-use crate::error::ErrorKind;
-use crate::error::ReportableError;
-use crate::error::UserFacingError;
-use crate::metrics::HttpDirection;
-use crate::metrics::Metrics;
-use crate::proxy::run_until_cancelled;
-use crate::proxy::NeonOptions;
-use crate::serverless::backend::HttpConnError;
-use crate::usage_metrics::MetricCounter;
-use crate::usage_metrics::MetricCounterRecorder;
-use crate::DbName;
-use crate::RoleName;
-
-use super::backend::LocalProxyConnError;
-use super::backend::PoolingBackend;
-use super::conn_pool;
-use super::conn_pool::AuthData;
-use super::conn_pool::ConnInfo;
-use super::conn_pool::ConnInfoWithAuth;
+use super::backend::{LocalProxyConnError, PoolingBackend};
+use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth};
 use super::http_util::json_response;
-use super::json::json_to_pg_text;
-use super::json::pg_text_row_to_json;
-use super::json::JsonConversionError;
-use super::local_conn_pool;
+use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
+use super::{conn_pool, local_conn_pool};
+use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
+use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
+use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
+use crate::context::RequestMonitoring;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::{HttpDirection, Metrics};
+use crate::proxy::{run_until_cancelled, NeonOptions};
+use crate::serverless::backend::HttpConnError;
+use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
+use crate::{DbName, RoleName};
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index f5a692cf40..ba36116c2c 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -1,13 +1,7 @@
-use crate::proxy::ErrorSource;
-use crate::{
-    cancellation::CancellationHandlerMain,
-    config::ProxyConfig,
-    context::RequestMonitoring,
-    error::{io_error, ReportableError},
-    metrics::Metrics,
-    proxy::{handle_client, ClientMode},
-    rate_limiter::EndpointRateLimiter,
-};
+use std::pin::Pin;
+use std::sync::Arc;
+use std::task::{ready, Context, Poll};
+
 use anyhow::Context as _;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use framed_websockets::{Frame, OpCode, WebSocketServer};
@@ -15,15 +9,17 @@ use futures::{Sink, Stream};
 use hyper::upgrade::OnUpgrade;
 use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;
-
-use std::{
-    pin::Pin,
-    sync::Arc,
-    task::{ready, Context, Poll},
-};
 use tokio::io::{self, AsyncBufRead, AsyncRead, AsyncWrite, ReadBuf};
 use tracing::warn;
 
+use crate::cancellation::CancellationHandlerMain;
+use crate::config::ProxyConfig;
+use crate::context::RequestMonitoring;
+use crate::error::{io_error, ReportableError};
+use crate::metrics::Metrics;
+use crate::proxy::{handle_client, ClientMode, ErrorSource};
+use crate::rate_limiter::EndpointRateLimiter;
+
 pin_project! {
     /// This is a wrapper around a [`WebSocketStream`] that
     /// implements [`AsyncRead`] and [`AsyncWrite`].
@@ -184,14 +180,11 @@ mod tests {
 
     use framed_websockets::WebSocketServer;
     use futures::{SinkExt, StreamExt};
-    use tokio::{
-        io::{duplex, AsyncReadExt, AsyncWriteExt},
-        task::JoinSet,
-    };
-    use tokio_tungstenite::{
-        tungstenite::{protocol::Role, Message},
-        WebSocketStream,
-    };
+    use tokio::io::{duplex, AsyncReadExt, AsyncWriteExt};
+    use tokio::task::JoinSet;
+    use tokio_tungstenite::tungstenite::protocol::Role;
+    use tokio_tungstenite::tungstenite::Message;
+    use tokio_tungstenite::WebSocketStream;
 
     use super::WebSocketRw;
 
diff --git a/proxy/src/stream.rs b/proxy/src/stream.rs
index e2fc73235e..89df48c5d3 100644
--- a/proxy/src/stream.rs
+++ b/proxy/src/stream.rs
@@ -1,19 +1,20 @@
-use crate::config::TlsServerEndPoint;
-use crate::error::{ErrorKind, ReportableError, UserFacingError};
-use crate::metrics::Metrics;
-use bytes::BytesMut;
-
-use pq_proto::framed::{ConnectionError, Framed};
-use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
-use rustls::ServerConfig;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::{io, task};
+
+use bytes::BytesMut;
+use pq_proto::framed::{ConnectionError, Framed};
+use pq_proto::{BeMessage, FeMessage, FeStartupPacket, ProtocolError};
+use rustls::ServerConfig;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
 use tokio_rustls::server::TlsStream;
 use tracing::debug;
 
+use crate::config::TlsServerEndPoint;
+use crate::error::{ErrorKind, ReportableError, UserFacingError};
+use crate::metrics::Metrics;
+
 /// Stream wrapper which implements libpq's protocol.
 ///
 /// NOTE: This object deliberately doesn't implement [`AsyncRead`]
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index ee36ed462d..c5384c0b0e 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -1,36 +1,33 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::{
-    config::{MetricBackupCollectionConfig, MetricCollectionConfig},
-    context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
-    http,
-    intern::{BranchIdInt, EndpointIdInt},
-};
+use std::convert::Infallible;
+use std::pin::pin;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
+use std::sync::Arc;
+use std::time::Duration;
+
 use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Bytes;
 use chrono::{DateTime, Datelike, Timelike, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
-use dashmap::{mapref::entry::Entry, DashMap};
+use dashmap::mapref::entry::Entry;
+use dashmap::DashMap;
 use futures::future::select;
 use once_cell::sync::Lazy;
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use serde::{Deserialize, Serialize};
-use std::{
-    convert::Infallible,
-    pin::pin,
-    sync::{
-        atomic::{AtomicU64, AtomicUsize, Ordering},
-        Arc,
-    },
-    time::Duration,
-};
 use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, instrument, trace, warn};
 use utils::backoff;
 use uuid::{NoContext, Timestamp};
 
+use crate::config::{MetricBackupCollectionConfig, MetricCollectionConfig};
+use crate::context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD};
+use crate::http;
+use crate::intern::{BranchIdInt, EndpointIdInt};
+
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
 const HTTP_REPORTING_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
@@ -485,19 +482,23 @@ async fn upload_events_chunk(
 
 #[cfg(test)]
 mod tests {
-    use super::*;
+    use std::sync::{Arc, Mutex};
 
-    use crate::{http, BranchId, EndpointId};
     use anyhow::Error;
     use chrono::Utc;
     use consumption_metrics::{Event, EventChunk};
     use http_body_util::BodyExt;
-    use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response};
+    use hyper::body::Incoming;
+    use hyper::server::conn::http1;
+    use hyper::service::service_fn;
+    use hyper::{Request, Response};
     use hyper_util::rt::TokioIo;
-    use std::sync::{Arc, Mutex};
     use tokio::net::TcpListener;
     use url::Url;
 
+    use super::*;
+    use crate::{http, BranchId, EndpointId};
+
     #[tokio::test]
     async fn metrics() {
         type Report = EventChunk<'static, Event<Ids, String>>;
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 86d0f9e8b2..7e07f6a2af 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -1,8 +1,9 @@
+use std::pin::Pin;
+use std::task;
+
 use hashbrown::HashMap;
 use parking_lot::Mutex;
 use pin_project_lite::pin_project;
-use std::pin::Pin;
-use std::task;
 use thiserror::Error;
 use tokio::sync::oneshot;
 
@@ -99,9 +100,10 @@ impl<T> std::future::Future for Waiter<'_, T> {
 
 #[cfg(test)]
 mod tests {
-    use super::*;
     use std::sync::Arc;
 
+    use super::*;
+
     #[tokio::test]
     async fn test_waiter() -> anyhow::Result<()> {
         let waiters = Arc::new(Waiters::default());

From d490ad23e0948b7c49098638ffc669774c61049e Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 16 Oct 2024 14:04:17 +0100
Subject: [PATCH 014/239] storcon: use the same trace fields for reconciler and
 results (#9410)

## Problem

The reconciler use `seq`, but processing of results uses `sequence`.
Order is different too. It makes it annoying to read logs.

## Summary of Changes

Use the same tracing fields in both
---
 storage_controller/src/service.rs | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index cedee54534..25e1fb5e1f 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -1074,8 +1074,9 @@ impl Service {
     /// the observed state of the tenant such that subsequent calls to [`TenantShard::get_reconcile_needed`]
     /// will indicate that reconciliation is not needed.
     #[instrument(skip_all, fields(
-        tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
-        sequence=%result.sequence
+        seq=%result.sequence,
+        tenant_id=%result.tenant_shard_id.tenant_id,
+        shard_id=%result.tenant_shard_id.shard_slug(),
     ))]
     fn process_result(&self, result: ReconcileResult) {
         let mut locked = self.inner.write().unwrap();

From d6281cbe65db6959e83c6d8abb44c0a3184e8b97 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 16 Oct 2024 15:27:46 +0100
Subject: [PATCH 015/239] tests: stabilize test_timelines_parallel_endpoints
 (#9413)

## Problem

This test would get failures like `command failed: Found no timeline id
for branch name 'branch_8'`

It's because neon_local is being invoked concurrently for branch
creation, which is unsafe (they'll step on each others' JSON writes)

Example failure:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9410/11363051979/index.html#testresult/5ddc56c640f5422b/retries

## Summary of changes

- Don't do branch creation concurrently with endpoint creation via neon_local
---
 test_runner/regress/test_tenants.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 4a16535941..03cb79fc1d 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -19,6 +19,7 @@ from fixtures.metrics import (
     parse_metrics,
 )
 from fixtures.neon_fixtures import (
+    Endpoint,
     NeonEnv,
     NeonEnvBuilder,
     wait_for_last_flush_lsn,
@@ -490,8 +491,8 @@ def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):
     n_threads = 16
     barrier = threading.Barrier(n_threads)
 
-    def test_timeline(branch_name: str, timeline_id: TimelineId):
-        endpoint = env.endpoints.create_start(branch_name)
+    def test_timeline(branch_name: str, timeline_id: TimelineId, endpoint: Endpoint):
+        endpoint.start()
         endpoint.stop()
         # Use a barrier to make sure we restart endpoints at the same time
         barrier.wait()
@@ -502,8 +503,12 @@ def test_timelines_parallel_endpoints(neon_simple_env: NeonEnv):
     for i in range(0, n_threads):
         branch_name = f"branch_{i}"
         timeline_id = env.create_branch(branch_name)
-        w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id])
+        endpoint = env.endpoints.create(branch_name)
+        w = threading.Thread(target=test_timeline, args=[branch_name, timeline_id, endpoint])
         workers.append(w)
+
+    # Only start the restarts once we're done creating all timelines & endpoints
+    for w in workers:
         w.start()
 
     for w in workers:

From 3140c14d608e79d792518d9d9144460b6ff01b0f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:28:55 +0200
Subject: [PATCH 016/239] Remove allow(clippy::unknown_lints) (#9416)

the lint stabilized in 1.80.
---
 pageserver/src/tenant/timeline.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 8f098d0e82..1992dee930 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3092,7 +3092,6 @@ impl Timeline {
 }
 
 impl Timeline {
-    #[allow(unknown_lints)] // doc_lazy_continuation is still a new lint
     #[allow(clippy::doc_lazy_continuation)]
     /// Get the data needed to reconstruct all keys in the provided keyspace
     ///

From 9668601f4666bd82cee653800433ce66a4d9fb21 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 16 Oct 2024 15:29:23 +0100
Subject: [PATCH 017/239] Add support of extensions for v17 (part 2) (#9389)

- plv8 3.2.3
    - HypoPG 1.4.1
    - pgtap 1.3.3
    - timescaledb 2.17.0
    - pg_hint_plan 17_1_7_0
    - rdkit Release_2024_09_1
    - pg_uuidv7 1.6.0
    - wal2json 2.6
    - pg_ivm 1.9
    - pg_partman 5.1.0

    update support of extensions for v14-v16:
    - HypoPG 1.4.0 -> 1.4.1
    - pgtap 1.2.0 -> 1.3.3
    - plpgsql_check 2.5.3 -> 2.7.11
    - pg_uuidv7 1.0.1 -> 1.6.0
    - wal2json 2.5 -> 2.6
    - pg_ivm 1.7 -> 1.9
    - pg_partman 5.0.1 -> 5.1.0
---
 compute/Dockerfile.compute-node | 182 ++++++++++++++++++++------------
 1 file changed, 114 insertions(+), 68 deletions(-)

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 13381b2901..f05039f8b7 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -18,13 +18,14 @@ RUN case $DEBIAN_VERSION in \
       # Version-specific installs for Bullseye (PG14-PG16):
       # The h3_pg extension needs a cmake 3.20+, but Debian bullseye has 3.18.
       # Install newer version (3.25) from backports.
+      # libstdc++-10-dev is required for plv8
       bullseye) \
         echo "deb http://deb.debian.org/debian bullseye-backports main" > /etc/apt/sources.list.d/bullseye-backports.list; \
-        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports"; \
+        VERSION_INSTALLS="cmake/bullseye-backports cmake-data/bullseye-backports libstdc++-10-dev"; \
       ;; \
       # Version-specific installs for Bookworm (PG17):
       bookworm) \
-        VERSION_INSTALLS="cmake"; \
+        VERSION_INSTALLS="cmake libstdc++-12-dev"; \
       ;; \
       *) \
         echo "Unknown Debian version ${DEBIAN_VERSION}" && exit 1 \
@@ -227,18 +228,33 @@ FROM build-deps AS plv8-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt update && \
+RUN apt update && \
     apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+# plv8 3.2.3 supports v17
+# last release v3.2.3 - Sep 7, 2024
+#
+# clone the repo instead of downloading the release tarball because plv8 has submodule dependencies
+# and the release tarball doesn't include them
+#
+# Use new version only for v17
+# because since v3.2, plv8 doesn't include plcoffee and plls extensions
+ENV PLV8_TAG=v3.2.3
+
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export PLV8_TAG=v3.2.3 \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export PLV8_TAG=v3.1.10 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.10.tar.gz -O plv8.tar.gz && \
-    echo "7096c3290928561f0d4901b7a52794295dc47f6303102fae3f8e42dd575ad97d plv8.tar.gz" | sha256sum --check && \
-    mkdir plv8-src && cd plv8-src && tar xzf ../plv8.tar.gz --strip-components=1 -C . && \
+    git clone --recurse-submodules --depth 1 --branch ${PLV8_TAG} https://github.com/plv8/plv8.git plv8-src && \
+    tar -czf plv8.tar.gz --exclude .git plv8-src && \
+    cd plv8-src && \
     # generate and copy upgrade scripts
     mkdir -p upgrade && ./generate_upgrade.sh 3.1.10 && \
     cp upgrade/* /usr/local/pgsql/share/extension/ && \
@@ -248,8 +264,17 @@ RUN case "${PG_VERSION}" in "v17") \
     find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
     # don't break computes with installed old version of plv8
     cd /usr/local/pgsql/lib/ && \
-    ln -s plv8-3.1.10.so plv8-3.1.5.so && \
-    ln -s plv8-3.1.10.so plv8-3.1.8.so && \
+    case "${PG_VERSION}" in \
+    "v17") \
+        ln -s plv8-3.2.3.so plv8-3.1.8.so && \
+        ln -s plv8-3.2.3.so plv8-3.1.5.so && \
+        ln -s plv8-3.2.3.so plv8-3.1.10.so \
+    ;; \
+    "v14" | "v15" | "v16") \
+        ln -s plv8-3.1.10.so plv8-3.1.5.so && \
+        ln -s plv8-3.1.10.so plv8-3.1.8.so \
+    ;; \
+    esac && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -327,6 +352,9 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # By default, pgvector Makefile uses `-march=native`. We don't want that,
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
+#
+# v17 is not supported yet because of upstream issue
+# https://github.com/pgvector/pgvector/issues/669
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
     esac && \
@@ -366,11 +394,10 @@ FROM build-deps AS hypopg-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.0.tar.gz -O hypopg.tar.gz && \
-    echo "0821011743083226fc9b813c1f2ef5897a91901b57b6bea85a78e466187c6819 hypopg.tar.gz" | sha256sum --check && \
+# HypoPG 1.4.1 supports v17
+# last release 1.4.1 - Apr 28, 2024
+RUN wget https://github.com/HypoPG/hypopg/archive/refs/tags/1.4.1.tar.gz -O hypopg.tar.gz && \
+    echo "9afe6357fd389d8d33fad81703038ce520b09275ec00153c6c89282bcdedd6bc hypopg.tar.gz" | sha256sum --check && \
     mkdir hypopg-src && cd hypopg-src && tar xzf ../hypopg.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -407,6 +434,9 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/rum.patch /rum.patch
 
+# maybe version-specific
+# support for v17 is unknown
+# last release 1.3.13 - Sep 19, 2022
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
     esac && \
@@ -428,11 +458,10 @@ FROM build-deps AS pgtap-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/theory/pgtap/archive/refs/tags/v1.2.0.tar.gz -O pgtap.tar.gz && \
-    echo "9c7c3de67ea41638e14f06da5da57bac6f5bd03fea05c165a0ec862205a5c052 pgtap.tar.gz" | sha256sum --check && \
+# pgtap 1.3.3 supports v17
+# last release v1.3.3 - Apr 8, 2024
+RUN wget https://github.com/theory/pgtap/archive/refs/tags/v1.3.3.tar.gz -O pgtap.tar.gz && \
+    echo "325ea79d0d2515bce96bce43f6823dcd3effbd6c54cb2a4d6c2384fffa3a14c7 pgtap.tar.gz" | sha256sum --check && \
     mkdir pgtap-src && cd pgtap-src && tar xzf ../pgtap.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
@@ -505,11 +534,10 @@ FROM build-deps AS plpgsql-check-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.5.3.tar.gz -O plpgsql_check.tar.gz && \
-    echo "6631ec3e7fb3769eaaf56e3dfedb829aa761abf163d13dba354b4c218508e1c0 plpgsql_check.tar.gz" | sha256sum --check && \
+# plpgsql_check v2.7.11 supports v17
+# last release v2.7.11 - Sep 16, 2024
+RUN wget https://github.com/okbob/plpgsql_check/archive/refs/tags/v2.7.11.tar.gz -O plpgsql_check.tar.gz && \
+    echo "208933f8dbe8e0d2628eb3851e9f52e6892b8e280c63700c0f1ce7883625d172 plpgsql_check.tar.gz" | sha256sum --check && \
     mkdir plpgsql_check-src && cd plpgsql_check-src && tar xzf ../plpgsql_check.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -527,18 +555,19 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+RUN case "${PG_VERSION}" in \
       "v14" | "v15") \
         export TIMESCALEDB_VERSION=2.10.1 \
         export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
         ;; \
-      *) \
+      "v16") \
         export TIMESCALEDB_VERSION=2.13.0 \
         export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
         ;; \
+      "v17") \
+        export TIMESCALEDB_VERSION=2.17.0 \
+        export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
+        ;; \
     esac && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
     echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
@@ -561,10 +590,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin:$PATH"
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    case "${PG_VERSION}" in \
+# version-specific, has separate releases for each version
+RUN case "${PG_VERSION}" in \
       "v14") \
         export PG_HINT_PLAN_VERSION=14_1_4_1 \
         export PG_HINT_PLAN_CHECKSUM=c3501becf70ead27f70626bce80ea401ceac6a77e2083ee5f3ff1f1444ec1ad1 \
@@ -578,7 +605,8 @@ RUN case "${PG_VERSION}" in "v17") \
         export PG_HINT_PLAN_CHECKSUM=fc85a9212e7d2819d4ae4ac75817481101833c3cfa9f0fe1f980984e12347d00 \
         ;; \
       "v17") \
-        echo "TODO: PG17 pg_hint_plan support" && exit 0 \
+        export PG_HINT_PLAN_VERSION=17_1_7_0 \
+        export PG_HINT_PLAN_CHECKSUM=06dd306328c67a4248f48403c50444f30959fb61ebe963248dbc2afb396fe600 \
         ;; \
       *) \
         echo "Export the valid PG_HINT_PLAN_VERSION variable" && exit 1 \
@@ -602,6 +630,10 @@ FROM build-deps AS pg-cron-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# 1.6.4 available, supports v17
+# This is an experimental extension that we do not support on prod yet.
+# !Do not remove!
+# We set it in shared_preload_libraries and computes will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
     echo "v17 extensions are not supported yet. Quit" && exit 0;; \
@@ -623,23 +655,37 @@ FROM build-deps AS rdkit-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    apt-get update && \
+RUN apt-get update && \
     apt-get install --no-install-recommends -y \
         libboost-iostreams1.74-dev \
         libboost-regex1.74-dev \
         libboost-serialization1.74-dev \
         libboost-system1.74-dev \
-        libeigen3-dev
+        libeigen3-dev \
+        libboost-all-dev
 
+# rdkit Release_2024_09_1 supports v17
+# last release Release_2024_09_1 - Sep 27, 2024
+#
+# Use new version only for v17
+# because Release_2024_09_1 has some backward incompatible changes
+# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 
 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
+RUN case "${PG_VERSION}" in \
+    "v17") \
+        export RDKIT_VERSION=Release_2024_09_1 \
+        export RDKIT_CHECKSUM=034c00d6e9de323506834da03400761ed8c3721095114369d06805409747a60f \
+    ;; \
+    "v14" | "v15" | "v16") \
+        export RDKIT_VERSION=Release_2023_03_3 \
+        export RDKIT_CHECKSUM=bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/rdkit/rdkit/archive/refs/tags/Release_2023_03_3.tar.gz -O rdkit.tar.gz && \
-    echo "bdbf9a2e6988526bfeb8c56ce3cdfe2998d60ac289078e2215374288185e8c8d rdkit.tar.gz" | sha256sum --check && \
+    wget https://github.com/rdkit/rdkit/archive/refs/tags/${RDKIT_VERSION}.tar.gz -O rdkit.tar.gz && \
+    echo "${RDKIT_CHECKSUM} rdkit.tar.gz" | sha256sum --check && \
     mkdir rdkit-src && cd rdkit-src && tar xzf ../rdkit.tar.gz --strip-components=1 -C . && \
     cmake \
         -D RDK_BUILD_CAIRO_SUPPORT=OFF \
@@ -678,12 +724,11 @@ FROM build-deps AS pg-uuidv7-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# not version-specific
+# last release v1.6.0 - Oct 9, 2024
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.0.1.tar.gz -O pg_uuidv7.tar.gz && \
-    echo "0d0759ab01b7fb23851ecffb0bce27822e1868a4a5819bfd276101c716637a7a pg_uuidv7.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/fboulnois/pg_uuidv7/archive/refs/tags/v1.6.0.tar.gz -O pg_uuidv7.tar.gz && \
+    echo "0fa6c710929d003f6ce276a7de7a864e9d1667b2d78be3dc2c07f2409eb55867 pg_uuidv7.tar.gz" | sha256sum --check && \
     mkdir pg_uuidv7-src && cd pg_uuidv7-src && tar xzf ../pg_uuidv7.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -754,6 +799,8 @@ RUN case "${PG_VERSION}" in \
 FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# This is our extension, support stopped in favor of pgvector
+# TODO: deprecate it
 ARG PG_VERSION
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in \
@@ -780,6 +827,8 @@ FROM build-deps AS pg-anon-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# This is an experimental extension, never got to real production.
+# !Do not remove! It can be present in shared_preload_libraries and compute will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 RUN case "${PG_VERSION}" in "v17") \
     echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
@@ -946,13 +995,12 @@ FROM build-deps AS wal2json-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# wal2json wal2json_2_6 supports v17
+# last release wal2json_2_6 - Apr 25, 2024
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "We'll need to update wal2json to 2.6+ for pg17 support" && exit 0;; \
-    esac && \
-    wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
-    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
-    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_6.tar.gz -O wal2json.tar.gz && \
+    echo "18b4bdec28c74a8fc98a11c72de38378a760327ef8e5e42e975b0029eb96ba0d wal2json.tar.gz" | sha256sum --check && \
+    mkdir wal2json-src && cd wal2json-src && tar xzf ../wal2json.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install
 
@@ -966,12 +1014,11 @@ FROM build-deps AS pg-ivm-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# pg_ivm v1.9 supports v17
+# last release v1.9 - Jul 31
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "We'll need to update pg_ivm to 1.9+ for pg17 support" && exit 0;; \
-    esac && \
-    wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
-    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.9.tar.gz -O pg_ivm.tar.gz && \
+    echo "59e15722939f274650abf637f315dd723c87073496ca77236b044cb205270d8b pg_ivm.tar.gz" | sha256sum --check && \
     mkdir pg_ivm-src && cd pg_ivm-src && tar xzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -987,12 +1034,11 @@ FROM build-deps AS pg-partman-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
+# should support v17 https://github.com/pgpartman/pg_partman/discussions/693
+# last release 5.1.0  Apr 2, 2024
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_partman doesn't support PG17 yet" && exit 0;; \
-    esac && \
-    wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.0.1.tar.gz -O pg_partman.tar.gz && \
-    echo "75b541733a9659a6c90dbd40fccb904a630a32880a6e3044d0c4c5f4c8a65525 pg_partman.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz -O pg_partman.tar.gz && \
+    echo "3e3a27d7ff827295d5c55ef72f07a49062d6204b3cb0b9a048645d6db9f3cb9f pg_partman.tar.gz" | sha256sum --check && \
     mkdir pg_partman-src && cd pg_partman-src && tar xzf ../pg_partman.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \

From 55b246085ea30341f2479ecfadff374a5487e74d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 16 Oct 2024 16:47:17 +0200
Subject: [PATCH 018/239] Activate timelines during unoffload (#9399)

The current code has forgotten to activate timelines during unoffload,
leading to inability to receive the basebackup, due to the timeline
still being in loading state.

```
  stderr:
    command failed: compute startup failed: failed to get basebackup@0/0 from pageserver postgresql://no_user@localhost:15014

    Caused by:
        0: db error: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading
        1: ERROR: Not found: Timeline 508546c79b2b16a84ab609fdf966e0d3/bfc18c24c4b837ecae5dbb5216c80fce is not active, state: Loading
```

Therefore, also activate the timeline during unoffloading.

Part of #8088
---
 pageserver/src/http/routes.rs                |  7 +++-
 pageserver/src/tenant.rs                     | 40 +++++++++++++-------
 test_runner/regress/test_timeline_archive.py | 17 +++++++++
 3 files changed, 50 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index dd403c1cef..36a6ed427b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -720,7 +720,12 @@ async fn timeline_archival_config_handler(
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
         tenant
-            .apply_timeline_archival_config(timeline_id, request_data.state, ctx)
+            .apply_timeline_archival_config(
+                timeline_id,
+                request_data.state,
+                state.broker_client.clone(),
+                ctx,
+            )
             .await?;
         Ok::<_, ApiError>(())
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 20925c7fd6..689982ddd4 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1554,6 +1554,7 @@ impl Tenant {
     async fn unoffload_timeline(
         self: &Arc<Self>,
         timeline_id: TimelineId,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: RequestContext,
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
         info!("unoffloading timeline");
@@ -1605,25 +1606,37 @@ impl Tenant {
         })
         .map_err(TimelineArchivalError::Other)?;
         let timelines = self.timelines.lock().unwrap();
-        if let Some(timeline) = timelines.get(&timeline_id) {
-            let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
-            if offloaded_timelines.remove(&timeline_id).is_none() {
-                warn!("timeline already removed from offloaded timelines");
-            }
-            info!("timeline unoffloading complete");
-            Ok(Arc::clone(timeline))
-        } else {
+        let Some(timeline) = timelines.get(&timeline_id) else {
             warn!("timeline not available directly after attach");
-            Err(TimelineArchivalError::Other(anyhow::anyhow!(
+            return Err(TimelineArchivalError::Other(anyhow::anyhow!(
                 "timeline not available directly after attach"
-            )))
+            )));
+        };
+        let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
+        if offloaded_timelines.remove(&timeline_id).is_none() {
+            warn!("timeline already removed from offloaded timelines");
         }
+
+        // Activate the timeline (if it makes sense)
+        if !(timeline.is_broken() || timeline.is_stopping()) {
+            let background_jobs_can_start = None;
+            timeline.activate(
+                self.clone(),
+                broker_client.clone(),
+                background_jobs_can_start,
+                &ctx,
+            );
+        }
+
+        info!("timeline unoffloading complete");
+        Ok(Arc::clone(timeline))
     }
 
     pub(crate) async fn apply_timeline_archival_config(
         self: &Arc<Self>,
         timeline_id: TimelineId,
         new_state: TimelineArchivalState,
+        broker_client: storage_broker::BrokerClientChannel,
         ctx: RequestContext,
     ) -> Result<(), TimelineArchivalError> {
         info!("setting timeline archival config");
@@ -1664,12 +1677,13 @@ impl Tenant {
             Some(Arc::clone(timeline))
         };
 
-        // Second part: unarchive timeline (if needed)
+        // Second part: unoffload timeline (if needed)
         let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded {
             timeline
         } else {
             // Turn offloaded timeline into a non-offloaded one
-            self.unoffload_timeline(timeline_id, ctx).await?
+            self.unoffload_timeline(timeline_id, broker_client, ctx)
+                .await?
         };
 
         // Third part: upload new timeline archival state and block until it is present in S3
@@ -3354,7 +3368,7 @@ impl Tenant {
     /// Populate all Timelines' `GcInfo` with information about their children.  We do not set the
     /// PITR cutoffs here, because that requires I/O: this is done later, before GC, by [`Self::refresh_gc_info_internal`]
     ///
-    /// Subsequently, parent-child relationships are updated incrementally during timeline creation/deletion.
+    /// Subsequently, parent-child relationships are updated incrementally inside [`Timeline::new`] and [`Timeline::drop`].
     fn initialize_gc_info(
         &self,
         timelines: &std::sync::MutexGuard<HashMap<TimelineId, Arc<Timeline>>>,
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 971cc57a1c..ffaed5e130 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -136,6 +136,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
     )
 
+    with env.endpoints.create_start(
+        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,1000)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
+
     ps_http.timeline_archival_config(
         tenant_id,
         leaf_timeline_id,
@@ -197,4 +208,10 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
     )
     assert leaf_detail["is_archived"] is False
 
+    with env.endpoints.create_start(
+        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+    ) as endpoint:
+        sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
+        assert sum == sum_again
+
     assert not timeline_offloaded(initial_timeline_id)

From 8a114e3aeda7a2e321fa4524335c1748448cae07 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 16 Oct 2024 11:19:45 -0400
Subject: [PATCH 019/239] refactor(pageserver): upgrade remote_storage to use
 hyper1 (#9405)

part of https://github.com/neondatabase/neon/issues/9255

## Summary of changes

Upgrade remote_storage crate to use hyper1. Hyper0 is used when
providing the streaming HTTP body to the s3 SDK, and it is refactored to
use hyper1.


Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock                           | 3 ++-
 libs/remote_storage/Cargo.toml       | 3 ++-
 libs/remote_storage/src/s3_bucket.rs | 8 +++++---
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7e772814ec..6b212bac2e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4648,9 +4648,10 @@ dependencies = [
  "camino-tempfile",
  "futures",
  "futures-util",
+ "http-body-util",
  "http-types",
  "humantime-serde",
- "hyper 0.14.30",
+ "hyper 1.4.1",
  "itertools 0.10.5",
  "metrics",
  "once_cell",
diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml
index be4d61f009..1816825bda 100644
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true
 bytes.workspace = true
 camino = { workspace = true, features = ["serde1"] }
 humantime-serde.workspace = true
-hyper0 = { workspace = true, features = ["stream"] }
+hyper = { workspace = true, features = ["client"] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -36,6 +36,7 @@ azure_storage.workspace = true
 azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
+http-body-util.workspace = true
 itertools.workspace = true
 sync_wrapper = { workspace = true, features = ["futures"] }
 
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index f950f2886c..cde32df402 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -28,13 +28,15 @@ use aws_sdk_s3::{
     Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
+use http_body_util::StreamBody;
 use http_types::StatusCode;
 
 use aws_smithy_types::{body::SdkBody, DateTime};
 use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
-use hyper0::Body;
+use futures_util::StreamExt;
+use hyper::body::Frame;
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff;
@@ -710,8 +712,8 @@ impl RemoteStorage for S3Bucket {
 
         let started_at = start_measuring_requests(kind);
 
-        let body = Body::wrap_stream(from);
-        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
+        let body = StreamBody::new(from.map(|x| x.map(Frame::data)));
+        let bytes_stream = ByteStream::new(SdkBody::from_body_1_x(body));
 
         let upload = self
             .client

From ed694732e707b15592991902c89f5078935ec177 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 16 Oct 2024 19:10:49 +0200
Subject: [PATCH 020/239] proxy: merge AuthError and AuthErrorImpl (#9418)

Since GetAuthInfoError now boxes the ControlPlaneError message the
variant is not big anymore and AuthError is 32 bytes.
---
 proxy/src/auth/flow.rs | 10 +++---
 proxy/src/auth/mod.rs  | 78 ++++++++++++++++++------------------------
 2 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index ccb17b66b9..6294549ff6 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -9,7 +9,7 @@ use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
 use super::backend::ComputeCredentialKeys;
-use super::{AuthErrorImpl, PasswordHackPayload};
+use super::{AuthError, PasswordHackPayload};
 use crate::config::TlsServerEndPoint;
 use crate::context::RequestMonitoring;
 use crate::control_plane::AuthSecret;
@@ -117,14 +117,14 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
-            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+            .ok_or(AuthError::MalformedPassword("missing terminator"))?;
 
         let payload = PasswordHackPayload::parse(password)
             // If we ended up here and the payload is malformed, it means that
             // the user neither enabled SNI nor resorted to any other method
             // for passing the project name we rely on. We should show them
             // the most helpful error message and point to the documentation.
-            .ok_or(AuthErrorImpl::MissingEndpointName)?;
+            .ok_or(AuthError::MissingEndpointName)?;
 
         Ok(payload)
     }
@@ -136,7 +136,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, CleartextPassword> {
         let msg = self.stream.read_password_message().await?;
         let password = msg
             .strip_suffix(&[0])
-            .ok_or(AuthErrorImpl::MalformedPassword("missing terminator"))?;
+            .ok_or(AuthError::MalformedPassword("missing terminator"))?;
 
         let outcome = validate_password_and_exchange(
             &self.state.pool,
@@ -166,7 +166,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
         // Initial client message contains the chosen auth method's name.
         let msg = self.stream.read_password_message().await?;
         let sasl = sasl::FirstMessage::parse(&msg)
-            .ok_or(AuthErrorImpl::MalformedPassword("bad sasl message"))?;
+            .ok_or(AuthError::MalformedPassword("bad sasl message"))?;
 
         // Currently, the only supported SASL method is SCRAM.
         if !scram::METHODS.contains(&sasl.method) {
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index ff97e6c35d..7a373dd825 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -29,7 +29,7 @@ pub(crate) type Result<T> = std::result::Result<T, AuthError>;
 
 /// Common authentication error.
 #[derive(Debug, Error)]
-pub(crate) enum AuthErrorImpl {
+pub(crate) enum AuthError {
     #[error(transparent)]
     Web(#[from] backend::WebAuthError),
 
@@ -78,80 +78,70 @@ pub(crate) enum AuthErrorImpl {
     ConfirmationTimeout(humantime::Duration),
 }
 
-#[derive(Debug, Error)]
-#[error(transparent)]
-pub(crate) struct AuthError(Box<AuthErrorImpl>);
-
 impl AuthError {
     pub(crate) fn bad_auth_method(name: impl Into<Box<str>>) -> Self {
-        AuthErrorImpl::BadAuthMethod(name.into()).into()
+        AuthError::BadAuthMethod(name.into())
     }
 
     pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
-        AuthErrorImpl::AuthFailed(user.into()).into()
+        AuthError::AuthFailed(user.into())
     }
 
     pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
-        AuthErrorImpl::IpAddressNotAllowed(ip).into()
+        AuthError::IpAddressNotAllowed(ip)
     }
 
     pub(crate) fn too_many_connections() -> Self {
-        AuthErrorImpl::TooManyConnections.into()
+        AuthError::TooManyConnections
     }
 
     pub(crate) fn is_auth_failed(&self) -> bool {
-        matches!(self.0.as_ref(), AuthErrorImpl::AuthFailed(_))
+        matches!(self, AuthError::AuthFailed(_))
     }
 
     pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
-        AuthErrorImpl::UserTimeout(elapsed).into()
+        AuthError::UserTimeout(elapsed)
     }
 
     pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self {
-        AuthErrorImpl::ConfirmationTimeout(timeout).into()
-    }
-}
-
-impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
-    fn from(e: E) -> Self {
-        Self(Box::new(e.into()))
+        AuthError::ConfirmationTimeout(timeout)
     }
 }
 
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
-        match self.0.as_ref() {
-            AuthErrorImpl::Web(e) => e.to_string_client(),
-            AuthErrorImpl::GetAuthInfo(e) => e.to_string_client(),
-            AuthErrorImpl::Sasl(e) => e.to_string_client(),
-            AuthErrorImpl::AuthFailed(_) => self.to_string(),
-            AuthErrorImpl::BadAuthMethod(_) => self.to_string(),
-            AuthErrorImpl::MalformedPassword(_) => self.to_string(),
-            AuthErrorImpl::MissingEndpointName => self.to_string(),
-            AuthErrorImpl::Io(_) => "Internal error".to_string(),
-            AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(),
-            AuthErrorImpl::TooManyConnections => self.to_string(),
-            AuthErrorImpl::UserTimeout(_) => self.to_string(),
-            AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(),
+        match self {
+            Self::Web(e) => e.to_string_client(),
+            Self::GetAuthInfo(e) => e.to_string_client(),
+            Self::Sasl(e) => e.to_string_client(),
+            Self::AuthFailed(_) => self.to_string(),
+            Self::BadAuthMethod(_) => self.to_string(),
+            Self::MalformedPassword(_) => self.to_string(),
+            Self::MissingEndpointName => self.to_string(),
+            Self::Io(_) => "Internal error".to_string(),
+            Self::IpAddressNotAllowed(_) => self.to_string(),
+            Self::TooManyConnections => self.to_string(),
+            Self::UserTimeout(_) => self.to_string(),
+            Self::ConfirmationTimeout(_) => self.to_string(),
         }
     }
 }
 
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self.0.as_ref() {
-            AuthErrorImpl::Web(e) => e.get_error_kind(),
-            AuthErrorImpl::GetAuthInfo(e) => e.get_error_kind(),
-            AuthErrorImpl::Sasl(e) => e.get_error_kind(),
-            AuthErrorImpl::AuthFailed(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::BadAuthMethod(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::MalformedPassword(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::MissingEndpointName => crate::error::ErrorKind::User,
-            AuthErrorImpl::Io(_) => crate::error::ErrorKind::ClientDisconnect,
-            AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit,
-            AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User,
-            AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
+        match self {
+            Self::Web(e) => e.get_error_kind(),
+            Self::GetAuthInfo(e) => e.get_error_kind(),
+            Self::Sasl(e) => e.get_error_kind(),
+            Self::AuthFailed(_) => crate::error::ErrorKind::User,
+            Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
+            Self::MalformedPassword(_) => crate::error::ErrorKind::User,
+            Self::MissingEndpointName => crate::error::ErrorKind::User,
+            Self::Io(_) => crate::error::ErrorKind::ClientDisconnect,
+            Self::IpAddressNotAllowed(_) => crate::error::ErrorKind::User,
+            Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            Self::UserTimeout(_) => crate::error::ErrorKind::User,
+            Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
         }
     }
 }

From 0551cfb6a74258537255af18428b0345f24f2702 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 16 Oct 2024 20:04:56 +0200
Subject: [PATCH 021/239] Fix beta clippy warnings (#9419)

```
warning: first doc comment paragraph is too long
  --> compute_tools/src/installed_extensions.rs:35:1
   |
35 | / /// Connect to every database (see list_dbs above) and get the list of installed extensions.
36 | | /// Same extension can be installed in multiple databases with different versions,
37 | | /// we only keep the highest and lowest version across all databases.
   | |_
   |
   = help: for further information visit https://rust-lang.github.io/rust-clippy/master/index.html#too_long_first_doc_paragraph
   = note: `#[warn(clippy::too_long_first_doc_paragraph)]` on by default
help: add an empty line
   |
35 ~ /// Connect to every database (see list_dbs above) and get the list of installed extensions.
36 + ///
   |
```
---
 compute_tools/src/installed_extensions.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 3d8b22a8a3..72578b1f34 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -33,6 +33,7 @@ fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
 }
 
 /// Connect to every database (see list_dbs above) and get the list of installed extensions.
+///
 /// Same extension can be installed in multiple databases with different versions,
 /// we only keep the highest and lowest version across all databases.
 pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {

From 409a286eaa6f030494c8914fcaa36dcc7d6496d1 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Oct 2024 13:08:40 -0500
Subject: [PATCH 022/239] Fix typo in sql_exporter generator

Bad copy-paste seemingly. This manifested itself as a failure to start
for the sql_exporter, and was just dying on loop in staging. A future PR
will have E2E testing of sql_exporter.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/sql_exporter.jsonnet | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
index 1e3665ac47..640e2ac38d 100644
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') {
     // Collectors (referenced by name) to execute on the target.
     // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
     collectors: [
-      'neon_collector_autoscaling',
+      'neon_collector',
     ],
   },
 

From e0fa6bcf1a9a33929cfcfd0cefada739a8fe6fea Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 16 Oct 2024 14:46:33 -0500
Subject: [PATCH 023/239] Fix some sql_exporter metrics for PG 17

Checkpointer related statistics moved from pg_stat_bgwriter to
pg_stat_checkpointer, so we need to adjust our queries accordingly.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/Dockerfile.compute-node                  |  3 ++-
 compute/Makefile                                 |  6 ++++--
 compute/etc/sql_exporter/checkpoints_req.17.sql  |  1 +
 .../etc/sql_exporter/checkpoints_req.libsonnet   |  7 ++++++-
 .../etc/sql_exporter/checkpoints_timed.17.sql    |  1 +
 .../etc/sql_exporter/checkpoints_timed.libsonnet |  7 ++++++-
 compute/jsonnet/neon.libsonnet                   | 16 ++++++++++++++++
 7 files changed, 36 insertions(+), 5 deletions(-)
 create mode 100644 compute/etc/sql_exporter/checkpoints_req.17.sql
 create mode 100644 compute/etc/sql_exporter/checkpoints_timed.17.sql
 create mode 100644 compute/jsonnet/neon.libsonnet

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index f05039f8b7..b0ce7c1718 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -1221,12 +1221,13 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 #
 #########################################################################################
 FROM $REPOSITORY/$IMAGE:$TAG AS sql_exporter_preprocessor
+ARG PG_VERSION
 
 USER nonroot
 
 COPY --chown=nonroot compute compute
 
-RUN make -C compute
+RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 #########################################################################################
 #
diff --git a/compute/Makefile b/compute/Makefile
index f8faa882ee..e4f08a223c 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -6,13 +6,15 @@ jsonnet_files = $(wildcard \
 all: neon_collector.yml neon_collector_autoscaling.yml sql_exporter.yml sql_exporter_autoscaling.yml
 
 neon_collector.yml: $(jsonnet_files)
-	JSONNET_PATH=etc jsonnet \
+	JSONNET_PATH=jsonnet:etc jsonnet \
 		--output-file etc/$@ \
+		--ext-str pg_version=$(PG_VERSION) \
 		etc/neon_collector.jsonnet
 
 neon_collector_autoscaling.yml: $(jsonnet_files)
-	JSONNET_PATH=etc jsonnet \
+	JSONNET_PATH=jsonnet:etc jsonnet \
 		--output-file etc/$@ \
+		--ext-str pg_version=$(PG_VERSION) \
 		etc/neon_collector_autoscaling.jsonnet
 
 sql_exporter.yml: $(jsonnet_files)
diff --git a/compute/etc/sql_exporter/checkpoints_req.17.sql b/compute/etc/sql_exporter/checkpoints_req.17.sql
new file mode 100644
index 0000000000..a4b946e8e2
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_req.17.sql
@@ -0,0 +1 @@
+SELECT num_requested AS checkpoints_req FROM pg_stat_checkpointer;
diff --git a/compute/etc/sql_exporter/checkpoints_req.libsonnet b/compute/etc/sql_exporter/checkpoints_req.libsonnet
index 8697f8af3b..e5d9753507 100644
--- a/compute/etc/sql_exporter/checkpoints_req.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_req.libsonnet
@@ -1,3 +1,8 @@
+local neon = import 'neon.libsonnet';
+
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
+
 {
   metric_name: 'checkpoints_req',
   type: 'gauge',
@@ -6,5 +11,5 @@
   values: [
     'checkpoints_req',
   ],
-  query: importstr 'sql_exporter/checkpoints_req.sql',
+  query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer,
 }
diff --git a/compute/etc/sql_exporter/checkpoints_timed.17.sql b/compute/etc/sql_exporter/checkpoints_timed.17.sql
new file mode 100644
index 0000000000..0d86ddb3ea
--- /dev/null
+++ b/compute/etc/sql_exporter/checkpoints_timed.17.sql
@@ -0,0 +1 @@
+SELECT num_timed AS checkpoints_timed FROM pg_stat_checkpointer;
diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
index 9f0b742400..0ba0080188 100644
--- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -1,3 +1,8 @@
+local neon = import 'neon.libsonnet';
+
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
+
 {
   metric_name: 'checkpoints_timed',
   type: 'gauge',
@@ -6,5 +11,5 @@
   values: [
     'checkpoints_timed',
   ],
-  query: importstr 'sql_exporter/checkpoints_timed.sql',
+  query: if neon.PG_MAJORVERSION_NUM < 17 then pg_stat_bgwriter else pg_stat_checkpointer,
 }
diff --git a/compute/jsonnet/neon.libsonnet b/compute/jsonnet/neon.libsonnet
new file mode 100644
index 0000000000..583b631c58
--- /dev/null
+++ b/compute/jsonnet/neon.libsonnet
@@ -0,0 +1,16 @@
+local MIN_SUPPORTED_VERSION = 14;
+local MAX_SUPPORTED_VERSION = 17;
+local SUPPORTED_VERSIONS = std.range(MIN_SUPPORTED_VERSION, MAX_SUPPORTED_VERSION);
+
+# If we receive the pg_version with a leading "v", ditch it.
+local pg_version = std.strReplace(std.extVar('pg_version'), 'v', '');
+local pg_version_num = std.parseInt(pg_version);
+
+assert std.setMember(pg_version_num, SUPPORTED_VERSIONS) :
+       std.format('%s is an unsupported Postgres version: %s',
+                  [pg_version, std.toString(SUPPORTED_VERSIONS)]);
+
+{
+  PG_MAJORVERSION: pg_version,
+  PG_MAJORVERSION_NUM: pg_version_num,
+}

From 67d5d98b1960c7f7b88d1f9860cd9672411cb815 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 16 Oct 2024 21:47:53 +0200
Subject: [PATCH 024/239] readme: fix build instructions for debian 12 (#9371)

We need libprotobuf-dev for some of the
`/usr/include/google/protobuf/...*.proto`
referenced by our protobuf decls.
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index cfc63b4708..e68ef70bdf 100644
--- a/README.md
+++ b/README.md
@@ -31,7 +31,7 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati
 ```bash
 apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \
 libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \
-libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
+libprotobuf-dev libcurl4-openssl-dev openssl python3-poetry lsof libicu-dev
 ```
 * On Fedora, these packages are needed:
 ```bash

From 934dbb61f557477512b3cf5c98e9930e5745d87e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 17 Oct 2024 08:04:57 +0300
Subject: [PATCH 025/239] Check access_count in lfc_evict (#9407)

## Problem

See
https://neondb.slack.com/archives/C033A2WE6BZ/p1729007738526309?thread_ts=1722942856.987979&cid=C033A2WE6BZ

When replica receives WAL record which target page is not present in
shared buffer, we evict this page from LFC.
If all pages from the LFC chunk are evicted, then chunk is moved to the
beginning of LRU least to force it reuse.
Unfortunately access_count is not checked and if the entry is access at
this moment then this operation can cause LRU list corruption.

## Summary of changes

Check `access_count` in `lfc_evict`

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/file_cache.c | 45 ++++++++++++++++++++++--------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c
index bbea5a8b0d..70b250d394 100644
--- a/pgxn/neon/file_cache.c
+++ b/pgxn/neon/file_cache.c
@@ -617,31 +617,34 @@ lfc_evict(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno)
 	/* remove the page from the cache */
 	entry->bitmap[chunk_offs >> 5] &= ~(1 << (chunk_offs & (32 - 1)));
 
-	/*
-	 * If the chunk has no live entries, we can position the chunk to be
-	 * recycled first.
-	 */
-	if (entry->bitmap[chunk_offs >> 5] == 0)
+	if (entry->access_count == 0)
 	{
-		bool		has_remaining_pages = false;
-
-		for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
-		{
-			if (entry->bitmap[i] != 0)
-			{
-				has_remaining_pages = true;
-				break;
-			}
-		}
-
 		/*
-		 * Put the entry at the position that is first to be reclaimed when we
-		 * have no cached pages remaining in the chunk
+		 * If the chunk has no live entries, we can position the chunk to be
+		 * recycled first.
 		 */
-		if (!has_remaining_pages)
+		if (entry->bitmap[chunk_offs >> 5] == 0)
 		{
-			dlist_delete(&entry->list_node);
-			dlist_push_head(&lfc_ctl->lru, &entry->list_node);
+			bool		has_remaining_pages = false;
+
+			for (int i = 0; i < CHUNK_BITMAP_SIZE; i++)
+			{
+				if (entry->bitmap[i] != 0)
+				{
+					has_remaining_pages = true;
+					break;
+				}
+			}
+
+			/*
+			 * Put the entry at the position that is first to be reclaimed when we
+			 * have no cached pages remaining in the chunk
+			 */
+			if (!has_remaining_pages)
+			{
+				dlist_delete(&entry->list_node);
+				dlist_push_head(&lfc_ctl->lru, &entry->list_node);
+			}
 		}
 	}
 

From db68e822355a4ef8ac9e3363d90bb9a2bd0e6dad Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Oct 2024 10:06:02 +0100
Subject: [PATCH 026/239] storage_scrubber: fixes to garbage commands (#9409)

## Problem

While running `find-garbage` and `purge-garbage`, I encountered two
things that needed updating:
- Console API may omit `user_id` since org accounts were added
- When we cut over to using GenericRemoteStorage, the object listings we
do during purge did not get proper retry handling, so could easily fail
on usual S3 errors, and make the whole process drop out.

...and one bug:
- We had a `.unwrap` which expects that after finding an object in a
tenant path, a listing in that path will always return objects. This is
not true, because a pageserver might be deleting the path at the same
time as we scan it.

## Summary of changes

- When listing objects during purge, use backoff::retry
- Make `user_id` an `Option`
- Handle the case where a tenant's objects go away during find-garbage.
---
 storage_scrubber/src/cloud_admin_api.rs |  2 +-
 storage_scrubber/src/garbage.rs         | 65 ++++++++++++++++---------
 2 files changed, 42 insertions(+), 25 deletions(-)

diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index 70b108cf23..7b82a0b116 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -138,7 +138,7 @@ pub struct ProjectData {
     pub name: String,
     pub region_id: String,
     pub platform_id: String,
-    pub user_id: String,
+    pub user_id: Option<String>,
     pub pageserver_id: Option<u64>,
     #[serde(deserialize_with = "from_nullable_id")]
     pub tenant: TenantId,
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index d53611ed6e..a0040ada08 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -16,13 +16,13 @@ use remote_storage::{GenericRemoteStorage, ListingMode, ListingObject, RemotePat
 use serde::{Deserialize, Serialize};
 use tokio_stream::StreamExt;
 use tokio_util::sync::CancellationToken;
-use utils::id::TenantId;
+use utils::{backoff, id::TenantId};
 
 use crate::{
     cloud_admin_api::{CloudAdminApiClient, MaybeDeleted, ProjectData},
     init_remote, list_objects_with_retries,
     metadata_stream::{stream_tenant_timelines, stream_tenants},
-    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth,
+    BucketConfig, ConsoleConfig, NodeKind, TenantShardTimelineId, TraversingDepth, MAX_RETRIES,
 };
 
 #[derive(Serialize, Deserialize, Debug)]
@@ -250,13 +250,16 @@ async fn find_garbage_inner(
                     &target.tenant_root(&tenant_shard_id),
                 )
                 .await?;
-                let object = tenant_objects.keys.first().unwrap();
-                if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
-                    tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
-                    garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
-                    continue;
+                if let Some(object) = tenant_objects.keys.first() {
+                    if object.key.get_path().as_str().ends_with("heatmap-v1.json") {
+                        tracing::info!("Tenant {tenant_shard_id}: is missing in console and is only a heatmap (known historic deletion bug)");
+                        garbage.append_buggy(GarbageEntity::Tenant(tenant_shard_id));
+                        continue;
+                    } else {
+                        tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                    }
                 } else {
-                    tracing::info!("Tenant {tenant_shard_id} is missing in console and contains one object: {}", object.key);
+                    tracing::info!("Tenant {tenant_shard_id} is missing in console appears to have been deleted while we ran");
                 }
             } else {
                 // A console-unknown tenant with timelines: check if these timelines only contain initdb.tar.zst, from the initial
@@ -406,14 +409,17 @@ pub async fn get_tenant_objects(
     // TODO: apply extra validation based on object modification time.  Don't purge
     // tenants where any timeline's index_part.json has been touched recently.
 
-    let list = s3_client
-        .list(
-            Some(&tenant_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
+    let cancel = CancellationToken::new();
+    let list = backoff::retry(
+        || s3_client.list(Some(&tenant_root), ListingMode::NoDelimiter, None, &cancel),
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "get_tenant_objects",
+        &cancel,
+    )
+    .await
+    .expect("dummy cancellation token")?;
     Ok(list.keys)
 }
 
@@ -424,14 +430,25 @@ pub async fn get_timeline_objects(
     tracing::debug!("Listing objects in timeline {ttid}");
     let timeline_root = super::remote_timeline_path_id(&ttid);
 
-    let list = s3_client
-        .list(
-            Some(&timeline_root),
-            ListingMode::NoDelimiter,
-            None,
-            &CancellationToken::new(),
-        )
-        .await?;
+    let cancel = CancellationToken::new();
+    let list = backoff::retry(
+        || {
+            s3_client.list(
+                Some(&timeline_root),
+                ListingMode::NoDelimiter,
+                None,
+                &cancel,
+            )
+        },
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "get_timeline_objects",
+        &cancel,
+    )
+    .await
+    .expect("dummy cancellation token")?;
+
     Ok(list.keys)
 }
 

From 22d8834474d1f619b6ed351fd80033b4a064bb21 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 17 Oct 2024 13:38:24 +0300
Subject: [PATCH 027/239] proxy: move the connection pools to separate file
 (#9398)

First PR for #9284
Start unification of the client and connection pool interfaces:
- Exclude the 'global_connections_count' out from the get_conn_entry()
- Move remote connection pools to the conn_pool_lib as a reference
- Unify clients among all the conn pools
---
 proxy/src/serverless/backend.rs         |  13 +-
 proxy/src/serverless/conn_pool.rs       | 593 ++----------------------
 proxy/src/serverless/conn_pool_lib.rs   | 562 ++++++++++++++++++++++
 proxy/src/serverless/http_conn_pool.rs  |  50 +-
 proxy/src/serverless/local_conn_pool.rs | 111 ++---
 proxy/src/serverless/mod.rs             |   5 +-
 proxy/src/serverless/sql_over_http.rs   |  15 +-
 7 files changed, 709 insertions(+), 640 deletions(-)
 create mode 100644 proxy/src/serverless/conn_pool_lib.rs

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index a180c4c2ed..82e81dbcfe 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -11,8 +11,9 @@ use tokio::net::{lookup_host, TcpStream};
 use tracing::field::display;
 use tracing::{debug, info};
 
-use super::conn_pool::{poll_client, Client, ConnInfo, GlobalConnPool};
-use super::http_conn_pool::{self, poll_http2_client};
+use super::conn_pool::poll_client;
+use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
+use super::http_conn_pool::{self, poll_http2_client, Send};
 use super::local_conn_pool::{self, LocalClient, LocalConnPool};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
@@ -31,7 +32,7 @@ use crate::rate_limiter::EndpointRateLimiter;
 use crate::{compute, EndpointId, Host};
 
 pub(crate) struct PoolingBackend {
-    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool>,
+    pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
     pub(crate) config: &'static ProxyConfig,
@@ -199,7 +200,7 @@ impl PoolingBackend {
         &self,
         ctx: &RequestMonitoring,
         conn_info: ConnInfo,
-    ) -> Result<http_conn_pool::Client, HttpConnError> {
+    ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
         info!("pool: looking for an existing connection");
         if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) {
             return Ok(client);
@@ -481,7 +482,7 @@ impl ConnectMechanism for TokioMechanism {
 }
 
 struct HyperMechanism {
-    pool: Arc<http_conn_pool::GlobalConnPool>,
+    pool: Arc<http_conn_pool::GlobalConnPool<Send>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -491,7 +492,7 @@ struct HyperMechanism {
 
 #[async_trait]
 impl ConnectMechanism for HyperMechanism {
-    type Connection = http_conn_pool::Client;
+    type Connection = http_conn_pool::Client<Send>;
     type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index aa869ff1c0..b97c656510 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -1,31 +1,29 @@
-use std::collections::HashMap;
 use std::fmt;
-use std::ops::Deref;
 use std::pin::pin;
-use std::sync::atomic::{self, AtomicUsize};
 use std::sync::{Arc, Weak};
 use std::task::{ready, Poll};
-use std::time::Duration;
 
-use dashmap::DashMap;
 use futures::future::poll_fn;
 use futures::Future;
-use parking_lot::RwLock;
-use rand::Rng;
 use smallvec::SmallVec;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, info_span, warn, Instrument, Span};
+use tracing::{error, info, info_span, warn, Instrument};
 
-use super::backend::HttpConnError;
-use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
-use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
-use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{DbName, EndpointCacheKey, RoleName};
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::Metrics;
+
+use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
+
+#[cfg(test)]
+use {
+    super::conn_pool_lib::GlobalConnPoolOptions,
+    crate::auth::backend::ComputeUserInfo,
+    std::{sync::atomic, time::Duration},
+};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
@@ -33,34 +31,12 @@ pub(crate) struct ConnInfoWithAuth {
     pub(crate) auth: AuthData,
 }
 
-#[derive(Debug, Clone)]
-pub(crate) struct ConnInfo {
-    pub(crate) user_info: ComputeUserInfo,
-    pub(crate) dbname: DbName,
-}
-
 #[derive(Debug, Clone)]
 pub(crate) enum AuthData {
     Password(SmallVec<[u8; 16]>),
     Jwt(String),
 }
 
-impl ConnInfo {
-    // hm, change to hasher to avoid cloning?
-    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
-        (self.dbname.clone(), self.user_info.user.clone())
-    }
-
-    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
-        // We don't want to cache http connections for ephemeral endpoints.
-        if self.user_info.options.is_ephemeral() {
-            None
-        } else {
-            Some(self.user_info.endpoint_cache_key())
-        }
-    }
-}
-
 impl fmt::Display for ConnInfo {
     // use custom display to avoid logging password
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -75,402 +51,6 @@ impl fmt::Display for ConnInfo {
     }
 }
 
-struct ConnPoolEntry<C: ClientInnerExt> {
-    conn: ClientInner<C>,
-    _last_access: std::time::Instant,
-}
-
-// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
-// Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
-    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
-    total_conns: usize,
-    max_conns: usize,
-    _guard: HttpEndpointPoolsGuard<'static>,
-    global_connections_count: Arc<AtomicUsize>,
-    global_pool_size_max_conns: usize,
-}
-
-impl<C: ClientInnerExt> EndpointConnPool<C> {
-    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
-        let Self {
-            pools,
-            total_conns,
-            global_connections_count,
-            ..
-        } = self;
-        pools.get_mut(&db_user).and_then(|pool_entries| {
-            pool_entries.get_conn_entry(total_conns, global_connections_count.clone())
-        })
-    }
-
-    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
-        let Self {
-            pools,
-            total_conns,
-            global_connections_count,
-            ..
-        } = self;
-        if let Some(pool) = pools.get_mut(&db_user) {
-            let old_len = pool.conns.len();
-            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
-            let new_len = pool.conns.len();
-            let removed = old_len - new_len;
-            if removed > 0 {
-                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .dec_by(removed as i64);
-            }
-            *total_conns -= removed;
-            removed > 0
-        } else {
-            false
-        }
-    }
-
-    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
-        let conn_id = client.conn_id;
-
-        if client.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
-            return;
-        }
-        let global_max_conn = pool.read().global_pool_size_max_conns;
-        if pool
-            .read()
-            .global_connections_count
-            .load(atomic::Ordering::Relaxed)
-            >= global_max_conn
-        {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
-            return;
-        }
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < pool.max_conns {
-                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
-                pool_entries.conns.push(ConnPoolEntry {
-                    conn: client,
-                    _last_access: std::time::Instant::now(),
-                });
-
-                returned = true;
-                per_db_size = pool_entries.conns.len();
-
-                pool.total_conns += 1;
-                pool.global_connections_count
-                    .fetch_add(1, atomic::Ordering::Relaxed);
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .inc();
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
-    fn drop(&mut self) {
-        if self.total_conns > 0 {
-            self.global_connections_count
-                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(self.total_conns as i64);
-        }
-    }
-}
-
-pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
-    conns: Vec<ConnPoolEntry<C>>,
-}
-
-impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
-    fn default() -> Self {
-        Self { conns: Vec::new() }
-    }
-}
-
-impl<C: ClientInnerExt> DbUserConnPool<C> {
-    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
-        let old_len = self.conns.len();
-
-        self.conns.retain(|conn| !conn.conn.is_closed());
-
-        let new_len = self.conns.len();
-        let removed = old_len - new_len;
-        *conns -= removed;
-        removed
-    }
-
-    fn get_conn_entry(
-        &mut self,
-        conns: &mut usize,
-        global_connections_count: Arc<AtomicUsize>,
-    ) -> Option<ConnPoolEntry<C>> {
-        let mut removed = self.clear_closed_clients(conns);
-        let conn = self.conns.pop();
-        if conn.is_some() {
-            *conns -= 1;
-            removed += 1;
-        }
-        global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
-        Metrics::get()
-            .proxy
-            .http_pool_opened_connections
-            .get_metric()
-            .dec_by(removed as i64);
-        conn
-    }
-}
-
-pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
-    // endpoint -> per-endpoint connection pool
-    //
-    // That should be a fairly conteded map, so return reference to the per-endpoint
-    // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
-
-    /// Number of endpoint-connection pools
-    ///
-    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
-    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
-    /// It's only used for diagnostics.
-    global_pool_size: AtomicUsize,
-
-    /// Total number of connections in the pool
-    global_connections_count: Arc<AtomicUsize>,
-
-    config: &'static crate::config::HttpConfig,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub struct GlobalConnPoolOptions {
-    // Maximum number of connections per one endpoint.
-    // Can mix different (dbname, username) connections.
-    // When running out of free slots for a particular endpoint,
-    // falls back to opening a new connection for each request.
-    pub max_conns_per_endpoint: usize,
-
-    pub gc_epoch: Duration,
-
-    pub pool_shards: usize,
-
-    pub idle_timeout: Duration,
-
-    pub opt_in: bool,
-
-    // Total number of connections in the pool.
-    pub max_total_conns: usize,
-}
-
-impl<C: ClientInnerExt> GlobalConnPool<C> {
-    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
-        let shards = config.pool_options.pool_shards;
-        Arc::new(Self {
-            global_pool: DashMap::with_shard_amount(shards),
-            global_pool_size: AtomicUsize::new(0),
-            config,
-            global_connections_count: Arc::new(AtomicUsize::new(0)),
-        })
-    }
-
-    #[cfg(test)]
-    pub(crate) fn get_global_connections_count(&self) -> usize {
-        self.global_connections_count
-            .load(atomic::Ordering::Relaxed)
-    }
-
-    pub(crate) fn get_idle_timeout(&self) -> Duration {
-        self.config.pool_options.idle_timeout
-    }
-
-    pub(crate) fn shutdown(&self) {
-        // drops all strong references to endpoint-pools
-        self.global_pool.clear();
-    }
-
-    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
-        let epoch = self.config.pool_options.gc_epoch;
-        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
-        loop {
-            interval.tick().await;
-
-            let shard = rng.gen_range(0..self.global_pool.shards().len());
-            self.gc(shard);
-        }
-    }
-
-    fn gc(&self, shard: usize) {
-        debug!(shard, "pool: performing epoch reclamation");
-
-        // acquire a random shard lock
-        let mut shard = self.global_pool.shards()[shard].write();
-
-        let timer = Metrics::get()
-            .proxy
-            .http_pool_reclaimation_lag_seconds
-            .start_timer();
-        let current_len = shard.len();
-        let mut clients_removed = 0;
-        shard.retain(|endpoint, x| {
-            // if the current endpoint pool is unique (no other strong or weak references)
-            // then it is currently not in use by any connections.
-            if let Some(pool) = Arc::get_mut(x.get_mut()) {
-                let EndpointConnPool {
-                    pools, total_conns, ..
-                } = pool.get_mut();
-
-                // ensure that closed clients are removed
-                for db_pool in pools.values_mut() {
-                    clients_removed += db_pool.clear_closed_clients(total_conns);
-                }
-
-                // we only remove this pool if it has no active connections
-                if *total_conns == 0 {
-                    info!("pool: discarding pool for endpoint {endpoint}");
-                    return false;
-                }
-            }
-
-            true
-        });
-
-        let new_len = shard.len();
-        drop(shard);
-        timer.observe();
-
-        // Do logging outside of the lock.
-        if clients_removed > 0 {
-            let size = self
-                .global_connections_count
-                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
-                - clients_removed;
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(clients_removed as i64);
-            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
-        }
-        let removed = current_len - new_len;
-
-        if removed > 0 {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_sub(removed, atomic::Ordering::Relaxed)
-                - removed;
-            info!("pool: performed global pool gc. size now {global_pool_size}");
-        }
-    }
-
-    pub(crate) fn get(
-        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
-        conn_info: &ConnInfo,
-    ) -> Result<Option<Client<C>>, HttpConnError> {
-        let mut client: Option<ClientInner<C>> = None;
-        let Some(endpoint) = conn_info.endpoint_cache_key() else {
-            return Ok(None);
-        };
-
-        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
-        if let Some(entry) = endpoint_pool
-            .write()
-            .get_conn_entry(conn_info.db_and_user())
-        {
-            client = Some(entry.conn);
-        }
-        let endpoint_pool = Arc::downgrade(&endpoint_pool);
-
-        // ok return cached connection if found and establish a new one otherwise
-        if let Some(client) = client {
-            if client.is_closed() {
-                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                return Ok(None);
-            }
-            tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
-            tracing::Span::current().record(
-                "pid",
-                tracing::field::display(client.inner.get_process_id()),
-            );
-            info!(
-                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
-                "pool: reusing connection '{conn_info}'"
-            );
-            client.session.send(ctx.session_id())?;
-            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-            ctx.success();
-            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
-        }
-        Ok(None)
-    }
-
-    fn get_or_create_endpoint_pool(
-        self: &Arc<Self>,
-        endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool<C>>> {
-        // fast path
-        if let Some(pool) = self.global_pool.get(endpoint) {
-            return pool.clone();
-        }
-
-        // slow path
-        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
-            pools: HashMap::new(),
-            total_conns: 0,
-            max_conns: self.config.pool_options.max_conns_per_endpoint,
-            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
-            global_connections_count: self.global_connections_count.clone(),
-            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
-        }));
-
-        // find or create a pool for this endpoint
-        let mut created = false;
-        let pool = self
-            .global_pool
-            .entry(endpoint.clone())
-            .or_insert_with(|| {
-                created = true;
-                new_pool
-            })
-            .clone();
-
-        // log new global pool size
-        if created {
-            let global_pool_size = self
-                .global_pool_size
-                .fetch_add(1, atomic::Ordering::Relaxed)
-                + 1;
-            info!(
-                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
-            );
-        }
-
-        pool
-    }
-}
-
 pub(crate) fn poll_client<C: ClientInnerExt>(
     global_pool: Arc<GlobalConnPool<C>>,
     ctx: &RequestMonitoring,
@@ -574,7 +154,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
 
     }
     .instrument(span));
-    let inner = ClientInner {
+    let inner = ClientInnerRemote {
         inner: client,
         session: tx,
         cancel,
@@ -584,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     Client::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner<C: ClientInnerExt> {
+pub(crate) struct ClientInnerRemote<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
@@ -592,131 +172,36 @@ struct ClientInner<C: ClientInnerExt> {
     conn_id: uuid::Uuid,
 }
 
-impl<C: ClientInnerExt> Drop for ClientInner<C> {
-    fn drop(&mut self) {
-        // on client drop, tell the conn to shut down
-        self.cancel.cancel();
+impl<C: ClientInnerExt> ClientInnerRemote<C> {
+    pub(crate) fn inner_mut(&mut self) -> &mut C {
+        &mut self.inner
     }
-}
 
-pub(crate) trait ClientInnerExt: Sync + Send + 'static {
-    fn is_closed(&self) -> bool;
-    fn get_process_id(&self) -> i32;
-}
-
-impl ClientInnerExt for tokio_postgres::Client {
-    fn is_closed(&self) -> bool {
-        self.is_closed()
+    pub(crate) fn inner(&self) -> &C {
+        &self.inner
+    }
+
+    pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
+        &mut self.session
+    }
+
+    pub(crate) fn aux(&self) -> &MetricsAuxInfo {
+        &self.aux
+    }
+
+    pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
+        self.conn_id
     }
-    fn get_process_id(&self) -> i32 {
-        self.get_process_id()
-    }
-}
 
-impl<C: ClientInnerExt> ClientInner<C> {
     pub(crate) fn is_closed(&self) -> bool {
         self.inner.is_closed()
     }
 }
 
-impl<C: ClientInnerExt> Client<C> {
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
-        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id,
-        })
-    }
-}
-
-pub(crate) struct Client<C: ClientInnerExt> {
-    span: Span,
-    inner: Option<ClientInner<C>>,
-    conn_info: ConnInfo,
-    pool: Weak<RwLock<EndpointConnPool<C>>>,
-}
-
-pub(crate) struct Discard<'a, C: ClientInnerExt> {
-    conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
-}
-
-impl<C: ClientInnerExt> Client<C> {
-    pub(self) fn new(
-        inner: ClientInner<C>,
-        conn_info: ConnInfo,
-        pool: Weak<RwLock<EndpointConnPool<C>>>,
-    ) -> Self {
-        Self {
-            inner: Some(inner),
-            span: Span::current(),
-            conn_info,
-            pool,
-        }
-    }
-    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
-        let Self {
-            inner,
-            pool,
-            conn_info,
-            span: _,
-        } = self;
-        let inner = inner.as_mut().expect("client inner should not be removed");
-        (&mut inner.inner, Discard { conn_info, pool })
-    }
-}
-
-impl<C: ClientInnerExt> Discard<'_, C> {
-    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
-        }
-    }
-    pub(crate) fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Deref for Client<C> {
-    type Target = C;
-
-    fn deref(&self) -> &Self::Target {
-        &self
-            .inner
-            .as_ref()
-            .expect("client inner should not be removed")
-            .inner
-    }
-}
-
-impl<C: ClientInnerExt> Client<C> {
-    fn do_drop(&mut self) -> Option<impl FnOnce()> {
-        let conn_info = self.conn_info.clone();
-        let client = self
-            .inner
-            .take()
-            .expect("client inner should not be removed");
-        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
-            let current_span = self.span.clone();
-            // return connection to the pool
-            return Some(move || {
-                let _span = current_span.enter();
-                EndpointConnPool::put(&conn_pool, &conn_info, client);
-            });
-        }
-        None
-    }
-}
-
-impl<C: ClientInnerExt> Drop for Client<C> {
+impl<C: ClientInnerExt> Drop for ClientInnerRemote<C> {
     fn drop(&mut self) {
-        if let Some(drop) = self.do_drop() {
-            tokio::task::spawn_blocking(drop);
-        }
+        // on client drop, tell the conn to shut down
+        self.cancel.cancel();
     }
 }
 
@@ -745,12 +230,12 @@ mod tests {
         }
     }
 
-    fn create_inner() -> ClientInner<MockClient> {
+    fn create_inner() -> ClientInnerRemote<MockClient> {
         create_inner_with(MockClient::new(false))
     }
 
-    fn create_inner_with(client: MockClient) -> ClientInner<MockClient> {
-        ClientInner {
+    fn create_inner_with(client: MockClient) -> ClientInnerRemote<MockClient> {
+        ClientInnerRemote {
             inner: client,
             session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
             cancel: CancellationToken::new(),
@@ -797,7 +282,7 @@ mod tests {
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
-            client.inner().1.discard();
+            client.inner_mut().1.discard();
             // Discard should not add the connection from the pool.
             assert_eq!(0, pool.get_global_connections_count());
         }
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
new file mode 100644
index 0000000000..6e964ce878
--- /dev/null
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -0,0 +1,562 @@
+use dashmap::DashMap;
+use parking_lot::RwLock;
+use rand::Rng;
+use std::{collections::HashMap, sync::Arc, sync::Weak, time::Duration};
+use std::{
+    ops::Deref,
+    sync::atomic::{self, AtomicUsize},
+};
+use tokio_postgres::ReadyForQueryStatus;
+
+use crate::control_plane::messages::ColdStartInfo;
+use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+use crate::{
+    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
+};
+
+use super::conn_pool::ClientInnerRemote;
+use tracing::info;
+use tracing::{debug, Span};
+
+use super::backend::HttpConnError;
+
+#[derive(Debug, Clone)]
+pub(crate) struct ConnInfo {
+    pub(crate) user_info: ComputeUserInfo,
+    pub(crate) dbname: DbName,
+}
+
+impl ConnInfo {
+    // hm, change to hasher to avoid cloning?
+    pub(crate) fn db_and_user(&self) -> (DbName, RoleName) {
+        (self.dbname.clone(), self.user_info.user.clone())
+    }
+
+    pub(crate) fn endpoint_cache_key(&self) -> Option<EndpointCacheKey> {
+        // We don't want to cache http connections for ephemeral endpoints.
+        if self.user_info.options.is_ephemeral() {
+            None
+        } else {
+            Some(self.user_info.endpoint_cache_key())
+        }
+    }
+}
+
+pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
+    pub(crate) conn: ClientInnerRemote<C>,
+    pub(crate) _last_access: std::time::Instant,
+}
+
+// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
+// Number of open connections is limited by the `max_conns_per_endpoint`.
+pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
+    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
+    total_conns: usize,
+    max_conns: usize,
+    _guard: HttpEndpointPoolsGuard<'static>,
+    global_connections_count: Arc<AtomicUsize>,
+    global_pool_size_max_conns: usize,
+}
+
+impl<C: ClientInnerExt> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
+        let Self {
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
+        } = self;
+        pools.get_mut(&db_user).and_then(|pool_entries| {
+            let (entry, removed) = pool_entries.get_conn_entry(total_conns);
+            global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+            entry
+        })
+    }
+
+    pub(crate) fn remove_client(
+        &mut self,
+        db_user: (DbName, RoleName),
+        conn_id: uuid::Uuid,
+    ) -> bool {
+        let Self {
+            pools,
+            total_conns,
+            global_connections_count,
+            ..
+        } = self;
+        if let Some(pool) = pools.get_mut(&db_user) {
+            let old_len = pool.conns.len();
+            pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id);
+            let new_len = pool.conns.len();
+            let removed = old_len - new_len;
+            if removed > 0 {
+                global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .dec_by(removed as i64);
+            }
+            *total_conns -= removed;
+            removed > 0
+        } else {
+            false
+        }
+    }
+
+    pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerRemote<C>) {
+        let conn_id = client.get_conn_id();
+
+        if client.is_closed() {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+            return;
+        }
+
+        let global_max_conn = pool.read().global_pool_size_max_conns;
+        if pool
+            .read()
+            .global_connections_count
+            .load(atomic::Ordering::Relaxed)
+            >= global_max_conn
+        {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
+            return;
+        }
+
+        // return connection to the pool
+        let mut returned = false;
+        let mut per_db_size = 0;
+        let total_conns = {
+            let mut pool = pool.write();
+
+            if pool.total_conns < pool.max_conns {
+                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
+                pool_entries.conns.push(ConnPoolEntry {
+                    conn: client,
+                    _last_access: std::time::Instant::now(),
+                });
+
+                returned = true;
+                per_db_size = pool_entries.conns.len();
+
+                pool.total_conns += 1;
+                pool.global_connections_count
+                    .fetch_add(1, atomic::Ordering::Relaxed);
+                Metrics::get()
+                    .proxy
+                    .http_pool_opened_connections
+                    .get_metric()
+                    .inc();
+            }
+
+            pool.total_conns
+        };
+
+        // do logging outside of the mutex
+        if returned {
+            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+        } else {
+            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+        }
+    }
+}
+
+impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
+    fn drop(&mut self) {
+        if self.total_conns > 0 {
+            self.global_connections_count
+                .fetch_sub(self.total_conns, atomic::Ordering::Relaxed);
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(self.total_conns as i64);
+        }
+    }
+}
+
+pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
+    pub(crate) conns: Vec<ConnPoolEntry<C>>,
+}
+
+impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
+    fn default() -> Self {
+        Self { conns: Vec::new() }
+    }
+}
+
+impl<C: ClientInnerExt> DbUserConnPool<C> {
+    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
+        let old_len = self.conns.len();
+
+        self.conns.retain(|conn| !conn.conn.is_closed());
+
+        let new_len = self.conns.len();
+        let removed = old_len - new_len;
+        *conns -= removed;
+        removed
+    }
+
+    pub(crate) fn get_conn_entry(
+        &mut self,
+        conns: &mut usize,
+    ) -> (Option<ConnPoolEntry<C>>, usize) {
+        let mut removed = self.clear_closed_clients(conns);
+        let conn = self.conns.pop();
+        if conn.is_some() {
+            *conns -= 1;
+            removed += 1;
+        }
+
+        Metrics::get()
+            .proxy
+            .http_pool_opened_connections
+            .get_metric()
+            .dec_by(removed as i64);
+
+        (conn, removed)
+    }
+}
+
+pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
+    // endpoint -> per-endpoint connection pool
+    //
+    // That should be a fairly conteded map, so return reference to the per-endpoint
+    // pool as early as possible and release the lock.
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
+
+    /// Number of endpoint-connection pools
+    ///
+    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
+    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
+    /// It's only used for diagnostics.
+    global_pool_size: AtomicUsize,
+
+    /// Total number of connections in the pool
+    global_connections_count: Arc<AtomicUsize>,
+
+    config: &'static crate::config::HttpConfig,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub struct GlobalConnPoolOptions {
+    // Maximum number of connections per one endpoint.
+    // Can mix different (dbname, username) connections.
+    // When running out of free slots for a particular endpoint,
+    // falls back to opening a new connection for each request.
+    pub max_conns_per_endpoint: usize,
+
+    pub gc_epoch: Duration,
+
+    pub pool_shards: usize,
+
+    pub idle_timeout: Duration,
+
+    pub opt_in: bool,
+
+    // Total number of connections in the pool.
+    pub max_total_conns: usize,
+}
+
+impl<C: ClientInnerExt> GlobalConnPool<C> {
+    pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
+        let shards = config.pool_options.pool_shards;
+        Arc::new(Self {
+            global_pool: DashMap::with_shard_amount(shards),
+            global_pool_size: AtomicUsize::new(0),
+            config,
+            global_connections_count: Arc::new(AtomicUsize::new(0)),
+        })
+    }
+
+    #[cfg(test)]
+    pub(crate) fn get_global_connections_count(&self) -> usize {
+        self.global_connections_count
+            .load(atomic::Ordering::Relaxed)
+    }
+
+    pub(crate) fn get_idle_timeout(&self) -> Duration {
+        self.config.pool_options.idle_timeout
+    }
+
+    pub(crate) fn shutdown(&self) {
+        // drops all strong references to endpoint-pools
+        self.global_pool.clear();
+    }
+
+    pub(crate) async fn gc_worker(&self, mut rng: impl Rng) {
+        let epoch = self.config.pool_options.gc_epoch;
+        let mut interval = tokio::time::interval(epoch / (self.global_pool.shards().len()) as u32);
+        loop {
+            interval.tick().await;
+
+            let shard = rng.gen_range(0..self.global_pool.shards().len());
+            self.gc(shard);
+        }
+    }
+
+    pub(crate) fn gc(&self, shard: usize) {
+        debug!(shard, "pool: performing epoch reclamation");
+
+        // acquire a random shard lock
+        let mut shard = self.global_pool.shards()[shard].write();
+
+        let timer = Metrics::get()
+            .proxy
+            .http_pool_reclaimation_lag_seconds
+            .start_timer();
+        let current_len = shard.len();
+        let mut clients_removed = 0;
+        shard.retain(|endpoint, x| {
+            // if the current endpoint pool is unique (no other strong or weak references)
+            // then it is currently not in use by any connections.
+            if let Some(pool) = Arc::get_mut(x.get_mut()) {
+                let EndpointConnPool {
+                    pools, total_conns, ..
+                } = pool.get_mut();
+
+                // ensure that closed clients are removed
+                for db_pool in pools.values_mut() {
+                    clients_removed += db_pool.clear_closed_clients(total_conns);
+                }
+
+                // we only remove this pool if it has no active connections
+                if *total_conns == 0 {
+                    info!("pool: discarding pool for endpoint {endpoint}");
+                    return false;
+                }
+            }
+
+            true
+        });
+
+        let new_len = shard.len();
+        drop(shard);
+        timer.observe();
+
+        // Do logging outside of the lock.
+        if clients_removed > 0 {
+            let size = self
+                .global_connections_count
+                .fetch_sub(clients_removed, atomic::Ordering::Relaxed)
+                - clients_removed;
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .dec_by(clients_removed as i64);
+            info!("pool: performed global pool gc. removed {clients_removed} clients, total number of clients in pool is {size}");
+        }
+        let removed = current_len - new_len;
+
+        if removed > 0 {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_sub(removed, atomic::Ordering::Relaxed)
+                - removed;
+            info!("pool: performed global pool gc. size now {global_pool_size}");
+        }
+    }
+
+    pub(crate) fn get_or_create_endpoint_pool(
+        self: &Arc<Self>,
+        endpoint: &EndpointCacheKey,
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
+        // fast path
+        if let Some(pool) = self.global_pool.get(endpoint) {
+            return pool.clone();
+        }
+
+        // slow path
+        let new_pool = Arc::new(RwLock::new(EndpointConnPool {
+            pools: HashMap::new(),
+            total_conns: 0,
+            max_conns: self.config.pool_options.max_conns_per_endpoint,
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
+            global_connections_count: self.global_connections_count.clone(),
+            global_pool_size_max_conns: self.config.pool_options.max_total_conns,
+        }));
+
+        // find or create a pool for this endpoint
+        let mut created = false;
+        let pool = self
+            .global_pool
+            .entry(endpoint.clone())
+            .or_insert_with(|| {
+                created = true;
+                new_pool
+            })
+            .clone();
+
+        // log new global pool size
+        if created {
+            let global_pool_size = self
+                .global_pool_size
+                .fetch_add(1, atomic::Ordering::Relaxed)
+                + 1;
+            info!(
+                "pool: created new pool for '{endpoint}', global pool size now {global_pool_size}"
+            );
+        }
+
+        pool
+    }
+
+    pub(crate) fn get(
+        self: &Arc<Self>,
+        ctx: &RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Result<Option<Client<C>>, HttpConnError> {
+        let mut client: Option<ClientInnerRemote<C>> = None;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            return Ok(None);
+        };
+
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
+        if let Some(entry) = endpoint_pool
+            .write()
+            .get_conn_entry(conn_info.db_and_user())
+        {
+            client = Some(entry.conn);
+        }
+        let endpoint_pool = Arc::downgrade(&endpoint_pool);
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(mut client) = client {
+            if client.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                return Ok(None);
+            }
+            tracing::Span::current()
+                .record("conn_id", tracing::field::display(client.get_conn_id()));
+            tracing::Span::current().record(
+                "pid",
+                tracing::field::display(client.inner().get_process_id()),
+            );
+            info!(
+                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                "pool: reusing connection '{conn_info}'"
+            );
+
+            client.session().send(ctx.session_id())?;
+            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+            ctx.success();
+            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
+        }
+        Ok(None)
+    }
+}
+
+impl<C: ClientInnerExt> Client<C> {
+    pub(crate) fn new(
+        inner: ClientInnerRemote<C>,
+        conn_info: ConnInfo,
+        pool: Weak<RwLock<EndpointConnPool<C>>>,
+    ) -> Self {
+        Self {
+            inner: Some(inner),
+            span: Span::current(),
+            conn_info,
+            pool,
+        }
+    }
+
+    pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner = inner.as_mut().expect("client inner should not be removed");
+        let inner_ref = inner.inner_mut();
+        (inner_ref, Discard { conn_info, pool })
+    }
+
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.as_ref().unwrap().aux();
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
+        })
+    }
+
+    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce()> {
+        let conn_info = self.conn_info.clone();
+        let client = self
+            .inner
+            .take()
+            .expect("client inner should not be removed");
+        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
+            let current_span = self.span.clone();
+            // return connection to the pool
+            return Some(move || {
+                let _span = current_span.enter();
+                EndpointConnPool::put(&conn_pool, &conn_info, client);
+            });
+        }
+        None
+    }
+}
+
+pub(crate) struct Client<C: ClientInnerExt> {
+    span: Span,
+    inner: Option<ClientInnerRemote<C>>,
+    conn_info: ConnInfo,
+    pool: Weak<RwLock<EndpointConnPool<C>>>,
+}
+
+impl<C: ClientInnerExt> Drop for Client<C> {
+    fn drop(&mut self) {
+        if let Some(drop) = self.do_drop() {
+            tokio::task::spawn_blocking(drop);
+        }
+    }
+}
+
+impl<C: ClientInnerExt> Deref for Client<C> {
+    type Target = C;
+
+    fn deref(&self) -> &Self::Target {
+        self.inner
+            .as_ref()
+            .expect("client inner should not be removed")
+            .inner()
+    }
+}
+
+pub(crate) trait ClientInnerExt: Sync + Send + 'static {
+    fn is_closed(&self) -> bool;
+    fn get_process_id(&self) -> i32;
+}
+
+impl ClientInnerExt for tokio_postgres::Client {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+
+    fn get_process_id(&self) -> i32 {
+        self.get_process_id()
+    }
+}
+
+pub(crate) struct Discard<'a, C: ClientInnerExt> {
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
+}
+
+impl<C: ClientInnerExt> Discard<'_, C> {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!("pool: throwing away connection '{conn_info}' because connection is not idle");
+        }
+    }
+    pub(crate) fn discard(&mut self) {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
+            info!("pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+        }
+    }
+}
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 9b6bc98557..79bb19328f 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -10,11 +10,12 @@ use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
-use super::conn_pool::ConnInfo;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
@@ -22,15 +23,15 @@ pub(crate) type Connect =
     http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
-struct ConnPoolEntry {
-    conn: Send,
+pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
+    conn: C,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 }
 
 // Per-endpoint connection pool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool {
+pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
     // TODO(conrad):
     // either we should open more connections depending on stream count
     // (not exposed by hyper, need our own counter)
@@ -40,13 +41,13 @@ pub(crate) struct EndpointConnPool {
     // seems somewhat redundant though.
     //
     // Probably we should run a semaphore and just the single conn. TBD.
-    conns: VecDeque<ConnPoolEntry>,
+    conns: VecDeque<ConnPoolEntry<C>>,
     _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
 }
 
-impl EndpointConnPool {
-    fn get_conn_entry(&mut self) -> Option<ConnPoolEntry> {
+impl<C: ClientInnerExt + Clone> EndpointConnPool<C> {
+    fn get_conn_entry(&mut self) -> Option<ConnPoolEntry<C>> {
         let Self { conns, .. } = self;
 
         loop {
@@ -81,7 +82,7 @@ impl EndpointConnPool {
     }
 }
 
-impl Drop for EndpointConnPool {
+impl<C: ClientInnerExt + Clone> Drop for EndpointConnPool<C> {
     fn drop(&mut self) {
         if !self.conns.is_empty() {
             self.global_connections_count
@@ -95,12 +96,12 @@ impl Drop for EndpointConnPool {
     }
 }
 
-pub(crate) struct GlobalConnPool {
+pub(crate) struct GlobalConnPool<C: ClientInnerExt + Clone> {
     // endpoint -> per-endpoint connection pool
     //
     // That should be a fairly conteded map, so return reference to the per-endpoint
     // pool as early as possible and release the lock.
-    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<EndpointCacheKey, Arc<RwLock<EndpointConnPool<C>>>>,
 
     /// Number of endpoint-connection pools
     ///
@@ -115,7 +116,7 @@ pub(crate) struct GlobalConnPool {
     config: &'static crate::config::HttpConfig,
 }
 
-impl GlobalConnPool {
+impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         let shards = config.pool_options.pool_shards;
         Arc::new(Self {
@@ -210,7 +211,7 @@ impl GlobalConnPool {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Option<Client> {
+    ) -> Option<Client<C>> {
         let endpoint = conn_info.endpoint_cache_key()?;
         let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
         let client = endpoint_pool.write().get_conn_entry()?;
@@ -228,7 +229,7 @@ impl GlobalConnPool {
     fn get_or_create_endpoint_pool(
         self: &Arc<Self>,
         endpoint: &EndpointCacheKey,
-    ) -> Arc<RwLock<EndpointConnPool>> {
+    ) -> Arc<RwLock<EndpointConnPool<C>>> {
         // fast path
         if let Some(pool) = self.global_pool.get(endpoint) {
             return pool.clone();
@@ -268,14 +269,14 @@ impl GlobalConnPool {
 }
 
 pub(crate) fn poll_http2_client(
-    global_pool: Arc<GlobalConnPool>,
+    global_pool: Arc<GlobalConnPool<Send>>,
     ctx: &RequestMonitoring,
     conn_info: &ConnInfo,
     client: Send,
     connection: Connect,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
-) -> Client {
+) -> Client<Send> {
     let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
     let session_id = ctx.session_id();
 
@@ -322,13 +323,13 @@ pub(crate) fn poll_http2_client(
     Client::new(client, aux)
 }
 
-pub(crate) struct Client {
-    pub(crate) inner: Send,
+pub(crate) struct Client<C: ClientInnerExt + Clone> {
+    pub(crate) inner: C,
     aux: MetricsAuxInfo,
 }
 
-impl Client {
-    pub(self) fn new(inner: Send, aux: MetricsAuxInfo) -> Self {
+impl<C: ClientInnerExt + Clone> Client<C> {
+    pub(self) fn new(inner: C, aux: MetricsAuxInfo) -> Self {
         Self { inner, aux }
     }
 
@@ -339,3 +340,14 @@ impl Client {
         })
     }
 }
+
+impl ClientInnerExt for Send {
+    fn is_closed(&self) -> bool {
+        self.is_closed()
+    }
+
+    fn get_process_id(&self) -> i32 {
+        // ideally throw something meaningful
+        -1
+    }
+}
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 5df37a8762..c4fdd00f78 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -20,11 +20,12 @@ use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument, Span};
 
 use super::backend::HttpConnError;
-use super::conn_pool::{ClientInnerExt, ConnInfo};
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
+
 use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
@@ -362,7 +363,7 @@ pub(crate) fn poll_client(
     LocalClient::new(inner, conn_info, pool_clone)
 }
 
-struct ClientInner<C: ClientInnerExt> {
+pub(crate) struct ClientInner<C: ClientInnerExt> {
     inner: C,
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
@@ -387,13 +388,24 @@ impl<C: ClientInnerExt> ClientInner<C> {
     }
 }
 
-impl<C: ClientInnerExt> LocalClient<C> {
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
-        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id,
-        })
+impl ClientInner<tokio_postgres::Client> {
+    pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
+        self.jti += 1;
+        let token = resign_jwt(&self.key, payload, self.jti)?;
+
+        // initiates the auth session
+        self.inner.simple_query("discard all").await?;
+        self.inner
+            .query(
+                "select auth.jwt_session_init($1)",
+                &[&token as &(dyn ToSql + Sync)],
+            )
+            .await?;
+
+        let pid = self.inner.get_process_id();
+        info!(pid, jti = self.jti, "user session state init");
+
+        Ok(())
     }
 }
 
@@ -422,6 +434,18 @@ impl<C: ClientInnerExt> LocalClient<C> {
             pool,
         }
     }
+
+    pub(crate) fn client_inner(&mut self) -> (&mut ClientInner<C>, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner_m = inner.as_mut().expect("client inner should not be removed");
+        (inner_m, Discard { conn_info, pool })
+    }
+
     pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
@@ -434,33 +458,6 @@ impl<C: ClientInnerExt> LocalClient<C> {
     }
 }
 
-impl LocalClient<tokio_postgres::Client> {
-    pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
-        let inner = self
-            .inner
-            .as_mut()
-            .expect("client inner should not be removed");
-
-        inner.jti += 1;
-        let token = resign_jwt(&inner.key, payload, inner.jti)?;
-
-        // initiates the auth session
-        inner.inner.simple_query("discard all").await?;
-        inner
-            .inner
-            .query(
-                "select auth.jwt_session_init($1)",
-                &[&token as &(dyn ToSql + Sync)],
-            )
-            .await?;
-
-        let pid = inner.inner.get_process_id();
-        info!(pid, jti = inner.jti, "user session state init");
-
-        Ok(())
-    }
-}
-
 /// implements relatively efficient in-place json object key upserting
 ///
 /// only supports top-level keys
@@ -524,24 +521,15 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
     jwt
 }
 
-impl<C: ClientInnerExt> Discard<'_, C> {
-    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(
-                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
-            );
-        }
-    }
-    pub(crate) fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
-        }
-    }
-}
-
 impl<C: ClientInnerExt> LocalClient<C> {
+    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
+        let aux = &self.inner.as_ref().unwrap().aux;
+        USAGE_METRICS.register(Ids {
+            endpoint_id: aux.endpoint_id,
+            branch_id: aux.branch_id,
+        })
+    }
+
     fn do_drop(&mut self) -> Option<impl FnOnce()> {
         let conn_info = self.conn_info.clone();
         let client = self
@@ -568,6 +556,23 @@ impl<C: ClientInnerExt> Drop for LocalClient<C> {
     }
 }
 
+impl<C: ClientInnerExt> Discard<'_, C> {
+    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
+        let conn_info = &self.conn_info;
+        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
+            info!(
+                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
+            );
+        }
+    }
+    pub(crate) fn discard(&mut self) {
+        let conn_info = &self.conn_info;
+        if std::mem::take(self.pool).strong_count() > 0 {
+            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use p256::ecdsa::SigningKey;
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 3ed3b6c845..29ff7b9d91 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -5,6 +5,7 @@
 mod backend;
 pub mod cancel_set;
 mod conn_pool;
+mod conn_pool_lib;
 mod http_conn_pool;
 mod http_util;
 mod json;
@@ -20,7 +21,7 @@ use anyhow::Context;
 use async_trait::async_trait;
 use atomic_take::AtomicTake;
 use bytes::Bytes;
-pub use conn_pool::GlobalConnPoolOptions;
+pub use conn_pool_lib::GlobalConnPoolOptions;
 use futures::future::{select, Either};
 use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
@@ -65,7 +66,7 @@ pub async fn task_main(
     }
 
     let local_pool = local_conn_pool::LocalConnPool::new(&config.http_config);
-    let conn_pool = conn_pool::GlobalConnPool::new(&config.http_config);
+    let conn_pool = conn_pool_lib::GlobalConnPool::new(&config.http_config);
     {
         let conn_pool = Arc::clone(&conn_pool);
         tokio::spawn(async move {
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 3d8a2adef1..bb5eb390a6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -25,10 +25,11 @@ use urlencoding;
 use utils::http::error::ApiError;
 
 use super::backend::{LocalProxyConnError, PoolingBackend};
-use super::conn_pool::{AuthData, ConnInfo, ConnInfoWithAuth};
+use super::conn_pool::{AuthData, ConnInfoWithAuth};
+use super::conn_pool_lib::{self, ConnInfo};
 use super::http_util::json_response;
 use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
-use super::{conn_pool, local_conn_pool};
+use super::local_conn_pool;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
@@ -37,6 +38,7 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
+
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 use crate::{DbName, RoleName};
 
@@ -607,7 +609,8 @@ async fn handle_db_inner(
             let client = match keys.keys {
                 ComputeCredentialKeys::JwtPayload(payload) if is_local_proxy => {
                     let mut client = backend.connect_to_local_postgres(ctx, conn_info).await?;
-                    client.set_jwt_session(&payload).await?;
+                    let (cli_inner, _dsc) = client.client_inner();
+                    cli_inner.set_jwt_session(&payload).await?;
                     Client::Local(client)
                 }
                 _ => {
@@ -1021,12 +1024,12 @@ async fn query_to_json<T: GenericClient>(
 }
 
 enum Client {
-    Remote(conn_pool::Client<tokio_postgres::Client>),
+    Remote(conn_pool_lib::Client<tokio_postgres::Client>),
     Local(local_conn_pool::LocalClient<tokio_postgres::Client>),
 }
 
 enum Discard<'a> {
-    Remote(conn_pool::Discard<'a, tokio_postgres::Client>),
+    Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
     Local(local_conn_pool::Discard<'a, tokio_postgres::Client>),
 }
 
@@ -1041,7 +1044,7 @@ impl Client {
     fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
         match self {
             Client::Remote(client) => {
-                let (c, d) = client.inner();
+                let (c, d) = client.inner_mut();
                 (c, Discard::Remote(d))
             }
             Client::Local(local_client) => {

From 35e7d91bc9eb07c8ef70acef5e224c9b9e78a0ec Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 17 Oct 2024 14:07:58 +0200
Subject: [PATCH 028/239] Add config variable for timeline offloading (#9421)

Adds a configuration variable for timeline offloading support. The added
pageserver-global config option controls whether the pageserver
automatically offloads timelines during compaction.

Therefore, already offloaded timelines are not affected by this, nor is
the manual testing endpoint.

This allows the rollout of timeline offloading to be driven by the
storage team.

Part of #8088
---
 libs/pageserver_api/src/config.rs            | 2 ++
 pageserver/src/config.rs                     | 5 +++++
 pageserver/src/tenant.rs                     | 3 ++-
 pageserver/src/tenant/timeline.rs            | 1 +
 test_runner/regress/test_timeline_archive.py | 4 ++++
 5 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 24474d4840..896a5d8069 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -102,6 +102,7 @@ pub struct ConfigToml {
     pub ingest_batch_size: u64,
     pub max_vectored_read_bytes: MaxVectoredReadBytes,
     pub image_compression: ImageCompressionAlgorithm,
+    pub timeline_offloading: bool,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
@@ -385,6 +386,7 @@ impl Default for ConfigToml {
                 NonZeroUsize::new(DEFAULT_MAX_VECTORED_READ_BYTES).unwrap(),
             )),
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
+            timeline_offloading: false,
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
             virtual_file_io_mode: None,
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 8db78285e4..06d4326459 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -164,6 +164,9 @@ pub struct PageServerConf {
 
     pub image_compression: ImageCompressionAlgorithm,
 
+    /// Whether to offload archived timelines automatically
+    pub timeline_offloading: bool,
+
     /// How many bytes of ephemeral layer content will we allow per kilobyte of RAM.  When this
     /// is exceeded, we start proactively closing ephemeral layers to limit the total amount
     /// of ephemeral data.
@@ -321,6 +324,7 @@ impl PageServerConf {
             ingest_batch_size,
             max_vectored_read_bytes,
             image_compression,
+            timeline_offloading,
             ephemeral_bytes_per_memory_kb,
             l0_flush,
             virtual_file_io_mode,
@@ -364,6 +368,7 @@ impl PageServerConf {
             ingest_batch_size,
             max_vectored_read_bytes,
             image_compression,
+            timeline_offloading,
             ephemeral_bytes_per_memory_kb,
 
             // ------------------------------------------------------------
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 689982ddd4..baa2365658 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2187,7 +2187,8 @@ impl Tenant {
                             .iter()
                             .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
                     };
-                    let can_offload = can_offload && has_no_unoffloaded_children;
+                    let can_offload =
+                        can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
                     if (is_active, can_offload) == (false, false) {
                         None
                     } else {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1992dee930..2b4f949c76 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1565,6 +1565,7 @@ impl Timeline {
     }
 
     /// Checks if the internal state of the timeline is consistent with it being able to be offloaded.
+    ///
     /// This is neccessary but not sufficient for offloading of the timeline as it might have
     /// child timelines that are not offloaded yet.
     pub(crate) fn can_offload(&self) -> bool {
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index ffaed5e130..85e1077fd5 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -119,6 +119,10 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int):
 
 @pytest.mark.parametrize("manual_offload", [False, True])
 def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: bool):
+    if not manual_offload:
+        # (automatic) timeline offloading defaults to false for now
+        neon_env_builder.pageserver_config_override = "timeline_offloading = true"
+
     env = neon_env_builder.init_start()
     ps_http = env.pageserver.http_client()
 

From 8b479381403cd2be8f7bc7eba69d5074735d8924 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 17 Oct 2024 13:37:21 +0100
Subject: [PATCH 029/239] Add support of extensions for v17 (part 3) (#9430)

- pgvector 7.4

update support of extensions for v14-v16:
- pgvector 7.2 -> 7.4
---
 compute/Dockerfile.compute-node | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index b0ce7c1718..45c1fd9f38 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -353,13 +353,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 #
-# v17 is not supported yet because of upstream issue
-# https://github.com/pgvector/pgvector/issues/669
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.2.tar.gz -O pgvector.tar.gz && \
-    echo "617fba855c9bcb41a2a9bc78a78567fd2e147c72afd5bf9d37b31b9591632b30 pgvector.tar.gz" | sha256sum --check && \
+# vector 0.7.4 supports v17
+# last release v0.7.4 - Aug 5, 2024
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
+    echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From a7c05686ccbebc856b0ce389a9fa60d2bddbeea6 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 17 Oct 2024 17:20:42 +0300
Subject: [PATCH 030/239] test_runner: Update the README.md to build neon with
 'testing' (#9437)

Without having the '--features testing' in the cargo build the proxy
won't start causing tests to fail.
---
 test_runner/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/README.md b/test_runner/README.md
index e087241c1f..55d8d2faa9 100644
--- a/test_runner/README.md
+++ b/test_runner/README.md
@@ -6,7 +6,7 @@ Prerequisites:
 - Correctly configured Python, see [`/docs/sourcetree.md`](/docs/sourcetree.md#using-python)
 - Neon and Postgres binaries
     - See the root [README.md](/README.md) for build directions
-      If you want to test tests with test-only APIs, you would need to add `--features testing` to Rust code build commands.
+      To run tests you need to add `--features testing` to Rust code build commands.
       For convenience, repository cargo config contains `build_testing` alias, that serves as a subcommand, adding the required feature flags.
       Usage example: `cargo build_testing --release` is equivalent to `cargo build --features testing --release`
     - Tests can be run from the git tree; or see the environment variables

From f3a3eefd26284776ab3179116374009ec537ab11 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Oct 2024 10:29:53 -0400
Subject: [PATCH 031/239] feat(pageserver): do space check before gc-compaction
 (#9250)

part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

gc-compaction may take a lot of disk space, and if it does, the caller
should do a partial gc-compaction. This patch adds space check for the
compaction job.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/disk_usage_eviction_task.rs   | 11 +----
 pageserver/src/statvfs.rs                    | 16 ++++++++
 pageserver/src/tenant/storage_layer/layer.rs |  4 ++
 pageserver/src/tenant/timeline/compaction.rs | 42 ++++++++++++++++++++
 4 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index a58fa2c0b1..7ab2ba8742 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -1218,16 +1218,7 @@ mod filesystem_level_usage {
         let stat = Statvfs::get(tenants_dir, mock_config)
             .context("statvfs failed, presumably directory got unlinked")?;
 
-        // https://unix.stackexchange.com/a/703650
-        let blocksize = if stat.fragment_size() > 0 {
-            stat.fragment_size()
-        } else {
-            stat.block_size()
-        };
-
-        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
-        let avail_bytes = stat.blocks_available() * blocksize;
-        let total_bytes = stat.blocks() * blocksize;
+        let (avail_bytes, total_bytes) = stat.get_avail_total_bytes();
 
         Ok(Usage {
             config,
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 5a6f6e5176..205605bc86 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -53,6 +53,22 @@ impl Statvfs {
             Statvfs::Mock(stat) => stat.block_size,
         }
     }
+
+    /// Get the available and total bytes on the filesystem.
+    pub fn get_avail_total_bytes(&self) -> (u64, u64) {
+        // https://unix.stackexchange.com/a/703650
+        let blocksize = if self.fragment_size() > 0 {
+            self.fragment_size()
+        } else {
+            self.block_size()
+        };
+
+        // use blocks_available (b_avail) since, pageserver runs as unprivileged user
+        let avail_bytes = self.blocks_available() * blocksize;
+        let total_bytes = self.blocks() * blocksize;
+
+        (avail_bytes, total_bytes)
+    }
 }
 
 pub mod mock {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index bbb21b180e..f29a33bae6 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -341,6 +341,10 @@ impl Layer {
         Ok(())
     }
 
+    pub(crate) async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
+        self.0.needs_download().await
+    }
+
     /// Assuming the layer is already downloaded, returns a guard which will prohibit eviction
     /// while the guard exists.
     ///
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8b9ace1e5b..5588363330 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -29,6 +29,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
+use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
@@ -1691,6 +1692,45 @@ impl Timeline {
         unreachable!("key retention is empty")
     }
 
+    /// Check how much space is left on the disk
+    async fn check_available_space(self: &Arc<Self>) -> anyhow::Result<u64> {
+        let tenants_dir = self.conf.tenants_path();
+
+        let stat = Statvfs::get(&tenants_dir, None)
+            .context("statvfs failed, presumably directory got unlinked")?;
+
+        let (avail_bytes, _) = stat.get_avail_total_bytes();
+
+        Ok(avail_bytes)
+    }
+
+    /// Check if the compaction can proceed safely without running out of space. We assume the size
+    /// upper bound of the produced files of a compaction job is the same as all layers involved in
+    /// the compaction. Therefore, we need `2 * layers_to_be_compacted_size` at least to do a
+    /// compaction.
+    async fn check_compaction_space(
+        self: &Arc<Self>,
+        layer_selection: &[Layer],
+    ) -> anyhow::Result<()> {
+        let available_space = self.check_available_space().await?;
+        let mut remote_layer_size = 0;
+        let mut all_layer_size = 0;
+        for layer in layer_selection {
+            let needs_download = layer.needs_download().await?;
+            if needs_download.is_some() {
+                remote_layer_size += layer.layer_desc().file_size;
+            }
+            all_layer_size += layer.layer_desc().file_size;
+        }
+        let allocated_space = (available_space as f64 * 0.8) as u64; /* reserve 20% space for other tasks */
+        if all_layer_size /* space needed for newly-generated file */ + remote_layer_size /* space for downloading layers */ > allocated_space
+        {
+            return Err(anyhow!("not enough space for compaction: available_space={}, allocated_space={}, all_layer_size={}, remote_layer_size={}, required_space={}",
+                available_space, allocated_space, all_layer_size, remote_layer_size, all_layer_size + remote_layer_size));
+        }
+        Ok(())
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1806,6 +1846,8 @@ impl Timeline {
             lowest_retain_lsn
         );
 
+        self.check_compaction_space(&layer_selection).await?;
+
         // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
         let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)

From 4c9835f4a3065648c2d6ecd721664b88557aca0f Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 17 Oct 2024 16:34:51 +0200
Subject: [PATCH 032/239] storage_controller: delete stale shards when deleting
 tenant (#9333)

## Problem

Tenant deletion only removes the current shards from remote storage. Any
stale parent shards (before splits) will be left behind. These shards
are kept since child shards may reference data from the parent until new
image layers are generated.

## Summary of changes

* Document a special case for pageserver tenant deletion that deletes
all shards in remote storage when given an unsharded tenant ID, as well
as any unsharded tenant data.
* Pass an unsharded tenant ID to delete all remote storage under the
tenant ID prefix.
* Split out `RemoteStorage::delete_prefix()` to delete a bucket prefix,
with additional test coverage.
* Add a `delimiter` argument to `asset_prefix_empty()` to support
partial prefix matches (i.e. all shards starting with a given tenant
ID).
---
 libs/remote_storage/src/lib.rs            |  53 +++++-
 libs/remote_storage/tests/common/tests.rs | 206 ++++++++++++++++++++++
 pageserver/src/tenant/mgr.rs              |  71 +++-----
 storage_controller/src/service.rs         |  73 ++++----
 test_runner/fixtures/pageserver/utils.py  |  15 +-
 test_runner/regress/test_tenant_delete.py |  55 ++++++
 6 files changed, 376 insertions(+), 97 deletions(-)

diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index c6466237bf..719608dd5f 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,7 +19,12 @@ mod simulate_failures;
 mod support;
 
 use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
+    collections::HashMap,
+    fmt::Debug,
+    num::NonZeroU32,
+    ops::Bound,
+    pin::{pin, Pin},
+    sync::Arc,
     time::SystemTime,
 };
 
@@ -28,6 +33,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 
 use bytes::Bytes;
 use futures::{stream::Stream, StreamExt};
+use itertools::Itertools as _;
 use serde::{Deserialize, Serialize};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
@@ -261,7 +267,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
         max_keys: Option<NonZeroU32>,
         cancel: &CancellationToken,
     ) -> Result<Listing, DownloadError> {
-        let mut stream = std::pin::pin!(self.list_streaming(prefix, mode, max_keys, cancel));
+        let mut stream = pin!(self.list_streaming(prefix, mode, max_keys, cancel));
         let mut combined = stream.next().await.expect("At least one item required")?;
         while let Some(list) = stream.next().await {
             let list = list?;
@@ -324,6 +330,35 @@ pub trait RemoteStorage: Send + Sync + 'static {
         cancel: &CancellationToken,
     ) -> anyhow::Result<()>;
 
+    /// Deletes all objects matching the given prefix.
+    ///
+    /// NB: this uses NoDelimiter and will match partial prefixes. For example, the prefix /a/b will
+    /// delete /a/b, /a/b/*, /a/bc, /a/bc/*, etc.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will
+    /// be set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
+    /// through.
+    async fn delete_prefix(
+        &self,
+        prefix: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let mut stream =
+            pin!(self.list_streaming(Some(prefix), ListingMode::NoDelimiter, None, cancel));
+        while let Some(result) = stream.next().await {
+            let keys = match result {
+                Ok(listing) if listing.keys.is_empty() => continue,
+                Ok(listing) => listing.keys.into_iter().map(|o| o.key).collect_vec(),
+                Err(DownloadError::Cancelled) => return Err(TimeoutOrCancel::Cancel.into()),
+                Err(DownloadError::Timeout) => return Err(TimeoutOrCancel::Timeout.into()),
+                Err(err) => return Err(err.into()),
+            };
+            tracing::info!("Deleting {} keys from remote storage", keys.len());
+            self.delete_objects(&keys, cancel).await?;
+        }
+        Ok(())
+    }
+
     /// Copy a remote object inside a bucket from one path to another.
     async fn copy(
         &self,
@@ -488,6 +523,20 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
+    /// See [`RemoteStorage::delete_prefix`]
+    pub async fn delete_prefix(
+        &self,
+        prefix: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.delete_prefix(prefix, cancel).await,
+            Self::AwsS3(s) => s.delete_prefix(prefix, cancel).await,
+            Self::AzureBlob(s) => s.delete_prefix(prefix, cancel).await,
+            Self::Unreliable(s) => s.delete_prefix(prefix, cancel).await,
+        }
+    }
+
     /// See [`RemoteStorage::copy`]
     pub async fn copy_object(
         &self,
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index e6f33fc3f8..d5da1d48e9 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -199,6 +199,138 @@ async fn list_no_delimiter_works(
     Ok(())
 }
 
+/// Tests that giving a partial prefix returns all matches (e.g. "/foo" yields "/foobar/baz"),
+/// but only with NoDelimiter.
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn list_partial_prefix(
+    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
+) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let cancel = CancellationToken::new();
+    let test_client = Arc::clone(&ctx.enabled.client);
+
+    // Prefix "fold" should match all "folder{i}" directories with NoDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("fold")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert_eq!(&objects, &ctx.remote_blobs);
+
+    // Prefix "fold" matches nothing with WithDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("fold")?),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "" matches everything.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert_eq!(&objects, &ctx.remote_blobs);
+
+    // Prefix "" matches nothing with WithDelimiter.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("")?),
+            ListingMode::WithDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "foo" matches nothing.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("foo")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    // Prefix "folder2/blob" matches.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("folder2/blob")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    let expect: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .filter(|o| o.get_path().starts_with("folder2"))
+        .cloned()
+        .collect();
+    assert_eq!(&objects, &expect);
+
+    // Prefix "folder2/foo" matches nothing.
+    let objects: HashSet<_> = test_client
+        .list(
+            Some(&RemotePath::from_string("folder2/foo")?),
+            ListingMode::NoDelimiter,
+            None,
+            &cancel,
+        )
+        .await?
+        .keys
+        .into_iter()
+        .map(|o| o.key)
+        .collect();
+    assert!(objects.is_empty());
+
+    Ok(())
+}
+
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
@@ -265,6 +397,80 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
     Ok(())
 }
 
+/// Tests that delete_prefix() will delete all objects matching a prefix, including
+/// partial prefixes (i.e. "/foo" matches "/foobar").
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn delete_prefix(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let cancel = CancellationToken::new();
+    let test_client = Arc::clone(&ctx.enabled.client);
+
+    /// Asserts that the S3 listing matches the given paths.
+    macro_rules! assert_list {
+        ($expect:expr) => {{
+            let listing = test_client
+                .list(None, ListingMode::NoDelimiter, None, &cancel)
+                .await?
+                .keys
+                .into_iter()
+                .map(|o| o.key)
+                .collect();
+            assert_eq!($expect, listing);
+        }};
+    }
+
+    // We start with the full set of uploaded files.
+    let mut expect = ctx.remote_blobs.clone();
+
+    // Deleting a non-existing prefix should do nothing.
+    test_client
+        .delete_prefix(&RemotePath::from_string("xyz")?, &cancel)
+        .await?;
+    assert_list!(expect);
+
+    // Prefixes are case-sensitive.
+    test_client
+        .delete_prefix(&RemotePath::from_string("Folder")?, &cancel)
+        .await?;
+    assert_list!(expect);
+
+    // Deleting a path which overlaps with an existing object should do nothing. We pick the first
+    // path in the set as our common prefix.
+    let path = expect.iter().next().expect("empty set").clone().join("xyz");
+    test_client.delete_prefix(&path, &cancel).await?;
+    assert_list!(expect);
+
+    // Deleting an exact path should work. We pick the first path in the set.
+    let path = expect.iter().next().expect("empty set").clone();
+    test_client.delete_prefix(&path, &cancel).await?;
+    expect.remove(&path);
+    assert_list!(expect);
+
+    // Deleting a prefix should delete all matching objects.
+    test_client
+        .delete_prefix(&RemotePath::from_string("folder0/blob_")?, &cancel)
+        .await?;
+    expect.retain(|p| !p.get_path().as_str().starts_with("folder0/"));
+    assert_list!(expect);
+
+    // Deleting a common prefix should delete all objects.
+    test_client
+        .delete_prefix(&RemotePath::from_string("fold")?, &cancel)
+        .await?;
+    expect.clear();
+    assert_list!(expect);
+
+    Ok(())
+}
+
 #[test_context(MaybeEnabledStorage)]
 #[tokio::test]
 async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 9d9852c525..0567f8f3a7 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -11,6 +11,7 @@ use pageserver_api::shard::{
 };
 use pageserver_api::upcall_api::ReAttachResponseTenant;
 use rand::{distributions::Alphanumeric, Rng};
+use remote_storage::TimeoutOrCancel;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::collections::{BTreeMap, HashMap, HashSet};
@@ -1350,47 +1351,17 @@ impl TenantManager {
         }
     }
 
-    async fn delete_tenant_remote(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<(), DeleteTenantError> {
-        let remote_path = remote_tenant_path(&tenant_shard_id);
-        let mut keys_stream = self.resources.remote_storage.list_streaming(
-            Some(&remote_path),
-            remote_storage::ListingMode::NoDelimiter,
-            None,
-            &self.cancel,
-        );
-        while let Some(chunk) = keys_stream.next().await {
-            let keys = match chunk {
-                Ok(listing) => listing.keys,
-                Err(remote_storage::DownloadError::Cancelled) => {
-                    return Err(DeleteTenantError::Cancelled)
-                }
-                Err(remote_storage::DownloadError::NotFound) => return Ok(()),
-                Err(other) => return Err(DeleteTenantError::Other(anyhow::anyhow!(other))),
-            };
-
-            if keys.is_empty() {
-                tracing::info!("Remote storage already deleted");
-            } else {
-                tracing::info!("Deleting {} keys from remote storage", keys.len());
-                let keys = keys.into_iter().map(|o| o.key).collect::<Vec<_>>();
-                self.resources
-                    .remote_storage
-                    .delete_objects(&keys, &self.cancel)
-                    .await?;
-            }
-        }
-
-        Ok(())
-    }
-
     /// If a tenant is attached, detach it.  Then remove its data from remote storage.
     ///
     /// A tenant is considered deleted once it is gone from remote storage.  It is the caller's
     /// responsibility to avoid trying to attach the tenant again or use it any way once deletion
     /// has started: this operation is not atomic, and must be retried until it succeeds.
+    ///
+    /// As a special case, if an unsharded tenant ID is given for a sharded tenant, it will remove
+    /// all tenant shards in remote storage (removing all paths with the tenant prefix). The storage
+    /// controller uses this to purge all remote tenant data, including any stale parent shards that
+    /// may remain after splits. Ideally, this special case would be handled elsewhere. See:
+    /// <https://github.com/neondatabase/neon/pull/9394>.
     pub(crate) async fn delete_tenant(
         &self,
         tenant_shard_id: TenantShardId,
@@ -1442,25 +1413,29 @@ impl TenantManager {
         //   in 500 responses to delete requests.
         // - We keep the `SlotGuard` during this I/O, so that if a concurrent delete request comes in, it will
         //   503/retry, rather than kicking off a wasteful concurrent deletion.
-        match backoff::retry(
-            || async move { self.delete_tenant_remote(tenant_shard_id).await },
-            |e| match e {
-                DeleteTenantError::Cancelled => true,
-                DeleteTenantError::SlotError(_) => {
-                    unreachable!("Remote deletion doesn't touch slots")
-                }
-                _ => false,
+        // NB: this also deletes partial prefixes, i.e. a <tenant_id> path will delete all
+        // <tenant_id>_<shard_id>/* objects. See method comment for why.
+        backoff::retry(
+            || async move {
+                self.resources
+                    .remote_storage
+                    .delete_prefix(&remote_tenant_path(&tenant_shard_id), &self.cancel)
+                    .await
             },
+            |_| false, // backoff::retry handles cancellation
             1,
             3,
             &format!("delete_tenant[tenant_shard_id={tenant_shard_id}]"),
             &self.cancel,
         )
         .await
-        {
-            Some(r) => r,
-            None => Err(DeleteTenantError::Cancelled),
-        }
+        .unwrap_or(Err(TimeoutOrCancel::Cancel.into()))
+        .map_err(|err| {
+            if TimeoutOrCancel::caused_by_cancel(&err) {
+                return DeleteTenantError::Cancelled;
+            }
+            DeleteTenantError::Other(err)
+        })
     }
 
     #[instrument(skip_all, fields(tenant_id=%tenant.get_tenant_shard_id().tenant_id, shard_id=%tenant.get_tenant_shard_id().shard_slug(), new_shard_count=%new_shard_count.literal()))]
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 25e1fb5e1f..ab2c3b5e48 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -2862,17 +2862,12 @@ impl Service {
         let _tenant_lock =
             trace_exclusive_lock(&self.tenant_op_locks, tenant_id, TenantOperations::Delete).await;
 
-        // Detach all shards
-        let (detach_waiters, shard_ids, node) = {
-            let mut shard_ids = Vec::new();
+        // Detach all shards. This also deletes local pageserver shard data.
+        let (detach_waiters, node) = {
             let mut detach_waiters = Vec::new();
             let mut locked = self.inner.write().unwrap();
             let (nodes, tenants, scheduler) = locked.parts_mut();
-            for (tenant_shard_id, shard) in
-                tenants.range_mut(TenantShardId::tenant_range(tenant_id))
-            {
-                shard_ids.push(*tenant_shard_id);
-
+            for (_, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
                 // Update the tenant's intent to remove all attachments
                 shard.policy = PlacementPolicy::Detached;
                 shard
@@ -2892,7 +2887,7 @@ impl Service {
             let node = nodes
                 .get(&node_id)
                 .expect("Pageservers may not be deleted while lock is active");
-            (detach_waiters, shard_ids, node.clone())
+            (detach_waiters, node.clone())
         };
 
         // This reconcile wait can fail in a few ways:
@@ -2907,38 +2902,34 @@ impl Service {
         self.await_waiters(detach_waiters, RECONCILE_TIMEOUT)
             .await?;
 
-        let locations = shard_ids
-            .into_iter()
-            .map(|s| (s, node.clone()))
-            .collect::<Vec<_>>();
-        let results = self.tenant_for_shards_api(
-            locations,
-            |tenant_shard_id, client| async move { client.tenant_delete(tenant_shard_id).await },
-            1,
-            3,
-            RECONCILE_TIMEOUT,
-            &self.cancel,
-        )
-        .await;
-        for result in results {
-            match result {
-                Ok(StatusCode::ACCEPTED) => {
-                    // This should never happen: we waited for detaches to finish above
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(
-                        "Unexpectedly still attached on {}",
-                        node
-                    )));
-                }
-                Ok(_) => {}
-                Err(mgmt_api::Error::Cancelled) => {
-                    return Err(ApiError::ShuttingDown);
-                }
-                Err(e) => {
-                    // This is unexpected: remote deletion should be infallible, unless the object store
-                    // at large is unavailable.
-                    tracing::error!("Error deleting via node {}: {e}", node);
-                    return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
-                }
+        // Delete the entire tenant (all shards) from remote storage via a random pageserver.
+        // Passing an unsharded tenant ID will cause the pageserver to remove all remote paths with
+        // the tenant ID prefix, including all shards (even possibly stale ones).
+        match node
+            .with_client_retries(
+                |client| async move {
+                    client
+                        .tenant_delete(TenantShardId::unsharded(tenant_id))
+                        .await
+                },
+                &self.config.jwt_token,
+                1,
+                3,
+                RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await
+            .unwrap_or(Err(mgmt_api::Error::Cancelled))
+        {
+            Ok(_) => {}
+            Err(mgmt_api::Error::Cancelled) => {
+                return Err(ApiError::ShuttingDown);
+            }
+            Err(e) => {
+                // This is unexpected: remote deletion should be infallible, unless the object store
+                // at large is unavailable.
+                tracing::error!("Error deleting via node {node}: {e}");
+                return Err(ApiError::InternalServerError(anyhow::anyhow!(e)));
             }
         }
 
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 377a95fbeb..4c4306be9e 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -303,9 +303,10 @@ def assert_prefix_empty(
     remote_storage: Optional[RemoteStorage],
     prefix: Optional[str] = None,
     allowed_postfix: Optional[str] = None,
+    delimiter: str = "/",
 ) -> None:
     assert remote_storage is not None
-    response = list_prefix(remote_storage, prefix)
+    response = list_prefix(remote_storage, prefix, delimiter)
     keys = response["KeyCount"]
     objects: list[ObjectTypeDef] = response.get("Contents", [])
     common_prefixes = response.get("CommonPrefixes", [])
@@ -338,16 +339,18 @@ def assert_prefix_empty(
             if not (allowed_postfix.endswith(key)):
                 filtered_count += 1
 
-    assert (
-        filtered_count == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    assert filtered_count == 0, f"remote prefix {prefix} is not empty: {objects}"
 
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
-def assert_prefix_not_empty(remote_storage: Optional[RemoteStorage], prefix: Optional[str] = None):
+def assert_prefix_not_empty(
+    remote_storage: Optional[RemoteStorage],
+    prefix: Optional[str] = None,
+    delimiter: str = "/",
+):
     assert remote_storage is not None
     response = list_prefix(remote_storage, prefix)
-    assert response["KeyCount"] != 0, f"remote dir with prefix {prefix} is empty: {response}"
+    assert response["KeyCount"] != 0, f"remote prefix {prefix} is empty: {response}"
 
 
 def list_prefix(
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 294c1248c5..f486327445 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -20,6 +20,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.utils import run_pg_bench_small, wait_until
+from fixtures.workload import Workload
 from requests.exceptions import ReadTimeout
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -404,3 +405,57 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder
         cloud_admin_api_token=cloud_admin_token,
     )
     assert healthy
+
+
+def test_tenant_delete_stale_shards(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
+    """
+    Deleting a tenant should also delete any stale (pre-split) shards from remote storage.
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+
+    # Create an unsharded tenant.
+    tenant_id, timeline_id = env.create_tenant()
+
+    # Write some data.
+    workload = Workload(env, tenant_id, timeline_id, branch_name="main")
+    workload.init()
+    workload.write_rows(256)
+    workload.validate()
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(tenant_id))),
+    )
+
+    # Upload a heatmap as well.
+    env.pageserver.http_client().tenant_heatmap_upload(tenant_id)
+
+    # Split off a few shards, in two rounds.
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=4)
+    env.storage_controller.tenant_shard_split(tenant_id, shard_count=16)
+
+    # Delete the tenant. This should also delete data for the unsharded and count=4 parents.
+    env.storage_controller.pageserver_api().tenant_delete(tenant_id=tenant_id)
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(tenant_id))),
+        delimiter="",  # match partial prefixes, i.e. all shards
+    )
+
+    dirs = list(env.pageserver.tenant_dir(None).glob(f"{tenant_id}*"))
+    assert dirs == [], f"found tenant directories: {dirs}"
+
+    # The initial tenant created by the test harness should still be there.
+    # Only the tenant we deleted should be removed.
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix="/".join(("tenants", str(env.initial_tenant))),
+    )
+    dirs = list(env.pageserver.tenant_dir(None).glob(f"{env.initial_tenant}*"))
+    assert dirs != [], "missing initial tenant directory"
+
+    env.stop()

From 299cde899b7b9a31723508afdf7b9e0f0be13912 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 17 Oct 2024 17:19:18 +0200
Subject: [PATCH 033/239] safekeeper: flush WAL on compute disconnect (#9436)

## Problem

In #9259, we found that the `check_safekeepers_synced` fast path could
result in a lower basebackup LSN than the `flush_lsn` reported by
Safekeepers in `VoteResponse`, causing the compute to panic once on
startup.

This would happen if the Safekeeper had unflushed WAL records due to a
compute disconnect. The `TIMELINE_STATUS` query would report a
`flush_lsn` below these unflushed records, while `VoteResponse` would
flush the WAL and report the advanced `flush_lsn`. See
https://github.com/neondatabase/neon/issues/9259#issuecomment-2410849032.

## Summary of changes

Flush the WAL if the compute disconnects during WAL processing.
---
 safekeeper/src/receive_wal.rs | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index e35f806e90..2a9ca85299 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -498,21 +498,18 @@ impl WalAcceptor {
         // we will send keepalives by replying to these requests once per second.
         let mut next_keepalive = Instant::now();
 
-        loop {
-            let opt_msg = self.msg_rx.recv().await;
-            if opt_msg.is_none() {
-                return Ok(()); // chan closed, streaming terminated
-            }
-            let mut next_msg = opt_msg.unwrap();
-
+        while let Some(mut next_msg) = self.msg_rx.recv().await {
             // Update walreceiver state in shmem for reporting.
             if let ProposerAcceptorMessage::Elected(_) = &next_msg {
                 walreceiver_guard.get().status = WalReceiverStatus::Streaming;
             }
 
             let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
-                // loop through AppendRequest's while it's readily available to
-                // write as many WAL as possible without fsyncing
+                // Loop through AppendRequests while available to write as many WAL records as
+                // possible without fsyncing.
+                //
+                // Make sure the WAL is flushed before returning, see:
+                // https://github.com/neondatabase/neon/issues/9259
                 //
                 // Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
                 // Otherwise, we might end up in a situation where we read a message, but don't
@@ -522,7 +519,7 @@ impl WalAcceptor {
 
                     if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
                         if self.reply_tx.send(reply).await.is_err() {
-                            return Ok(()); // chan closed, streaming terminated
+                            break; // disconnected, flush WAL and return on next send/recv
                         }
                     }
 
@@ -531,11 +528,13 @@ impl WalAcceptor {
                         break;
                     }
 
+                    // continue pulling AppendRequests if available
                     match self.msg_rx.try_recv() {
                         Ok(msg) => next_msg = msg,
                         Err(TryRecvError::Empty) => break,
-                        Err(TryRecvError::Disconnected) => return Ok(()), // chan closed, streaming terminated
-                    }
+                        // on disconnect, flush WAL and return on next send/recv
+                        Err(TryRecvError::Disconnected) => break,
+                    };
                 }
 
                 // flush all written WAL to the disk
@@ -555,5 +554,6 @@ impl WalAcceptor {
                 next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
             }
         }
+        Ok(())
     }
 }

From 858867c62771e7f24c3d33820a8ca87c5f4f146f Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 17 Oct 2024 16:35:19 +0100
Subject: [PATCH 034/239] Add logging of installed_extensions (#9438)

Simple PR to log installed_extensions statistics.

in the following format:
```
2024-10-17T13:53:02.860595Z  INFO [NEON_EXT_STAT] {"extensions":[{"extname":"plpgsql","versions":["1.0"],"n_databases":2},{"extname":"neon","versions":["1.5"],"n_databases":1}]}
```
---
 compute_tools/src/compute.rs              | 28 +++++------------------
 compute_tools/src/installed_extensions.rs | 21 +++++++++++++++++
 2 files changed, 27 insertions(+), 22 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 285be56264..6aec008f3a 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -34,6 +34,7 @@ use nix::sys::signal::{kill, Signal};
 use remote_storage::{DownloadError, RemotePath};
 
 use crate::checker::create_availability_check_data;
+use crate::installed_extensions::get_installed_extensions_sync;
 use crate::local_proxy;
 use crate::logger::inlinify;
 use crate::pg_helpers::*;
@@ -1121,6 +1122,11 @@ impl ComputeNode {
                 self.pg_reload_conf()?;
             }
             self.post_apply_config()?;
+
+            let connstr = self.connstr.clone();
+            thread::spawn(move || {
+                get_installed_extensions_sync(connstr).context("get_installed_extensions")
+            });
         }
 
         let startup_end_time = Utc::now();
@@ -1484,28 +1490,6 @@ LIMIT 100",
             info!("Pageserver config changed");
         }
     }
-
-    // Gather info about installed extensions
-    pub fn get_installed_extensions(&self) -> Result<()> {
-        let connstr = self.connstr.clone();
-
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .expect("failed to create runtime");
-        let result = rt
-            .block_on(crate::installed_extensions::get_installed_extensions(
-                connstr,
-            ))
-            .expect("failed to get installed extensions");
-
-        info!(
-            "{}",
-            serde_json::to_string(&result).expect("failed to serialize extensions list")
-        );
-
-        Ok(())
-    }
 }
 
 pub fn forward_termination_signal() {
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 72578b1f34..877f99bff7 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,6 +1,7 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
 use std::collections::HashMap;
 use std::collections::HashSet;
+use tracing::info;
 use url::Url;
 
 use anyhow::Result;
@@ -79,3 +80,23 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
     })
     .await?
 }
+
+// Gather info about installed extensions
+pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
+    let rt = tokio::runtime::Builder::new_current_thread()
+        .enable_all()
+        .build()
+        .expect("failed to create runtime");
+    let result = rt
+        .block_on(crate::installed_extensions::get_installed_extensions(
+            connstr,
+        ))
+        .expect("failed to get installed extensions");
+
+    info!(
+        "[NEON_EXT_STAT] {}",
+        serde_json::to_string(&result).expect("failed to serialize extensions list")
+    );
+
+    Ok(())
+}

From 63b3491c1b489487e9d94b8499f401cd57e12290 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 17 Oct 2024 12:22:44 -0400
Subject: [PATCH 035/239] refactor(pageserver): remove aux v1 code path (#9424)

Part of the aux v1 retirement
https://github.com/neondatabase/neon/issues/8623

## Summary of changes

Remove write/read path for aux v1, but keeping the config item and the
index part field for now.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/models.rs             |   2 -
 pageserver/src/http/routes.rs                 |  32 --
 pageserver/src/pgdatadir_mapping.rs           | 323 +++------------
 pageserver/src/tenant.rs                      | 380 +-----------------
 .../src/tenant/remote_timeline_client.rs      |  14 +-
 .../tenant/remote_timeline_client/index.rs    |   4 -
 pageserver/src/tenant/timeline.rs             |  51 +--
 pageserver/src/tenant/timeline/delete.rs      |   2 -
 pageserver/src/walredo/apply_neon.rs          |  71 +---
 test_runner/regress/test_aux_files.py         |  78 ----
 10 files changed, 60 insertions(+), 897 deletions(-)
 delete mode 100644 test_runner/regress/test_aux_files.py

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 3ec9cac2c3..5b0b6bebe3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -743,8 +743,6 @@ pub struct TimelineInfo {
     // Forward compatibility: a previous version of the pageserver will receive a JSON. serde::Deserialize does
     // not deny unknown fields by default so it's safe to set the field to some value, though it won't be
     // read.
-    /// The last aux file policy being used on this timeline
-    pub last_aux_file_policy: Option<AuxFilePolicy>,
     pub is_archived: Option<bool>,
 }
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 36a6ed427b..e6663ef56f 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -18,7 +18,6 @@ use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::virtual_file::IoMode;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
 use pageserver_api::models::ListAuxFilesRequest;
@@ -474,8 +473,6 @@ async fn build_timeline_info_common(
         is_archived: Some(is_archived),
 
         walreceiver_status,
-
-        last_aux_file_policy: timeline.last_aux_file_policy.load(),
     };
     Ok(info)
 }
@@ -2399,31 +2396,6 @@ async fn post_tracing_event_handler(
     json_response(StatusCode::OK, ())
 }
 
-async fn force_aux_policy_switch_handler(
-    mut r: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    check_permission(&r, None)?;
-    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
-    let timeline_id: TimelineId = parse_request_param(&r, "timeline_id")?;
-    let policy: AuxFilePolicy = json_request(&mut r).await?;
-
-    let state = get_state(&r);
-
-    let tenant = state
-        .tenant_manager
-        .get_attached_tenant_shard(tenant_shard_id)?;
-    tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
-    let timeline =
-        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
-            .await?;
-    timeline
-        .do_switch_aux_policy(policy)
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn put_io_engine_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
@@ -3136,10 +3108,6 @@ pub fn make_router(
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
         .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
-        .put(
-            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
-            |r| api_handler(r, force_aux_policy_switch_handler),
-        )
         .get("/v1/utilization", |r| api_handler(r, get_utilization))
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/ingest_aux_files",
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 900da5beab..f2a11e65c1 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -22,7 +22,6 @@ use pageserver_api::key::{
     CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -33,7 +32,7 @@ use std::ops::ControlFlow;
 use std::ops::Range;
 use strum::IntoEnumIterator;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, info, trace, warn};
+use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -677,21 +676,6 @@ impl Timeline {
         self.get(CHECKPOINT_KEY, lsn, ctx).await
     }
 
-    async fn list_aux_files_v1(
-        &self,
-        lsn: Lsn,
-        ctx: &RequestContext,
-    ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        match self.get(AUX_FILES_KEY, lsn, ctx).await {
-            Ok(buf) => Ok(AuxFilesDirectory::des(&buf)?.files),
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                Ok(HashMap::new())
-            }
-        }
-    }
-
     async fn list_aux_files_v2(
         &self,
         lsn: Lsn,
@@ -722,10 +706,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        if let Some(AuxFilePolicy::V2) | Some(AuxFilePolicy::CrossValidation) = current_policy {
-            self.list_aux_files_v2(lsn, ctx).await?;
-        }
+        self.list_aux_files_v2(lsn, ctx).await?;
         Ok(())
     }
 
@@ -734,51 +715,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<HashMap<String, Bytes>, PageReconstructError> {
-        let current_policy = self.last_aux_file_policy.load();
-        match current_policy {
-            Some(AuxFilePolicy::V1) => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                let empty_str = if res.is_empty() { ", empty" } else { "" };
-                warn!(
-                    "this timeline is using deprecated aux file policy V1 (policy=v1{empty_str})"
-                );
-                Ok(res)
-            }
-            None => {
-                let res = self.list_aux_files_v1(lsn, ctx).await?;
-                if !res.is_empty() {
-                    warn!("this timeline is using deprecated aux file policy V1 (policy=None)");
-                }
-                Ok(res)
-            }
-            Some(AuxFilePolicy::V2) => self.list_aux_files_v2(lsn, ctx).await,
-            Some(AuxFilePolicy::CrossValidation) => {
-                let v1_result = self.list_aux_files_v1(lsn, ctx).await;
-                let v2_result = self.list_aux_files_v2(lsn, ctx).await;
-                match (v1_result, v2_result) {
-                    (Ok(v1), Ok(v2)) => {
-                        if v1 != v2 {
-                            tracing::error!(
-                                "unmatched aux file v1 v2 result:\nv1 {v1:?}\nv2 {v2:?}"
-                            );
-                            return Err(PageReconstructError::Other(anyhow::anyhow!(
-                                "unmatched aux file v1 v2 result"
-                            )));
-                        }
-                        Ok(v1)
-                    }
-                    (Ok(_), Err(v2)) => {
-                        tracing::error!("aux file v1 returns Ok while aux file v2 returns an err");
-                        Err(v2)
-                    }
-                    (Err(v1), Ok(_)) => {
-                        tracing::error!("aux file v2 returns Ok while aux file v1 returns an err");
-                        Err(v1)
-                    }
-                    (Err(_), Err(v2)) => Err(v2),
-                }
-            }
-        }
+        self.list_aux_files_v2(lsn, ctx).await
     }
 
     pub(crate) async fn get_replorigins(
@@ -954,9 +891,6 @@ impl Timeline {
 
         result.add_key(CONTROLFILE_KEY);
         result.add_key(CHECKPOINT_KEY);
-        if self.get(AUX_FILES_KEY, lsn, ctx).await.is_ok() {
-            result.add_key(AUX_FILES_KEY);
-        }
 
         // Add extra keyspaces in the test cases. Some test cases write keys into the storage without
         // creating directory keys. These test cases will add such keyspaces into `extra_test_dense_keyspace`
@@ -1166,9 +1100,6 @@ impl<'a> DatadirModification<'a> {
         self.pending_directory_entries.push((DirectoryKind::Db, 0));
         self.put(DBDIR_KEY, Value::Image(buf.into()));
 
-        // Create AuxFilesDirectory
-        self.init_aux_dir()?;
-
         let buf = if self.tline.pg_version >= 17 {
             TwoPhaseDirectoryV17::ser(&TwoPhaseDirectoryV17 {
                 xids: HashSet::new(),
@@ -1347,9 +1278,6 @@ impl<'a> DatadirModification<'a> {
             // 'true', now write the updated 'dbdirs' map back.
             let buf = DbDirectory::ser(&dbdir)?;
             self.put(DBDIR_KEY, Value::Image(buf.into()));
-
-            // Create AuxFilesDirectory as well
-            self.init_aux_dir()?;
         }
         if r.is_none() {
             // Create RelDirectory
@@ -1726,200 +1654,60 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
-        if let AuxFilePolicy::V2 = self.tline.get_switch_aux_file_policy() {
-            return Ok(());
-        }
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-            files: HashMap::new(),
-        })?;
-        self.pending_directory_entries
-            .push((DirectoryKind::AuxFiles, 0));
-        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-        Ok(())
-    }
-
     pub async fn put_file(
         &mut self,
         path: &str,
         content: &[u8],
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let switch_policy = self.tline.get_switch_aux_file_policy();
-
-        let policy = {
-            let current_policy = self.tline.last_aux_file_policy.load();
-            // Allowed switch path:
-            // * no aux files -> v1/v2/cross-validation
-            // * cross-validation->v2
-
-            let current_policy = if current_policy.is_none() {
-                // This path will only be hit once per tenant: we will decide the final policy in this code block.
-                // The next call to `put_file` will always have `last_aux_file_policy != None`.
-                let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-                let aux_files_key_v1 = self.tline.list_aux_files_v1(lsn, ctx).await?;
-                if aux_files_key_v1.is_empty() {
-                    None
-                } else {
-                    warn!("this timeline is using deprecated aux file policy V1 (detected existing v1 files)");
-                    self.tline.do_switch_aux_policy(AuxFilePolicy::V1)?;
-                    Some(AuxFilePolicy::V1)
-                }
-            } else {
-                current_policy
-            };
-
-            if AuxFilePolicy::is_valid_migration_path(current_policy, switch_policy) {
-                self.tline.do_switch_aux_policy(switch_policy)?;
-                info!(current=?current_policy, next=?switch_policy, "switching aux file policy");
-                switch_policy
-            } else {
-                // This branch handles non-valid migration path, and the case that switch_policy == current_policy.
-                // And actually, because the migration path always allow unspecified -> *, this unwrap_or will never be hit.
-                current_policy.unwrap_or(AuxFilePolicy::default_tenant_config())
-            }
+        let key = aux_file::encode_aux_file_key(path);
+        // retrieve the key from the engine
+        let old_val = match self.get(key, ctx).await {
+            Ok(val) => Some(val),
+            Err(PageReconstructError::MissingKey(_)) => None,
+            Err(e) => return Err(e.into()),
         };
-
-        if let AuxFilePolicy::V2 | AuxFilePolicy::CrossValidation = policy {
-            let key = aux_file::encode_aux_file_key(path);
-            // retrieve the key from the engine
-            let old_val = match self.get(key, ctx).await {
-                Ok(val) => Some(val),
-                Err(PageReconstructError::MissingKey(_)) => None,
-                Err(e) => return Err(e.into()),
-            };
-            let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
-                aux_file::decode_file_value(old_val)?
+        let files: Vec<(&str, &[u8])> = if let Some(ref old_val) = old_val {
+            aux_file::decode_file_value(old_val)?
+        } else {
+            Vec::new()
+        };
+        let mut other_files = Vec::with_capacity(files.len());
+        let mut modifying_file = None;
+        for file @ (p, content) in files {
+            if path == p {
+                assert!(
+                    modifying_file.is_none(),
+                    "duplicated entries found for {}",
+                    path
+                );
+                modifying_file = Some(content);
             } else {
-                Vec::new()
-            };
-            let mut other_files = Vec::with_capacity(files.len());
-            let mut modifying_file = None;
-            for file @ (p, content) in files {
-                if path == p {
-                    assert!(
-                        modifying_file.is_none(),
-                        "duplicated entries found for {}",
-                        path
-                    );
-                    modifying_file = Some(content);
-                } else {
-                    other_files.push(file);
-                }
+                other_files.push(file);
             }
-            let mut new_files = other_files;
-            match (modifying_file, content.is_empty()) {
-                (Some(old_content), false) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_update(old_content.len(), content.len());
-                    new_files.push((path, content));
-                }
-                (Some(old_content), true) => {
-                    self.tline
-                        .aux_file_size_estimator
-                        .on_remove(old_content.len());
-                    // not adding the file key to the final `new_files` vec.
-                }
-                (None, false) => {
-                    self.tline.aux_file_size_estimator.on_add(content.len());
-                    new_files.push((path, content));
-                }
-                (None, true) => warn!("removing non-existing aux file: {}", path),
-            }
-            let new_val = aux_file::encode_file_value(&new_files)?;
-            self.put(key, Value::Image(new_val.into()));
         }
-
-        if let AuxFilePolicy::V1 | AuxFilePolicy::CrossValidation = policy {
-            let file_path = path.to_string();
-            let content = if content.is_empty() {
-                None
-            } else {
-                Some(Bytes::copy_from_slice(content))
-            };
-
-            let n_files;
-            let mut aux_files = self.tline.aux_files.lock().await;
-            if let Some(mut dir) = aux_files.dir.take() {
-                // We already updated aux files in `self`: emit a delta and update our latest value.
-                dir.upsert(file_path.clone(), content.clone());
-                n_files = dir.files.len();
-                if aux_files.n_deltas == MAX_AUX_FILE_DELTAS {
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::Image(Bytes::from(
-                            AuxFilesDirectory::ser(&dir).context("serialize")?,
-                        )),
-                    );
-                    aux_files.n_deltas = 0;
-                } else {
-                    self.put(
-                        AUX_FILES_KEY,
-                        Value::WalRecord(NeonWalRecord::AuxFile { file_path, content }),
-                    );
-                    aux_files.n_deltas += 1;
-                }
-                aux_files.dir = Some(dir);
-            } else {
-                // Check if the AUX_FILES_KEY is initialized
-                match self.get(AUX_FILES_KEY, ctx).await {
-                    Ok(dir_bytes) => {
-                        let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
-                        // Key is already set, we may append a delta
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::WalRecord(NeonWalRecord::AuxFile {
-                                file_path: file_path.clone(),
-                                content: content.clone(),
-                            }),
-                        );
-                        dir.upsert(file_path, content);
-                        n_files = dir.files.len();
-                        aux_files.dir = Some(dir);
-                    }
-                    Err(
-                        e @ (PageReconstructError::Cancelled
-                        | PageReconstructError::AncestorLsnTimeout(_)),
-                    ) => {
-                        // Important that we do not interpret a shutdown error as "not found" and thereby
-                        // reset the map.
-                        return Err(e.into());
-                    }
-                    // Note: we added missing key error variant in https://github.com/neondatabase/neon/pull/7393 but
-                    // the original code assumes all other errors are missing keys. Therefore, we keep the code path
-                    // the same for now, though in theory, we should only match the `MissingKey` variant.
-                    Err(
-                        e @ (PageReconstructError::Other(_)
-                        | PageReconstructError::WalRedo(_)
-                        | PageReconstructError::MissingKey(_)),
-                    ) => {
-                        // Key is missing, we must insert an image as the basis for subsequent deltas.
-
-                        if !matches!(e, PageReconstructError::MissingKey(_)) {
-                            let e = utils::error::report_compact_sources(&e);
-                            tracing::warn!("treating error as if it was a missing key: {}", e);
-                        }
-
-                        let mut dir = AuxFilesDirectory {
-                            files: HashMap::new(),
-                        };
-                        dir.upsert(file_path, content);
-                        self.put(
-                            AUX_FILES_KEY,
-                            Value::Image(Bytes::from(
-                                AuxFilesDirectory::ser(&dir).context("serialize")?,
-                            )),
-                        );
-                        n_files = 1;
-                        aux_files.dir = Some(dir);
-                    }
-                }
+        let mut new_files = other_files;
+        match (modifying_file, content.is_empty()) {
+            (Some(old_content), false) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_update(old_content.len(), content.len());
+                new_files.push((path, content));
             }
-
-            self.pending_directory_entries
-                .push((DirectoryKind::AuxFiles, n_files));
+            (Some(old_content), true) => {
+                self.tline
+                    .aux_file_size_estimator
+                    .on_remove(old_content.len());
+                // not adding the file key to the final `new_files` vec.
+            }
+            (None, false) => {
+                self.tline.aux_file_size_estimator.on_add(content.len());
+                new_files.push((path, content));
+            }
+            (None, true) => warn!("removing non-existing aux file: {}", path),
         }
+        let new_val = aux_file::encode_file_value(&new_files)?;
+        self.put(key, Value::Image(new_val.into()));
 
         Ok(())
     }
@@ -2089,12 +1877,6 @@ impl<'a> DatadirModification<'a> {
         self.tline.get(key, lsn, ctx).await
     }
 
-    /// Only used during unit tests, force putting a key into the modification.
-    #[cfg(test)]
-    pub(crate) fn put_for_test(&mut self, key: Key, val: Value) {
-        self.put(key, val);
-    }
-
     fn put(&mut self, key: Key, val: Value) {
         if Self::is_data_key(&key) {
             self.put_data(key.to_compact(), val)
@@ -2212,21 +1994,6 @@ struct RelDirectory {
     rels: HashSet<(Oid, u8)>,
 }
 
-#[derive(Debug, Serialize, Deserialize, Default, PartialEq)]
-pub(crate) struct AuxFilesDirectory {
-    pub(crate) files: HashMap<String, Bytes>,
-}
-
-impl AuxFilesDirectory {
-    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
-        if let Some(value) = value {
-            self.files.insert(key, value);
-        } else {
-            self.files.remove(&key);
-        }
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 struct RelSizeEntry {
     nblocks: u32,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index baa2365658..1066d165cd 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -20,7 +20,6 @@ use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
 use pageserver_api::models;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::models::TimelineState;
@@ -800,7 +799,6 @@ impl Tenant {
         index_part: Option<IndexPart>,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
         _ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
@@ -811,10 +809,6 @@ impl Tenant {
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
-            // This could be derived from ancestor branch + index part. Though the only caller of `timeline_init_and_sync` is `load_remote_timeline`,
-            // there will potentially be other caller of this function in the future, and we don't know whether `index_part` or `ancestor` takes precedence.
-            // Therefore, we pass this field explicitly for now, and remove it once we fully migrate to aux file v2.
-            last_aux_file_policy,
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -829,10 +823,6 @@ impl Tenant {
 
         if let Some(index_part) = index_part.as_ref() {
             timeline.remote_client.init_upload_queue(index_part)?;
-
-            timeline
-                .last_aux_file_policy
-                .store(index_part.last_aux_file_policy());
         } else {
             // No data on the remote storage, but we have local metadata file. We can end up
             // here with timeline_create being interrupted before finishing index part upload.
@@ -1403,15 +1393,12 @@ impl Tenant {
             None
         };
 
-        let last_aux_file_policy = index_part.last_aux_file_policy();
-
         self.timeline_init_and_sync(
             timeline_id,
             resources,
             Some(index_part),
             remote_metadata,
             ancestor,
-            last_aux_file_policy,
             ctx,
         )
         .await
@@ -1824,7 +1811,6 @@ impl Tenant {
             create_guard,
             initdb_lsn,
             None,
-            None,
         )
         .await
     }
@@ -3032,7 +3018,6 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
-        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -3061,7 +3046,6 @@ impl Tenant {
             resources,
             pg_version,
             state,
-            last_aux_file_policy,
             self.attach_wal_lag_cooldown.clone(),
             self.cancel.child_token(),
         );
@@ -3720,7 +3704,6 @@ impl Tenant {
                 timeline_create_guard,
                 start_lsn + 1,
                 Some(Arc::clone(src_timeline)),
-                src_timeline.last_aux_file_policy.load(),
             )
             .await?;
 
@@ -3914,7 +3897,6 @@ impl Tenant {
                 timeline_create_guard,
                 pgdata_lsn,
                 None,
-                None,
             )
             .await?;
 
@@ -3986,7 +3968,6 @@ impl Tenant {
         create_guard: TimelineCreateGuard<'a>,
         start_lsn: Lsn,
         ancestor: Option<Arc<Timeline>>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
     ) -> anyhow::Result<UninitializedTimeline<'a>> {
         let tenant_shard_id = self.tenant_shard_id;
 
@@ -4002,7 +3983,6 @@ impl Tenant {
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
-                last_aux_file_policy,
             )
             .context("Failed to create timeline data structure")?;
 
@@ -4600,7 +4580,6 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
-    use crate::pgdatadir_mapping::AuxFilesDirectory;
     use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
@@ -4609,7 +4588,7 @@ mod tests {
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{AUX_FILES_KEY, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use rand::{thread_rng, Rng};
@@ -4618,7 +4597,6 @@ mod tests {
     use tests::timeline::{GetVectoredError, ShutdownMode};
     use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
     use timeline::{DeltaLayerTestDesc, GcInfo};
-    use utils::bin_ser::BeSer;
     use utils::id::TenantId;
 
     static TEST_KEY: Lazy<Key> =
@@ -6422,16 +6400,9 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn test_branch_copies_dirty_aux_file_flag() {
-        let harness = TenantHarness::create("test_branch_copies_dirty_aux_file_flag")
-            .await
-            .unwrap();
+    async fn test_aux_file_e2e() {
+        let harness = TenantHarness::create("test_aux_file_e2e").await.unwrap();
 
-        // the default aux file policy to switch is v2 if not set by the admins
-        assert_eq!(
-            harness.tenant_conf.switch_aux_file_policy,
-            AuxFilePolicy::default_tenant_config()
-        );
         let (tenant, ctx) = harness.load().await;
 
         let mut lsn = Lsn(0x08);
@@ -6441,9 +6412,6 @@ mod tests {
             .await
             .unwrap();
 
-        // no aux file is written at this point, so the persistent flag should be unset
-        assert_eq!(tline.last_aux_file_policy.load(), None);
-
         {
             lsn += 8;
             let mut modification = tline.begin_modification(lsn);
@@ -6454,30 +6422,6 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        // there is no tenant manager to pass the configuration through, so lets mimic it
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V2,
-            "wanted state has been updated"
-        );
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "aux file is written with switch_aux_file_policy unset (which is v2), so we should use v2 there"
-        );
-
         // we can read everything from the storage
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(
@@ -6495,12 +6439,6 @@ mod tests {
             modification.commit(&ctx).await.unwrap();
         }
 
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "keep v2 storage format when new files are written"
-        );
-
         let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(
             files.get("pg_logical/mappings/test2"),
@@ -6512,321 +6450,9 @@ mod tests {
             .await
             .unwrap();
 
-        // child copies the last flag even if that is not on remote storage yet
-        assert_eq!(child.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-        assert_eq!(child.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
-
         let files = child.list_aux_files(lsn, &ctx).await.unwrap();
         assert_eq!(files.get("pg_logical/mappings/test1"), None);
         assert_eq!(files.get("pg_logical/mappings/test2"), None);
-
-        // even if we crash here without flushing parent timeline with it's new
-        // last_aux_file_policy we are safe, because child was never meant to access ancestor's
-        // files. the ancestor can even switch back to V1 because of a migration safely.
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_switch")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::CrossValidation; // set to cross-validation mode
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        // there is no tenant manager to pass the configuration through, so lets mimic it
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V2,
-            "wanted state has been updated"
-        );
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::CrossValidation),
-            "dirty index_part.json reflected state is yet to be updated"
-        );
-
-        // we can still read the auxfile v1 before we ingest anything new
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"second", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "ingesting a file should apply the wanted switch state when applicable"
-        );
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first")),
-            "cross validation writes to both v1 and v2 so this should be available in v2"
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"second"))
-        );
-
-        // mimic again by trying to flip it from V2 to V1 (not switched to while ingesting a file)
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V1),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"third", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.get_switch_aux_file_policy(),
-            AuxFilePolicy::V1,
-            "wanted state has been updated again, even if invalid request"
-        );
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "ingesting a file should apply the wanted switch state when applicable"
-        );
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"third"))
-        );
-
-        // mimic again by trying to flip it from from V1 to V2 (not switched to while ingesting a file)
-        tenant.set_new_location_config(
-            AttachedTenantConf::try_from(LocationConf::attached_single(
-                TenantConfOpt {
-                    switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                    ..Default::default()
-                },
-                tenant.generation,
-                &pageserver_api::models::ShardParameters::default(),
-            ))
-            .unwrap(),
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test3", b"last", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(tline.get_switch_aux_file_policy(), AuxFilePolicy::V2);
-
-        assert_eq!(tline.last_aux_file_policy.load(), Some(AuxFilePolicy::V2));
-
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"third"))
-        );
-        assert_eq!(
-            files.get("pg_logical/mappings/test3"),
-            Some(&bytes::Bytes::from_static(b"last"))
-        );
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_force_switch() {
-        let mut harness = TenantHarness::create("aux_file_policy_force_switch")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V1;
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        tline.do_switch_aux_policy(AuxFilePolicy::V2).unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V2),
-            "dirty index_part.json reflected state is yet to be updated"
-        );
-
-        // lose all data from v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test2", b"second", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        // read data ingested in v2
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test2"),
-            Some(&bytes::Bytes::from_static(b"second"))
-        );
-        // lose all data from v1
-        assert_eq!(files.get("pg_logical/mappings/test1"), None);
-    }
-
-    #[tokio::test]
-    async fn aux_file_policy_auto_detect() {
-        let mut harness = TenantHarness::create("aux_file_policy_auto_detect")
-            .await
-            .unwrap();
-        harness.tenant_conf.switch_aux_file_policy = AuxFilePolicy::V2; // set to cross-validation mode
-        let (tenant, ctx) = harness.load().await;
-
-        let mut lsn = Lsn(0x08);
-
-        let tline: Arc<Timeline> = tenant
-            .create_test_timeline(TIMELINE_ID, lsn, DEFAULT_PG_VERSION, &ctx)
-            .await
-            .unwrap();
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            None,
-            "no aux file is written so it should be unset"
-        );
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
-                files: vec![(
-                    "test_file".to_string(),
-                    Bytes::copy_from_slice(b"test_file"),
-                )]
-                .into_iter()
-                .collect(),
-            })
-            .unwrap();
-            modification.put_for_test(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        {
-            lsn += 8;
-            let mut modification = tline.begin_modification(lsn);
-            modification
-                .put_file("pg_logical/mappings/test1", b"first", &ctx)
-                .await
-                .unwrap();
-            modification.commit(&ctx).await.unwrap();
-        }
-
-        assert_eq!(
-            tline.last_aux_file_policy.load(),
-            Some(AuxFilePolicy::V1),
-            "keep using v1 because there are aux files writting with v1"
-        );
-
-        // we can still read the auxfile v1
-        let files = tline.list_aux_files(lsn, &ctx).await.unwrap();
-        assert_eq!(
-            files.get("pg_logical/mappings/test1"),
-            Some(&bytes::Bytes::from_static(b"first"))
-        );
-        assert_eq!(
-            files.get("test_file"),
-            Some(&bytes::Bytes::from_static(b"test_file"))
-        );
     }
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1f9ae40af5..5e9702bd3d 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -187,7 +187,7 @@ use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 
 pub(crate) use download::download_initdb_tar_zst;
-use pageserver_api::models::{AuxFilePolicy, TimelineArchivalState};
+use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -628,18 +628,6 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    /// Launch an index-file upload operation in the background, with only the `aux_file_policy` flag updated.
-    pub(crate) fn schedule_index_upload_for_aux_file_policy_update(
-        self: &Arc<Self>,
-        last_aux_file_policy: Option<AuxFilePolicy>,
-    ) -> anyhow::Result<()> {
-        let mut guard = self.upload_queue.lock().unwrap();
-        let upload_queue = guard.initialized_mut()?;
-        upload_queue.dirty.last_aux_file_policy = last_aux_file_policy;
-        self.schedule_index_upload(upload_queue)?;
-        Ok(())
-    }
-
     /// Launch an index-file upload operation in the background, with only the `archived_at` field updated.
     ///
     /// Returns whether it is required to wait for the queue to be empty to ensure that the change is uploaded,
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index c51ff54919..3a74a4ed11 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -133,10 +133,6 @@ impl IndexPart {
     pub(crate) fn example() -> Self {
         Self::empty(TimelineMetadata::example())
     }
-
-    pub(crate) fn last_aux_file_policy(&self) -> Option<AuxFilePolicy> {
-        self.last_aux_file_policy
-    }
 }
 
 /// Metadata gathered for each of the layer files.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2b4f949c76..d67a139dfa 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -28,9 +28,9 @@ use pageserver_api::{
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
-        AtomicAuxFilePolicy, AuxFilePolicy, CompactionAlgorithm, CompactionAlgorithmSettings,
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
-        InMemoryLayerInfo, LayerMapInfo, LsnLease, TimelineState,
+        CompactionAlgorithm, CompactionAlgorithmSettings, DownloadRemoteLayersTaskInfo,
+        DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy, InMemoryLayerInfo, LayerMapInfo,
+        LsnLease, TimelineState,
     },
     reltag::BlockNumber,
     shard::{ShardIdentity, ShardNumber, TenantShardId},
@@ -98,12 +98,12 @@ use crate::{
 use crate::{
     metrics::ScanLatencyOngoingRecording, tenant::timeline::logical_size::CurrentLogicalSize,
 };
-use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
-use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use crate::{
-    pgdatadir_mapping::{AuxFilesDirectory, DirectoryKind},
+    pgdatadir_mapping::DirectoryKind,
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
+use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};
+use crate::{pgdatadir_mapping::MAX_AUX_FILE_V2_DELTAS, tenant::storage_layer::PersistentLayerKey};
 use pageserver_api::config::tenant_conf_defaults::DEFAULT_PITR_INTERVAL;
 
 use crate::config::PageServerConf;
@@ -206,11 +206,6 @@ pub struct TimelineResources {
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
-pub(crate) struct AuxFilesState {
-    pub(crate) dir: Option<AuxFilesDirectory>,
-    pub(crate) n_deltas: usize,
-}
-
 /// The relation size cache caches relation sizes at the end of the timeline. It speeds up WAL
 /// ingestion considerably, because WAL ingestion needs to check on most records if the record
 /// implicitly extends the relation.  At startup, `complete_as_of` is initialized to the current end
@@ -413,15 +408,9 @@ pub struct Timeline {
     timeline_get_throttle:
         Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
 
-    /// Keep aux directory cache to avoid it's reconstruction on each update
-    pub(crate) aux_files: tokio::sync::Mutex<AuxFilesState>,
-
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
 
-    /// Indicate whether aux file v2 storage is enabled.
-    pub(crate) last_aux_file_policy: AtomicAuxFilePolicy,
-
     /// Some test cases directly place keys into the timeline without actually modifying the directory
     /// keys (i.e., DB_DIR). The test cases creating such keys will put the keyspaces here, so that
     /// these keys won't get garbage-collected during compaction/GC. This field only modifies the dense
@@ -2012,14 +2001,6 @@ impl Timeline {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length_for_ts)
     }
 
-    pub(crate) fn get_switch_aux_file_policy(&self) -> AuxFilePolicy {
-        let tenant_conf = self.tenant_conf.load();
-        tenant_conf
-            .tenant_conf
-            .switch_aux_file_policy
-            .unwrap_or(self.conf.default_tenant_conf.switch_aux_file_policy)
-    }
-
     pub(crate) fn get_lazy_slru_download(&self) -> bool {
         let tenant_conf = self.tenant_conf.load();
         tenant_conf
@@ -2152,7 +2133,6 @@ impl Timeline {
         resources: TimelineResources,
         pg_version: u32,
         state: TimelineState,
-        aux_file_policy: Option<AuxFilePolicy>,
         attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
         cancel: CancellationToken,
     ) -> Arc<Self> {
@@ -2282,15 +2262,8 @@ impl Timeline {
 
                 timeline_get_throttle: resources.timeline_get_throttle,
 
-                aux_files: tokio::sync::Mutex::new(AuxFilesState {
-                    dir: None,
-                    n_deltas: 0,
-                }),
-
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
 
-                last_aux_file_policy: AtomicAuxFilePolicy::new(aux_file_policy),
-
                 #[cfg(test)]
                 extra_test_dense_keyspace: ArcSwap::new(Arc::new(KeySpace::default())),
 
@@ -2301,10 +2274,6 @@ impl Timeline {
                 attach_wal_lag_cooldown,
             };
 
-            if aux_file_policy == Some(AuxFilePolicy::V1) {
-                warn!("this timeline is using deprecated aux file policy V1 (when loading the timeline)");
-            }
-
             result.repartition_threshold =
                 result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
 
@@ -4479,14 +4448,6 @@ impl Timeline {
     ) -> Result<(), detach_ancestor::Error> {
         detach_ancestor::complete(self, tenant, attempt, ctx).await
     }
-
-    /// Switch aux file policy and schedule upload to the index part.
-    pub(crate) fn do_switch_aux_policy(&self, policy: AuxFilePolicy) -> anyhow::Result<()> {
-        self.last_aux_file_policy.store(Some(policy));
-        self.remote_client
-            .schedule_index_upload_for_aux_file_policy_update(Some(policy))?;
-        Ok(())
-    }
 }
 
 impl Drop for Timeline {
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 305c5758cc..71b9e4e288 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -283,8 +283,6 @@ impl DeleteTimelineFlow {
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
-                // Aux file policy is not needed for deletion, assuming deletion does not read aux keyspace
-                None,
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index facf01004c..c067787f97 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,8 +1,7 @@
-use crate::pgdatadir_mapping::AuxFilesDirectory;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::{BufMut, BytesMut};
+use bytes::BytesMut;
 use pageserver_api::key::Key;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
@@ -13,7 +12,6 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;
 use tracing::*;
-use utils::bin_ser::BeSer;
 use utils::lsn::Lsn;
 
 /// Can this request be served by neon redo functions
@@ -236,13 +234,9 @@ pub(crate) fn apply_in_neon(
                 LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
             }
         }
-        NeonWalRecord::AuxFile { file_path, content } => {
-            let mut dir = AuxFilesDirectory::des(page)?;
-            dir.upsert(file_path.clone(), content.clone());
-
-            page.clear();
-            let mut writer = page.writer();
-            dir.ser_into(&mut writer)?;
+        NeonWalRecord::AuxFile { .. } => {
+            // No-op: this record will never be created in aux v2.
+            warn!("AuxFile record should not be created in aux v2");
         }
         #[cfg(test)]
         NeonWalRecord::Test {
@@ -250,6 +244,7 @@ pub(crate) fn apply_in_neon(
             clear,
             will_init,
         } => {
+            use bytes::BufMut;
             if *will_init {
                 assert!(*clear, "init record must be clear to ensure correctness");
             }
@@ -261,59 +256,3 @@ pub(crate) fn apply_in_neon(
     }
     Ok(())
 }
-
-#[cfg(test)]
-mod test {
-    use bytes::Bytes;
-    use pageserver_api::key::AUX_FILES_KEY;
-
-    use super::*;
-    use std::collections::HashMap;
-
-    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
-    #[test]
-    fn apply_aux_file_deltas() -> anyhow::Result<()> {
-        let base_dir = AuxFilesDirectory {
-            files: HashMap::from([
-                ("two".to_string(), Bytes::from_static(b"content0")),
-                ("three".to_string(), Bytes::from_static(b"contentX")),
-            ]),
-        };
-        let base_image = AuxFilesDirectory::ser(&base_dir)?;
-
-        let deltas = vec![
-            // Insert
-            NeonWalRecord::AuxFile {
-                file_path: "one".to_string(),
-                content: Some(Bytes::from_static(b"content1")),
-            },
-            // Update
-            NeonWalRecord::AuxFile {
-                file_path: "two".to_string(),
-                content: Some(Bytes::from_static(b"content99")),
-            },
-            // Delete
-            NeonWalRecord::AuxFile {
-                file_path: "three".to_string(),
-                content: None,
-            },
-        ];
-
-        let file_path = AUX_FILES_KEY;
-        let mut page = BytesMut::from_iter(base_image);
-
-        for record in deltas {
-            apply_in_neon(&record, Lsn(8), file_path, &mut page)?;
-        }
-
-        let reconstructed = AuxFilesDirectory::des(&page)?;
-        let expect = HashMap::from([
-            ("one".to_string(), Bytes::from_static(b"content1")),
-            ("two".to_string(), Bytes::from_static(b"content99")),
-        ]);
-
-        assert_eq!(reconstructed.files, expect);
-
-        Ok(())
-    }
-}
diff --git a/test_runner/regress/test_aux_files.py b/test_runner/regress/test_aux_files.py
deleted file mode 100644
index 91d674d0db..0000000000
--- a/test_runner/regress/test_aux_files.py
+++ /dev/null
@@ -1,78 +0,0 @@
-from __future__ import annotations
-
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    AuxFileStore,
-    NeonEnvBuilder,
-    logical_replication_sync,
-)
-
-
-def test_aux_v2_config_switch(neon_env_builder: NeonEnvBuilder, vanilla_pg):
-    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start("main")
-    client = env.pageserver.http_client()
-
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    tenant_config = client.tenant_config(tenant_id).effective_config
-    tenant_config["switch_aux_file_policy"] = AuxFileStore.V2
-    client.set_tenant_config(tenant_id, tenant_config)
-    # aux file v2 is enabled on the write path, so for now, it should be unset (or null)
-    assert (
-        client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)["last_aux_file_policy"]
-        is None
-    )
-
-    pg_conn = endpoint.connect()
-    cur = pg_conn.cursor()
-
-    cur.execute("create table t(pk integer primary key, payload integer)")
-    cur.execute(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120));"
-    )
-    cur.execute("create publication pub1 for table t, replication_example")
-
-    # now start subscriber, aux files will be created at this point. TODO: find better ways of testing aux files (i.e., neon_test_utils)
-    # instead of going through the full logical replication process.
-    vanilla_pg.start()
-    vanilla_pg.safe_psql("create table t(pk integer primary key, payload integer)")
-    vanilla_pg.safe_psql(
-        "CREATE TABLE replication_example(id SERIAL PRIMARY KEY, somedata int, text varchar(120), testcolumn1 int, testcolumn2 int, testcolumn3 int);"
-    )
-    connstr = endpoint.connstr().replace("'", "''")
-    log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
-    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
-
-    # Wait logical replication channel to be established
-    logical_replication_sync(vanilla_pg, endpoint)
-    vanilla_pg.stop()
-    endpoint.stop()
-
-    with env.pageserver.http_client() as client:
-        # aux file v2 flag should be enabled at this point
-        assert (
-            client.timeline_detail(tenant_id, timeline_id)["last_aux_file_policy"]
-            == AuxFileStore.V2
-        )
-    with env.pageserver.http_client() as client:
-        tenant_config = client.tenant_config(tenant_id).effective_config
-        tenant_config["switch_aux_file_policy"] = "V1"
-        client.set_tenant_config(tenant_id, tenant_config)
-        # the flag should still be enabled
-        assert (
-            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
-                "last_aux_file_policy"
-            ]
-            == AuxFileStore.V2
-        )
-    env.pageserver.restart()
-    with env.pageserver.http_client() as client:
-        # aux file v2 flag should be persisted
-        assert (
-            client.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)[
-                "last_aux_file_policy"
-            ]
-            == AuxFileStore.V2
-        )

From 24398bf0600223fb74fb3aa33ca4e4374209f84d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 17 Oct 2024 19:02:24 +0100
Subject: [PATCH 036/239] pageserver: detect & warn on loading an old index
 which is probably the result of a bad generation (#9383)

## Problem

The pageserver generally trusts the storage controller/control plane to
give it valid generations. However, sometimes it should be obvious that
a generation is bad, and for defense in depth we should detect that on
the pageserver.

This PR is part 1 of 2:
1. in this PR we detect and warn on such situations, but do not block
starting up the tenant. Once we have confidence that the check is not
firing unexpectedly in the field
2. part 2 of 2 will introduce a condition that refuses to start a tenant
in this situtation, and a test for that (maybe, if we can figure out how
to spoof an ancient mtime)

Related: #6951

## Summary of changes

- When loading an index older than 2 weeks, log an INFO message noting
that we will check for other indices
- When loading an index older than 2 weeks _and_ a newer-generation
index exists, log a warning.
---
 pageserver/src/http/routes.rs                 |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 45 ++++++++++++++++++-
 .../tenant/remote_timeline_client/download.rs | 11 ++---
 3 files changed, 51 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e6663ef56f..8f928fd81b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2251,7 +2251,7 @@ async fn tenant_scan_remote_handler(
                          %timeline_id))
             .await
             {
-                Ok((index_part, index_generation)) => {
+                Ok((index_part, index_generation, _index_mtime)) => {
                     tracing::info!("Found timeline {tenant_shard_id}/{timeline_id} metadata (gen {index_generation:?}, {} layers, {} consistent LSN)",
                         index_part.layer_metadata.len(), index_part.metadata.disk_consistent_lsn());
                     generation = std::cmp::max(generation, index_generation);
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 5e9702bd3d..450084aca2 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -505,7 +505,7 @@ impl RemoteTimelineClient {
             },
         );
 
-        let (index_part, _index_generation) = download::download_index_part(
+        let (index_part, index_generation, index_last_modified) = download::download_index_part(
             &self.storage_impl,
             &self.tenant_shard_id,
             &self.timeline_id,
@@ -519,6 +519,49 @@ impl RemoteTimelineClient {
         )
         .await?;
 
+        // Defense in depth: monotonicity of generation numbers is an important correctness guarantee, so when we see a very
+        // old index, we do extra checks in case this is the result of backward time-travel of the generation number (e.g.
+        // in case of a bug in the service that issues generation numbers). Indices are allowed to be old, but we expect that
+        // when we load an old index we are loading the _latest_ index: if we are asked to load an old index and there is
+        // also a newer index available, that is surprising.
+        const INDEX_AGE_CHECKS_THRESHOLD: Duration = Duration::from_secs(14 * 24 * 3600);
+        let index_age = index_last_modified.elapsed().unwrap_or_else(|e| {
+            if e.duration() > Duration::from_secs(5) {
+                // We only warn if the S3 clock and our local clock are >5s out: because this is a low resolution
+                // timestamp, it is common to be out by at least 1 second.
+                tracing::warn!("Index has modification time in the future: {e}");
+            }
+            Duration::ZERO
+        });
+        if index_age > INDEX_AGE_CHECKS_THRESHOLD {
+            tracing::info!(
+                ?index_generation,
+                age = index_age.as_secs_f64(),
+                "Loaded an old index, checking for other indices..."
+            );
+
+            // Find the highest-generation index
+            let (_latest_index_part, latest_index_generation, latest_index_mtime) =
+                download::download_index_part(
+                    &self.storage_impl,
+                    &self.tenant_shard_id,
+                    &self.timeline_id,
+                    Generation::MAX,
+                    cancel,
+                )
+                .await?;
+
+            if latest_index_generation > index_generation {
+                // Unexpected!  Why are we loading such an old index if a more recent one exists?
+                tracing::warn!(
+                    ?index_generation,
+                    ?latest_index_generation,
+                    ?latest_index_mtime,
+                    "Found a newer index while loading an old one"
+                );
+            }
+        }
+
         if index_part.deleted_at.is_some() {
             Ok(MaybeDeletedIndexPart::Deleted(index_part))
         } else {
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 692e4d3096..b5d4b0f0bb 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -343,10 +344,10 @@ async fn do_download_index_part(
     timeline_id: &TimelineId,
     index_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
-    let index_part_bytes = download_retry_forever(
+    let (index_part_bytes, index_part_mtime) = download_retry_forever(
         || async {
             let download = storage
                 .download(&remote_path, &DownloadOpts::default(), cancel)
@@ -359,7 +360,7 @@ async fn do_download_index_part(
 
             tokio::io::copy_buf(&mut stream, &mut bytes).await?;
 
-            Ok(bytes)
+            Ok((bytes, download.last_modified))
         },
         &format!("download {remote_path:?}"),
         cancel,
@@ -370,7 +371,7 @@ async fn do_download_index_part(
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok((index_part, index_generation))
+    Ok((index_part, index_generation, index_part_mtime))
 }
 
 /// index_part.json objects are suffixed with a generation number, so we cannot
@@ -385,7 +386,7 @@ pub(crate) async fn download_index_part(
     timeline_id: &TimelineId,
     my_generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation), DownloadError> {
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {

From 928d98b6dcb57ae22a3da18fc6786b90c8dcae0a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 17 Oct 2024 21:25:51 +0200
Subject: [PATCH 037/239] Update Rust to 1.82.0 and mold to 2.34.0 (#9445)

We keep the practice of keeping the compiler up to date, pointing to the
latest release. This is done by many other projects in the Rust
ecosystem as well.

[Release notes](https://github.com/rust-lang/rust/blob/master/RELEASES.md#version-1820-2024-10-17).

Also update mold. [release notes for
2.34.0](https://github.com/rui314/mold/releases/tag/v2.34.0), [release
notes for 2.34.1](https://github.com/rui314/mold/releases/tag/v2.34.1).

Prior update was in #8939.
---
 Dockerfile.build-tools | 6 +++---
 rust-toolchain.toml    | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.build-tools b/Dockerfile.build-tools
index 7cba1c8635..f05c60661c 100644
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -72,7 +72,7 @@ RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/
     && mv s5cmd /usr/local/bin/s5cmd
 
 # LLVM
-ENV LLVM_VERSION=18
+ENV LLVM_VERSION=19
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
     && echo "deb http://apt.llvm.org/${DEBIAN_VERSION}/ llvm-toolchain-${DEBIAN_VERSION}-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
     && apt update \
@@ -99,7 +99,7 @@ RUN curl "https://awscli.amazonaws.com/awscli-exe-linux-$(uname -m).zip" -o "aws
     && rm awscliv2.zip
 
 # Mold: A Modern Linker
-ENV MOLD_VERSION=v2.33.0
+ENV MOLD_VERSION=v2.34.1
 RUN set -e \
     && git clone https://github.com/rui314/mold.git \
     && mkdir mold/build \
@@ -192,7 +192,7 @@ WORKDIR /home/nonroot
 
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.81.0
+ENV RUSTC_VERSION=1.82.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 ARG RUSTFILT_VERSION=0.2.1
diff --git a/rust-toolchain.toml b/rust-toolchain.toml
index 3c5d0b12a6..92b7929c7f 100644
--- a/rust-toolchain.toml
+++ b/rust-toolchain.toml
@@ -1,5 +1,5 @@
 [toolchain]
-channel = "1.81.0"
+channel = "1.82.0"
 profile = "default"
 # The default profile includes rustc, rust-std, cargo, rust-docs, rustfmt and clippy.
 # https://rust-lang.github.io/rustup/concepts/profiles.html

From d762ad0883f204dee1b15729db8a6a3d6d5497e5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 17 Oct 2024 20:45:37 +0100
Subject: [PATCH 038/239] update rustls (#9396)

The forever ongoing effort of juggling multiple versions of rustls :3

now with new crypto library aws-lc.

Because of dependencies, it is currently impossible to not have both
ring and aws-lc in the dep tree, therefore our only options are not
updating rustls or having both crypto backends enabled...

According to benchmarks run by the rustls maintainer, aws-lc is faster
than ring in some cases too <https://jbp.io/graviola/>, so it's not
without its upsides,
---
 Cargo.lock                                    | 220 +++++++++++++-----
 Cargo.toml                                    |  12 +-
 libs/postgres_backend/tests/simple_select.rs  |  29 ++-
 proxy/src/bin/pg_sni_router.rs                |  10 +-
 proxy/src/compute.rs                          |  30 ++-
 proxy/src/config.rs                           |  14 +-
 proxy/src/proxy/tests/mod.rs                  |  51 ++--
 .../src/scan_safekeeper_metadata.rs           |  22 +-
 workspace_hack/Cargo.toml                     |  11 +-
 9 files changed, 276 insertions(+), 123 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 6b212bac2e..ad29fa4634 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -148,9 +148,9 @@ dependencies = [
 
 [[package]]
 name = "asn1-rs"
-version = "0.5.2"
+version = "0.6.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7f6fd5ddaf0351dff5b8da21b2fb4ff8e08ddd02857f0bf69c47639106c0fff0"
+checksum = "5493c3bedbacf7fd7382c6346bbd66687d12bbaad3a89a2d2c303ee6cf20b048"
 dependencies = [
  "asn1-rs-derive",
  "asn1-rs-impl",
@@ -164,25 +164,25 @@ dependencies = [
 
 [[package]]
 name = "asn1-rs-derive"
-version = "0.4.0"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "726535892e8eae7e70657b4c8ea93d26b8553afb1ce617caee529ef96d7dee6c"
+checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
  "synstructure",
 ]
 
 [[package]]
 name = "asn1-rs-impl"
-version = "0.1.0"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2777730b2039ac0f95f093556e61b6d26cebed5393ca6f152717777cec3a42ed"
+checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -310,6 +310,33 @@ dependencies = [
  "zeroize",
 ]
 
+[[package]]
+name = "aws-lc-rs"
+version = "1.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
+dependencies = [
+ "aws-lc-sys",
+ "mirai-annotations",
+ "paste",
+ "zeroize",
+]
+
+[[package]]
+name = "aws-lc-sys"
+version = "0.21.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
+dependencies = [
+ "bindgen 0.69.5",
+ "cc",
+ "cmake",
+ "dunce",
+ "fs_extra",
+ "libc",
+ "paste",
+]
+
 [[package]]
 name = "aws-runtime"
 version = "1.4.3"
@@ -595,7 +622,7 @@ dependencies = [
  "once_cell",
  "pin-project-lite",
  "pin-utils",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "tokio",
  "tracing",
 ]
@@ -915,6 +942,29 @@ dependencies = [
  "serde",
 ]
 
+[[package]]
+name = "bindgen"
+version = "0.69.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
+dependencies = [
+ "bitflags 2.4.1",
+ "cexpr",
+ "clang-sys",
+ "itertools 0.10.5",
+ "lazy_static",
+ "lazycell",
+ "log",
+ "prettyplease",
+ "proc-macro2",
+ "quote",
+ "regex",
+ "rustc-hash",
+ "shlex",
+ "syn 2.0.52",
+ "which",
+]
+
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -924,7 +974,7 @@ dependencies = [
  "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -1038,12 +1088,13 @@ checksum = "37b2a672a2cb129a2e41c10b1224bb368f9f37a2b16b612598138befd7b37eb5"
 
 [[package]]
 name = "cc"
-version = "1.0.83"
+version = "1.1.30"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f1174fb0b6ec23863f8b971027804a42614e347eafb0a95bf0b12cdae21fc4d0"
+checksum = "b16803a61b81d9eabb7eae2588776c4c1e584b738ede45fdbb4c972cec1e9945"
 dependencies = [
  "jobserver",
  "libc",
+ "shlex",
 ]
 
 [[package]]
@@ -1169,6 +1220,15 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
+[[package]]
+name = "cmake"
+version = "0.1.51"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1624,9 +1684,9 @@ dependencies = [
 
 [[package]]
 name = "der-parser"
-version = "8.2.0"
+version = "9.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dbd676fbbab537128ef0278adb5576cf363cff6aa22a7b24effe97347cfab61e"
+checksum = "5cd0a5c643689626bec213c4d8bd4d96acc8ffdb4ad4bb6bc16abf27d5f4b553"
 dependencies = [
  "asn1-rs",
  "displaydoc",
@@ -1755,6 +1815,12 @@ dependencies = [
  "syn 2.0.52",
 ]
 
+[[package]]
+name = "dunce"
+version = "1.0.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
+
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2059,6 +2125,12 @@ dependencies = [
  "tokio-util",
 ]
 
+[[package]]
+name = "fs_extra"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
+
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2412,6 +2484,15 @@ dependencies = [
  "digest",
 ]
 
+[[package]]
+name = "home"
+version = "0.5.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
+dependencies = [
+ "windows-sys 0.52.0",
+]
+
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2581,7 +2662,7 @@ dependencies = [
  "http 0.2.9",
  "hyper 0.14.30",
  "log",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "rustls-native-certs 0.6.2",
  "tokio",
  "tokio-rustls 0.24.0",
@@ -2801,9 +2882,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
 [[package]]
 name = "jobserver"
-version = "0.1.26"
+version = "0.1.32"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2"
+checksum = "48d1dbcbbeb6a7fec7e059840aa538bd62aaccf972c7346c4d9d2059312853d0"
 dependencies = [
  "libc",
 ]
@@ -2907,6 +2988,12 @@ dependencies = [
  "spin",
 ]
 
+[[package]]
+name = "lazycell"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3137,6 +3224,12 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "mirai-annotations"
+version = "1.12.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
+
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -3356,9 +3449,9 @@ dependencies = [
 
 [[package]]
 name = "oid-registry"
-version = "0.6.1"
+version = "0.7.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9bedf36ffb6ba96c2eb7144ef6270557b52e54b20c0a8e1eb2ff99a6c6959bff"
+checksum = "a8d8034d9489cdaf79228eb9f6a3b8d7bb32ba00d6645ebd48eef4077ceb5bd9"
 dependencies = [
  "asn1-rs",
 ]
@@ -4053,14 +4146,14 @@ dependencies = [
  "bytes",
  "once_cell",
  "pq_proto",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "tokio-util",
  "tracing",
 ]
@@ -4082,7 +4175,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen",
+ "bindgen 0.70.1",
  "bytes",
  "crc32c",
  "env_logger",
@@ -4219,7 +4312,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
  "heck 0.5.0",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "log",
  "multimap",
  "once_cell",
@@ -4239,7 +4332,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
  "anyhow",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -4327,8 +4420,8 @@ dependencies = [
  "rsa",
  "rstest",
  "rustc-hash",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
+ "rustls 0.23.7",
+ "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
  "serde",
@@ -4345,7 +4438,7 @@ dependencies = [
  "tokio",
  "tokio-postgres",
  "tokio-postgres-rustls",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "tokio-tungstenite",
  "tokio-util",
  "tracing",
@@ -4509,12 +4602,13 @@ dependencies = [
 
 [[package]]
 name = "rcgen"
-version = "0.12.1"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "48406db8ac1f3cbc7dcdb56ec355343817958a356ff430259bb07baf7607e1e1"
+checksum = "54077e1872c46788540de1ea3d7f4ccb1983d12f9aa909b234468676c1a36779"
 dependencies = [
  "pem",
  "ring",
+ "rustls-pki-types",
  "time",
  "yasna",
 ]
@@ -4693,7 +4787,7 @@ dependencies = [
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "rustls-pemfile 1.0.2",
  "serde",
  "serde_json",
@@ -4991,9 +5085,9 @@ dependencies = [
 
 [[package]]
 name = "rustls"
-version = "0.21.11"
+version = "0.21.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "7fecbfb7b1444f477b345853b1fce097a2c6fb637b2bfb87e6bc5db0f043fae4"
+checksum = "3f56a14d1f48b391359b22f731fd4bd7e43c97f3c50eee276f3aa09c94784d3e"
 dependencies = [
  "log",
  "ring",
@@ -5021,6 +5115,7 @@ version = "0.23.7"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
 dependencies = [
+ "aws-lc-rs",
  "log",
  "once_cell",
  "ring",
@@ -5089,9 +5184,9 @@ dependencies = [
 
 [[package]]
 name = "rustls-pki-types"
-version = "1.3.1"
+version = "1.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "5ede67b28608b4c60685c7d54122d4400d90f62b40caee7700e700380a390fa8"
+checksum = "16f1201b3c9a7ee8039bcadc17b7e605e2945b27eee7631788c1bd2b0643674b"
 
 [[package]]
 name = "rustls-webpki"
@@ -5109,6 +5204,7 @@ version = "0.102.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
 dependencies = [
+ "aws-lc-rs",
  "ring",
  "rustls-pki-types",
  "untrusted",
@@ -5312,7 +5408,7 @@ checksum = "00421ed8fa0c995f07cde48ba6c89e80f2b312f74ff637326f392fbfd23abe02"
 dependencies = [
  "httpdate",
  "reqwest 0.12.4",
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "sentry-backtrace",
  "sentry-contexts",
  "sentry-core",
@@ -5807,8 +5903,8 @@ dependencies = [
  "postgres_ffi",
  "remote_storage",
  "reqwest 0.12.4",
- "rustls 0.22.4",
- "rustls-native-certs 0.7.0",
+ "rustls 0.23.7",
+ "rustls-native-certs 0.8.0",
  "serde",
  "serde_json",
  "storage_controller_client",
@@ -5930,14 +6026,13 @@ checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394"
 
 [[package]]
 name = "synstructure"
-version = "0.12.6"
+version = "0.13.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "f36bdaa60a83aca3921b5259d5400cbf5e90fc51931376a9bd4a0eb79aa7210f"
+checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 1.0.109",
- "unicode-xid",
+ "syn 2.0.52",
 ]
 
 [[package]]
@@ -6236,16 +6331,15 @@ dependencies = [
 
 [[package]]
 name = "tokio-postgres-rustls"
-version = "0.11.1"
+version = "0.12.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ea13f22eda7127c827983bdaf0d7fff9df21c8817bab02815ac277a21143677"
+checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
- "futures",
  "ring",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "tokio",
  "tokio-postgres",
- "tokio-rustls 0.25.0",
+ "tokio-rustls 0.26.0",
  "x509-certificate",
 ]
 
@@ -6255,7 +6349,7 @@ version = "0.24.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e0d409377ff5b1e3ca6437aa86c1eb7d40c134bfec254e44c830defa92669db5"
 dependencies = [
- "rustls 0.21.11",
+ "rustls 0.21.12",
  "tokio",
 ]
 
@@ -6678,16 +6772,15 @@ checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1"
 
 [[package]]
 name = "ureq"
-version = "2.9.7"
+version = "2.10.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d11a831e3c0b56e438a28308e7c810799e3c118417f342d30ecec080105395cd"
+checksum = "b74fc6b57825be3373f7054754755f03ac3a8f5d70015ccad699ba2029956f4a"
 dependencies = [
  "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.22.4",
+ "rustls 0.23.7",
  "rustls-pki-types",
- "rustls-webpki 0.102.2",
  "url",
  "webpki-roots 0.26.1",
 ]
@@ -6876,7 +6969,7 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen",
+ "bindgen 0.70.1",
  "postgres_ffi",
  "utils",
 ]
@@ -7051,6 +7144,18 @@ dependencies = [
  "rustls-pki-types",
 ]
 
+[[package]]
+name = "which"
+version = "4.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
+dependencies = [
+ "either",
+ "home",
+ "once_cell",
+ "rustix",
+]
+
 [[package]]
 name = "whoami"
 version = "1.5.1"
@@ -7295,7 +7400,6 @@ dependencies = [
  "digest",
  "either",
  "fail",
- "futures",
  "futures-channel",
  "futures-executor",
  "futures-io",
@@ -7311,7 +7415,7 @@ dependencies = [
  "hyper-util",
  "indexmap 1.9.3",
  "indexmap 2.0.1",
- "itertools 0.12.1",
+ "itertools 0.10.5",
  "lazy_static",
  "libc",
  "log",
@@ -7332,6 +7436,8 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest 0.12.4",
+ "rustls 0.23.7",
+ "rustls-webpki 0.102.2",
  "scopeguard",
  "serde",
  "serde_json",
@@ -7340,7 +7446,6 @@ dependencies = [
  "smallvec",
  "spki 0.7.3",
  "subtle",
- "syn 1.0.109",
  "syn 2.0.52",
  "sync_wrapper 0.1.2",
  "tikv-jemalloc-sys",
@@ -7348,6 +7453,7 @@ dependencies = [
  "time-macros",
  "tokio",
  "tokio-postgres",
+ "tokio-rustls 0.26.0",
  "tokio-stream",
  "tokio-util",
  "toml_edit",
@@ -7383,9 +7489,9 @@ dependencies = [
 
 [[package]]
 name = "x509-parser"
-version = "0.15.0"
+version = "0.16.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bab0c2f54ae1d92f4fcb99c0b7ccf0b1e3451cbd395e5f115ccbdbcb18d4f634"
+checksum = "fcbc162f30700d6f3f82a24bf7cc62ffe7caea42c0b2cba8bf7f3ae50cf51f69"
 dependencies = [
  "asn1-rs",
  "data-encoding",
diff --git a/Cargo.toml b/Cargo.toml
index a1a974b33b..4c6a24ecde 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -142,7 +142,7 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.22"
+rustls = "0.23"
 rustls-pemfile = "2"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -172,8 +172,8 @@ tikv-jemalloc-ctl = "0.5"
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
-tokio-postgres-rustls = "0.11.0"
-tokio-rustls = "0.25"
+tokio-postgres-rustls = "0.12.0"
+tokio-rustls = "0.26"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
@@ -192,8 +192,8 @@ url = "2.2"
 urlencoding = "2.1"
 uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
-rustls-native-certs = "0.7"
-x509-parser = "0.15"
+rustls-native-certs = "0.8"
+x509-parser = "0.16"
 whoami = "1.5.1"
 
 ## TODO replace this with tracing
@@ -244,7 +244,7 @@ workspace_hack = { version = "0.1", path = "./workspace_hack/" }
 
 ## Build dependencies
 criterion = "0.5.1"
-rcgen = "0.12"
+rcgen = "0.13"
 rstest = "0.18"
 camino-tempfile = "1.0.2"
 tonic-build = "0.12"
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 900083ea7f..9d3031d699 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -2,6 +2,7 @@
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
+use rustls::crypto::aws_lc_rs;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -92,10 +93,13 @@ static CERT: Lazy<rustls::pki_types::CertificateDer<'static>> = Lazy::new(|| {
 async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
-    let server_cfg = rustls::ServerConfig::builder()
-        .with_no_client_auth()
-        .with_single_cert(vec![CERT.clone()], KEY.clone_key())
-        .unwrap();
+    let server_cfg =
+        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("aws_lc_rs should support the default protocol versions")
+            .with_no_client_auth()
+            .with_single_cert(vec![CERT.clone()], KEY.clone_key())
+            .unwrap();
     let tls_config = Some(Arc::new(server_cfg));
     let pgbackend =
         PostgresBackend::new(server_sock, AuthType::Trust, tls_config).expect("pgbackend creation");
@@ -105,13 +109,16 @@ async fn simple_select_ssl() {
         pgbackend.run(&mut handler, &CancellationToken::new()).await
     });
 
-    let client_cfg = rustls::ClientConfig::builder()
-        .with_root_certificates({
-            let mut store = rustls::RootCertStore::empty();
-            store.add(CERT.clone()).unwrap();
-            store
-        })
-        .with_no_client_auth();
+    let client_cfg =
+        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .expect("aws_lc_rs should support the default protocol versions")
+            .with_root_certificates({
+                let mut store = rustls::RootCertStore::empty();
+                store.add(CERT.clone()).unwrap();
+                store
+            })
+            .with_no_client_auth();
     let mut make_tls_connect = tokio_postgres_rustls::MakeRustlsConnect::new(client_cfg);
     let tls_connect = <MakeRustlsConnect as MakeTlsConnect<TcpStream>>::make_tls_connect(
         &mut make_tls_connect,
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 00eb830d98..13b7fdd40a 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -15,6 +15,7 @@ use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpListener;
@@ -104,10 +105,11 @@ async fn main() -> anyhow::Result<()> {
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder_with_protocol_versions(&[
-                &rustls::version::TLS13,
-                &rustls::version::TLS12,
-            ])
+            let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new(
+                aws_lc_rs::default_provider(),
+            ))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
             .with_no_client_auth()
             .with_single_cert(cert_chain, key)?
             .into();
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 212e82497f..a7c2cab4a1 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,6 +8,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -38,6 +39,9 @@ pub(crate) enum ConnectionError {
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
 
+    #[error("Couldn't load native TLS certificates: {0:?}")]
+    TlsCertificateError(Vec<rustls_native_certs::Error>),
+
     #[error("{COULD_NOT_CONNECT}: {0}")]
     TlsError(#[from] InvalidDnsNameError),
 
@@ -84,6 +88,7 @@ impl ReportableError for ConnectionError {
             }
             ConnectionError::Postgres(_) => crate::error::ErrorKind::Compute,
             ConnectionError::CouldNotConnect(_) => crate::error::ErrorKind::Compute,
+            ConnectionError::TlsCertificateError(_) => crate::error::ErrorKind::Service,
             ConnectionError::TlsError(_) => crate::error::ErrorKind::Compute,
             ConnectionError::WakeComputeError(e) => e.get_error_kind(),
             ConnectionError::TooManyConnectionAttempts(e) => e.get_error_kind(),
@@ -293,12 +298,20 @@ impl ConnCfg {
         let client_config = if allow_self_signed_compute {
             // Allow all certificates for creating the connection
             let verifier = Arc::new(AcceptEverythingVerifier);
-            rustls::ClientConfig::builder()
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("aws_lc_rs should support the default protocol versions")
                 .dangerous()
                 .with_custom_certificate_verifier(verifier)
         } else {
-            let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-            rustls::ClientConfig::builder().with_root_certificates(root_store)
+            let root_store = TLS_ROOTS
+                .get_or_try_init(load_certs)
+                .map_err(ConnectionError::TlsCertificateError)?
+                .clone();
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .expect("aws_lc_rs should support the default protocol versions")
+                .with_root_certificates(root_store)
         };
         let client_config = client_config.with_no_client_auth();
 
@@ -359,10 +372,15 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     Some(options)
 }
 
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
+fn load_certs() -> Result<Arc<rustls::RootCertStore>, Vec<rustls_native_certs::Error>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        return Err(der_certs.errors);
+    }
+
     let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
+    store.add_parsable_certificates(der_certs.certs);
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2ec8c7adda..0d5ebd88f9 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -7,7 +7,7 @@ use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::crypto::ring::sign;
+use rustls::crypto::aws_lc_rs::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use sha2::{Digest, Sha256};
 use tracing::{error, info};
@@ -126,12 +126,12 @@ pub fn configure_tls(
     let cert_resolver = Arc::new(cert_resolver);
 
     // allow TLS 1.2 to be compatible with older client libraries
-    let mut config = rustls::ServerConfig::builder_with_protocol_versions(&[
-        &rustls::version::TLS13,
-        &rustls::version::TLS12,
-    ])
-    .with_no_client_auth()
-    .with_cert_resolver(cert_resolver.clone());
+    let mut config =
+        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
+            .with_no_client_auth()
+            .with_cert_resolver(cert_resolver.clone());
 
     config.alpn_protocols = vec![PG_ALPN_PROTOCOL.to_vec()];
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index e50ae4bc93..88175d73b1 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -9,6 +9,7 @@ use async_trait::async_trait;
 use http::StatusCode;
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
+use rustls::crypto::aws_lc_rs;
 use rustls::pki_types;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
@@ -38,25 +39,27 @@ fn generate_certs(
     pki_types::CertificateDer<'static>,
     pki_types::PrivateKeyDer<'static>,
 )> {
-    let ca = rcgen::Certificate::from_params({
+    let ca_key = rcgen::KeyPair::generate()?;
+    let ca = {
         let mut params = rcgen::CertificateParams::default();
         params.is_ca = rcgen::IsCa::Ca(rcgen::BasicConstraints::Unconstrained);
-        params
-    })?;
+        params.self_signed(&ca_key)?
+    };
 
-    let cert = rcgen::Certificate::from_params({
-        let mut params = rcgen::CertificateParams::new(vec![hostname.into()]);
+    let cert_key = rcgen::KeyPair::generate()?;
+    let cert = {
+        let mut params = rcgen::CertificateParams::new(vec![hostname.into()])?;
         params.distinguished_name = rcgen::DistinguishedName::new();
         params
             .distinguished_name
             .push(rcgen::DnType::CommonName, common_name);
-        params
-    })?;
+        params.signed_by(&cert_key, &ca, &ca_key)?
+    };
 
     Ok((
-        pki_types::CertificateDer::from(ca.serialize_der()?),
-        pki_types::CertificateDer::from(cert.serialize_der_with_signer(&ca)?),
-        pki_types::PrivateKeyDer::Pkcs8(cert.serialize_private_key_der().into()),
+        ca.der().clone(),
+        cert.der().clone(),
+        pki_types::PrivateKeyDer::Pkcs8(cert_key.serialize_der().into()),
     ))
 }
 
@@ -90,10 +93,13 @@ fn generate_tls_config<'a>(
     let (ca, cert, key) = generate_certs(hostname, common_name)?;
 
     let tls_config = {
-        let config = rustls::ServerConfig::builder()
-            .with_no_client_auth()
-            .with_single_cert(vec![cert.clone()], key.clone_key())?
-            .into();
+        let config =
+            rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .context("aws_lc_rs should support the default protocol versions")?
+                .with_no_client_auth()
+                .with_single_cert(vec![cert.clone()], key.clone_key())?
+                .into();
 
         let mut cert_resolver = CertResolver::new();
         cert_resolver.add_cert(key, vec![cert], true)?;
@@ -108,13 +114,16 @@ fn generate_tls_config<'a>(
     };
 
     let client_config = {
-        let config = rustls::ClientConfig::builder()
-            .with_root_certificates({
-                let mut store = rustls::RootCertStore::empty();
-                store.add(ca)?;
-                store
-            })
-            .with_no_client_auth();
+        let config =
+            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+                .with_safe_default_protocol_versions()
+                .context("aws_lc_rs should support the default protocol versions")?
+                .with_root_certificates({
+                    let mut store = rustls::RootCertStore::empty();
+                    store.add(ca)?;
+                    store
+                })
+                .with_no_client_auth();
 
         ClientConfig { config, hostname }
     };
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 15f3665fac..6c312d0036 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -1,10 +1,12 @@
 use std::{collections::HashSet, str::FromStr, sync::Arc};
 
+use anyhow::{bail, Context};
 use futures::stream::{StreamExt, TryStreamExt};
 use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use remote_storage::GenericRemoteStorage;
+use rustls::crypto::aws_lc_rs;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{debug, error, info};
@@ -231,10 +233,15 @@ async fn check_timeline(
     })
 }
 
-fn load_certs() -> Result<Arc<rustls::RootCertStore>, std::io::Error> {
-    let der_certs = rustls_native_certs::load_native_certs()?;
+fn load_certs() -> anyhow::Result<Arc<rustls::RootCertStore>> {
+    let der_certs = rustls_native_certs::load_native_certs();
+
+    if !der_certs.errors.is_empty() {
+        bail!("could not load native tls certs: {:?}", der_certs.errors);
+    }
+
     let mut store = rustls::RootCertStore::empty();
-    store.add_parsable_certificates(der_certs);
+    store.add_parsable_certificates(der_certs.certs);
     Ok(Arc::new(store))
 }
 static TLS_ROOTS: OnceCell<Arc<rustls::RootCertStore>> = OnceCell::new();
@@ -248,9 +255,12 @@ async fn load_timelines_from_db(
 
     // Use rustls (Neon requires TLS)
     let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
-    let client_config = rustls::ClientConfig::builder()
-        .with_root_certificates(root_store)
-        .with_no_client_auth();
+    let client_config =
+        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            .with_safe_default_protocol_versions()
+            .context("aws_lc_rs should support the default protocol versions")?
+            .with_root_certificates(root_store)
+            .with_no_client_auth();
     let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
     let (client, connection) = tokio_postgres::connect(&dump_db_connstr, tls_connector).await?;
     // The connection object performs the actual communication with the database,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 1347d6ddff..28c51b8ac1 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -32,7 +32,6 @@ deranged = { version = "0.3", default-features = false, features = ["powerfmt",
 digest = { version = "0.10", features = ["mac", "oid", "std"] }
 either = { version = "1" }
 fail = { version = "0.5", default-features = false, features = ["failpoints"] }
-futures = { version = "0.3" }
 futures-channel = { version = "0.3", features = ["sink"] }
 futures-executor = { version = "0.3" }
 futures-io = { version = "0.3" }
@@ -48,7 +47,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"]
 hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.12" }
+itertools = { version = "0.10" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -66,6 +65,8 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
+rustls = { version = "0.23", features = ["ring"] }
+rustls-webpki = { version = "0.102", default-features = false, features = ["aws_lc_rs", "ring", "std"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["alloc", "raw_value"] }
@@ -79,6 +80,7 @@ tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] }
+tokio-rustls = { version = "0.26", features = ["ring"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
@@ -104,7 +106,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.12" }
+itertools = { version = "0.10" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
@@ -122,8 +124,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 serde = { version = "1", features = ["alloc", "derive"] }
-syn-dff4ba8e3ae991db = { package = "syn", version = "1", features = ["extra-traits", "full", "visit"] }
-syn-f595c2ba2a3f28df = { package = "syn", version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
+syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
 toml_edit = { version = "0.22", features = ["serde"] }
 zstd = { version = "0.13" }

From b8304f90d6ad9a5f118a59ac392b3330495827d3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 18 Oct 2024 10:27:50 +0100
Subject: [PATCH 039/239] 2024 oct new clippy lints (#9448)

Fixes new lints from `cargo +nightly clippy` (`clippy 0.1.83 (798fb83f
2024-10-16)`)
---
 compute_tools/src/extension_server.rs         |  2 +-
 .../pageserver_api/src/models/partitioning.rs |  6 ++--
 libs/postgres_backend/src/lib.rs              |  3 +-
 libs/pq_proto/src/lib.rs                      |  2 +-
 libs/tenant_size_model/src/svg.rs             |  2 +-
 libs/tracing-utils/src/http.rs                |  2 +-
 libs/utils/src/lsn.rs                         |  2 +-
 libs/utils/src/poison.rs                      |  4 +--
 libs/utils/src/shard.rs                       |  2 +-
 libs/utils/src/simple_rcu.rs                  |  4 +--
 libs/utils/src/sync/heavier_once_cell.rs      |  4 +--
 libs/utils/src/tracing_span_assert.rs         | 10 +++----
 pageserver/compaction/src/helpers.rs          | 10 +++----
 pageserver/src/consumption_metrics/upload.rs  |  2 +-
 pageserver/src/disk_usage_eviction_task.rs    |  2 +-
 pageserver/src/metrics.rs                     |  4 +--
 pageserver/src/statvfs.rs                     |  2 +-
 pageserver/src/tenant/block_io.rs             |  4 +--
 pageserver/src/tenant/disk_btree.rs           |  2 +-
 .../src/tenant/remote_timeline_client.rs      |  2 +-
 .../src/tenant/secondary/heatmap_uploader.rs  |  1 -
 pageserver/src/tenant/storage_layer.rs        |  2 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  3 +-
 .../src/tenant/storage_layer/image_layer.rs   |  3 +-
 pageserver/src/tenant/storage_layer/layer.rs  |  2 +-
 .../src/tenant/storage_layer/layer_name.rs    |  2 +-
 .../tenant/storage_layer/merge_iterator.rs    |  8 +++---
 pageserver/src/tenant/vectored_blob_io.rs     | 21 +++-----------
 pageserver/src/virtual_file.rs                |  4 +--
 proxy/src/auth/credentials.rs                 |  2 +-
 proxy/src/config.rs                           |  2 +-
 proxy/src/context/parquet.rs                  |  2 +-
 proxy/src/intern.rs                           |  2 +-
 proxy/src/lib.rs                              |  6 +---
 proxy/src/proxy/tests/mod.rs                  | 10 +++----
 proxy/src/scram/exchange.rs                   |  4 ---
 proxy/src/serverless/conn_pool.rs             | 12 ++++----
 proxy/src/serverless/conn_pool_lib.rs         | 28 +++++++++----------
 proxy/src/serverless/http_conn_pool.rs        |  3 +-
 proxy/src/serverless/json.rs                  |  6 ++--
 proxy/src/serverless/local_conn_pool.rs       |  3 +-
 proxy/src/serverless/sql_over_http.rs         |  1 -
 proxy/src/usage_metrics.rs                    | 10 +++----
 proxy/src/waiters.rs                          |  2 +-
 safekeeper/src/timeline.rs                    |  6 ++--
 45 files changed, 92 insertions(+), 124 deletions(-)

diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs
index 6ef7e0837f..da2d107b54 100644
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -107,7 +107,7 @@ pub fn get_pg_version(pgbin: &str) -> String {
     // pg_config --version returns a (platform specific) human readable string
     // such as "PostgreSQL 15.4". We parse this to v14/v15/v16 etc.
     let human_version = get_pg_config("--version", pgbin);
-    return parse_pg_version(&human_version).to_string();
+    parse_pg_version(&human_version).to_string()
 }
 
 fn parse_pg_version(human_version: &str) -> &str {
diff --git a/libs/pageserver_api/src/models/partitioning.rs b/libs/pageserver_api/src/models/partitioning.rs
index f6644be635..69832b9a0d 100644
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -16,7 +16,7 @@ impl serde::Serialize for Partitioning {
     {
         pub struct KeySpace<'a>(&'a crate::keyspace::KeySpace);
 
-        impl<'a> serde::Serialize for KeySpace<'a> {
+        impl serde::Serialize for KeySpace<'_> {
             fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
             where
                 S: serde::Serializer,
@@ -44,7 +44,7 @@ impl serde::Serialize for Partitioning {
 
 pub struct WithDisplay<'a, T>(&'a T);
 
-impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
+impl<T: std::fmt::Display> serde::Serialize for WithDisplay<'_, T> {
     fn serialize<S>(&self, serializer: S) -> std::result::Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
@@ -55,7 +55,7 @@ impl<'a, T: std::fmt::Display> serde::Serialize for WithDisplay<'a, T> {
 
 pub struct KeyRange<'a>(&'a std::ops::Range<crate::key::Key>);
 
-impl<'a> serde::Serialize for KeyRange<'a> {
+impl serde::Serialize for KeyRange<'_> {
     fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 085540e7b9..9d274b25e6 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -921,12 +921,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackendReader<IO> {
 /// A futures::AsyncWrite implementation that wraps all data written to it in CopyData
 /// messages.
 ///
-
 pub struct CopyDataWriter<'a, IO> {
     pgb: &'a mut PostgresBackend<IO>,
 }
 
-impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, IO> {
+impl<IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'_, IO> {
     fn poll_write(
         self: Pin<&mut Self>,
         cx: &mut std::task::Context<'_>,
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index a01191bd5d..9ffaaba584 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -727,7 +727,7 @@ pub const SQLSTATE_INTERNAL_ERROR: &[u8; 5] = b"XX000";
 pub const SQLSTATE_ADMIN_SHUTDOWN: &[u8; 5] = b"57P01";
 pub const SQLSTATE_SUCCESSFUL_COMPLETION: &[u8; 5] = b"00000";
 
-impl<'a> BeMessage<'a> {
+impl BeMessage<'_> {
     /// Serialize `message` to the given `buf`.
     /// Apart from smart memory managemet, BytesMut is good here as msg len
     /// precedes its body and it is handy to write it down first and then fill
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
index 0de2890bb4..25ebb1c3d8 100644
--- a/libs/tenant_size_model/src/svg.rs
+++ b/libs/tenant_size_model/src/svg.rs
@@ -97,7 +97,7 @@ pub fn draw_svg(
     Ok(result)
 }
 
-impl<'a> SvgDraw<'a> {
+impl SvgDraw<'_> {
     fn calculate_svg_layout(&mut self) {
         // Find x scale
         let segments = &self.storage.segments;
diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs
index e6fdf9be45..2168beee88 100644
--- a/libs/tracing-utils/src/http.rs
+++ b/libs/tracing-utils/src/http.rs
@@ -82,7 +82,7 @@ where
 fn extract_remote_context(headers: &HeaderMap) -> opentelemetry::Context {
     struct HeaderExtractor<'a>(&'a HeaderMap);
 
-    impl<'a> opentelemetry::propagation::Extractor for HeaderExtractor<'a> {
+    impl opentelemetry::propagation::Extractor for HeaderExtractor<'_> {
         fn get(&self, key: &str) -> Option<&str> {
             self.0.get(key).and_then(|value| value.to_str().ok())
         }
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 06d5c27ebf..3ec2c130bd 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -37,7 +37,7 @@ impl<'de> Deserialize<'de> for Lsn {
             is_human_readable_deserializer: bool,
         }
 
-        impl<'de> Visitor<'de> for LsnVisitor {
+        impl Visitor<'_> for LsnVisitor {
             type Value = Lsn;
 
             fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
diff --git a/libs/utils/src/poison.rs b/libs/utils/src/poison.rs
index c3e2fba20c..ab9ebb3c5a 100644
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -73,7 +73,7 @@ impl<T> Poison<T> {
 /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
 pub struct Guard<'a, T>(&'a mut Poison<T>);
 
-impl<'a, T> Guard<'a, T> {
+impl<T> Guard<'_, T> {
     pub fn data(&self) -> &T {
         &self.0.data
     }
@@ -94,7 +94,7 @@ impl<'a, T> Guard<'a, T> {
     }
 }
 
-impl<'a, T> Drop for Guard<'a, T> {
+impl<T> Drop for Guard<'_, T> {
     fn drop(&mut self) {
         match self.0.state {
             State::Clean => {
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index d146010b41..782cddc599 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,7 +164,7 @@ impl TenantShardId {
     }
 }
 
-impl<'a> std::fmt::Display for ShardSlug<'a> {
+impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
             f,
diff --git a/libs/utils/src/simple_rcu.rs b/libs/utils/src/simple_rcu.rs
index 01750b2aef..6700f86e4a 100644
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -152,7 +152,7 @@ pub struct RcuWriteGuard<'a, V> {
     inner: RwLockWriteGuard<'a, RcuInner<V>>,
 }
 
-impl<'a, V> Deref for RcuWriteGuard<'a, V> {
+impl<V> Deref for RcuWriteGuard<'_, V> {
     type Target = V;
 
     fn deref(&self) -> &V {
@@ -160,7 +160,7 @@ impl<'a, V> Deref for RcuWriteGuard<'a, V> {
     }
 }
 
-impl<'a, V> RcuWriteGuard<'a, V> {
+impl<V> RcuWriteGuard<'_, V> {
     ///
     /// Store a new value. The new value will be written to the Rcu immediately,
     /// and will be immediately seen by any `read` calls that start afterwards.
diff --git a/libs/utils/src/sync/heavier_once_cell.rs b/libs/utils/src/sync/heavier_once_cell.rs
index dc711fb028..66c2065554 100644
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -219,7 +219,7 @@ impl<'a, T> CountWaitingInitializers<'a, T> {
     }
 }
 
-impl<'a, T> Drop for CountWaitingInitializers<'a, T> {
+impl<T> Drop for CountWaitingInitializers<'_, T> {
     fn drop(&mut self) {
         self.0.initializers.fetch_sub(1, Ordering::Relaxed);
     }
@@ -250,7 +250,7 @@ impl<T> std::ops::DerefMut for Guard<'_, T> {
     }
 }
 
-impl<'a, T> Guard<'a, T> {
+impl<T> Guard<'_, T> {
     /// Take the current value, and a new permit for it's deinitialization.
     ///
     /// The permit will be on a semaphore part of the new internal value, and any following
diff --git a/libs/utils/src/tracing_span_assert.rs b/libs/utils/src/tracing_span_assert.rs
index d24c81ad0b..add2fa7920 100644
--- a/libs/utils/src/tracing_span_assert.rs
+++ b/libs/utils/src/tracing_span_assert.rs
@@ -184,23 +184,23 @@ mod tests {
 
     struct MemoryIdentity<'a>(&'a dyn Extractor);
 
-    impl<'a> MemoryIdentity<'a> {
+    impl MemoryIdentity<'_> {
         fn as_ptr(&self) -> *const () {
             self.0 as *const _ as *const ()
         }
     }
-    impl<'a> PartialEq for MemoryIdentity<'a> {
+    impl PartialEq for MemoryIdentity<'_> {
         fn eq(&self, other: &Self) -> bool {
             self.as_ptr() == other.as_ptr()
         }
     }
-    impl<'a> Eq for MemoryIdentity<'a> {}
-    impl<'a> Hash for MemoryIdentity<'a> {
+    impl Eq for MemoryIdentity<'_> {}
+    impl Hash for MemoryIdentity<'_> {
         fn hash<H: Hasher>(&self, state: &mut H) {
             self.as_ptr().hash(state);
         }
     }
-    impl<'a> fmt::Debug for MemoryIdentity<'a> {
+    impl fmt::Debug for MemoryIdentity<'_> {
         fn fmt(&self, f: &mut fmt::Formatter<'_>) -> std::fmt::Result {
             write!(f, "{:p}: {}", self.as_ptr(), self.0.id())
         }
diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 8ed1d16082..9dbb6ecedf 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -133,7 +133,7 @@ enum LazyLoadLayer<'a, E: CompactionJobExecutor> {
     Loaded(VecDeque<<E::DeltaLayer as CompactionDeltaLayer<E>>::DeltaEntry<'a>>),
     Unloaded(&'a E::DeltaLayer),
 }
-impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> LazyLoadLayer<'_, E> {
     fn min_key(&self) -> E::Key {
         match self {
             Self::Loaded(entries) => entries.front().unwrap().key(),
@@ -147,23 +147,23 @@ impl<'a, E: CompactionJobExecutor> LazyLoadLayer<'a, E> {
         }
     }
 }
-impl<'a, E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> PartialOrd for LazyLoadLayer<'_, E> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
-impl<'a, E: CompactionJobExecutor> Ord for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> Ord for LazyLoadLayer<'_, E> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         // reverse order so that we get a min-heap
         (other.min_key(), other.min_lsn()).cmp(&(self.min_key(), self.min_lsn()))
     }
 }
-impl<'a, E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'a, E> {
+impl<E: CompactionJobExecutor> PartialEq for LazyLoadLayer<'_, E> {
     fn eq(&self, other: &Self) -> bool {
         self.cmp(other) == std::cmp::Ordering::Equal
     }
 }
-impl<'a, E: CompactionJobExecutor> Eq for LazyLoadLayer<'a, E> {}
+impl<E: CompactionJobExecutor> Eq for LazyLoadLayer<'_, E> {}
 
 type LoadFuture<'a, E> = BoxFuture<'a, anyhow::Result<Vec<E>>>;
 
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 0325ee403a..1eb25d337b 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -198,7 +198,7 @@ fn serialize_in_chunks<'a>(
         }
     }
 
-    impl<'a> ExactSizeIterator for Iter<'a> {}
+    impl ExactSizeIterator for Iter<'_> {}
 
     let buffer = bytes::BytesMut::new();
     let inner = input.chunks(chunk_size);
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 7ab2ba8742..ca44fbe6ae 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -654,7 +654,7 @@ impl std::fmt::Debug for EvictionCandidate {
         let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
         let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
         struct DisplayIsDebug<'a, T>(&'a T);
-        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
+        impl<T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'_, T> {
             fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
                 write!(f, "{}", self.0)
             }
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index b76efa5b48..3e824b59fb 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1189,7 +1189,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     op: SmgrQueryType,
 }
 
-impl<'a, 'c> Drop for GlobalAndPerTimelineHistogramTimer<'a, 'c> {
+impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
     fn drop(&mut self) {
         let elapsed = self.start.elapsed();
         let ex_throttled = self
@@ -1560,7 +1560,7 @@ impl BasebackupQueryTime {
     }
 }
 
-impl<'a, 'c> BasebackupQueryTimeOngoingRecording<'a, 'c> {
+impl BasebackupQueryTimeOngoingRecording<'_, '_> {
     pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
         let elapsed = self.start.elapsed();
         let ex_throttled = self
diff --git a/pageserver/src/statvfs.rs b/pageserver/src/statvfs.rs
index 205605bc86..4e8be58d58 100644
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -90,7 +90,7 @@ pub mod mock {
                 let used_bytes = walk_dir_disk_usage(tenants_dir, name_filter.as_deref()).unwrap();
 
                 // round it up to the nearest block multiple
-                let used_blocks = (used_bytes + (blocksize - 1)) / blocksize;
+                let used_blocks = used_bytes.div_ceil(*blocksize);
 
                 if used_blocks > *total_blocks {
                     panic!(
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 3afa3a86b9..1c82e5454d 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -50,13 +50,13 @@ impl From<PageReadGuard<'static>> for BlockLease<'static> {
 }
 
 #[cfg(test)]
-impl<'a> From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'a> {
+impl From<std::sync::Arc<[u8; PAGE_SZ]>> for BlockLease<'_> {
     fn from(value: std::sync::Arc<[u8; PAGE_SZ]>) -> Self {
         BlockLease::Arc(value)
     }
 }
 
-impl<'a> Deref for BlockLease<'a> {
+impl Deref for BlockLease<'_> {
     type Target = [u8; PAGE_SZ];
 
     fn deref(&self) -> &Self::Target {
diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 0107b0ac7e..b302cbc975 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -131,7 +131,7 @@ struct OnDiskNode<'a, const L: usize> {
     values: &'a [u8],
 }
 
-impl<'a, const L: usize> OnDiskNode<'a, L> {
+impl<const L: usize> OnDiskNode<'_, L> {
     ///
     /// Interpret a PAGE_SZ page as a node.
     ///
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 450084aca2..14b894d17c 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2182,7 +2182,7 @@ pub(crate) struct UploadQueueAccessor<'a> {
     inner: std::sync::MutexGuard<'a, UploadQueue>,
 }
 
-impl<'a> UploadQueueAccessor<'a> {
+impl UploadQueueAccessor<'_> {
     pub(crate) fn latest_uploaded_index_part(&self) -> &IndexPart {
         match &*self.inner {
             UploadQueue::Initialized(x) => &x.clean.0,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index 0aad5bf392..e680fd705b 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -108,7 +108,6 @@ impl scheduler::Completion for WriteComplete {
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
 /// uploads disabled.
-
 struct UploaderTenantState {
     // This Weak only exists to enable culling idle instances of this type
     // when the Tenant has been deallocated.
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 99bd0ece57..a229b59560 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -705,7 +705,7 @@ pub mod tests {
 /// Useful with `Key`, which has too verbose `{:?}` for printing multiple layers.
 struct RangeDisplayDebug<'a, T: std::fmt::Display>(&'a Range<T>);
 
-impl<'a, T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'a, T> {
+impl<T: std::fmt::Display> std::fmt::Debug for RangeDisplayDebug<'_, T> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(f, "{}..{}", self.0.start, self.0.end)
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 8be7d7876f..d1079876f8 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -529,8 +529,7 @@ impl DeltaLayerWriterInner {
         key_end: Key,
         ctx: &RequestContext,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk =
-            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
         let mut file = self.blob_writer.into_inner(ctx).await?;
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index de8155f455..6c1a943470 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -828,8 +828,7 @@ impl ImageLayerWriterInner {
         ctx: &RequestContext,
         end_key: Option<Key>,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
-        let index_start_blk =
-            ((self.blob_writer.size() + PAGE_SZ as u64 - 1) / PAGE_SZ as u64) as u32;
+        let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
         // Calculate compression ratio
         let compressed_size = self.blob_writer.size() - PAGE_SZ as u64; // Subtract PAGE_SZ for header
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index f29a33bae6..38a7cd09af 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -978,7 +978,7 @@ impl LayerInner {
         let timeline = self
             .timeline
             .upgrade()
-            .ok_or_else(|| DownloadError::TimelineShutdown)?;
+            .ok_or(DownloadError::TimelineShutdown)?;
 
         // count cancellations, which currently remain largely unexpected
         let init_cancelled = scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index ffe7ca5f3e..8e750e1187 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -339,7 +339,7 @@ impl<'de> serde::Deserialize<'de> for LayerName {
 
 struct LayerNameVisitor;
 
-impl<'de> serde::de::Visitor<'de> for LayerNameVisitor {
+impl serde::de::Visitor<'_> for LayerNameVisitor {
     type Value = LayerName;
 
     fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 0831fd9530..f91e27241d 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -99,21 +99,21 @@ impl<'a> PeekableLayerIterRef<'a> {
     }
 }
 
-impl<'a> std::cmp::PartialEq for IteratorWrapper<'a> {
+impl std::cmp::PartialEq for IteratorWrapper<'_> {
     fn eq(&self, other: &Self) -> bool {
         self.cmp(other) == Ordering::Equal
     }
 }
 
-impl<'a> std::cmp::Eq for IteratorWrapper<'a> {}
+impl std::cmp::Eq for IteratorWrapper<'_> {}
 
-impl<'a> std::cmp::PartialOrd for IteratorWrapper<'a> {
+impl std::cmp::PartialOrd for IteratorWrapper<'_> {
     fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
         Some(self.cmp(other))
     }
 }
 
-impl<'a> std::cmp::Ord for IteratorWrapper<'a> {
+impl std::cmp::Ord for IteratorWrapper<'_> {
     fn cmp(&self, other: &Self) -> std::cmp::Ordering {
         use std::cmp::Ordering;
         let a = self.peek_next_key_lsn_value();
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 792c769b4f..0c03791034 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -73,7 +73,7 @@ impl<'a> BufView<'a> {
     }
 }
 
-impl<'a> Deref for BufView<'a> {
+impl Deref for BufView<'_> {
     type Target = [u8];
 
     fn deref(&self) -> &Self::Target {
@@ -84,7 +84,7 @@ impl<'a> Deref for BufView<'a> {
     }
 }
 
-impl<'a> AsRef<[u8]> for BufView<'a> {
+impl AsRef<[u8]> for BufView<'_> {
     fn as_ref(&self) -> &[u8] {
         match self {
             BufView::Slice(slice) => slice,
@@ -196,11 +196,6 @@ pub(crate) struct ChunkedVectoredReadBuilder {
     max_read_size: Option<usize>,
 }
 
-/// Computes x / d rounded up.
-fn div_round_up(x: usize, d: usize) -> usize {
-    (x + (d - 1)) / d
-}
-
 impl ChunkedVectoredReadBuilder {
     const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment();
     /// Start building a new vectored read.
@@ -220,7 +215,7 @@ impl ChunkedVectoredReadBuilder {
             .expect("First insertion always succeeds");
 
         let start_blk_no = start_offset as usize / Self::CHUNK_SIZE;
-        let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE);
+        let end_blk_no = (end_offset as usize).div_ceil(Self::CHUNK_SIZE);
         Self {
             start_blk_no,
             end_blk_no,
@@ -248,7 +243,7 @@ impl ChunkedVectoredReadBuilder {
     pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
         tracing::trace!(start, end, "trying to extend");
         let start_blk_no = start as usize / Self::CHUNK_SIZE;
-        let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE);
+        let end_blk_no = (end as usize).div_ceil(Self::CHUNK_SIZE);
 
         let not_limited_by_max_read_size = {
             if let Some(max_read_size) = self.max_read_size {
@@ -975,12 +970,4 @@ mod tests {
         round_trip_test_compressed(&blobs, true).await?;
         Ok(())
     }
-
-    #[test]
-    fn test_div_round_up() {
-        const CHUNK_SIZE: usize = 512;
-        assert_eq!(1, div_round_up(200, CHUNK_SIZE));
-        assert_eq!(1, div_round_up(CHUNK_SIZE, CHUNK_SIZE));
-        assert_eq!(2, div_round_up(CHUNK_SIZE + 1, CHUNK_SIZE));
-    }
 }
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index d260116b38..5a364b7aaf 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -724,9 +724,9 @@ impl VirtualFileInner {
 
         *handle_guard = handle;
 
-        return Ok(FileGuard {
+        Ok(FileGuard {
             slot_guard: slot_guard.downgrade(),
-        });
+        })
     }
 
     pub fn remove(self) {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index fa6bc4c6f5..465e427f7c 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -193,7 +193,7 @@ impl<'de> serde::de::Deserialize<'de> for IpPattern {
         D: serde::Deserializer<'de>,
     {
         struct StrVisitor;
-        impl<'de> serde::de::Visitor<'de> for StrVisitor {
+        impl serde::de::Visitor<'_> for StrVisitor {
             type Value = IpPattern;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 0d5ebd88f9..3baa7ec751 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -558,7 +558,7 @@ pub struct RetryConfig {
 }
 
 impl RetryConfig {
-    /// Default options for RetryConfig.
+    // Default options for RetryConfig.
 
     /// Total delay for 5 retries with 200ms base delay and 2 backoff factor is about 6s.
     pub const CONNECT_TO_COMPUTE_DEFAULT_VALUES: &'static str =
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b0ad0e4566..3432ac5ff6 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -104,7 +104,7 @@ struct Options<'a> {
     options: &'a StartupMessageParams,
 }
 
-impl<'a> serde::Serialize for Options<'a> {
+impl serde::Serialize for Options<'_> {
     fn serialize<S>(&self, s: S) -> Result<S::Ok, S::Error>
     where
         S: serde::Serializer,
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 09fd9657d0..49aab917e4 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -55,7 +55,7 @@ impl<Id: InternId> std::ops::Deref for InternedString<Id> {
 impl<'de, Id: InternId> serde::de::Deserialize<'de> for InternedString<Id> {
     fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
         struct Visitor<Id>(PhantomData<Id>);
-        impl<'de, Id: InternId> serde::de::Visitor<'de> for Visitor<Id> {
+        impl<Id: InternId> serde::de::Visitor<'_> for Visitor<Id> {
             type Value = InternedString<Id>;
 
             fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 74bc778a36..a7b3d45c95 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -76,11 +76,7 @@
     )
 )]
 // List of temporarily allowed lints to unblock beta/nightly.
-#![allow(
-    unknown_lints,
-    // TODO: 1.82: Add `use<T>` where necessary and remove from this list.
-    impl_trait_overcaptures,
-)]
+#![allow(unknown_lints)]
 
 use std::convert::Infallible;
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 88175d73b1..3f54b0661b 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -73,11 +73,11 @@ impl ClientConfig<'_> {
         self,
     ) -> anyhow::Result<
         impl tokio_postgres::tls::TlsConnect<
-            S,
-            Error = impl std::fmt::Debug,
-            Future = impl Send,
-            Stream = RustlsStream<S>,
-        >,
+                S,
+                Error = impl std::fmt::Debug + use<S>,
+                Future = impl Send + use<S>,
+                Stream = RustlsStream<S>,
+            > + use<S>,
     > {
         let mut mk = MakeRustlsConnect::new(self.config);
         let tls = MakeTlsConnect::<S>::make_tls_connect(&mut mk, self.hostname)?;
diff --git a/proxy/src/scram/exchange.rs b/proxy/src/scram/exchange.rs
index 493295c938..6a13f645a5 100644
--- a/proxy/src/scram/exchange.rs
+++ b/proxy/src/scram/exchange.rs
@@ -218,16 +218,12 @@ impl sasl::Mechanism for Exchange<'_> {
                         self.state = ExchangeState::SaltSent(sent);
                         Ok(Step::Continue(self, msg))
                     }
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
-                    Step::Success(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
             ExchangeState::SaltSent(sent) => {
                 match sent.transition(self.secret, &self.tls_server_end_point, input)? {
                     Step::Success(keys, msg) => Ok(Step::Success(keys, msg)),
-                    #[allow(unreachable_patterns)] // TODO: 1.82: simply drop this match
-                    Step::Continue(x, _) => match x {},
                     Step::Failure(msg) => Ok(Step::Failure(msg)),
                 }
             }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index b97c656510..8401e3a1c9 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -11,13 +11,6 @@ use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
-
-use crate::context::RequestMonitoring;
-use crate::control_plane::messages::MetricsAuxInfo;
-use crate::metrics::Metrics;
-
-use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
-
 #[cfg(test)]
 use {
     super::conn_pool_lib::GlobalConnPoolOptions,
@@ -25,6 +18,11 @@ use {
     std::{sync::atomic, time::Duration},
 };
 
+use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::MetricsAuxInfo;
+use crate::metrics::Metrics;
+
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfoWithAuth {
     pub(crate) conn_info: ConnInfo,
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 6e964ce878..844730194d 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -1,25 +1,23 @@
+use std::collections::HashMap;
+use std::ops::Deref;
+use std::sync::atomic::{self, AtomicUsize};
+use std::sync::{Arc, Weak};
+use std::time::Duration;
+
 use dashmap::DashMap;
 use parking_lot::RwLock;
 use rand::Rng;
-use std::{collections::HashMap, sync::Arc, sync::Weak, time::Duration};
-use std::{
-    ops::Deref,
-    sync::atomic::{self, AtomicUsize},
-};
 use tokio_postgres::ReadyForQueryStatus;
+use tracing::{debug, info, Span};
 
+use super::backend::HttpConnError;
+use super::conn_pool::ClientInnerRemote;
+use crate::auth::backend::ComputeUserInfo;
+use crate::context::RequestMonitoring;
 use crate::control_plane::messages::ColdStartInfo;
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{
-    auth::backend::ComputeUserInfo, context::RequestMonitoring, DbName, EndpointCacheKey, RoleName,
-};
-
-use super::conn_pool::ClientInnerRemote;
-use tracing::info;
-use tracing::{debug, Span};
-
-use super::backend::HttpConnError;
+use crate::{DbName, EndpointCacheKey, RoleName};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
@@ -482,7 +480,7 @@ impl<C: ClientInnerExt> Client<C> {
         })
     }
 
-    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce()> {
+    pub(crate) fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 79bb19328f..363e397976 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -10,12 +10,11 @@ use rand::Rng;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
+use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-
-use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 8c56d317cc..569e2da571 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -155,10 +155,10 @@ fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, Json
 // dimensions, we just return them as is.
 //
 fn pg_array_parse(pg_array: &str, elem_type: &Type) -> Result<Value, JsonConversionError> {
-    _pg_array_parse(pg_array, elem_type, false).map(|(v, _)| v)
+    pg_array_parse_inner(pg_array, elem_type, false).map(|(v, _)| v)
 }
 
-fn _pg_array_parse(
+fn pg_array_parse_inner(
     pg_array: &str,
     elem_type: &Type,
     nested: bool,
@@ -211,7 +211,7 @@ fn _pg_array_parse(
             '{' if !quote => {
                 level += 1;
                 if level > 1 {
-                    let (res, off) = _pg_array_parse(&pg_array[i..], elem_type, true)?;
+                    let (res, off) = pg_array_parse_inner(&pg_array[i..], elem_type, true)?;
                     entries.push(res);
                     for _ in 0..off - 1 {
                         pg_array_chr.next();
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index c4fdd00f78..a01afd2820 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -25,7 +25,6 @@ use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-
 use crate::{DbName, RoleName};
 
 struct ConnPoolEntry<C: ClientInnerExt> {
@@ -530,7 +529,7 @@ impl<C: ClientInnerExt> LocalClient<C> {
         })
     }
 
-    fn do_drop(&mut self) -> Option<impl FnOnce()> {
+    fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
         let conn_info = self.conn_info.clone();
         let client = self
             .inner
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index bb5eb390a6..6fbb044669 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -38,7 +38,6 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
-
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
 use crate::{DbName, RoleName};
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index c5384c0b0e..f944d5aec3 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -375,7 +375,7 @@ pub async fn task_backup(
         let now = Utc::now();
         collect_metrics_backup_iteration(
             &USAGE_METRICS.backup_endpoints,
-            &storage,
+            storage.as_ref(),
             &hostname,
             prev,
             now,
@@ -395,7 +395,7 @@ pub async fn task_backup(
 #[instrument(skip_all)]
 async fn collect_metrics_backup_iteration(
     endpoints: &DashMap<Ids, Arc<MetricBackupCounter>, FastHasher>,
-    storage: &Option<GenericRemoteStorage>,
+    storage: Option<&GenericRemoteStorage>,
     hostname: &str,
     prev: DateTime<Utc>,
     now: DateTime<Utc>,
@@ -446,7 +446,7 @@ async fn collect_metrics_backup_iteration(
 }
 
 async fn upload_events_chunk(
-    storage: &Option<GenericRemoteStorage>,
+    storage: Option<&GenericRemoteStorage>,
     chunk: EventChunk<'_, Event<Ids, &'static str>>,
     remote_path: &RemotePath,
     cancel: &CancellationToken,
@@ -577,10 +577,10 @@ mod tests {
         // counter is unregistered
         assert!(metrics.endpoints.is_empty());
 
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
             .await;
         assert!(!metrics.backup_endpoints.is_empty());
-        collect_metrics_backup_iteration(&metrics.backup_endpoints, &None, "foo", now, now, 1000)
+        collect_metrics_backup_iteration(&metrics.backup_endpoints, None, "foo", now, now, 1000)
             .await;
         // backup counter is unregistered after the second iteration
         assert!(metrics.backup_endpoints.is_empty());
diff --git a/proxy/src/waiters.rs b/proxy/src/waiters.rs
index 7e07f6a2af..330e73f02f 100644
--- a/proxy/src/waiters.rs
+++ b/proxy/src/waiters.rs
@@ -73,7 +73,7 @@ struct DropKey<'a, T> {
     registry: &'a Waiters<T>,
 }
 
-impl<'a, T> Drop for DropKey<'a, T> {
+impl<T> Drop for DropKey<'_, T> {
     fn drop(&mut self) {
         self.registry.0.lock().remove(&self.key);
     }
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 3494b0b764..41b9490088 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -122,7 +122,7 @@ impl<'a> WriteGuardSharedState<'a> {
     }
 }
 
-impl<'a> Deref for WriteGuardSharedState<'a> {
+impl Deref for WriteGuardSharedState<'_> {
     type Target = SharedState;
 
     fn deref(&self) -> &Self::Target {
@@ -130,13 +130,13 @@ impl<'a> Deref for WriteGuardSharedState<'a> {
     }
 }
 
-impl<'a> DerefMut for WriteGuardSharedState<'a> {
+impl DerefMut for WriteGuardSharedState<'_> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         &mut self.guard
     }
 }
 
-impl<'a> Drop for WriteGuardSharedState<'a> {
+impl Drop for WriteGuardSharedState<'_> {
     fn drop(&mut self) {
         let term_flush_lsn =
             TermLsn::from((self.guard.sk.last_log_term(), self.guard.sk.flush_lsn()));

From 24654b8eee8706e8ae98948733a28b56df83536b Mon Sep 17 00:00:00 2001
From: Jere Vaara <jerevaara@gmail.com>
Date: Fri, 18 Oct 2024 13:25:45 +0300
Subject: [PATCH 040/239] compute_ctl: Add endpoint that allows setting role
 grants (#9395)

This PR introduces a `/grants` endpoint which allows setting specific
`privileges` to certain `role` for a certain `schema`.

Related to #9344

Together these endpoints will be used to configure JWT extension and set
correct usage to its schema to specific roles that will need them.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 compute_tools/src/compute.rs             | 43 ++++++++++++
 compute_tools/src/http/api.rs            | 48 ++++++++++++-
 compute_tools/src/http/openapi_spec.yaml | 89 ++++++++++++++++++++++++
 libs/compute_api/src/lib.rs              |  1 +
 libs/compute_api/src/privilege.rs        | 35 ++++++++++
 libs/compute_api/src/requests.rs         | 13 +++-
 libs/compute_api/src/responses.rs        | 13 +++-
 test_runner/fixtures/endpoint/http.py    |  8 +++
 test_runner/regress/test_role_grants.py  | 41 +++++++++++
 9 files changed, 287 insertions(+), 4 deletions(-)
 create mode 100644 libs/compute_api/src/privilege.rs
 create mode 100644 test_runner/regress/test_role_grants.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 6aec008f3a..11fee73f03 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -15,6 +15,7 @@ use std::time::Instant;
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use compute_api::spec::PgIdent;
 use futures::future::join_all;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -25,6 +26,7 @@ use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
+use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
@@ -1373,6 +1375,47 @@ LIMIT 100",
         download_size
     }
 
+    pub async fn set_role_grants(
+        &self,
+        db_name: &PgIdent,
+        schema_name: &PgIdent,
+        privileges: &[Privilege],
+        role_name: &PgIdent,
+    ) -> Result<()> {
+        use tokio_postgres::config::Config;
+        use tokio_postgres::NoTls;
+
+        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        conf.dbname(db_name);
+
+        let (db_client, conn) = conf
+            .connect(NoTls)
+            .await
+            .context("Failed to connect to the database")?;
+        tokio::spawn(conn);
+
+        // TODO: support other types of grants apart from schemas?
+        let query = format!(
+            "GRANT {} ON SCHEMA {} TO {}",
+            privileges
+                .iter()
+                // should not be quoted as it's part of the command.
+                // is already sanitized so it's ok
+                .map(|p| p.as_str())
+                .collect::<Vec<&'static str>>()
+                .join(", "),
+            // quote the schema and role name as identifiers to sanitize them.
+            schema_name.pg_quote(),
+            role_name.pg_quote(),
+        );
+        db_client
+            .simple_query(&query)
+            .await
+            .with_context(|| format!("Failed to execute query: {}", query))?;
+
+        Ok(())
+    }
+
     #[tokio::main]
     pub async fn prepare_preload_libraries(
         &self,
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 79e6158081..133ab9f5af 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,8 +9,10 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use compute_api::requests::ConfigurationRequest;
-use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
+use compute_api::requests::{ConfigurationRequest, SetRoleGrantsRequest};
+use compute_api::responses::{
+    ComputeStatus, ComputeStatusResponse, GenericAPIError, SetRoleGrantsResponse,
+};
 
 use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
@@ -165,6 +167,48 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/grants") => {
+            info!("serving /grants POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for set_role_grants request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<SetRoleGrantsRequest>(&request).unwrap();
+
+            let res = compute
+                .set_role_grants(
+                    &request.database,
+                    &request.schema,
+                    &request.privileges,
+                    &request.role,
+                )
+                .await;
+            match res {
+                Ok(()) => render_json(Body::from(
+                    serde_json::to_string(&SetRoleGrantsResponse {
+                        database: request.database,
+                        schema: request.schema,
+                        role: request.role,
+                        privileges: request.privileges,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => render_json_error(
+                    &format!("could not grant role privileges to the schema: {e}"),
+                    // TODO: can we filter on role/schema not found errors
+                    // and return appropriate error code?
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                ),
+            }
+        }
+
         // get the list of installed extensions
         // currently only used in python tests
         // TODO: call it from cplane
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index e9fa66b323..73dbdc3ee9 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -127,6 +127,41 @@ paths:
               schema:
                 $ref: "#/components/schemas/GenericError"
 
+  /grants:
+    post:
+      tags:
+        - Grants
+      summary: Apply grants to the database.
+      description: ""
+      operationId: setRoleGrants
+      requestBody:
+        description: Grants request.
+        required: true
+        content:
+          application/json:
+            schema:
+                $ref: "#/components/schemas/SetRoleGrantsRequest"
+      responses:
+        200:
+          description: Grants applied.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/SetRoleGrantsResponse"
+        412:
+          description: |
+            Compute is not in the right state for processing the request.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: Error occurred during grants application.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /check_writability:
     post:
       tags:
@@ -427,6 +462,60 @@ components:
               n_databases:
                 type: integer
 
+    SetRoleGrantsRequest:
+      type: object
+      required:
+        - database
+        - schema
+        - privileges
+        - role
+      properties:
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+        schema:
+          type: string
+          description: Schema name.
+          example: "public"
+        privileges:
+          type: array
+          items:
+            type: string
+          description: List of privileges to set.
+          example: ["SELECT", "INSERT"]
+        role:
+          type: string
+          description: Role name.
+          example: "neon"
+
+    SetRoleGrantsResponse:
+      type: object
+      required:
+        - database
+        - schema
+        - privileges
+        - role
+      properties:
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+        schema:
+          type: string
+          description: Schema name.
+          example: "public"
+        privileges:
+          type: array
+          items:
+            type: string
+          description: List of privileges set.
+          example: ["SELECT", "INSERT"]
+        role:
+          type: string
+          description: Role name.
+          example: "neon"
+
     #
     # Errors
     #
diff --git a/libs/compute_api/src/lib.rs b/libs/compute_api/src/lib.rs
index 210a52d089..f4f3d92fc6 100644
--- a/libs/compute_api/src/lib.rs
+++ b/libs/compute_api/src/lib.rs
@@ -1,5 +1,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
+pub mod privilege;
 pub mod requests;
 pub mod responses;
 pub mod spec;
diff --git a/libs/compute_api/src/privilege.rs b/libs/compute_api/src/privilege.rs
new file mode 100644
index 0000000000..dc0d870946
--- /dev/null
+++ b/libs/compute_api/src/privilege.rs
@@ -0,0 +1,35 @@
+#[derive(Debug, Clone, serde::Deserialize, serde::Serialize)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum Privilege {
+    Select,
+    Insert,
+    Update,
+    Delete,
+    Truncate,
+    References,
+    Trigger,
+    Usage,
+    Create,
+    Connect,
+    Temporary,
+    Execute,
+}
+
+impl Privilege {
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Privilege::Select => "SELECT",
+            Privilege::Insert => "INSERT",
+            Privilege::Update => "UPDATE",
+            Privilege::Delete => "DELETE",
+            Privilege::Truncate => "TRUNCATE",
+            Privilege::References => "REFERENCES",
+            Privilege::Trigger => "TRIGGER",
+            Privilege::Usage => "USAGE",
+            Privilege::Create => "CREATE",
+            Privilege::Connect => "CONNECT",
+            Privilege::Temporary => "TEMPORARY",
+            Privilege::Execute => "EXECUTE",
+        }
+    }
+}
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index 5896c7dc65..fbc7577dd9 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,6 +1,9 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 
-use crate::spec::ComputeSpec;
+use crate::{
+    privilege::Privilege,
+    spec::{ComputeSpec, PgIdent},
+};
 use serde::Deserialize;
 
 /// Request of the /configure API
@@ -12,3 +15,11 @@ use serde::Deserialize;
 pub struct ConfigurationRequest {
     pub spec: ComputeSpec,
 }
+
+#[derive(Deserialize, Debug)]
+pub struct SetRoleGrantsRequest {
+    pub database: PgIdent,
+    pub schema: PgIdent,
+    pub privileges: Vec<Privilege>,
+    pub role: PgIdent,
+}
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 5023fce003..fadf524273 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -6,7 +6,10 @@ use std::fmt::Display;
 use chrono::{DateTime, Utc};
 use serde::{Deserialize, Serialize, Serializer};
 
-use crate::spec::{ComputeSpec, Database, Role};
+use crate::{
+    privilege::Privilege,
+    spec::{ComputeSpec, Database, PgIdent, Role},
+};
 
 #[derive(Serialize, Debug, Deserialize)]
 pub struct GenericAPIError {
@@ -168,3 +171,11 @@ pub struct InstalledExtension {
 pub struct InstalledExtensions {
     pub extensions: Vec<InstalledExtension>,
 }
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct SetRoleGrantsResponse {
+    pub database: PgIdent,
+    pub schema: PgIdent,
+    pub privileges: Vec<Privilege>,
+    pub role: PgIdent,
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index 26895df8a6..e7b014b4a9 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -28,3 +28,11 @@ class EndpointHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/installed_extensions")
         res.raise_for_status()
         return res.json()
+
+    def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
+        res = self.post(
+            f"http://localhost:{self.port}/grants",
+            json={"database": database, "schema": schema, "role": role, "privileges": privileges},
+        )
+        res.raise_for_status()
+        return res.json()
diff --git a/test_runner/regress/test_role_grants.py b/test_runner/regress/test_role_grants.py
new file mode 100644
index 0000000000..b2251875f0
--- /dev/null
+++ b/test_runner/regress/test_role_grants.py
@@ -0,0 +1,41 @@
+import psycopg2
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_role_grants(neon_simple_env: NeonEnv):
+    """basic test for the endpoint that grants permissions for a role against a schema"""
+
+    env = neon_simple_env
+
+    env.create_branch("test_role_grants")
+
+    endpoint = env.endpoints.create_start("test_role_grants")
+
+    endpoint.safe_psql("CREATE DATABASE test_role_grants")
+    endpoint.safe_psql("CREATE SCHEMA IF NOT EXISTS test_schema", dbname="test_role_grants")
+    endpoint.safe_psql("CREATE ROLE test_role WITH LOGIN", dbname="test_role_grants")
+
+    # confirm we do not yet have access
+    pg_conn = endpoint.connect(dbname="test_role_grants", user="test_role")
+    with pg_conn.cursor() as cur:
+        try:
+            cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)')
+            raise ValueError("create table should not succeed")
+        except psycopg2.errors.InsufficientPrivilege:
+            pass
+        except BaseException as e:
+            raise e
+
+    client = endpoint.http_client()
+    res = client.set_role_grants(
+        "test_role_grants", "test_role", "test_schema", ["CREATE", "USAGE"]
+    )
+
+    # confirm we have access
+    with pg_conn.cursor() as cur:
+        cur.execute('CREATE TABLE "test_schema"."test_table" (id integer primary key)')
+        cur.execute('INSERT INTO "test_schema"."test_table" (id) VALUES (1)')
+        cur.execute('SELECT id from "test_schema"."test_table"')
+        res = cur.fetchall()
+
+        assert res == [(1,)], "select should not succeed"

From b7173b1ef05f694f3fa7968dadc4a298ea6d66e8 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 18 Oct 2024 11:29:23 +0100
Subject: [PATCH 041/239] storcon: fix case where we might fail to send compute
 notifications after two opposite migrations  (#9435)

## Problem

If we migrate A->B, then B->A, and the notification of A->B fails, then
we might have retained state that makes us think "A" is the last state
we sent to the compute hook, whereas when we migrate B->A we should
really be sending a fresh notification in case our earlier failed
notification has actually mutated the remote compute config.

Closes: #9417

## Summary of changes

- Add a reproducer for the bug
(`test_storage_controller_compute_hook_revert`)
- Refactor compute hook code to represent remote state with
`ComputeRemoteState` which stores a boolean for whether the compute has
fully applied the change as well as the request that the compute
accepted.
- The actual bug fix: after sending a compute notification, if we got a
423 response then update our ComputeRemoteState to reflect that we have
mutated the remote state. This way, when we later try and notify for our
historic location, we will properly see that as a change and send the
notification.

Co-authored-by: Vlad Lazar <vlad@neon.tech>
---
 storage_controller/src/compute_hook.rs        |  80 ++++++++---
 .../regress/test_storage_controller.py        | 127 ++++++++++++++++--
 2 files changed, 183 insertions(+), 24 deletions(-)

diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index bafae1f551..b63a322b87 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -28,7 +28,7 @@ struct UnshardedComputeHookTenant {
     node_id: NodeId,
 
     // Must hold this lock to send a notification.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
 }
 struct ShardedComputeHookTenant {
     stripe_size: ShardStripeSize,
@@ -38,7 +38,22 @@ struct ShardedComputeHookTenant {
     // Must hold this lock to send a notification.  The contents represent
     // the last successfully sent notification, and are used to coalesce multiple
     // updates by only sending when there is a chance since our last successful send.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
+    send_lock: Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>,
+}
+
+/// Represents our knowledge of the compute's state: we can update this when we get a
+/// response from a notify API call, which tells us what has been applied.
+///
+/// Should be wrapped in an Option<>, as we cannot always know the remote state.
+#[derive(PartialEq, Eq, Debug)]
+struct ComputeRemoteState {
+    // The request body which was acked by the compute
+    request: ComputeHookNotifyRequest,
+
+    // Whether the cplane indicated that the state was applied to running computes, or just
+    // persisted.  In the Neon control plane, this is the difference between a 423 response (meaning
+    // persisted but not applied), and a 2xx response (both persisted and applied)
+    applied: bool,
 }
 
 enum ComputeHookTenant {
@@ -64,7 +79,7 @@ impl ComputeHookTenant {
         }
     }
 
-    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
+    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>> {
         match self {
             Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
             Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
@@ -188,11 +203,11 @@ enum MaybeSendResult {
     Transmit(
         (
             ComputeHookNotifyRequest,
-            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
+            tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>,
         ),
     ),
     // Something requires sending, but you must wait for a current sender then call again
-    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
+    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeRemoteState>>>),
     // Nothing requires sending
     Noop,
 }
@@ -201,7 +216,7 @@ impl ComputeHookTenant {
     fn maybe_send(
         &self,
         tenant_id: TenantId,
-        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
+        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeRemoteState>>>,
     ) -> MaybeSendResult {
         let locked = match lock {
             Some(already_locked) => already_locked,
@@ -257,11 +272,22 @@ impl ComputeHookTenant {
                 tracing::info!("Tenant isn't yet ready to emit a notification");
                 MaybeSendResult::Noop
             }
-            Some(request) if Some(&request) == locked.as_ref() => {
-                // No change from the last value successfully sent
+            Some(request)
+                if Some(&request) == locked.as_ref().map(|s| &s.request)
+                    && locked.as_ref().map(|s| s.applied).unwrap_or(false) =>
+            {
+                tracing::info!(
+                    "Skipping notification because remote state already matches ({:?})",
+                    &request
+                );
+                // No change from the last value successfully sent, and our state indicates that the last
+                // value sent was fully applied on the control plane side.
                 MaybeSendResult::Noop
             }
-            Some(request) => MaybeSendResult::Transmit((request, locked)),
+            Some(request) => {
+                // Our request differs from the last one sent, or the last one sent was not fully applied on the compute side
+                MaybeSendResult::Transmit((request, locked))
+            }
         }
     }
 }
@@ -550,10 +576,28 @@ impl ComputeHook {
             })
         };
 
-        if result.is_ok() {
-            // Before dropping the send lock, stash the request we just sent so that
-            // subsequent callers can avoid redundantly re-sending the same thing.
-            *send_lock_guard = Some(request);
+        match result {
+            Ok(_) => {
+                // Before dropping the send lock, stash the request we just sent so that
+                // subsequent callers can avoid redundantly re-sending the same thing.
+                *send_lock_guard = Some(ComputeRemoteState {
+                    request,
+                    applied: true,
+                });
+            }
+            Err(NotifyError::Busy) => {
+                // Busy result means that the server responded and has stored the new configuration,
+                // but was not able to fully apply it to the compute
+                *send_lock_guard = Some(ComputeRemoteState {
+                    request,
+                    applied: false,
+                });
+            }
+            Err(_) => {
+                // General error case: we can no longer know the remote state, so clear it.  This will result in
+                // the logic in maybe_send recognizing that we should call the hook again.
+                *send_lock_guard = None;
+            }
         }
         result
     }
@@ -707,7 +751,10 @@ pub(crate) mod tests {
         assert!(request.stripe_size.is_none());
 
         // Simulate successful send
-        *guard = Some(request);
+        *guard = Some(ComputeRemoteState {
+            request,
+            applied: true,
+        });
         drop(guard);
 
         // Try asking again: this should be a no-op
@@ -750,7 +797,10 @@ pub(crate) mod tests {
         assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
 
         // Simulate successful send
-        *guard = Some(request);
+        *guard = Some(ComputeRemoteState {
+            request,
+            applied: true,
+        });
         drop(guard);
 
         Ok(())
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 1dcc37c407..a4e293da9e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -576,6 +576,14 @@ def test_storage_controller_compute_hook(
     env.storage_controller.consistency_check()
 
 
+NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*"
+NOTIFY_FAILURE_LOGS = [
+    ".*Failed to notify compute.*",
+    ".*Reconcile error.*Cancelled",
+    ".*Reconcile error.*Control plane tenant busy",
+]
+
+
 def test_storage_controller_stuck_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,
@@ -620,15 +628,8 @@ def test_storage_controller_stuck_compute_hook(
     dest_pageserver = env.get_pageserver(dest_ps_id)
     shard_0_id = TenantShardId(tenant_id, 0, 0)
 
-    NOTIFY_BLOCKED_LOG = ".*Live migration blocked.*"
-    env.storage_controller.allowed_errors.extend(
-        [
-            NOTIFY_BLOCKED_LOG,
-            ".*Failed to notify compute.*",
-            ".*Reconcile error.*Cancelled",
-            ".*Reconcile error.*Control plane tenant busy",
-        ]
-    )
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
 
     with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor:
         # We expect the controller to hit the 423 (locked) and retry.  Migration shouldn't complete until that
@@ -719,6 +720,114 @@ def test_storage_controller_stuck_compute_hook(
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("this test doesn't start an endpoint")
+def test_storage_controller_compute_hook_revert(
+    httpserver: HTTPServer,
+    neon_env_builder: NeonEnvBuilder,
+    httpserver_listen_address,
+):
+    """
+    'revert' in the sense of a migration which gets reversed shortly after, as may happen during
+    a rolling upgrade.
+
+    This is a reproducer for https://github.com/neondatabase/neon/issues/9417
+
+    The buggy behavior was that when the compute hook gave us errors, we assumed our last successfully
+    sent state was still in effect, so when migrating back to the original pageserver we didn't bother
+    notifying of that.  This is wrong because even a failed request might mutate the state on the server.
+    """
+
+    # We will run two pageserver to migrate and check that the storage controller sends notifications
+    # when migrating.
+    neon_env_builder.num_pageservers = 2
+    (host, port) = httpserver_listen_address
+    neon_env_builder.control_plane_compute_hook_api = f"http://{host}:{port}/notify"
+
+    # Set up fake HTTP notify endpoint
+    notifications = []
+
+    handle_params = {"status": 200}
+
+    def handler(request: Request):
+        status = handle_params["status"]
+        log.info(f"Notify request[{status}]: {request}")
+        notifications.append(request.json)
+        return Response(status=status)
+
+    httpserver.expect_request("/notify", method="PUT").respond_with_handler(handler)
+
+    # Start running
+    env = neon_env_builder.init_start(initial_tenant_conf={"lsn_lease_length": "0s"})
+    tenant_id = env.initial_tenant
+    tenant_shard_id = TenantShardId(tenant_id, 0, 0)
+
+    pageserver_a = env.get_tenant_pageserver(tenant_id)
+    pageserver_b = [p for p in env.pageservers if p.id != pageserver_a.id][0]
+
+    def notified_ps(ps_id: int) -> None:
+        latest = notifications[-1]
+        log.info(f"Waiting for {ps_id}, have {latest}")
+        assert latest is not None
+        assert latest["shards"] is not None
+        assert latest["shards"][0]["node_id"] == ps_id
+
+    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+
+    env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
+    env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
+
+    # Migrate A -> B, and make notifications fail while this is happening
+    handle_params["status"] = 423
+
+    with pytest.raises(StorageControllerApiException, match="Timeout waiting for shard"):
+        # We expect the controller to give us an error because its reconciliation timed out
+        # waiting for the compute hook.
+        env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_b.id)
+
+    # Although the migration API failed, the hook should still see pageserver B (it remembers what
+    # was posted even when returning an error code)
+    wait_until(30, 1, lambda: notified_ps(pageserver_b.id))
+
+    # Although the migration API failed, the tenant should still have moved to the right pageserver
+    assert len(pageserver_b.http_client().tenant_list()) == 1
+
+    # Before we clear the failure on the migration hook, we need the controller to give up
+    # trying to notify about B -- the bug case we're reproducing is when the controller
+    # _never_ successfully notified for B, then tries to notify for A.
+    #
+    # The controller will give up notifying if the origin of a migration becomes unavailable.
+    pageserver_a.stop()
+
+    # Preempt heartbeats for a faster test
+    env.storage_controller.node_configure(pageserver_a.id, {"availability": "Offline"})
+
+    def logged_giving_up():
+        env.storage_controller.assert_log_contains(".*Giving up on compute notification.*")
+
+    wait_until(30, 1, logged_giving_up)
+
+    pageserver_a.start()
+
+    # Preempt heartbeats for determinism
+    env.storage_controller.node_configure(pageserver_a.id, {"availability": "Active"})
+    # Starting node will prompt a reconcile to clean up old AttachedStale location, for a deterministic test
+    # we want that complete before we start our migration.  Tolerate failure because our compute hook is
+    # still configured to fail
+    try:
+        env.storage_controller.reconcile_all()
+    except StorageControllerApiException as e:
+        # This exception _might_ be raised: it depends if our reconcile_all hit the on-node-activation
+        # Reconciler lifetime or ran after it already completed.
+        log.info(f"Expected error from reconcile_all: {e}")
+
+    # Migrate B -> A, with a working compute hook: the controller should notify the hook because the
+    # last update it made that was acked (423) by the compute was for node B.
+    handle_params["status"] = 200
+    env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id)
+
+    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+
+
 def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     """
     Verify that occasional-use debug APIs work as expected.  This is a lightweight test

From 98fee7a97d68db55049583d403dcb21755bc4db5 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 18 Oct 2024 13:31:14 +0300
Subject: [PATCH 042/239] Increase shared_buffers in
 test_subscriber_synchronous_commit. (#9427)

Might make the test less flaky.
---
 test_runner/regress/test_logical_replication.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 87991eadf1..c26bf058e2 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -558,10 +558,10 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
     return publisher_flush_lsn
 
 
-# Test that subscriber takes into account quorum committed flush_lsn in
-# flush_lsn reporting to publisher. Without this, it may ack too far, losing
-# data on restart because publisher advances START_REPLICATION position to the
-# confirmed_flush_lsn of the slot.
+# Test that neon subscriber takes into account quorum committed flush_lsn in
+# flush_lsn reporting to publisher. Without this, subscriber may ack too far,
+# losing data on restart because publisher implicitly advances positition given
+# in START_REPLICATION to the confirmed_flush_lsn of the slot.
 def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
     # use vanilla as publisher to allow writes on it when safekeeper is down
@@ -578,7 +578,10 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     vanilla_pg.safe_psql("create extension neon;")
 
     env.create_branch("subscriber")
-    sub = env.endpoints.create("subscriber")
+    # We want all data to fit into shared_buffers because later we stop
+    # safekeeper and insert more; this shouldn't cause page requests as they
+    # will be stuck.
+    sub = env.endpoints.create("subscriber", config_lines=["shared_buffers=128MB"])
     sub.start()
 
     with vanilla_pg.cursor() as pcur:

From 15fecffe6ba400693619c6a022ed6205769a61ae Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Fri, 18 Oct 2024 12:42:41 +0200
Subject: [PATCH 043/239] Update ruff to much newer version (#9433)

Includes a multidict patch release to fix build with newer cpython.
---
 poetry.lock                                   | 207 ++++++++++--------
 pyproject.toml                                |   2 +-
 test_runner/fixtures/neon_cli.py              |   4 +-
 test_runner/fixtures/neon_fixtures.py         |  18 +-
 test_runner/fixtures/utils.py                 |   2 +-
 .../performance/test_logical_replication.py   |  14 +-
 .../performance/test_physical_replication.py  |  12 +-
 .../regress/test_download_extensions.py       |   2 +-
 test_runner/regress/test_next_xid.py          |   4 +-
 test_runner/regress/test_timeline_delete.py   |   2 +-
 10 files changed, 145 insertions(+), 122 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 00fe2505c9..e307b873f3 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1758,85 +1758,101 @@ tests = ["pytest (>=4.6)"]
 
 [[package]]
 name = "multidict"
-version = "6.0.4"
+version = "6.0.5"
 description = "multidict implementation"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:0b1a97283e0c85772d613878028fec909f003993e1007eafa715b24b377cb9b8"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:eeb6dcc05e911516ae3d1f207d4b0520d07f54484c49dfc294d6e7d63b734171"},
-    {file = "multidict-6.0.4-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:d6d635d5209b82a3492508cf5b365f3446afb65ae7ebd755e70e18f287b0adf7"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:c048099e4c9e9d615545e2001d3d8a4380bd403e1a0578734e0d31703d1b0c0b"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ea20853c6dbbb53ed34cb4d080382169b6f4554d394015f1bef35e881bf83547"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:16d232d4e5396c2efbbf4f6d4df89bfa905eb0d4dc5b3549d872ab898451f569"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:36c63aaa167f6c6b04ef2c85704e93af16c11d20de1d133e39de6a0e84582a93"},
-    {file = "multidict-6.0.4-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:64bdf1086b6043bf519869678f5f2757f473dee970d7abf6da91ec00acb9cb98"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:43644e38f42e3af682690876cff722d301ac585c5b9e1eacc013b7a3f7b696a0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7582a1d1030e15422262de9f58711774e02fa80df0d1578995c76214f6954988"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:ddff9c4e225a63a5afab9dd15590432c22e8057e1a9a13d28ed128ecf047bbdc"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ee2a1ece51b9b9e7752e742cfb661d2a29e7bcdba2d27e66e28a99f1890e4fa0"},
-    {file = "multidict-6.0.4-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:a2e4369eb3d47d2034032a26c7a80fcb21a2cb22e1173d761a162f11e562caa5"},
-    {file = "multidict-6.0.4-cp310-cp310-win32.whl", hash = "sha256:574b7eae1ab267e5f8285f0fe881f17efe4b98c39a40858247720935b893bba8"},
-    {file = "multidict-6.0.4-cp310-cp310-win_amd64.whl", hash = "sha256:4dcbb0906e38440fa3e325df2359ac6cb043df8e58c965bb45f4e406ecb162cc"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:0dfad7a5a1e39c53ed00d2dd0c2e36aed4650936dc18fd9a1826a5ae1cad6f03"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:64da238a09d6039e3bd39bb3aee9c21a5e34f28bfa5aa22518581f910ff94af3"},
-    {file = "multidict-6.0.4-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:ff959bee35038c4624250473988b24f846cbeb2c6639de3602c073f10410ceba"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:01a3a55bd90018c9c080fbb0b9f4891db37d148a0a18722b42f94694f8b6d4c9"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c5cb09abb18c1ea940fb99360ea0396f34d46566f157122c92dfa069d3e0e982"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:666daae833559deb2d609afa4490b85830ab0dfca811a98b70a205621a6109fe"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:11bdf3f5e1518b24530b8241529d2050014c884cf18b6fc69c0c2b30ca248710"},
-    {file = "multidict-6.0.4-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7d18748f2d30f94f498e852c67d61261c643b349b9d2a581131725595c45ec6c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:458f37be2d9e4c95e2d8866a851663cbc76e865b78395090786f6cd9b3bbf4f4"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:b1a2eeedcead3a41694130495593a559a668f382eee0727352b9a41e1c45759a"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:7d6ae9d593ef8641544d6263c7fa6408cc90370c8cb2bbb65f8d43e5b0351d9c"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:5979b5632c3e3534e42ca6ff856bb24b2e3071b37861c2c727ce220d80eee9ed"},
-    {file = "multidict-6.0.4-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:dcfe792765fab89c365123c81046ad4103fcabbc4f56d1c1997e6715e8015461"},
-    {file = "multidict-6.0.4-cp311-cp311-win32.whl", hash = "sha256:3601a3cece3819534b11d4efc1eb76047488fddd0c85a3948099d5da4d504636"},
-    {file = "multidict-6.0.4-cp311-cp311-win_amd64.whl", hash = "sha256:81a4f0b34bd92df3da93315c6a59034df95866014ac08535fc819f043bfd51f0"},
-    {file = "multidict-6.0.4-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:67040058f37a2a51ed8ea8f6b0e6ee5bd78ca67f169ce6122f3e2ec80dfe9b78"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:853888594621e6604c978ce2a0444a1e6e70c8d253ab65ba11657659dcc9100f"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:39ff62e7d0f26c248b15e364517a72932a611a9b75f35b45be078d81bdb86603"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:af048912e045a2dc732847d33821a9d84ba553f5c5f028adbd364dd4765092ac"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b1e8b901e607795ec06c9e42530788c45ac21ef3aaa11dbd0c69de543bfb79a9"},
-    {file = "multidict-6.0.4-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:62501642008a8b9871ddfccbf83e4222cf8ac0d5aeedf73da36153ef2ec222d2"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:99b76c052e9f1bc0721f7541e5e8c05db3941eb9ebe7b8553c625ef88d6eefde"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:509eac6cf09c794aa27bcacfd4d62c885cce62bef7b2c3e8b2e49d365b5003fe"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:21a12c4eb6ddc9952c415f24eef97e3e55ba3af61f67c7bc388dcdec1404a067"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:5cad9430ab3e2e4fa4a2ef4450f548768400a2ac635841bc2a56a2052cdbeb87"},
-    {file = "multidict-6.0.4-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:ab55edc2e84460694295f401215f4a58597f8f7c9466faec545093045476327d"},
-    {file = "multidict-6.0.4-cp37-cp37m-win32.whl", hash = "sha256:5a4dcf02b908c3b8b17a45fb0f15b695bf117a67b76b7ad18b73cf8e92608775"},
-    {file = "multidict-6.0.4-cp37-cp37m-win_amd64.whl", hash = "sha256:6ed5f161328b7df384d71b07317f4d8656434e34591f20552c7bcef27b0ab88e"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:5fc1b16f586f049820c5c5b17bb4ee7583092fa0d1c4e28b5239181ff9532e0c"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1502e24330eb681bdaa3eb70d6358e818e8e8f908a22a1851dfd4e15bc2f8161"},
-    {file = "multidict-6.0.4-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:b692f419760c0e65d060959df05f2a531945af31fda0c8a3b3195d4efd06de11"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:45e1ecb0379bfaab5eef059f50115b54571acfbe422a14f668fc8c27ba410e7e"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:ddd3915998d93fbcd2566ddf9cf62cdb35c9e093075f862935573d265cf8f65d"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:59d43b61c59d82f2effb39a93c48b845efe23a3852d201ed2d24ba830d0b4cf2"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:cc8e1d0c705233c5dd0c5e6460fbad7827d5d36f310a0fadfd45cc3029762258"},
-    {file = "multidict-6.0.4-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d6aa0418fcc838522256761b3415822626f866758ee0bc6632c9486b179d0b52"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:6748717bb10339c4760c1e63da040f5f29f5ed6e59d76daee30305894069a660"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:4d1a3d7ef5e96b1c9e92f973e43aa5e5b96c659c9bc3124acbbd81b0b9c8a951"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:4372381634485bec7e46718edc71528024fcdc6f835baefe517b34a33c731d60"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:fc35cb4676846ef752816d5be2193a1e8367b4c1397b74a565a9d0389c433a1d"},
-    {file = "multidict-6.0.4-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:4b9d9e4e2b37daddb5c23ea33a3417901fa7c7b3dee2d855f63ee67a0b21e5b1"},
-    {file = "multidict-6.0.4-cp38-cp38-win32.whl", hash = "sha256:e41b7e2b59679edfa309e8db64fdf22399eec4b0b24694e1b2104fb789207779"},
-    {file = "multidict-6.0.4-cp38-cp38-win_amd64.whl", hash = "sha256:d6c254ba6e45d8e72739281ebc46ea5eb5f101234f3ce171f0e9f5cc86991480"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:16ab77bbeb596e14212e7bab8429f24c1579234a3a462105cda4a66904998664"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:bc779e9e6f7fda81b3f9aa58e3a6091d49ad528b11ed19f6621408806204ad35"},
-    {file = "multidict-6.0.4-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:4ceef517eca3e03c1cceb22030a3e39cb399ac86bff4e426d4fc6ae49052cc60"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:281af09f488903fde97923c7744bb001a9b23b039a909460d0f14edc7bf59706"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:52f2dffc8acaba9a2f27174c41c9e57f60b907bb9f096b36b1a1f3be71c6284d"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b41156839806aecb3641f3208c0dafd3ac7775b9c4c422d82ee2a45c34ba81ca"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d5e3fc56f88cc98ef8139255cf8cd63eb2c586531e43310ff859d6bb3a6b51f1"},
-    {file = "multidict-6.0.4-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8316a77808c501004802f9beebde51c9f857054a0c871bd6da8280e718444449"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:f70b98cd94886b49d91170ef23ec5c0e8ebb6f242d734ed7ed677b24d50c82cf"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:bf6774e60d67a9efe02b3616fee22441d86fab4c6d335f9d2051d19d90a40063"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:e69924bfcdda39b722ef4d9aa762b2dd38e4632b3641b1d9a57ca9cd18f2f83a"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:6b181d8c23da913d4ff585afd1155a0e1194c0b50c54fcfe286f70cdaf2b7176"},
-    {file = "multidict-6.0.4-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:52509b5be062d9eafc8170e53026fbc54cf3b32759a23d07fd935fb04fc22d95"},
-    {file = "multidict-6.0.4-cp39-cp39-win32.whl", hash = "sha256:27c523fbfbdfd19c6867af7346332b62b586eed663887392cff78d614f9ec313"},
-    {file = "multidict-6.0.4-cp39-cp39-win_amd64.whl", hash = "sha256:33029f5734336aa0d4c0384525da0387ef89148dc7191aae00ca5fb23d7aafc2"},
-    {file = "multidict-6.0.4.tar.gz", hash = "sha256:3666906492efb76453c0e7b97f2cf459b0682e7402c0489a95484965dbc1da49"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:228b644ae063c10e7f324ab1ab6b548bdf6f8b47f3ec234fef1093bc2735e5f9"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:896ebdcf62683551312c30e20614305f53125750803b614e9e6ce74a96232604"},
+    {file = "multidict-6.0.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:411bf8515f3be9813d06004cac41ccf7d1cd46dfe233705933dd163b60e37600"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d147090048129ce3c453f0292e7697d333db95e52616b3793922945804a433c"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:215ed703caf15f578dca76ee6f6b21b7603791ae090fbf1ef9d865571039ade5"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c6390cf87ff6234643428991b7359b5f59cc15155695deb4eda5c777d2b880f"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:21fd81c4ebdb4f214161be351eb5bcf385426bf023041da2fd9e60681f3cebae"},
+    {file = "multidict-6.0.5-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3cc2ad10255f903656017363cd59436f2111443a76f996584d1077e43ee51182"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:6939c95381e003f54cd4c5516740faba40cf5ad3eeff460c3ad1d3e0ea2549bf"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:220dd781e3f7af2c2c1053da9fa96d9cf3072ca58f057f4c5adaaa1cab8fc442"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:766c8f7511df26d9f11cd3a8be623e59cca73d44643abab3f8c8c07620524e4a"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:fe5d7785250541f7f5019ab9cba2c71169dc7d74d0f45253f8313f436458a4ef"},
+    {file = "multidict-6.0.5-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:c1c1496e73051918fcd4f58ff2e0f2f3066d1c76a0c6aeffd9b45d53243702cc"},
+    {file = "multidict-6.0.5-cp310-cp310-win32.whl", hash = "sha256:7afcdd1fc07befad18ec4523a782cde4e93e0a2bf71239894b8d61ee578c1319"},
+    {file = "multidict-6.0.5-cp310-cp310-win_amd64.whl", hash = "sha256:99f60d34c048c5c2fabc766108c103612344c46e35d4ed9ae0673d33c8fb26e8"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f285e862d2f153a70586579c15c44656f888806ed0e5b56b64489afe4a2dbfba"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:53689bb4e102200a4fafa9de9c7c3c212ab40a7ab2c8e474491914d2305f187e"},
+    {file = "multidict-6.0.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:612d1156111ae11d14afaf3a0669ebf6c170dbb735e510a7438ffe2369a847fd"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:7be7047bd08accdb7487737631d25735c9a04327911de89ff1b26b81745bd4e3"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:de170c7b4fe6859beb8926e84f7d7d6c693dfe8e27372ce3b76f01c46e489fcf"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:04bde7a7b3de05732a4eb39c94574db1ec99abb56162d6c520ad26f83267de29"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:85f67aed7bb647f93e7520633d8f51d3cbc6ab96957c71272b286b2f30dc70ed"},
+    {file = "multidict-6.0.5-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:425bf820055005bfc8aa9a0b99ccb52cc2f4070153e34b701acc98d201693733"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:d3eb1ceec286eba8220c26f3b0096cf189aea7057b6e7b7a2e60ed36b373b77f"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:7901c05ead4b3fb75113fb1dd33eb1253c6d3ee37ce93305acd9d38e0b5f21a4"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:e0e79d91e71b9867c73323a3444724d496c037e578a0e1755ae159ba14f4f3d1"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:29bfeb0dff5cb5fdab2023a7a9947b3b4af63e9c47cae2a10ad58394b517fddc"},
+    {file = "multidict-6.0.5-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e030047e85cbcedbfc073f71836d62dd5dadfbe7531cae27789ff66bc551bd5e"},
+    {file = "multidict-6.0.5-cp311-cp311-win32.whl", hash = "sha256:2f4848aa3baa109e6ab81fe2006c77ed4d3cd1e0ac2c1fbddb7b1277c168788c"},
+    {file = "multidict-6.0.5-cp311-cp311-win_amd64.whl", hash = "sha256:2faa5ae9376faba05f630d7e5e6be05be22913782b927b19d12b8145968a85ea"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:51d035609b86722963404f711db441cf7134f1889107fb171a970c9701f92e1e"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:cbebcd5bcaf1eaf302617c114aa67569dd3f090dd0ce8ba9e35e9985b41ac35b"},
+    {file = "multidict-6.0.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:2ffc42c922dbfddb4a4c3b438eb056828719f07608af27d163191cb3e3aa6cc5"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ceb3b7e6a0135e092de86110c5a74e46bda4bd4fbfeeb3a3bcec79c0f861e450"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:79660376075cfd4b2c80f295528aa6beb2058fd289f4c9252f986751a4cd0496"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:e4428b29611e989719874670fd152b6625500ad6c686d464e99f5aaeeaca175a"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d84a5c3a5f7ce6db1f999fb9438f686bc2e09d38143f2d93d8406ed2dd6b9226"},
+    {file = "multidict-6.0.5-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:76c0de87358b192de7ea9649beb392f107dcad9ad27276324c24c91774ca5271"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:79a6d2ba910adb2cbafc95dad936f8b9386e77c84c35bc0add315b856d7c3abb"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:92d16a3e275e38293623ebf639c471d3e03bb20b8ebb845237e0d3664914caef"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:fb616be3538599e797a2017cccca78e354c767165e8858ab5116813146041a24"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:14c2976aa9038c2629efa2c148022ed5eb4cb939e15ec7aace7ca932f48f9ba6"},
+    {file = "multidict-6.0.5-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:435a0984199d81ca178b9ae2c26ec3d49692d20ee29bc4c11a2a8d4514c67eda"},
+    {file = "multidict-6.0.5-cp312-cp312-win32.whl", hash = "sha256:9fe7b0653ba3d9d65cbe7698cca585bf0f8c83dbbcc710db9c90f478e175f2d5"},
+    {file = "multidict-6.0.5-cp312-cp312-win_amd64.whl", hash = "sha256:01265f5e40f5a17f8241d52656ed27192be03bfa8764d88e8220141d1e4b3556"},
+    {file = "multidict-6.0.5-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:19fe01cea168585ba0f678cad6f58133db2aa14eccaf22f88e4a6dccadfad8b3"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6bf7a982604375a8d49b6cc1b781c1747f243d91b81035a9b43a2126c04766f5"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:107c0cdefe028703fb5dafe640a409cb146d44a6ae201e55b35a4af8e95457dd"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:403c0911cd5d5791605808b942c88a8155c2592e05332d2bf78f18697a5fa15e"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:aeaf541ddbad8311a87dd695ed9642401131ea39ad7bc8cf3ef3967fd093b626"},
+    {file = "multidict-6.0.5-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e4972624066095e52b569e02b5ca97dbd7a7ddd4294bf4e7247d52635630dd83"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:d946b0a9eb8aaa590df1fe082cee553ceab173e6cb5b03239716338629c50c7a"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:b55358304d7a73d7bdf5de62494aaf70bd33015831ffd98bc498b433dfe5b10c"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:a3145cb08d8625b2d3fee1b2d596a8766352979c9bffe5d7833e0503d0f0b5e5"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:d65f25da8e248202bd47445cec78e0025c0fe7582b23ec69c3b27a640dd7a8e3"},
+    {file = "multidict-6.0.5-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:c9bf56195c6bbd293340ea82eafd0071cb3d450c703d2c93afb89f93b8386ccc"},
+    {file = "multidict-6.0.5-cp37-cp37m-win32.whl", hash = "sha256:69db76c09796b313331bb7048229e3bee7928eb62bab5e071e9f7fcc4879caee"},
+    {file = "multidict-6.0.5-cp37-cp37m-win_amd64.whl", hash = "sha256:fce28b3c8a81b6b36dfac9feb1de115bab619b3c13905b419ec71d03a3fc1423"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76f067f5121dcecf0d63a67f29080b26c43c71a98b10c701b0677e4a065fbd54"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:b82cc8ace10ab5bd93235dfaab2021c70637005e1ac787031f4d1da63d493c1d"},
+    {file = "multidict-6.0.5-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:5cb241881eefd96b46f89b1a056187ea8e9ba14ab88ba632e68d7a2ecb7aadf7"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e8e94e6912639a02ce173341ff62cc1201232ab86b8a8fcc05572741a5dc7d93"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:09a892e4a9fb47331da06948690ae38eaa2426de97b4ccbfafbdcbe5c8f37ff8"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:55205d03e8a598cfc688c71ca8ea5f66447164efff8869517f175ea632c7cb7b"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:37b15024f864916b4951adb95d3a80c9431299080341ab9544ed148091b53f50"},
+    {file = "multidict-6.0.5-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f2a1dee728b52b33eebff5072817176c172050d44d67befd681609b4746e1c2e"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:edd08e6f2f1a390bf137080507e44ccc086353c8e98c657e666c017718561b89"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:60d698e8179a42ec85172d12f50b1668254628425a6bd611aba022257cac1386"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:3d25f19500588cbc47dc19081d78131c32637c25804df8414463ec908631e453"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:4cc0ef8b962ac7a5e62b9e826bd0cd5040e7d401bc45a6835910ed699037a461"},
+    {file = "multidict-6.0.5-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:eca2e9d0cc5a889850e9bbd68e98314ada174ff6ccd1129500103df7a94a7a44"},
+    {file = "multidict-6.0.5-cp38-cp38-win32.whl", hash = "sha256:4a6a4f196f08c58c59e0b8ef8ec441d12aee4125a7d4f4fef000ccb22f8d7241"},
+    {file = "multidict-6.0.5-cp38-cp38-win_amd64.whl", hash = "sha256:0275e35209c27a3f7951e1ce7aaf93ce0d163b28948444bec61dd7badc6d3f8c"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:e7be68734bd8c9a513f2b0cfd508802d6609da068f40dc57d4e3494cefc92929"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:1d9ea7a7e779d7a3561aade7d596649fbecfa5c08a7674b11b423783217933f9"},
+    {file = "multidict-6.0.5-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:ea1456df2a27c73ce51120fa2f519f1bea2f4a03a917f4a43c8707cf4cbbae1a"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cf590b134eb70629e350691ecca88eac3e3b8b3c86992042fb82e3cb1830d5e1"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5c0631926c4f58e9a5ccce555ad7747d9a9f8b10619621f22f9635f069f6233e"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dce1c6912ab9ff5f179eaf6efe7365c1f425ed690b03341911bf4939ef2f3046"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c0868d64af83169e4d4152ec612637a543f7a336e4a307b119e98042e852ad9c"},
+    {file = "multidict-6.0.5-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:141b43360bfd3bdd75f15ed811850763555a251e38b2405967f8e25fb43f7d40"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:7df704ca8cf4a073334e0427ae2345323613e4df18cc224f647f251e5e75a527"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:6214c5a5571802c33f80e6c84713b2c79e024995b9c5897f794b43e714daeec9"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:cd6c8fca38178e12c00418de737aef1261576bd1b6e8c6134d3e729a4e858b38"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e02021f87a5b6932fa6ce916ca004c4d441509d33bbdbeca70d05dff5e9d2479"},
+    {file = "multidict-6.0.5-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ebd8d160f91a764652d3e51ce0d2956b38efe37c9231cd82cfc0bed2e40b581c"},
+    {file = "multidict-6.0.5-cp39-cp39-win32.whl", hash = "sha256:04da1bb8c8dbadf2a18a452639771951c662c5ad03aefe4884775454be322c9b"},
+    {file = "multidict-6.0.5-cp39-cp39-win_amd64.whl", hash = "sha256:d6f6d4f185481c9669b9447bf9d9cf3b95a0e9df9d169bbc17e363b7d5487755"},
+    {file = "multidict-6.0.5-py3-none-any.whl", hash = "sha256:0d63c74e3d7ab26de115c49bffc92cc77ed23395303d496eae515d4204a625e7"},
+    {file = "multidict-6.0.5.tar.gz", hash = "sha256:f7e301075edaf50500f0b341543c41194d8df3ae5caf4702f2095f3ca73dd8da"},
 ]
 
 [[package]]
@@ -2766,28 +2782,29 @@ six = "*"
 
 [[package]]
 name = "ruff"
-version = "0.2.2"
+version = "0.7.0"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.macosx_11_0_arm64.macosx_10_12_universal2.whl", hash = "sha256:0a9efb032855ffb3c21f6405751d5e147b0c6b631e3ca3f6b20f917572b97eb6"},
-    {file = "ruff-0.2.2-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:d450b7fbff85913f866a5384d8912710936e2b96da74541c82c1b458472ddb39"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ecd46e3106850a5c26aee114e562c329f9a1fbe9e4821b008c4404f64ff9ce73"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:5e22676a5b875bd72acd3d11d5fa9075d3a5f53b877fe7b4793e4673499318ba"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:1695700d1e25a99d28f7a1636d85bafcc5030bba9d0578c0781ba1790dbcf51c"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:b0c232af3d0bd8f521806223723456ffebf8e323bd1e4e82b0befb20ba18388e"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f63d96494eeec2fc70d909393bcd76c69f35334cdbd9e20d089fb3f0640216ca"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:6a61ea0ff048e06de273b2e45bd72629f470f5da8f71daf09fe481278b175001"},
-    {file = "ruff-0.2.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5e1439c8f407e4f356470e54cdecdca1bd5439a0673792dbe34a2b0a551a2fe3"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:940de32dc8853eba0f67f7198b3e79bc6ba95c2edbfdfac2144c8235114d6726"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:0c126da55c38dd917621552ab430213bdb3273bb10ddb67bc4b761989210eb6e"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_i686.whl", hash = "sha256:3b65494f7e4bed2e74110dac1f0d17dc8e1f42faaa784e7c58a98e335ec83d7e"},
-    {file = "ruff-0.2.2-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:1ec49be4fe6ddac0503833f3ed8930528e26d1e60ad35c2446da372d16651ce9"},
-    {file = "ruff-0.2.2-py3-none-win32.whl", hash = "sha256:d920499b576f6c68295bc04e7b17b6544d9d05f196bb3aac4358792ef6f34325"},
-    {file = "ruff-0.2.2-py3-none-win_amd64.whl", hash = "sha256:cc9a91ae137d687f43a44c900e5d95e9617cb37d4c989e462980ba27039d239d"},
-    {file = "ruff-0.2.2-py3-none-win_arm64.whl", hash = "sha256:c9d15fc41e6054bfc7200478720570078f0b41c9ae4f010bcc16bd6f4d1aacdd"},
-    {file = "ruff-0.2.2.tar.gz", hash = "sha256:e62ed7f36b3068a30ba39193a14274cd706bc486fad521276458022f7bccb31d"},
+    {file = "ruff-0.7.0-py3-none-linux_armv6l.whl", hash = "sha256:0cdf20c2b6ff98e37df47b2b0bd3a34aaa155f59a11182c1303cce79be715628"},
+    {file = "ruff-0.7.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:496494d350c7fdeb36ca4ef1c9f21d80d182423718782222c29b3e72b3512737"},
+    {file = "ruff-0.7.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:214b88498684e20b6b2b8852c01d50f0651f3cc6118dfa113b4def9f14faaf06"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:630fce3fefe9844e91ea5bbf7ceadab4f9981f42b704fae011bb8efcaf5d84be"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:211d877674e9373d4bb0f1c80f97a0201c61bcd1e9d045b6e9726adc42c156aa"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:194d6c46c98c73949a106425ed40a576f52291c12bc21399eb8f13a0f7073495"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:82c2579b82b9973a110fab281860403b397c08c403de92de19568f32f7178598"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9af971fe85dcd5eaed8f585ddbc6bdbe8c217fb8fcf510ea6bca5bdfff56040e"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b641c7f16939b7d24b7bfc0be4102c56562a18281f84f635604e8a6989948914"},
+    {file = "ruff-0.7.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d71672336e46b34e0c90a790afeac8a31954fd42872c1f6adaea1dff76fd44f9"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ab7d98c7eed355166f367597e513a6c82408df4181a937628dbec79abb2a1fe4"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:1eb54986f770f49edb14f71d33312d79e00e629a57387382200b1ef12d6a4ef9"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:dc452ba6f2bb9cf8726a84aa877061a2462afe9ae0ea1d411c53d226661c601d"},
+    {file = "ruff-0.7.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:4b406c2dce5be9bad59f2de26139a86017a517e6bcd2688da515481c05a2cb11"},
+    {file = "ruff-0.7.0-py3-none-win32.whl", hash = "sha256:f6c968509f767776f524a8430426539587d5ec5c662f6addb6aa25bc2e8195ec"},
+    {file = "ruff-0.7.0-py3-none-win_amd64.whl", hash = "sha256:ff4aabfbaaba880e85d394603b9e75d32b0693152e16fa659a3064a85df7fce2"},
+    {file = "ruff-0.7.0-py3-none-win_arm64.whl", hash = "sha256:10842f69c245e78d6adec7e1db0a7d9ddc2fff0621d730e61657b64fa36f207e"},
+    {file = "ruff-0.7.0.tar.gz", hash = "sha256:47a86360cf62d9cd53ebfb0b5eb0e882193fc191c6d717e8bef4462bc3b9ea2b"},
 ]
 
 [[package]]
@@ -3389,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1"
+content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
diff --git a/pyproject.toml b/pyproject.toml
index 9cd315bb96..862ed49638 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,7 @@ kafka-python = "^2.0.2"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
-ruff = "^0.2.2"
+ruff = "^0.7.0"
 
 [build-system]
 requires = ["poetry-core>=1.0.0"]
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 0d3dcd1671..1b2767e296 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import abc
 import json
 import os
 import re
@@ -30,7 +29,8 @@ if TYPE_CHECKING:
     T = TypeVar("T")
 
 
-class AbstractNeonCli(abc.ABC):
+# Used to be an ABC. abc.ABC removed due to linter without name change.
+class AbstractNeonCli:
     """
     A typed wrapper around an arbitrary Neon CLI tool.
     Supports a way to run arbitrary command directly via CLI.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a313ac2ed3..3cd8019e32 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -386,9 +386,9 @@ class NeonEnvBuilder:
 
         self.pageserver_virtual_file_io_engine: Optional[str] = pageserver_virtual_file_io_engine
 
-        self.pageserver_default_tenant_config_compaction_algorithm: Optional[
-            dict[str, Any]
-        ] = pageserver_default_tenant_config_compaction_algorithm
+        self.pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = (
+            pageserver_default_tenant_config_compaction_algorithm
+        )
         if self.pageserver_default_tenant_config_compaction_algorithm is not None:
             log.debug(
                 f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
@@ -1062,9 +1062,9 @@ class NeonEnv:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:
                 tenant_config = ps_cfg.setdefault("tenant_config", {})
-                tenant_config[
-                    "compaction_algorithm"
-                ] = config.pageserver_default_tenant_config_compaction_algorithm
+                tenant_config["compaction_algorithm"] = (
+                    config.pageserver_default_tenant_config_compaction_algorithm
+                )
 
             if self.pageserver_remote_storage is not None:
                 ps_cfg["remote_storage"] = remote_storage_to_toml_dict(
@@ -1108,9 +1108,9 @@ class NeonEnv:
             if config.auth_enabled:
                 sk_cfg["auth_enabled"] = True
             if self.safekeepers_remote_storage is not None:
-                sk_cfg[
-                    "remote_storage"
-                ] = self.safekeepers_remote_storage.to_toml_inline_table().strip()
+                sk_cfg["remote_storage"] = (
+                    self.safekeepers_remote_storage.to_toml_inline_table().strip()
+                )
             self.safekeepers.append(
                 Safekeeper(env=self, id=id, port=port, extra_opts=config.safekeeper_extra_opts)
             )
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 76575d330c..7ca6b3dd1c 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -417,7 +417,7 @@ def wait_until(
             time.sleep(interval)
             continue
         return res
-    raise Exception("timed out while waiting for %s" % func) from last_exception
+    raise Exception(f"timed out while waiting for {func}") from last_exception
 
 
 def assert_eq(a, b) -> None:
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index dbf94a2cf5..815d186ab9 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -144,9 +144,10 @@ def test_subscriber_lag(
                 check_pgbench_still_running(pub_workload, "pub")
                 check_pgbench_still_running(sub_workload, "sub")
 
-                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                    sub_connstr
-                ) as sub_conn:
+                with (
+                    psycopg2.connect(pub_connstr) as pub_conn,
+                    psycopg2.connect(sub_connstr) as sub_conn,
+                ):
                     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
                         lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
@@ -242,9 +243,10 @@ def test_publisher_restart(
                     ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
                     env=pub_env,
                 )
-                with psycopg2.connect(pub_connstr) as pub_conn, psycopg2.connect(
-                    sub_connstr
-                ) as sub_conn:
+                with (
+                    psycopg2.connect(pub_connstr) as pub_conn,
+                    psycopg2.connect(sub_connstr) as sub_conn,
+                ):
                     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
                         lag = measure_logical_replication_lag(sub_cur, pub_cur)
 
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 14b527acca..8b368977df 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -102,10 +102,14 @@ def test_ro_replica_lag(
                     check_pgbench_still_running(master_workload)
                     check_pgbench_still_running(replica_workload)
                     time.sleep(sync_interval_min * 60)
-                    with psycopg2.connect(master_connstr) as conn_master, psycopg2.connect(
-                        replica_connstr
-                    ) as conn_replica:
-                        with conn_master.cursor() as cur_master, conn_replica.cursor() as cur_replica:
+                    with (
+                        psycopg2.connect(master_connstr) as conn_master,
+                        psycopg2.connect(replica_connstr) as conn_replica,
+                    ):
+                        with (
+                            conn_master.cursor() as cur_master,
+                            conn_replica.cursor() as cur_replica,
+                        ):
                             lag = measure_replication_lag(cur_master, cur_replica)
                     log.info(f"Replica lagged behind master by {lag} seconds")
                     zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 04916a6b6f..0134f80769 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -74,7 +74,7 @@ def test_remote_extensions(
             mimetype="application/octet-stream",
             headers=[
                 ("Content-Length", str(file_size)),
-                ("Content-Disposition", 'attachment; filename="%s"' % file_name),
+                ("Content-Disposition", f'attachment; filename="{file_name}"'),
             ],
             direct_passthrough=True,
         )
diff --git a/test_runner/regress/test_next_xid.py b/test_runner/regress/test_next_xid.py
index 980f6b5694..db8da51125 100644
--- a/test_runner/regress/test_next_xid.py
+++ b/test_runner/regress/test_next_xid.py
@@ -254,13 +254,13 @@ def advance_multixid_to(
     # missing. That's OK for our purposes. Autovacuum will print some warnings about the
     # missing segments, but will clean it up by truncating the SLRUs up to the new value,
     # closing the gap.
-    segname = "%04X" % MultiXactIdToOffsetSegment(next_multi_xid)
+    segname = f"{MultiXactIdToOffsetSegment(next_multi_xid):04X}"
     log.info(f"Creating dummy segment pg_multixact/offsets/{segname}")
     with open(vanilla_pg.pgdatadir / "pg_multixact" / "offsets" / segname, "w") as of:
         of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
         of.flush()
 
-    segname = "%04X" % MXOffsetToMemberSegment(next_multi_offset)
+    segname = f"{MXOffsetToMemberSegment(next_multi_offset):04X}"
     log.info(f"Creating dummy segment pg_multixact/members/{segname}")
     with open(vanilla_pg.pgdatadir / "pg_multixact" / "members" / segname, "w") as of:
         of.write("\0" * SLRU_PAGES_PER_SEGMENT * BLCKSZ)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 306f22acf9..155709e106 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -649,7 +649,7 @@ def test_timeline_delete_works_for_remote_smoke(
     env = neon_env_builder.init_start()
 
     ps_http = env.pageserver.http_client()
-    pg = env.endpoints.create_start("main")
+    env.endpoints.create_start("main")
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline

From 3532ae76ef3a91131aee1f203a133c4d5e32b57a Mon Sep 17 00:00:00 2001
From: Jere Vaara <jerevaara@gmail.com>
Date: Fri, 18 Oct 2024 15:07:36 +0300
Subject: [PATCH 044/239] compute_ctl: Add endpoint that allows extensions to
 be installed (#9344)

Adds endpoint to install extensions:

**POST** `/extensions`
```
{"extension":"pg_sessions_jwt","database":"neondb","version":"1.0.0"}
```

Will be used by `local-proxy`.
Example, for the JWT authentication to work the database needs to have
the pg_session_jwt extension and also to enable JWT to work in RLS
policies.

---------

Co-authored-by: Conrad Ludgate <conradludgate@gmail.com>
---
 compute_tools/src/compute.rs             | 52 +++++++++++++++++-
 compute_tools/src/http/api.rs            | 37 ++++++++++++-
 compute_tools/src/http/openapi_spec.yaml | 69 +++++++++++++++++++++++-
 libs/compute_api/src/requests.rs         | 10 +++-
 libs/compute_api/src/responses.rs        |  7 ++-
 libs/compute_api/src/spec.rs             |  3 ++
 test_runner/fixtures/endpoint/http.py    | 10 ++++
 test_runner/regress/test_extensions.py   | 50 +++++++++++++++++
 8 files changed, 231 insertions(+), 7 deletions(-)
 create mode 100644 test_runner/regress/test_extensions.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 11fee73f03..c9dd4dcfc5 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -28,7 +28,7 @@ use utils::lsn::Lsn;
 
 use compute_api::privilege::Privilege;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec, ExtVersion};
 use utils::measured_stream::MeasuredReader;
 
 use nix::sys::signal::{kill, Signal};
@@ -1416,6 +1416,56 @@ LIMIT 100",
         Ok(())
     }
 
+    pub async fn install_extension(
+        &self,
+        ext_name: &PgIdent,
+        db_name: &PgIdent,
+        ext_version: ExtVersion,
+    ) -> Result<ExtVersion> {
+        use tokio_postgres::config::Config;
+        use tokio_postgres::NoTls;
+
+        let mut conf = Config::from_str(self.connstr.as_str()).unwrap();
+        conf.dbname(db_name);
+
+        let (db_client, conn) = conf
+            .connect(NoTls)
+            .await
+            .context("Failed to connect to the database")?;
+        tokio::spawn(conn);
+
+        let version_query = "SELECT extversion FROM pg_extension WHERE extname = $1";
+        let version: Option<ExtVersion> = db_client
+            .query_opt(version_query, &[&ext_name])
+            .await
+            .with_context(|| format!("Failed to execute query: {}", version_query))?
+            .map(|row| row.get(0));
+
+        // sanitize the inputs as postgres idents.
+        let ext_name: String = ext_name.pg_quote();
+        let quoted_version: String = ext_version.pg_quote();
+
+        if let Some(installed_version) = version {
+            if installed_version == ext_version {
+                return Ok(installed_version);
+            }
+            let query = format!("ALTER EXTENSION {ext_name} UPDATE TO {quoted_version}");
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        } else {
+            let query =
+                format!("CREATE EXTENSION IF NOT EXISTS {ext_name} WITH VERSION {quoted_version}");
+            db_client
+                .simple_query(&query)
+                .await
+                .with_context(|| format!("Failed to execute query: {}", query))?;
+        }
+
+        Ok(ext_version)
+    }
+
     #[tokio::main]
     pub async fn prepare_preload_libraries(
         &self,
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index 133ab9f5af..af35f71bf2 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,9 +9,10 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
-use compute_api::requests::{ConfigurationRequest, SetRoleGrantsRequest};
+use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
-    ComputeStatus, ComputeStatusResponse, GenericAPIError, SetRoleGrantsResponse,
+    ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
+    SetRoleGrantsResponse,
 };
 
 use anyhow::Result;
@@ -100,6 +101,38 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        (&Method::POST, "/extensions") => {
+            info!("serving /extensions POST request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for extensions request: {:?}",
+                    status
+                );
+                error!(msg);
+                return render_json_error(&msg, StatusCode::PRECONDITION_FAILED);
+            }
+
+            let request = hyper::body::to_bytes(req.into_body()).await.unwrap();
+            let request = serde_json::from_slice::<ExtensionInstallRequest>(&request).unwrap();
+            let res = compute
+                .install_extension(&request.extension, &request.database, request.version)
+                .await;
+            match res {
+                Ok(version) => render_json(Body::from(
+                    serde_json::to_string(&ExtensionInstallResult {
+                        extension: request.extension,
+                        version,
+                    })
+                    .unwrap(),
+                )),
+                Err(e) => {
+                    error!("install_extension failed: {}", e);
+                    render_json_error(&e.to_string(), StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
+
         (&Method::GET, "/info") => {
             let num_cpus = num_cpus::get_physical();
             info!("serving /info GET request. num_cpus: {}", num_cpus);
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 73dbdc3ee9..11eee6ccfd 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -179,6 +179,41 @@ paths:
                 description: Error text or 'true' if check passed.
                 example: "true"
 
+  /extensions:
+    post:
+      tags:
+        - Extensions
+      summary: Install extension if possible.
+      description: ""
+      operationId: installExtension
+      requestBody:
+        description: Extension name and database to install it to.
+        required: true
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/ExtensionInstallRequest"
+      responses:
+        200:
+          description: Result from extension installation
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ExtensionInstallResult"
+        412:
+          description: |
+            Compute is in the wrong state for processing the request.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: Error during extension installation.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
   /configure:
     post:
       tags:
@@ -404,7 +439,7 @@ components:
             moment, when spec was received.
           example: "2022-10-12T07:20:50.52Z"
         status:
-          $ref: '#/components/schemas/ComputeStatus'
+          $ref: "#/components/schemas/ComputeStatus"
         last_active:
           type: string
           description: |
@@ -444,6 +479,38 @@ components:
         - configuration
       example: running
 
+    ExtensionInstallRequest:
+      type: object
+      required:
+        - extension
+        - database
+        - version
+      properties:
+        extension:
+          type: string
+          description: Extension name.
+          example: "pg_session_jwt"
+        version:
+          type: string
+          description: Version of the extension.
+          example: "1.0.0"
+        database:
+          type: string
+          description: Database name.
+          example: "neondb"
+
+    ExtensionInstallResult:
+      type: object
+      properties:
+        extension:
+          description: Name of the extension.
+          type: string
+          example: "pg_session_jwt"
+        version:
+          description: Version of the extension.
+          type: string
+          example: "1.0.0"
+
     InstalledExtensions:
       type: object
       properties:
diff --git a/libs/compute_api/src/requests.rs b/libs/compute_api/src/requests.rs
index fbc7577dd9..fc3757d981 100644
--- a/libs/compute_api/src/requests.rs
+++ b/libs/compute_api/src/requests.rs
@@ -1,8 +1,7 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
-
 use crate::{
     privilege::Privilege,
-    spec::{ComputeSpec, PgIdent},
+    spec::{ComputeSpec, ExtVersion, PgIdent},
 };
 use serde::Deserialize;
 
@@ -16,6 +15,13 @@ pub struct ConfigurationRequest {
     pub spec: ComputeSpec,
 }
 
+#[derive(Deserialize, Debug)]
+pub struct ExtensionInstallRequest {
+    pub extension: PgIdent,
+    pub database: PgIdent,
+    pub version: ExtVersion,
+}
+
 #[derive(Deserialize, Debug)]
 pub struct SetRoleGrantsRequest {
     pub database: PgIdent,
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index fadf524273..79234be720 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -8,7 +8,7 @@ use serde::{Deserialize, Serialize, Serializer};
 
 use crate::{
     privilege::Privilege,
-    spec::{ComputeSpec, Database, PgIdent, Role},
+    spec::{ComputeSpec, Database, ExtVersion, PgIdent, Role},
 };
 
 #[derive(Serialize, Debug, Deserialize)]
@@ -172,6 +172,11 @@ pub struct InstalledExtensions {
     pub extensions: Vec<InstalledExtension>,
 }
 
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct ExtensionInstallResult {
+    pub extension: PgIdent,
+    pub version: ExtVersion,
+}
 #[derive(Clone, Debug, Default, Serialize)]
 pub struct SetRoleGrantsResponse {
     pub database: PgIdent,
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 5903db7055..8a447563dc 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -16,6 +16,9 @@ use remote_storage::RemotePath;
 /// intended to be used for DB / role names.
 pub type PgIdent = String;
 
+/// String type alias representing Postgres extension version
+pub type ExtVersion = String;
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index e7b014b4a9..ea8291c1e0 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -29,6 +29,16 @@ class EndpointHttpClient(requests.Session):
         res.raise_for_status()
         return res.json()
 
+    def extensions(self, extension: str, version: str, database: str):
+        body = {
+            "extension": extension,
+            "version": version,
+            "database": database,
+        }
+        res = self.post(f"http://localhost:{self.port}/extensions", json=body)
+        res.raise_for_status()
+        return res.json()
+
     def set_role_grants(self, database: str, role: str, schema: str, privileges: list[str]):
         res = self.post(
             f"http://localhost:{self.port}/grants",
diff --git a/test_runner/regress/test_extensions.py b/test_runner/regress/test_extensions.py
new file mode 100644
index 0000000000..100fd4b048
--- /dev/null
+++ b/test_runner/regress/test_extensions.py
@@ -0,0 +1,50 @@
+from logging import info
+
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_extensions(neon_simple_env: NeonEnv):
+    """basic test for the extensions endpoint testing installing extensions"""
+
+    env = neon_simple_env
+
+    env.create_branch("test_extensions")
+
+    endpoint = env.endpoints.create_start("test_extensions")
+    extension = "neon_test_utils"
+    database = "test_extensions"
+
+    endpoint.safe_psql("CREATE DATABASE test_extensions")
+
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'"
+            )
+            res = cur.fetchone()
+            assert res is not None
+            version = res[0]
+
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'",
+            )
+            res = cur.fetchone()
+            assert not res, "The 'neon_test_utils' extension is installed"
+
+    client = endpoint.http_client()
+    install_res = client.extensions(extension, version, database)
+
+    info("Extension install result: %s", res)
+    assert install_res["extension"] == extension and install_res["version"] == version
+
+    with endpoint.connect(dbname=database) as pg_conn:
+        with pg_conn.cursor() as cur:
+            cur.execute(
+                "SELECT extname, extversion FROM pg_extension WHERE extname = 'neon_test_utils'",
+            )
+            res = cur.fetchone()
+            assert res is not None
+            (db_extension_name, db_extension_version) = res
+
+    assert db_extension_name == extension and db_extension_version == version

From fecff15f18f00a692ff234106b064d1693cc5441 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 18 Oct 2024 15:31:50 +0300
Subject: [PATCH 045/239] walproposer: immediately exit if sync-safekeepers
 collected 0/0. (#9442)

Otherwise term history starting with 0/0 is streamed to safekeepers.

ref https://github.com/neondatabase/neon/issues/9434
---
 pgxn/neon/walproposer.c | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index a3f33cb261..d2a6104c74 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -841,6 +841,23 @@ HandleElectedProposer(WalProposer *wp)
 		wp_log(FATAL, "failed to download WAL for logical replicaiton");
 	}
 
+	/*
+	 * Zero propEpochStartLsn means majority of safekeepers doesn't have any
+	 * WAL, timeline was just created. Compute bumps it to basebackup LSN,
+	 * otherwise we must be sync-safekeepers and we have nothing to do then.
+	 *
+	 * Proceeding is not only pointless but harmful, because we'd give
+	 * safekeepers term history starting with 0/0. These hacks will go away once
+	 * we disable implicit timeline creation on safekeepers and create it with
+	 * non zero LSN from the start.
+	 */
+	if (wp->propEpochStartLsn == InvalidXLogRecPtr)
+	{
+		Assert(wp->config->syncSafekeepers);
+		wp_log(LOG, "elected with zero propEpochStartLsn in sync-safekeepers, exiting");
+		wp->api.finish_sync_safekeepers(wp, wp->propEpochStartLsn);
+	}
+
 	if (wp->truncateLsn == wp->propEpochStartLsn && wp->config->syncSafekeepers)
 	{
 		/* Sync is not needed: just exit */

From ec6d3422a5a7b6f537b029d7c3e70b7a60f99e0c Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 18 Oct 2024 13:38:59 +0100
Subject: [PATCH 046/239] pageserver: disconnect when asking client to
 reconnect (#9390)

## Problem

Consider the following sequence of events:
1. Shard location gets downgraded to secondary while there's a libpq
connection in pagestream mode from the compute
2. There's no active tenant, so we return `QueryError::Reconnect` from
`PageServerHandler::handle_get_page_at_lsn_request`.
3. Error bubbles up to `PostgresBackendIO::process_message`, bailing us
out of pagestream mode.
4. We instruct the client to reconnnect, but continue serving the libpq
connection. The client isn't yet aware of the request to reconnect and
believes it is still in pagestream mode. Pageserver fails to deserialize
get page requests wrapped in `CopyData` since it's not in pagestream
mode.

## Summary of Changes

When we wish to instruct the client to reconnect, also disconnect from
the server side after flushing the error.

Closes https://github.com/neondatabase/cloud/issues/17336
---
 libs/postgres_backend/src/lib.rs | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
index 9d274b25e6..7419798a60 100644
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -738,6 +738,20 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                         QueryError::SimulatedConnectionError => {
                             return Err(QueryError::SimulatedConnectionError)
                         }
+                        err @ QueryError::Reconnect => {
+                            // Instruct the client to reconnect, stop processing messages
+                            // from this libpq connection and, finally, disconnect from the
+                            // server side (returning an Err achieves the later).
+                            //
+                            // Note the flushing is done by the caller.
+                            let reconnect_error = short_error(&err);
+                            self.write_message_noflush(&BeMessage::ErrorResponse(
+                                &reconnect_error,
+                                Some(err.pg_error_code()),
+                            ))?;
+
+                            return Err(err);
+                        }
                         e => {
                             log_query_error(query_string, &e);
                             let short_error = short_error(&e);

From 5cbdec9c794ef414e7511d644450b1a9a944d4ff Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 18 Oct 2024 14:41:21 +0100
Subject: [PATCH 047/239] [local_proxy]: install pg_session_jwt extension on
 demand (#9370)

Follow up on #9344. We want to install the extension automatically. We
didn't want to couple the extension into compute_ctl so instead
local_proxy is the one to issue requests specific to the extension.

depends on #9344 and #9395
---
 compute/Dockerfile.compute-node         |   4 +-
 proxy/src/auth/backend/local.rs         |  13 ++-
 proxy/src/bin/local_proxy.rs            |   8 +-
 proxy/src/compute_ctl/mod.rs            | 101 ++++++++++++++++++++++++
 proxy/src/http/mod.rs                   |  13 ++-
 proxy/src/lib.rs                        |   1 +
 proxy/src/serverless/backend.rs         |  54 +++++++++++--
 proxy/src/serverless/local_conn_pool.rs |  57 +++++++++----
 8 files changed, 222 insertions(+), 29 deletions(-)
 create mode 100644 proxy/src/compute_ctl/mod.rs

diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node
index 45c1fd9f38..74970696b5 100644
--- a/compute/Dockerfile.compute-node
+++ b/compute/Dockerfile.compute-node
@@ -975,8 +975,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/5aee2625af38213650e1a07ae038fdc427250ee4.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "5d91b10bc1347d36cffc456cb87bec25047935d6503dc652ca046f04760828e7 pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/e642528f429dd3f5403845a50191b78d434b84a6.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "1a69210703cc91224785e59a0a67562dd9eed9a0914ac84b11447582ca0d5b93 pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index e3995ac6c0..1e029ff609 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -1,23 +1,32 @@
 use std::net::SocketAddr;
 
 use arc_swap::ArcSwapOption;
+use tokio::sync::Semaphore;
 
 use super::jwt::{AuthRule, FetchAuthRules};
 use crate::auth::backend::jwt::FetchAuthRulesError;
 use crate::compute::ConnCfg;
+use crate::compute_ctl::ComputeCtlApi;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
 use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
-use crate::EndpointId;
+use crate::url::ApiUrl;
+use crate::{http, EndpointId};
 
 pub struct LocalBackend {
+    pub(crate) initialize: Semaphore,
+    pub(crate) compute_ctl: ComputeCtlApi,
     pub(crate) node_info: NodeInfo,
 }
 
 impl LocalBackend {
-    pub fn new(postgres_addr: SocketAddr) -> Self {
+    pub fn new(postgres_addr: SocketAddr, compute_ctl: ApiUrl) -> Self {
         LocalBackend {
+            initialize: Semaphore::new(1),
+            compute_ctl: ComputeCtlApi {
+                api: http::Endpoint::new(compute_ctl, http::new_client()),
+            },
             node_info: NodeInfo {
                 config: {
                     let mut cfg = ConnCfg::new();
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index e6bc369d9a..a16c288e5d 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -25,6 +25,7 @@ use proxy::rate_limiter::{
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::url::ApiUrl;
 use proxy::RoleName;
 
 project_git_version!(GIT_VERSION);
@@ -80,7 +81,10 @@ struct LocalProxyCliArgs {
     connect_to_compute_retry: String,
     /// Address of the postgres server
     #[clap(long, default_value = "127.0.0.1:5432")]
-    compute: SocketAddr,
+    postgres: SocketAddr,
+    /// Address of the compute-ctl api service
+    #[clap(long, default_value = "http://127.0.0.1:3080/")]
+    compute_ctl: ApiUrl,
     /// Path of the local proxy config file
     #[clap(long, default_value = "./local_proxy.json")]
     config_path: Utf8PathBuf,
@@ -295,7 +299,7 @@ fn build_auth_backend(
     args: &LocalProxyCliArgs,
 ) -> anyhow::Result<&'static auth::Backend<'static, ()>> {
     let auth_backend = proxy::auth::Backend::Local(proxy::auth::backend::MaybeOwned::Owned(
-        LocalBackend::new(args.compute),
+        LocalBackend::new(args.postgres, args.compute_ctl.clone()),
     ));
 
     Ok(Box::leak(Box::new(auth_backend)))
diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs
new file mode 100644
index 0000000000..2b57897223
--- /dev/null
+++ b/proxy/src/compute_ctl/mod.rs
@@ -0,0 +1,101 @@
+use compute_api::responses::GenericAPIError;
+use hyper::{Method, StatusCode};
+use serde::de::DeserializeOwned;
+use serde::{Deserialize, Serialize};
+use thiserror::Error;
+
+use crate::url::ApiUrl;
+use crate::{http, DbName, RoleName};
+
+pub struct ComputeCtlApi {
+    pub(crate) api: http::Endpoint,
+}
+
+#[derive(Serialize, Debug)]
+pub struct ExtensionInstallRequest {
+    pub extension: &'static str,
+    pub database: DbName,
+    pub version: &'static str,
+}
+
+#[derive(Serialize, Debug)]
+pub struct SetRoleGrantsRequest {
+    pub database: DbName,
+    pub schema: &'static str,
+    pub privileges: Vec<Privilege>,
+    pub role: RoleName,
+}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct ExtensionInstallResponse {}
+
+#[derive(Clone, Debug, Deserialize)]
+pub struct SetRoleGrantsResponse {}
+
+#[derive(Debug, Serialize, Deserialize, Clone, Copy)]
+#[serde(rename_all = "UPPERCASE")]
+pub enum Privilege {
+    Usage,
+}
+
+#[derive(Error, Debug)]
+pub enum ComputeCtlError {
+    #[error("connection error: {0}")]
+    ConnectionError(#[source] reqwest_middleware::Error),
+    #[error("request error [{status}]: {body:?}")]
+    RequestError {
+        status: StatusCode,
+        body: Option<GenericAPIError>,
+    },
+    #[error("response parsing error: {0}")]
+    ResponseError(#[source] reqwest::Error),
+}
+
+impl ComputeCtlApi {
+    pub async fn install_extension(
+        &self,
+        req: &ExtensionInstallRequest,
+    ) -> Result<ExtensionInstallResponse, ComputeCtlError> {
+        self.generic_request(req, Method::POST, |url| {
+            url.path_segments_mut().push("extensions");
+        })
+        .await
+    }
+
+    pub async fn grant_role(
+        &self,
+        req: &SetRoleGrantsRequest,
+    ) -> Result<SetRoleGrantsResponse, ComputeCtlError> {
+        self.generic_request(req, Method::POST, |url| {
+            url.path_segments_mut().push("grants");
+        })
+        .await
+    }
+
+    async fn generic_request<Req, Resp>(
+        &self,
+        req: &Req,
+        method: Method,
+        url: impl for<'a> FnOnce(&'a mut ApiUrl),
+    ) -> Result<Resp, ComputeCtlError>
+    where
+        Req: Serialize,
+        Resp: DeserializeOwned,
+    {
+        let resp = self
+            .api
+            .request_with_url(method, url)
+            .json(req)
+            .send()
+            .await
+            .map_err(ComputeCtlError::ConnectionError)?;
+
+        let status = resp.status();
+        if status.is_client_error() || status.is_server_error() {
+            let body = resp.json().await.ok();
+            return Err(ComputeCtlError::RequestError { status, body });
+        }
+
+        resp.json().await.map_err(ComputeCtlError::ResponseError)
+    }
+}
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index fd587e8f01..f1b632e704 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -8,6 +8,7 @@ use std::time::Duration;
 
 use anyhow::bail;
 use bytes::Bytes;
+use http::Method;
 use http_body_util::BodyExt;
 use hyper::body::Body;
 pub(crate) use reqwest::{Request, Response};
@@ -93,9 +94,19 @@ impl Endpoint {
     /// Return a [builder](RequestBuilder) for a `GET` request,
     /// accepting a closure to modify the url path segments for more complex paths queries.
     pub(crate) fn get_with_url(&self, f: impl for<'a> FnOnce(&'a mut ApiUrl)) -> RequestBuilder {
+        self.request_with_url(Method::GET, f)
+    }
+
+    /// Return a [builder](RequestBuilder) for a request,
+    /// accepting a closure to modify the url path segments for more complex paths queries.
+    pub(crate) fn request_with_url(
+        &self,
+        method: Method,
+        f: impl for<'a> FnOnce(&'a mut ApiUrl),
+    ) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         f(&mut url);
-        self.client.get(url.into_inner())
+        self.client.request(method, url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index a7b3d45c95..ea17a88067 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -90,6 +90,7 @@ pub mod auth;
 pub mod cache;
 pub mod cancellation;
 pub mod compute;
+pub mod compute_ctl;
 pub mod config;
 pub mod console_redirect_proxy;
 pub mod context;
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 82e81dbcfe..5d59b4d252 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -14,10 +14,13 @@ use tracing::{debug, info};
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
 use super::http_conn_pool::{self, poll_http2_client, Send};
-use super::local_conn_pool::{self, LocalClient, LocalConnPool};
+use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::compute_ctl::{
+    ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
+};
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
@@ -35,6 +38,7 @@ pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
     pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
     pub(crate) pool: Arc<GlobalConnPool<tokio_postgres::Client>>,
+
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
     pub(crate) endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -250,16 +254,47 @@ impl PoolingBackend {
             return Ok(client);
         }
 
+        let local_backend = match &self.auth_backend {
+            auth::Backend::ControlPlane(_, ()) => {
+                unreachable!("only local_proxy can connect to local postgres")
+            }
+            auth::Backend::Local(local) => local,
+        };
+
+        if !self.local_pool.initialized(&conn_info) {
+            // only install and grant usage one at a time.
+            let _permit = local_backend.initialize.acquire().await.unwrap();
+
+            // check again for race
+            if !self.local_pool.initialized(&conn_info) {
+                local_backend
+                    .compute_ctl
+                    .install_extension(&ExtensionInstallRequest {
+                        extension: EXT_NAME,
+                        database: conn_info.dbname.clone(),
+                        version: EXT_VERSION,
+                    })
+                    .await?;
+
+                local_backend
+                    .compute_ctl
+                    .grant_role(&SetRoleGrantsRequest {
+                        schema: EXT_SCHEMA,
+                        privileges: vec![Privilege::Usage],
+                        database: conn_info.dbname.clone(),
+                        role: conn_info.user_info.user.clone(),
+                    })
+                    .await?;
+
+                self.local_pool.set_initialized(&conn_info);
+            }
+        }
+
         let conn_id = uuid::Uuid::new_v4();
         tracing::Span::current().record("conn_id", display(conn_id));
         info!(%conn_id, "local_pool: opening a new connection '{conn_info}'");
 
-        let mut node_info = match &self.auth_backend {
-            auth::Backend::ControlPlane(_, ()) => {
-                unreachable!("only local_proxy can connect to local postgres")
-            }
-            auth::Backend::Local(local) => local.node_info.clone(),
-        };
+        let mut node_info = local_backend.node_info.clone();
 
         let (key, jwk) = create_random_jwk();
 
@@ -324,6 +359,8 @@ pub(crate) enum HttpConnError {
     #[error("could not parse JWT payload")]
     JwtPayloadError(serde_json::Error),
 
+    #[error("could not install extension: {0}")]
+    ComputeCtl(#[from] ComputeCtlError),
     #[error("could not get auth info")]
     GetAuthInfo(#[from] GetAuthInfoError),
     #[error("user not authenticated")]
@@ -348,6 +385,7 @@ impl ReportableError for HttpConnError {
             HttpConnError::ConnectionClosedAbruptly(_) => ErrorKind::Compute,
             HttpConnError::PostgresConnectionError(p) => p.get_error_kind(),
             HttpConnError::LocalProxyConnectionError(_) => ErrorKind::Compute,
+            HttpConnError::ComputeCtl(_) => ErrorKind::Service,
             HttpConnError::JwtPayloadError(_) => ErrorKind::User,
             HttpConnError::GetAuthInfo(a) => a.get_error_kind(),
             HttpConnError::AuthError(a) => a.get_error_kind(),
@@ -363,6 +401,7 @@ impl UserFacingError for HttpConnError {
             HttpConnError::ConnectionClosedAbruptly(_) => self.to_string(),
             HttpConnError::PostgresConnectionError(p) => p.to_string(),
             HttpConnError::LocalProxyConnectionError(p) => p.to_string(),
+            HttpConnError::ComputeCtl(_) => "could not set up the JWT authorization database extension".to_string(),
             HttpConnError::JwtPayloadError(p) => p.to_string(),
             HttpConnError::GetAuthInfo(c) => c.to_string_client(),
             HttpConnError::AuthError(c) => c.to_string_client(),
@@ -379,6 +418,7 @@ impl CouldRetry for HttpConnError {
         match self {
             HttpConnError::PostgresConnectionError(e) => e.could_retry(),
             HttpConnError::LocalProxyConnectionError(e) => e.could_retry(),
+            HttpConnError::ComputeCtl(_) => false,
             HttpConnError::ConnectionClosedAbruptly(_) => false,
             HttpConnError::JwtPayloadError(_) => false,
             HttpConnError::GetAuthInfo(_) => false,
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index a01afd2820..beb2ad4e8f 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -1,3 +1,14 @@
+//! Manages the pool of connections between local_proxy and postgres.
+//!
+//! The pool is keyed by database and role_name, and can contain multiple connections
+//! shared between users.
+//!
+//! The pool manages the pg_session_jwt extension used for authorizing
+//! requests in the db.
+//!
+//! The first time a db/role pair is seen, local_proxy attempts to install the extension
+//! and grant usage to the role on the given schema.
+
 use std::collections::HashMap;
 use std::pin::pin;
 use std::sync::{Arc, Weak};
@@ -27,14 +38,15 @@ use crate::metrics::Metrics;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{DbName, RoleName};
 
+pub(crate) const EXT_NAME: &str = "pg_session_jwt";
+pub(crate) const EXT_VERSION: &str = "0.1.1";
+pub(crate) const EXT_SCHEMA: &str = "auth";
+
 struct ConnPoolEntry<C: ClientInnerExt> {
     conn: ClientInner<C>,
     _last_access: std::time::Instant,
 }
 
-// /// key id for the pg_session_jwt state
-// static PG_SESSION_JWT_KID: AtomicU64 = AtomicU64::new(1);
-
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
@@ -140,11 +152,18 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
 
 pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
     conns: Vec<ConnPoolEntry<C>>,
+
+    // true if we have definitely installed the extension and
+    // granted the role access to the auth schema.
+    initialized: bool,
 }
 
 impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
     fn default() -> Self {
-        Self { conns: Vec::new() }
+        Self {
+            conns: Vec::new(),
+            initialized: false,
+        }
     }
 }
 
@@ -199,25 +218,16 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
         self.config.pool_options.idle_timeout
     }
 
-    // pub(crate) fn shutdown(&self) {
-    //     let mut pool = self.global_pool.write();
-    //     pool.pools.clear();
-    //     pool.total_conns = 0;
-    // }
-
     pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
     ) -> Result<Option<LocalClient<C>>, HttpConnError> {
-        let mut client: Option<ClientInner<C>> = None;
-        if let Some(entry) = self
+        let client = self
             .global_pool
             .write()
             .get_conn_entry(conn_info.db_and_user())
-        {
-            client = Some(entry.conn);
-        }
+            .map(|entry| entry.conn);
 
         // ok return cached connection if found and establish a new one otherwise
         if let Some(client) = client {
@@ -245,6 +255,23 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
         }
         Ok(None)
     }
+
+    pub(crate) fn initialized(self: &Arc<Self>, conn_info: &ConnInfo) -> bool {
+        self.global_pool
+            .read()
+            .pools
+            .get(&conn_info.db_and_user())
+            .map_or(false, |pool| pool.initialized)
+    }
+
+    pub(crate) fn set_initialized(self: &Arc<Self>, conn_info: &ConnInfo) {
+        self.global_pool
+            .write()
+            .pools
+            .entry(conn_info.db_and_user())
+            .or_default()
+            .initialized = true;
+    }
 }
 
 #[allow(clippy::too_many_arguments)]

From e162ab8b536e8b1d2277b4e2c00abd574c394d75 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 18 Oct 2024 15:33:04 +0100
Subject: [PATCH 048/239] storcon: handle ongoing deletions gracefully (#9449)

## Problem

Pageserver returns 409 (Conflict) if any of the shards are already
deleting the timeline. This resulted in an error being propagated out of
the HTTP handler and to the client. It's an expected scenario so we
should handle it nicely.

This caused failures in `test_storage_controller_smoke`
[here](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9435/11390431900/index.html#suites/8fc5d1648d2225380766afde7c428d81/86eee4b002d6572d).

## Summary of Changes

Instead of returning an error on 409s, we now bubble the status code up
and let the HTTP handler code retry until it gets a 404 or times out.
---
 storage_controller/src/http.rs    | 18 ++++++++++++------
 storage_controller/src/service.rs | 29 +++++++++++++++++++++--------
 2 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index 46b6f4f2bf..afefe8598c 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -381,14 +381,16 @@ async fn handle_tenant_timeline_delete(
         R: std::future::Future<Output = Result<StatusCode, ApiError>> + Send + 'static,
         F: Fn(Arc<Service>) -> R + Send + Sync + 'static,
     {
+        // On subsequent retries, wait longer.
+        // Enable callers with a 25 second request timeout to reliably get a response
+        const MAX_WAIT: Duration = Duration::from_secs(25);
+        const MAX_RETRY_PERIOD: Duration = Duration::from_secs(5);
+
         let started_at = Instant::now();
+
         // To keep deletion reasonably snappy for small tenants, initially check after 1 second if deletion
         // completed.
         let mut retry_period = Duration::from_secs(1);
-        // On subsequent retries, wait longer.
-        let max_retry_period = Duration::from_secs(5);
-        // Enable callers with a 30 second request timeout to reliably get a response
-        let max_wait = Duration::from_secs(25);
 
         loop {
             let status = f(service.clone()).await?;
@@ -396,7 +398,11 @@ async fn handle_tenant_timeline_delete(
                 StatusCode::ACCEPTED => {
                     tracing::info!("Deletion accepted, waiting to try again...");
                     tokio::time::sleep(retry_period).await;
-                    retry_period = max_retry_period;
+                    retry_period = MAX_RETRY_PERIOD;
+                }
+                StatusCode::CONFLICT => {
+                    tracing::info!("Deletion already in progress, waiting to try again...");
+                    tokio::time::sleep(retry_period).await;
                 }
                 StatusCode::NOT_FOUND => {
                     tracing::info!("Deletion complete");
@@ -409,7 +415,7 @@ async fn handle_tenant_timeline_delete(
             }
 
             let now = Instant::now();
-            if now + retry_period > started_at + max_wait {
+            if now + retry_period > started_at + MAX_WAIT {
                 tracing::info!("Deletion timed out waiting for 404");
                 // REQUEST_TIMEOUT would be more appropriate, but CONFLICT is already part of
                 // the pageserver's swagger definition for this endpoint, and has the same desired
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index ab2c3b5e48..01aa8f1dab 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3630,14 +3630,21 @@ impl Service {
                 );
 
                 let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref());
-                client
+                let res = client
                     .timeline_delete(tenant_shard_id, timeline_id)
-                    .await
-                    .map_err(|e| {
-                        ApiError::InternalServerError(anyhow::anyhow!(
-                            "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
-                        ))
-                    })
+                    .await;
+
+                match res {
+                    Ok(ok) => Ok(ok),
+                    Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(e) => {
+                        Err(
+                            ApiError::InternalServerError(anyhow::anyhow!(
+                                "Error deleting timeline {timeline_id} on {tenant_shard_id} on node {node}: {e}",
+                            ))
+                        )
+                    }
+                }
             }
 
             let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect();
@@ -3652,7 +3659,13 @@ impl Service {
                 })
                 .await?;
 
-            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero
+            // If any shards >0 haven't finished deletion yet, don't start deletion on shard zero.
+            // We return 409 (Conflict) if deletion was already in progress on any of the shards
+            // and 202 (Accepted) if deletion was not already in progress on any of the shards.
+            if statuses.iter().any(|s| s == &StatusCode::CONFLICT) {
+                return Ok(StatusCode::CONFLICT);
+            }
+
             if statuses.iter().any(|s| s != &StatusCode::NOT_FOUND) {
                 return Ok(StatusCode::ACCEPTED);
             }

From 62a334871fef32b754ab98a772ebbbbed8d1aa1c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 18 Oct 2024 09:36:29 -0500
Subject: [PATCH 049/239] Take the collector name as argument when generating
 sql_exporter configs

In neon_collector_autoscaling.jsonnet, the collector name is hardcoded
to neon_collector_autoscaling. This issue manifests itself such that
sql_exporter would not find the collector configuration.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/Makefile                 | 2 ++
 compute/etc/sql_exporter.jsonnet | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/compute/Makefile b/compute/Makefile
index e4f08a223c..e2896fe390 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -20,12 +20,14 @@ neon_collector_autoscaling.yml: $(jsonnet_files)
 sql_exporter.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
+		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
 		etc/sql_exporter.jsonnet
 
 sql_exporter_autoscaling.yml: $(jsonnet_files)
 	JSONNET_PATH=etc jsonnet \
 		--output-file etc/$@ \
+		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
 		--tla-str application_name=sql_exporter_autoscaling \
 		etc/sql_exporter.jsonnet
diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
index 640e2ac38d..3c36fd4f68 100644
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -1,4 +1,4 @@
-function(collector_file, application_name='sql_exporter') {
+function(collector_name, collector_file, application_name='sql_exporter') {
   // Configuration for sql_exporter for autoscaling-agent
   // Global defaults.
   global: {
@@ -28,7 +28,7 @@ function(collector_file, application_name='sql_exporter') {
     // Collectors (referenced by name) to execute on the target.
     // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
     collectors: [
-      'neon_collector',
+      collector_name,
     ],
   },
 

From 71d09c78d4ffd159cfcd83c4c1b919a4c7eef7c0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 19 Oct 2024 00:23:49 +0200
Subject: [PATCH 050/239] Accept basebackup <tenant> <timeline> --gzip requests
 (#9456)

In #9453, we want to remove the non-gzipped basebackup code in the
computes, and always request gzipped basebackups.

However, right now the pageserver's page service only accepts basebackup
requests in the following formats:

* `basebackup <tenant_id> <timeline_id>`, lsn is determined by the
pageserver as the most recent one (`timeline.get_last_record_rlsn()`)
* `basebackup <tenant_id> <timeline_id> <lsn>`
* `basebackup <tenant_id> <timeline_id> <lsn> --gzip`

We add a fourth case, `basebackup <tenant_id> <timeline_id> --gzip` to
allow gzipping the request for the latest lsn as well.
---
 pageserver/src/page_service.rs | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index afb2f92ff8..62b14cb83e 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1326,22 +1326,22 @@ where
                 .for_command(ComputeCommandKind::Basebackup)
                 .inc();
 
-            let lsn = if let Some(lsn_str) = params.get(2) {
-                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
-                )
-            } else {
-                None
-            };
-
-            let gzip = match params.get(3) {
-                Some(&"--gzip") => true,
-                None => false,
-                Some(third_param) => {
-                    return Err(QueryError::Other(anyhow::anyhow!(
-                        "Parameter in position 3 unknown {third_param}",
-                    )))
+            let (lsn, gzip) = match (params.get(2), params.get(3)) {
+                (None, _) => (None, false),
+                (Some(&"--gzip"), _) => (None, true),
+                (Some(lsn_str), gzip_str_opt) => {
+                    let lsn = Lsn::from_str(lsn_str)
+                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
+                    let gzip = match gzip_str_opt {
+                        Some(&"--gzip") => true,
+                        None => false,
+                        Some(third_param) => {
+                            return Err(QueryError::Other(anyhow::anyhow!(
+                                "Parameter in position 3 unknown {third_param}",
+                            )))
+                        }
+                    };
+                    (Some(lsn), gzip)
                 }
             };
 

From cc25ef73423ea0108986436501481b0154443932 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Sun, 20 Oct 2024 13:42:50 +0100
Subject: [PATCH 051/239] bump pg-session-jwt version (#9455)

forgot to bump this before
---
 proxy/src/serverless/local_conn_pool.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index beb2ad4e8f..e1ad46c751 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -39,7 +39,7 @@ use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 use crate::{DbName, RoleName};
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
-pub(crate) const EXT_VERSION: &str = "0.1.1";
+pub(crate) const EXT_VERSION: &str = "0.1.2";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
 struct ConnPoolEntry<C: ClientInnerExt> {

From ed958da38a0edf7853ee999f43737ac2ff69f920 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 21 Oct 2024 10:29:23 +0200
Subject: [PATCH 052/239] proxy: Make tests fail fast when test proxy exited
 early (#9432)

This currently happens when proxy is not compiled with feature
`testing`.
Also fix an adjacent function.
---
 test_runner/fixtures/neon_fixtures.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 3cd8019e32..747c2c0d63 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3175,10 +3175,13 @@ class NeonProxy(PgProtocol):
     # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
     def wait_for_exit(self, timeout=2):
         if self._popen:
-            self._popen.wait(timeout=2)
+            self._popen.wait(timeout=timeout)
 
     @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
     def _wait_until_ready(self):
+        assert (
+            self._popen and self._popen.poll() is None
+        ), "Proxy exited unexpectedly. Check test log."
         requests.get(f"http://{self.host}:{self.http_port}/v1/status")
 
     def http_query(self, query, args, **kwargs):

From 5b37485c99836abb060bed8eb1172870b31504b2 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 21 Oct 2024 09:51:12 +0100
Subject: [PATCH 053/239] Rename dockerfiles from `Dockerfile.<something>` to
 `<something>.Dockerfile` (#9446)

## Problem

Our dockerfiles, for some historical reason, have unconventional names
`Dockerfile.<something>`, and some tools (like GitHub UI) fail to highlight
the syntax in them.

> Some projects may need distinct Dockerfiles for specific purposes. A
common convention is to name these `<something>.Dockerfile`

From: https://docs.docker.com/build/concepts/dockerfile/#filename

## Summary of changes
- Rename `Dockerfile.build-tools` -> `build-tools.Dockerfile`
- Rename `compute/Dockerfile.compute-node` ->
`compute/compute-node.Dockerfile`
---
 .github/workflows/_build-and-test-locally.yml             | 8 ++++----
 .github/workflows/build-build-tools-image.yml             | 2 +-
 .github/workflows/build_and_test.yml                      | 6 +++---
 .github/workflows/check-build-tools-image.yml             | 2 +-
 .github/workflows/trigger-e2e-tests.yml                   | 2 +-
 Dockerfile.build-tools => build-tools.Dockerfile          | 2 +-
 compute/README.md                                         | 6 +++---
 .../{Dockerfile.compute-node => compute-node.Dockerfile}  | 0
 docs/docker.md                                            | 6 +++---
 9 files changed, 17 insertions(+), 17 deletions(-)
 rename Dockerfile.build-tools => build-tools.Dockerfile (99%)
 rename compute/{Dockerfile.compute-node => compute-node.Dockerfile} (100%)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index 3aa671fab1..c0f59fbdd5 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -124,28 +124,28 @@ jobs:
         uses: actions/cache@v4
         with:
           path: pg_install/v14
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v14_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Cache postgres v15 build
         id: cache_pg_15
         uses: actions/cache@v4
         with:
           path: pg_install/v15
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v15_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Cache postgres v16 build
         id: cache_pg_16
         uses: actions/cache@v4
         with:
           path: pg_install/v16
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v16_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Cache postgres v17 build
         id: cache_pg_17
         uses: actions/cache@v4
         with:
           path: pg_install/v17
-          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'Dockerfile.build-tools') }}
+          key: v1-${{ runner.os }}-${{ runner.arch }}-${{ inputs.build-type }}-pg-${{ steps.pg_v17_rev.outputs.pg_rev }}-bookworm-${{ hashFiles('Makefile', 'build-tools.Dockerfile') }}
 
       - name: Build postgres v14
         if: steps.cache_pg_14.outputs.cache-hit != 'true'
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 0f05276579..10750089b2 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -82,7 +82,7 @@ jobs:
 
       - uses: docker/build-push-action@v6
         with:
-          file: Dockerfile.build-tools
+          file: build-tools.Dockerfile
           context: .
           provenance: false
           push: true
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b669eaeb11..1186b9927b 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -683,7 +683,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: compute/Dockerfile.compute-node
+          file: compute/compute-node.Dockerfile
           cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-node-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
@@ -703,7 +703,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: compute/Dockerfile.compute-node
+          file: compute/compute-node.Dockerfile
           target: neon-pg-ext-test
           cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
@@ -728,7 +728,7 @@ jobs:
           provenance: false
           push: true
           pull: true
-          file: compute/Dockerfile.compute-node
+          file: compute/compute-node.Dockerfile
           cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
diff --git a/.github/workflows/check-build-tools-image.yml b/.github/workflows/check-build-tools-image.yml
index 807a9ef3bd..a7a15ad58b 100644
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -31,7 +31,7 @@ jobs:
         id: get-build-tools-tag
         env:
           IMAGE_TAG: |
-            ${{ hashFiles('Dockerfile.build-tools',
+            ${{ hashFiles('build-tools.Dockerfile',
                           '.github/workflows/check-build-tools-image.yml',
                           '.github/workflows/build-build-tools-image.yml') }}
         run: |
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 5c5423e252..1e7264c55a 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -112,7 +112,7 @@ jobs:
                 # This isn't exhaustive, just the paths that are most directly compute-related.
                 # For example, compute_ctl also depends on libs/utils, but we don't trigger
                 # an e2e run on that.
-                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node)
+                vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/compute-node.Dockerfile)
                   platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                   ;;
                 *)
diff --git a/Dockerfile.build-tools b/build-tools.Dockerfile
similarity index 99%
rename from Dockerfile.build-tools
rename to build-tools.Dockerfile
index f05c60661c..818cc1b6db 100644
--- a/Dockerfile.build-tools
+++ b/build-tools.Dockerfile
@@ -142,7 +142,7 @@ RUN wget -O /tmp/openssl-${OPENSSL_VERSION}.tar.gz https://www.openssl.org/sourc
 # Use the same version of libicu as the compute nodes so that
 # clusters created using inidb on pageserver can be used by computes.
 #
-# TODO: at this time, Dockerfile.compute-node uses the debian bullseye libicu
+# TODO: at this time, compute-node.Dockerfile uses the debian bullseye libicu
 # package, which is 67.1. We're duplicating that knowledge here, and also, technically,
 # Debian has a few patches on top of 67.1 that we're not adding here.
 ENV ICU_VERSION=67.1
diff --git a/compute/README.md b/compute/README.md
index bb1e42ab53..61e0eee4be 100644
--- a/compute/README.md
+++ b/compute/README.md
@@ -1,7 +1,7 @@
 This directory contains files that are needed to build the compute
 images, or included in the compute images.
 
-Dockerfile.compute-node
+compute-node.Dockerfile
 	To build the compute image
 
 vm-image-spec.yaml
@@ -14,8 +14,8 @@ etc/
 patches/
 	Some extensions need to be patched to work with Neon. This
 	directory contains such patches. They are applied to the extension
-	sources in Dockerfile.compute-node
+	sources in compute-node.Dockerfile
 
 In addition to these, postgres itself, the neon postgres extension,
 and compute_ctl are built and copied into the compute image by
-Dockerfile.compute-node.
+compute-node.Dockerfile.
diff --git a/compute/Dockerfile.compute-node b/compute/compute-node.Dockerfile
similarity index 100%
rename from compute/Dockerfile.compute-node
rename to compute/compute-node.Dockerfile
diff --git a/docs/docker.md b/docs/docker.md
index d16311c27b..0914a00082 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -5,7 +5,7 @@
 Currently we build two main images:
 
 - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile).
-- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node).
+- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/compute-node.Dockerfile).
 
 And additional intermediate image:
 
@@ -56,7 +56,7 @@ CREATE TABLE
 postgres=# insert into t values(1, 1);
 INSERT 0 1
 postgres=# select * from t;
- key | value 
+ key | value
 -----+-------
    1 | 1
 (1 row)
@@ -84,4 +84,4 @@ Access http://localhost:9001 and sign in.
 - Username: `minio`
 - Password: `password`
 
-You can see durable pages and WAL data in `neon` bucket.
\ No newline at end of file
+You can see durable pages and WAL data in `neon` bucket.

From 163beaf9ad8521ec28d451d1ea884039efcb8897 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 21 Oct 2024 12:14:19 +0100
Subject: [PATCH 054/239] CI: use build-tools on Debian 12 whenever we use Neon
 artifact (#9463)

## Problem

```
+ /tmp/neon/pg_install/v16/bin/psql '***' -c 'SELECT version()'
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /tmp/neon/pg_install/v16/bin/psql)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /tmp/neon/pg_install/v16/bin/psql)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.32' not found (required by /tmp/neon/pg_install/v16/lib/libpq.so.5)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.33' not found (required by /tmp/neon/pg_install/v16/lib/libpq.so.5)
/tmp/neon/pg_install/v16/bin/psql: /lib/x86_64-linux-gnu/libc.so.6: version `GLIBC_2.34' not found (required by /tmp/neon/pg_install/v16/lib/libpq.so.5)
```

## Summary of changes
- Use `build-tools:pinned-bookworm` whenever we download Neon artefact
---
 .../workflows/_benchmarking_preparation.yml    |  2 +-
 .github/workflows/benchmarking.yml             | 18 +++++++++---------
 .github/workflows/cloud-regress.yml            |  2 +-
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/_benchmarking_preparation.yml b/.github/workflows/_benchmarking_preparation.yml
index d60f97320b..5cdc16f248 100644
--- a/.github/workflows/_benchmarking_preparation.yml
+++ b/.github/workflows/_benchmarking_preparation.yml
@@ -27,7 +27,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 32806b89ab..5ccfe48684 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -83,7 +83,7 @@ jobs:
 
     runs-on: ${{ matrix.RUNNER }}
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -178,7 +178,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -280,7 +280,7 @@ jobs:
         region_id_default=${{ env.DEFAULT_REGION_ID }}
         runner_default='["self-hosted", "us-east-2", "x64"]'
         runner_azure='["self-hosted", "eastus2", "x64"]'
-        image_default="neondatabase/build-tools:pinned"
+        image_default="neondatabase/build-tools:pinned-bookworm"
         matrix='{
           "pg_version" : [
             16
@@ -299,9 +299,9 @@ jobs:
           "include": [{ "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-freetier",       "db_size": "3gb" ,"runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "10gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-new",            "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
-                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-freetier", "db_size": "3gb" ,"runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "10gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
+                      { "pg_version": 16, "region_id": "azure-eastus2",          "platform": "neonvm-azure-captest-new",      "db_size": "50gb","runner": '"$runner_azure"',   "image": "neondatabase/build-tools:pinned-bookworm" },
                       { "pg_version": 16, "region_id": "'"$region_id_default"'", "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb","runner": '"$runner_default"', "image": "'"$image_default"'" }]
         }'
 
@@ -665,7 +665,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -772,7 +772,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
@@ -877,7 +877,7 @@ jobs:
 
     runs-on: [ self-hosted, us-east-2, x64 ]
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       credentials:
         username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
diff --git a/.github/workflows/cloud-regress.yml b/.github/workflows/cloud-regress.yml
index ecafe183f8..19ebf457b8 100644
--- a/.github/workflows/cloud-regress.yml
+++ b/.github/workflows/cloud-regress.yml
@@ -31,7 +31,7 @@ jobs:
 
     runs-on: us-east-2
     container:
-      image: neondatabase/build-tools:pinned
+      image: neondatabase/build-tools:pinned-bookworm
       options: --init
 
     steps:

From ababa50cce5e05df4d3d9fcf617a1b2625ed3b4a Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Mon, 21 Oct 2024 16:20:39 +0300
Subject: [PATCH 055/239] Use '-f' for make clean in  Makefile compute (#9464)

Use '-f' instead of '--force' because it is impossible to clean the
targets on MacOS
---
 compute/Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute/Makefile b/compute/Makefile
index e2896fe390..08e3c7a68b 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -34,7 +34,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 
 .PHONY: clean
 clean:
-	rm --force \
+	rm -f \
 		etc/neon_collector.yml \
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \

From 2dcac94194bedb47e06bc6a98467125e3cfaf07b Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Mon, 21 Oct 2024 17:20:09 +0300
Subject: [PATCH 056/239] proxy: Use common error interface for error handling
 with cplane (#9454)

- Remove obsolete error handles.
- Use one source of truth for cplane errors.
#18468
---
 proxy/src/control_plane/messages.rs     |  6 ++-
 proxy/src/control_plane/provider/mod.rs | 32 +------------
 proxy/src/metrics.rs                    | 16 +------
 proxy/src/proxy/wake_compute.rs         | 62 ++-----------------------
 4 files changed, 12 insertions(+), 104 deletions(-)

diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index dae23f7c53..13a54145b1 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -161,6 +161,9 @@ pub(crate) enum Reason {
     /// LockAlreadyTaken indicates that the we attempted to take a lock that was already taken.
     #[serde(rename = "LOCK_ALREADY_TAKEN")]
     LockAlreadyTaken,
+    /// ActiveEndpointsLimitExceeded indicates that the limit of concurrently active endpoints was exceeded.
+    #[serde(rename = "ACTIVE_ENDPOINTS_LIMIT_EXCEEDED")]
+    ActiveEndpointsLimitExceeded,
     #[default]
     #[serde(other)]
     Unknown,
@@ -194,7 +197,8 @@ impl Reason {
             | Reason::ComputeTimeQuotaExceeded
             | Reason::WrittenDataQuotaExceeded
             | Reason::DataTransferQuotaExceeded
-            | Reason::LogicalSizeQuotaExceeded => false,
+            | Reason::LogicalSizeQuotaExceeded
+            | Reason::ActiveEndpointsLimitExceeded => false,
             // transitive error. control plane is currently busy
             // but might be ready soon
             Reason::RunningOperations
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index a4a330cd5f..88399dffa8 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -87,36 +87,8 @@ pub(crate) mod errors {
                     Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
                     Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
                     Reason::RunningOperations => ErrorKind::ControlPlane,
-                    Reason::Unknown => match &**e {
-                        ControlPlaneError {
-                            http_status_code:
-                                http::StatusCode::NOT_FOUND | http::StatusCode::NOT_ACCEPTABLE,
-                            ..
-                        } => crate::error::ErrorKind::User,
-                        ControlPlaneError {
-                            http_status_code: http::StatusCode::UNPROCESSABLE_ENTITY,
-                            error,
-                            ..
-                        } if error
-                            .contains("compute time quota of non-primary branches is exceeded") =>
-                        {
-                            crate::error::ErrorKind::Quota
-                        }
-                        ControlPlaneError {
-                            http_status_code: http::StatusCode::LOCKED,
-                            error,
-                            ..
-                        } if error.contains("quota exceeded")
-                            || error.contains("the limit for current plan reached") =>
-                        {
-                            crate::error::ErrorKind::Quota
-                        }
-                        ControlPlaneError {
-                            http_status_code: http::StatusCode::TOO_MANY_REQUESTS,
-                            ..
-                        } => crate::error::ErrorKind::ServiceRateLimit,
-                        ControlPlaneError { .. } => crate::error::ErrorKind::ControlPlane,
-                    },
+                    Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
+                    Reason::Unknown => ErrorKind::ControlPlane,
                 },
                 ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
             }
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 542826e833..f91fcd4120 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -14,6 +14,7 @@ use metrics::{CounterPairAssoc, CounterPairVec, HyperLogLog, HyperLogLogVec};
 use tokio::time::{self, Instant};
 
 use crate::control_plane::messages::ColdStartInfo;
+use crate::error::ErrorKind;
 
 #[derive(MetricGroup)]
 #[metric(new(thread_pool: Arc<ThreadPoolMetrics>))]
@@ -325,23 +326,10 @@ pub enum ConnectionFailureKind {
     ComputeUncached,
 }
 
-#[derive(FixedCardinalityLabel, Copy, Clone)]
-#[label(singleton = "kind")]
-pub enum WakeupFailureKind {
-    BadComputeAddress,
-    ApiTransportError,
-    QuotaExceeded,
-    ApiConsoleLocked,
-    ApiConsoleBadRequest,
-    ApiConsoleOtherServerError,
-    ApiConsoleOtherError,
-    TimeoutError,
-}
-
 #[derive(LabelGroup)]
 #[label(set = ConnectionFailuresBreakdownSet)]
 pub struct ConnectionFailuresBreakdownGroup {
-    pub kind: WakeupFailureKind,
+    pub kind: ErrorKind,
     pub retry: Bool,
 }
 
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 9dfa485fa4..4e61094264 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -1,15 +1,13 @@
-use hyper::StatusCode;
 use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::WakeComputeError;
-use crate::control_plane::messages::{ControlPlaneError, Reason};
 use crate::control_plane::provider::CachedNodeInfo;
+use crate::error::ReportableError;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
-    WakeupFailureKind,
 };
 use crate::proxy::retry::{retry_after, should_retry};
 
@@ -60,62 +58,8 @@ pub(crate) async fn wake_compute<B: ComputeConnectBackend>(
 }
 
 fn report_error(e: &WakeComputeError, retry: bool) {
-    use crate::control_plane::errors::ApiError;
-    let kind = match e {
-        WakeComputeError::BadComputeAddress(_) => WakeupFailureKind::BadComputeAddress,
-        WakeComputeError::ApiError(ApiError::Transport(_)) => WakeupFailureKind::ApiTransportError,
-        WakeComputeError::ApiError(ApiError::ControlPlane(e)) => match e.get_reason() {
-            Reason::RoleProtected => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::ResourceNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::ProjectNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::EndpointNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::BranchNotFound => WakeupFailureKind::ApiConsoleBadRequest,
-            Reason::RateLimitExceeded => WakeupFailureKind::ApiConsoleLocked,
-            Reason::NonDefaultBranchComputeTimeExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::ActiveTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::ComputeTimeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::WrittenDataQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::DataTransferQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::LogicalSizeQuotaExceeded => WakeupFailureKind::QuotaExceeded,
-            Reason::ConcurrencyLimitReached => WakeupFailureKind::ApiConsoleLocked,
-            Reason::LockAlreadyTaken => WakeupFailureKind::ApiConsoleLocked,
-            Reason::RunningOperations => WakeupFailureKind::ApiConsoleLocked,
-            Reason::Unknown => match **e {
-                ControlPlaneError {
-                    http_status_code: StatusCode::LOCKED,
-                    ref error,
-                    ..
-                } if error.contains("written data quota exceeded")
-                    || error.contains("the limit for current plan reached") =>
-                {
-                    WakeupFailureKind::QuotaExceeded
-                }
-                ControlPlaneError {
-                    http_status_code: StatusCode::UNPROCESSABLE_ENTITY,
-                    ref error,
-                    ..
-                } if error.contains("compute time quota of non-primary branches is exceeded") => {
-                    WakeupFailureKind::QuotaExceeded
-                }
-                ControlPlaneError {
-                    http_status_code: StatusCode::LOCKED,
-                    ..
-                } => WakeupFailureKind::ApiConsoleLocked,
-                ControlPlaneError {
-                    http_status_code: StatusCode::BAD_REQUEST,
-                    ..
-                } => WakeupFailureKind::ApiConsoleBadRequest,
-                ControlPlaneError {
-                    http_status_code, ..
-                } if http_status_code.is_server_error() => {
-                    WakeupFailureKind::ApiConsoleOtherServerError
-                }
-                ControlPlaneError { .. } => WakeupFailureKind::ApiConsoleOtherError,
-            },
-        },
-        WakeComputeError::TooManyConnections => WakeupFailureKind::ApiConsoleLocked,
-        WakeComputeError::TooManyConnectionAttempts(_) => WakeupFailureKind::TimeoutError,
-    };
+    let kind = e.get_error_kind();
+
     Metrics::get()
         .proxy
         .connection_failures_breakdown

From aca81f5fa4f3e0f882a9b0d55eef1cdee8ffc168 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 21 Oct 2024 10:59:48 -0400
Subject: [PATCH 057/239] fix(pageserver): make image split layer writer finish
 atomic (#8841)

Part of https://github.com/neondatabase/neon/issues/8836

## Summary of changes

This pull request makes the image layer split writer atomic when
finishing the layers. All the produced layers either finish at the same
time, or discard at the same time. Note that this does not prevent
atomicity when crash, but anyways, it will be cleaned up on pageserver
restart.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |  19 ++
 .../src/tenant/storage_layer/split_writer.rs  | 231 ++++++++++--------
 pageserver/src/tenant/timeline/compaction.rs  |   7 +-
 4 files changed, 149 insertions(+), 112 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index d1079876f8..6332d36dc3 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -515,8 +515,8 @@ impl DeltaLayerWriterInner {
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let temp_path = self.path.clone();
         let result = self.finish0(key_end, ctx).await;
-        if result.is_err() {
-            tracing::info!(%temp_path, "cleaning up temporary file after error during writing");
+        if let Err(ref e) = result {
+            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
             if let Err(e) = std::fs::remove_file(&temp_path) {
                 tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
             }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 6c1a943470..b1f2557038 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -827,6 +827,25 @@ impl ImageLayerWriterInner {
         self,
         ctx: &RequestContext,
         end_key: Option<Key>,
+    ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
+        let temp_path = self.path.clone();
+        let result = self.finish0(ctx, end_key).await;
+        if let Err(ref e) = result {
+            tracing::info!(%temp_path, "cleaning up temporary file after error during writing: {e}");
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                tracing::warn!(error=%e, %temp_path, "error cleaning up temporary layer file after error during writing");
+            }
+        }
+        result
+    }
+
+    ///
+    /// Finish writing the image layer.
+    ///
+    async fn finish0(
+        self,
+        ctx: &RequestContext,
+        end_key: Option<Key>,
     ) -> anyhow::Result<(PersistentLayerDesc, Utf8PathBuf)> {
         let index_start_blk = self.blob_writer.size().div_ceil(PAGE_SZ as u64) as u32;
 
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index b499a0eef4..5bd9a47e2b 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -42,7 +42,7 @@ impl SplitWriterResult {
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
     target_layer_size: u64,
-    generated_layers: Vec<SplitWriterResult>,
+    generated_layer_writers: Vec<(ImageLayerWriter, PersistentLayerKey)>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
@@ -71,7 +71,7 @@ impl SplitImageLayerWriter {
                 ctx,
             )
             .await?,
-            generated_layers: Vec::new(),
+            generated_layer_writers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
@@ -80,18 +80,12 @@ impl SplitImageLayerWriter {
         })
     }
 
-    pub async fn put_image_with_discard_fn<D, F>(
+    pub async fn put_image(
         &mut self,
         key: Key,
         img: Bytes,
-        tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<()>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
+    ) -> anyhow::Result<()> {
         // The current estimation is an upper bound of the space that the key/image could take
         // because we did not consider compression in this estimation. The resulting image layer
         // could be smaller than the target size.
@@ -108,72 +102,83 @@ impl SplitImageLayerWriter {
                 ctx,
             )
             .await?;
-            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
             let layer_key = PersistentLayerKey {
                 key_range: self.start_key..key,
                 lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
                 is_delta: false,
             };
+            let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
             self.start_key = key;
 
-            if discard(&layer_key).await {
-                drop(prev_image_writer);
-                self.generated_layers
-                    .push(SplitWriterResult::Discarded(layer_key));
-            } else {
-                let (desc, path) = prev_image_writer.finish_with_end_key(key, ctx).await?;
-
-                let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-                self.generated_layers
-                    .push(SplitWriterResult::Produced(layer));
-            }
+            self.generated_layer_writers
+                .push((prev_image_writer, layer_key));
         }
         self.inner.put_image(key, img, ctx).await
     }
 
-    #[cfg(test)]
-    pub async fn put_image(
-        &mut self,
-        key: Key,
-        img: Bytes,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.put_image_with_discard_fn(key, img, tline, ctx, |_| async { false })
-            .await
-    }
-
     pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Key,
-        discard: D,
+        discard_fn: D,
     ) -> anyhow::Result<Vec<SplitWriterResult>>
     where
-        D: FnOnce(&PersistentLayerKey) -> F,
+        D: Fn(&PersistentLayerKey) -> F,
         F: Future<Output = bool>,
     {
         let Self {
-            mut generated_layers,
+            mut generated_layer_writers,
             inner,
             ..
         } = self;
-        if inner.num_keys() == 0 {
-            return Ok(generated_layers);
+        if inner.num_keys() != 0 {
+            let layer_key = PersistentLayerKey {
+                key_range: self.start_key..end_key,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
+                is_delta: false,
+            };
+            generated_layer_writers.push((inner, layer_key));
         }
-        let layer_key = PersistentLayerKey {
-            key_range: self.start_key..end_key,
-            lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-            is_delta: false,
+        let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
+            for produced_layer in generated_layers {
+                if let SplitWriterResult::Produced(image_layer) = produced_layer {
+                    let layer: Layer = image_layer.into();
+                    layer.delete_on_drop();
+                }
+            }
         };
-        if discard(&layer_key).await {
-            generated_layers.push(SplitWriterResult::Discarded(layer_key));
-        } else {
-            let (desc, path) = inner.finish_with_end_key(end_key, ctx).await?;
-            let layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
-            generated_layers.push(SplitWriterResult::Produced(layer));
+        // BEGIN: catch every error and do the recovery in the below section
+        let mut generated_layers = Vec::new();
+        for (inner, layer_key) in generated_layer_writers {
+            if discard_fn(&layer_key).await {
+                generated_layers.push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                let layer = match inner
+                    .finish_with_end_key(layer_key.key_range.end, ctx)
+                    .await
+                {
+                    Ok((desc, path)) => {
+                        match Layer::finish_creating(self.conf, tline, desc, &path) {
+                            Ok(layer) => layer,
+                            Err(e) => {
+                                tokio::fs::remove_file(&path).await.ok();
+                                clean_up_layers(generated_layers);
+                                return Err(e);
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        // ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
+                        // so we don't need to remove it by ourselves.
+                        clean_up_layers(generated_layers);
+                        return Err(e);
+                    }
+                };
+                generated_layers.push(SplitWriterResult::Produced(layer));
+            }
         }
+        // END: catch every error and do the recovery in the above section
         Ok(generated_layers)
     }
 
@@ -187,11 +192,6 @@ impl SplitImageLayerWriter {
         self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
             .await
     }
-
-    /// This function will be deprecated with #8841.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, ImageLayerWriter)> {
-        Ok((self.generated_layers, self.inner))
-    }
 }
 
 /// A delta writer that takes key-lsn-values and produces multiple delta layers.
@@ -296,8 +296,16 @@ impl SplitDeltaLayerWriter {
                     self.generated_layers
                         .push(SplitWriterResult::Discarded(layer_key));
                 } else {
+                    // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
+                    // files for `finish_creating`.
                     let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-                    let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+                    let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
+                        Ok(layer) => layer,
+                        Err(e) => {
+                            tokio::fs::remove_file(&path).await.ok();
+                            return Err(e);
+                        }
+                    };
                     self.generated_layers
                         .push(SplitWriterResult::Produced(delta_layer));
                 }
@@ -357,8 +365,16 @@ impl SplitDeltaLayerWriter {
         if discard(&layer_key).await {
             generated_layers.push(SplitWriterResult::Discarded(layer_key));
         } else {
+            // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
+            // files for `finish_creating`.
             let (desc, path) = inner.finish(end_key, ctx).await?;
-            let delta_layer = Layer::finish_creating(self.conf, tline, desc, &path)?;
+            let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
+                Ok(layer) => layer,
+                Err(e) => {
+                    tokio::fs::remove_file(&path).await.ok();
+                    return Err(e);
+                }
+            };
             generated_layers.push(SplitWriterResult::Produced(delta_layer));
         }
         Ok(generated_layers)
@@ -447,7 +463,7 @@ mod tests {
         .unwrap();
 
         image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .put_image(get_key(0), get_img(0), &ctx)
             .await
             .unwrap();
         let layers = image_writer
@@ -486,14 +502,18 @@ mod tests {
 
     #[tokio::test]
     async fn write_split() {
+        // Test the split writer with retaining all the layers we have produced (discard=false)
         write_split_helper("split_writer_write_split", false).await;
     }
 
     #[tokio::test]
     async fn write_split_discard() {
-        write_split_helper("split_writer_write_split_discard", false).await;
+        // Test the split writer with discarding all the layers we have produced (discard=true)
+        write_split_helper("split_writer_write_split_discard", true).await;
     }
 
+    /// Test the image+delta writer by writing a large number of images and deltas. If discard is
+    /// set to true, all layers will be discarded.
     async fn write_split_helper(harness_name: &'static str, discard: bool) {
         let harness = TenantHarness::create(harness_name).await.unwrap();
         let (tenant, ctx) = harness.load().await;
@@ -527,9 +547,7 @@ mod tests {
         for i in 0..N {
             let i = i as u32;
             image_writer
-                .put_image_with_discard_fn(get_key(i), get_large_img(), &tline, &ctx, |_| async {
-                    discard
-                })
+                .put_image(get_key(i), get_large_img(), &ctx)
                 .await
                 .unwrap();
             delta_writer
@@ -545,51 +563,54 @@ mod tests {
                 .unwrap();
         }
         let image_layers = image_writer
-            .finish(&tline, &ctx, get_key(N as u32))
+            .finish_with_discard_fn(&tline, &ctx, get_key(N as u32), |_| async { discard })
             .await
             .unwrap();
-        let delta_layers = delta_writer.finish(&tline, &ctx).await.unwrap();
-        if discard {
-            for layer in image_layers {
-                layer.into_discarded_layer();
-            }
-            for layer in delta_layers {
-                layer.into_discarded_layer();
-            }
-        } else {
-            let image_layers = image_layers
-                .into_iter()
-                .map(|x| x.into_resident_layer())
-                .collect_vec();
-            let delta_layers = delta_layers
-                .into_iter()
-                .map(|x| x.into_resident_layer())
-                .collect_vec();
-            assert_eq!(image_layers.len(), N / 512 + 1);
-            assert_eq!(delta_layers.len(), N / 512 + 1);
-            assert_eq!(
-                delta_layers.first().unwrap().layer_desc().key_range.start,
-                get_key(0)
-            );
-            assert_eq!(
-                delta_layers.last().unwrap().layer_desc().key_range.end,
-                get_key(N as u32)
-            );
-            for idx in 0..image_layers.len() {
-                assert_ne!(image_layers[idx].layer_desc().key_range.start, Key::MIN);
-                assert_ne!(image_layers[idx].layer_desc().key_range.end, Key::MAX);
-                assert_ne!(delta_layers[idx].layer_desc().key_range.start, Key::MIN);
-                assert_ne!(delta_layers[idx].layer_desc().key_range.end, Key::MAX);
-                if idx > 0 {
-                    assert_eq!(
-                        image_layers[idx - 1].layer_desc().key_range.end,
-                        image_layers[idx].layer_desc().key_range.start
-                    );
-                    assert_eq!(
-                        delta_layers[idx - 1].layer_desc().key_range.end,
-                        delta_layers[idx].layer_desc().key_range.start
-                    );
+        let delta_layers = delta_writer
+            .finish_with_discard_fn(&tline, &ctx, |_| async { discard })
+            .await
+            .unwrap();
+        let image_layers = image_layers
+            .into_iter()
+            .map(|x| {
+                if discard {
+                    x.into_discarded_layer()
+                } else {
+                    x.into_resident_layer().layer_desc().key()
                 }
+            })
+            .collect_vec();
+        let delta_layers = delta_layers
+            .into_iter()
+            .map(|x| {
+                if discard {
+                    x.into_discarded_layer()
+                } else {
+                    x.into_resident_layer().layer_desc().key()
+                }
+            })
+            .collect_vec();
+        assert_eq!(image_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.len(), N / 512 + 1);
+        assert_eq!(delta_layers.first().unwrap().key_range.start, get_key(0));
+        assert_eq!(
+            delta_layers.last().unwrap().key_range.end,
+            get_key(N as u32)
+        );
+        for idx in 0..image_layers.len() {
+            assert_ne!(image_layers[idx].key_range.start, Key::MIN);
+            assert_ne!(image_layers[idx].key_range.end, Key::MAX);
+            assert_ne!(delta_layers[idx].key_range.start, Key::MIN);
+            assert_ne!(delta_layers[idx].key_range.end, Key::MAX);
+            if idx > 0 {
+                assert_eq!(
+                    image_layers[idx - 1].key_range.end,
+                    image_layers[idx].key_range.start
+                );
+                assert_eq!(
+                    delta_layers[idx - 1].key_range.end,
+                    delta_layers[idx].key_range.start
+                );
             }
         }
     }
@@ -629,11 +650,11 @@ mod tests {
         .unwrap();
 
         image_writer
-            .put_image(get_key(0), get_img(0), &tline, &ctx)
+            .put_image(get_key(0), get_img(0), &ctx)
             .await
             .unwrap();
         image_writer
-            .put_image(get_key(1), get_large_img(), &tline, &ctx)
+            .put_image(get_key(1), get_large_img(), &ctx)
             .await
             .unwrap();
         let layers = image_writer
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5588363330..5cb1460b29 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -141,9 +141,7 @@ impl KeyHistoryRetention {
                     };
                     stat.produce_image_key(img);
                     if let Some(image_writer) = image_writer.as_mut() {
-                        image_writer
-                            .put_image_with_discard_fn(key, img.clone(), tline, ctx, discard)
-                            .await?;
+                        image_writer.put_image(key, img.clone(), ctx).await?;
                     } else {
                         delta_writer
                             .put_value_with_discard_fn(
@@ -2041,8 +2039,7 @@ impl Timeline {
                     .finish_with_discard_fn(self, ctx, Key::MAX, discard)
                     .await?
             } else {
-                let (layers, _) = writer.take()?;
-                assert!(layers.is_empty(), "image layers produced in dry run mode?");
+                drop(writer);
                 Vec::new()
             }
         } else {

From 49d5e56c084f1fd694cde75d56c2d8ed9049c06e Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Mon, 21 Oct 2024 11:01:25 -0400
Subject: [PATCH 058/239] pageserver: use direct IO for delta and image layer
 reads (#9326)

Part of #8130

## Problem

Pageserver previously goes through the kernel page cache for all the
IOs. The kernel page cache makes light-loaded pageserver have deceptive
fast performance. Using direct IO would offer predictable latencies of
our virtual file IO operations.

In particular for reads, the data pages also have an extremely low
temporal locality because the most frequently accessed pages are cached
on the compute side.

## Summary of changes

This PR enables pageserver to use direct IO for delta layer and image
layer reads. We can ship them separately because these layers are
write-once, read-many, so we will not be mixing buffered IO with direct
IO.

- implement `IoBufferMut`, an buffer type with aligned allocation
(currently set to 512).
- use `IoBufferMut` at all places we are doing reads on image + delta
layers.
- leverage Rust type system and use `IoBufAlignedMut` marker trait to
guarantee that the input buffers for the IO operations are aligned.
- page cache allocation is also made aligned.

_* in-memory layer reads and the write path will be shipped separately._

## Testing

Integration test suite run with O_DIRECT enabled:
https://github.com/neondatabase/neon/pull/9350

## Performance

We evaluated performance based on the `get-page-at-latest-lsn`
benchmark. The results demonstrate a decrease in the number of IOps, no
sigificant change in the latency mean, and an slight improvement on the
p99.9 and p99.99 latencies.


[Benchmark](https://www.notion.so/neondatabase/Benchmark-O_DIRECT-for-image-and-delta-layers-2024-10-01-112f189e00478092a195ea5a0137e706?pvs=4)

## Rollout

We will add `virtual_file_io_mode=direct` region by region to enable
direct IO on image + delta layers.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/benches/bench_ingest.rs            |   6 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |   7 +-
 pageserver/ctl/src/layers.rs                  |  13 +-
 pageserver/ctl/src/main.rs                    |   8 +-
 pageserver/src/bin/pageserver.rs              |   6 +-
 pageserver/src/page_cache.rs                  |  16 +-
 pageserver/src/tenant/block_io.rs             |   6 +-
 pageserver/src/tenant/ephemeral_file.rs       |  28 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  15 +-
 .../src/tenant/storage_layer/image_layer.rs   |  19 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   8 +-
 .../inmemory_layer/vectored_dio_read.rs       |  23 +-
 pageserver/src/tenant/vectored_blob_io.rs     |   9 +-
 pageserver/src/virtual_file.rs                |  40 +-
 .../owned_buffers_io/aligned_buffer.rs        |   9 +
 .../aligned_buffer/alignment.rs               |  26 ++
 .../owned_buffers_io/aligned_buffer/buffer.rs | 124 +++++++
 .../aligned_buffer/buffer_mut.rs              | 347 ++++++++++++++++++
 .../owned_buffers_io/aligned_buffer/raw.rs    | 216 +++++++++++
 .../owned_buffers_io/aligned_buffer/slice.rs  |  40 ++
 .../owned_buffers_io/io_buf_aligned.rs        |   9 +
 .../owned_buffers_io/io_buf_ext.rs            |   3 +
 22 files changed, 899 insertions(+), 79 deletions(-)
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs

diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 821c8008a9..d98b23acce 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,7 +164,11 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(16384, virtual_file::io_engine_for_bench());
+    virtual_file::init(
+        16384,
+        virtual_file::io_engine_for_bench(),
+        conf.virtual_file_io_mode,
+    );
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 151b94cf62..7dd2a5d05c 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -7,6 +7,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
+use pageserver::virtual_file::api::IoMode;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
@@ -152,7 +153,11 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    pageserver::virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
+    );
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index fd948bf2ef..c0b2b6ae89 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -11,6 +11,7 @@ use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
 use pageserver::tenant::storage_layer::{delta_layer, image_layer};
 use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
+use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
     repository::{Key, KEY_SIZE},
@@ -59,7 +60,11 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
+    );
     page_cache::init(100);
     let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
@@ -190,7 +195,11 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+            pageserver::virtual_file::init(
+                10,
+                virtual_file::api::IoEngineKind::StdFs,
+                IoMode::preferred(),
+            );
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index c96664d346..f506caec5b 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -24,7 +24,7 @@ use pageserver::{
     page_cache,
     task_mgr::TaskKind,
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
-    virtual_file,
+    virtual_file::{self, api::IoMode},
 };
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
@@ -205,7 +205,11 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
+    );
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index f71a3d2653..c6659345f9 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -167,7 +167,11 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
+    virtual_file::init(
+        conf.max_file_descriptors,
+        conf.virtual_file_io_engine,
+        conf.virtual_file_io_mode,
+    );
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/page_cache.rs b/pageserver/src/page_cache.rs
index f386c825b8..45bf02362a 100644
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -82,6 +82,7 @@ use once_cell::sync::OnceCell;
 use crate::{
     context::RequestContext,
     metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
+    virtual_file::{IoBufferMut, IoPageSlice},
 };
 
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -144,7 +145,7 @@ struct SlotInner {
     key: Option<CacheKey>,
     // for `coalesce_readers_permit`
     permit: std::sync::Mutex<Weak<PinnedSlotsPermit>>,
-    buf: &'static mut [u8; PAGE_SZ],
+    buf: IoPageSlice<'static>,
 }
 
 impl Slot {
@@ -234,13 +235,13 @@ impl std::ops::Deref for PageReadGuard<'_> {
     type Target = [u8; PAGE_SZ];
 
     fn deref(&self) -> &Self::Target {
-        self.slot_guard.buf
+        self.slot_guard.buf.deref()
     }
 }
 
 impl AsRef<[u8; PAGE_SZ]> for PageReadGuard<'_> {
     fn as_ref(&self) -> &[u8; PAGE_SZ] {
-        self.slot_guard.buf
+        self.slot_guard.buf.as_ref()
     }
 }
 
@@ -266,7 +267,7 @@ enum PageWriteGuardState<'i> {
 impl std::ops::DerefMut for PageWriteGuard<'_> {
     fn deref_mut(&mut self) -> &mut Self::Target {
         match &mut self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref_mut(),
             PageWriteGuardState::Downgraded => unreachable!(),
         }
     }
@@ -277,7 +278,7 @@ impl std::ops::Deref for PageWriteGuard<'_> {
 
     fn deref(&self) -> &Self::Target {
         match &self.state {
-            PageWriteGuardState::Invalid { inner, _permit } => inner.buf,
+            PageWriteGuardState::Invalid { inner, _permit } => inner.buf.deref(),
             PageWriteGuardState::Downgraded => unreachable!(),
         }
     }
@@ -643,7 +644,7 @@ impl PageCache {
         // We could use Vec::leak here, but that potentially also leaks
         // uninitialized reserved capacity. With into_boxed_slice and Box::leak
         // this is avoided.
-        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());
+        let page_buffer = IoBufferMut::with_capacity_zeroed(num_pages * PAGE_SZ).leak();
 
         let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
         size_metrics.max_bytes.set_page_sz(num_pages);
@@ -652,7 +653,8 @@ impl PageCache {
         let slots = page_buffer
             .chunks_exact_mut(PAGE_SZ)
             .map(|chunk| {
-                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();
+                // SAFETY: Each chunk has `PAGE_SZ` (8192) bytes, greater than 512, still aligned.
+                let buf = unsafe { IoPageSlice::new_unchecked(chunk.try_into().unwrap()) };
 
                 Slot {
                     inner: tokio::sync::RwLock::new(SlotInner {
diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs
index 1c82e5454d..2bd7f2d619 100644
--- a/pageserver/src/tenant/block_io.rs
+++ b/pageserver/src/tenant/block_io.rs
@@ -5,6 +5,8 @@
 use super::storage_layer::delta_layer::{Adapter, DeltaLayerInner};
 use crate::context::RequestContext;
 use crate::page_cache::{self, FileId, PageReadGuard, PageWriteGuard, ReadBufResult, PAGE_SZ};
+#[cfg(test)]
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::VirtualFile;
 use bytes::Bytes;
 use std::ops::Deref;
@@ -40,7 +42,7 @@ pub enum BlockLease<'a> {
     #[cfg(test)]
     Arc(std::sync::Arc<[u8; PAGE_SZ]>),
     #[cfg(test)]
-    Vec(Vec<u8>),
+    IoBufferMut(IoBufferMut),
 }
 
 impl From<PageReadGuard<'static>> for BlockLease<'static> {
@@ -67,7 +69,7 @@ impl Deref for BlockLease<'_> {
             #[cfg(test)]
             BlockLease::Arc(v) => v.deref(),
             #[cfg(test)]
-            BlockLease::Vec(v) => {
+            BlockLease::IoBufferMut(v) => {
                 TryFrom::try_from(&v[..]).expect("caller must ensure that v has PAGE_SZ")
             }
         }
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index a62a47f9a7..de0abab4c0 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,10 +6,11 @@ use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::page_cache;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
+use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
 use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
-use crate::virtual_file::{self, owned_buffers_io, VirtualFile};
+use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
 use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use num_traits::Num;
@@ -107,15 +108,18 @@ impl EphemeralFile {
         self.page_cache_file_id
     }
 
-    pub(crate) async fn load_to_vec(&self, ctx: &RequestContext) -> Result<Vec<u8>, io::Error> {
+    pub(crate) async fn load_to_io_buf(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<IoBufferMut, io::Error> {
         let size = self.len().into_usize();
-        let vec = Vec::with_capacity(size);
-        let (slice, nread) = self.read_exact_at_eof_ok(0, vec.slice_full(), ctx).await?;
+        let buf = IoBufferMut::with_capacity(size);
+        let (slice, nread) = self.read_exact_at_eof_ok(0, buf.slice_full(), ctx).await?;
         assert_eq!(nread, size);
-        let vec = slice.into_inner();
-        assert_eq!(vec.len(), nread);
-        assert_eq!(vec.capacity(), size, "we shouldn't be reallocating");
-        Ok(vec)
+        let buf = slice.into_inner();
+        assert_eq!(buf.len(), nread);
+        assert_eq!(buf.capacity(), size, "we shouldn't be reallocating");
+        Ok(buf)
     }
 
     /// Returns the offset at which the first byte of the input was written, for use
@@ -158,7 +162,7 @@ impl EphemeralFile {
 }
 
 impl super::storage_layer::inmemory_layer::vectored_dio_read::File for EphemeralFile {
-    async fn read_exact_at_eof_ok<'a, 'b, B: tokio_epoll_uring::IoBufMut + Send>(
+    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
         &'b self,
         start: u64,
         dst: tokio_epoll_uring::Slice<B>,
@@ -345,7 +349,7 @@ mod tests {
         assert!(file.len() as usize == write_nbytes);
         for i in 0..write_nbytes {
             assert_eq!(value_offsets[i], i.into_u64());
-            let buf = Vec::with_capacity(1);
+            let buf = IoBufferMut::with_capacity(1);
             let (buf_slice, nread) = file
                 .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
                 .await
@@ -385,7 +389,7 @@ mod tests {
 
         // assert the state is as this test expects it to be
         assert_eq!(
-            &file.load_to_vec(&ctx).await.unwrap(),
+            &file.load_to_io_buf(&ctx).await.unwrap(),
             &content[0..cap + cap / 2]
         );
         let md = file
@@ -440,7 +444,7 @@ mod tests {
                 let (buf, nread) = file
                     .read_exact_at_eof_ok(
                         start.into_u64(),
-                        Vec::with_capacity(len).slice_full(),
+                        IoBufferMut::with_capacity(len).slice_full(),
                         ctx,
                     )
                     .await
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 6332d36dc3..ceae1d4b1a 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -44,11 +44,11 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{walrecord, TEMP_FILE_SUFFIX};
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::BytesMut;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::StreamExt;
 use itertools::Itertools;
@@ -1002,7 +1002,7 @@ impl DeltaLayerInner {
             .0
             .into();
         let buf_size = Self::get_min_read_buffer_size(&reads, max_vectored_read_bytes);
-        let mut buf = Some(BytesMut::with_capacity(buf_size));
+        let mut buf = Some(IoBufferMut::with_capacity(buf_size));
 
         // Note that reads are processed in reverse order (from highest key+lsn).
         // This is the order that `ReconstructState` requires such that it can
@@ -1029,7 +1029,7 @@ impl DeltaLayerInner {
 
                     // We have "lost" the buffer since the lower level IO api
                     // doesn't return the buffer on error. Allocate a new one.
-                    buf = Some(BytesMut::with_capacity(buf_size));
+                    buf = Some(IoBufferMut::with_capacity(buf_size));
 
                     continue;
                 }
@@ -1203,7 +1203,7 @@ impl DeltaLayerInner {
             .map(|x| x.0.get())
             .unwrap_or(8192);
 
-        let mut buffer = Some(BytesMut::with_capacity(max_read_size));
+        let mut buffer = Some(IoBufferMut::with_capacity(max_read_size));
 
         // FIXME: buffering of DeltaLayerWriter
         let mut per_blob_copy = Vec::new();
@@ -1561,12 +1561,11 @@ impl<'a> DeltaLayerIterator<'a> {
         let vectored_blob_reader = VectoredBlobReader::new(&self.delta_layer.file);
         let mut next_batch = std::collections::VecDeque::new();
         let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let buf = IoBufferMut::with_capacity(buf_size);
         let blobs_buf = vectored_blob_reader
             .read_blobs(&plan, buf, self.ctx)
             .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
         for meta in blobs_buf.blobs.iter() {
             let blob_read = meta.read(&view).await?;
             let value = Value::des(&blob_read)?;
@@ -1941,7 +1940,7 @@ pub(crate) mod test {
                 &vectored_reads,
                 constants::MAX_VECTORED_READ_BYTES,
             );
-            let mut buf = Some(BytesMut::with_capacity(buf_size));
+            let mut buf = Some(IoBufferMut::with_capacity(buf_size));
 
             for read in vectored_reads {
                 let blobs_buf = vectored_blob_reader
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b1f2557038..fa058833d4 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -41,10 +41,11 @@ use crate::tenant::vectored_blob_io::{
 };
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
 use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{anyhow, bail, ensure, Context, Result};
-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use itertools::Itertools;
@@ -547,10 +548,10 @@ impl ImageLayerInner {
         for read in plan.into_iter() {
             let buf_size = read.size();
 
-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = IoBufferMut::with_capacity(buf_size);
             let blobs_buf = vectored_blob_reader.read_blobs(&read, buf, ctx).await?;
-            let frozen_buf = blobs_buf.buf.freeze();
-            let view = BufView::new_bytes(frozen_buf);
+
+            let view = BufView::new_slice(&blobs_buf.buf);
 
             for meta in blobs_buf.blobs.iter() {
                 let img_buf = meta.read(&view).await?;
@@ -609,13 +610,12 @@ impl ImageLayerInner {
                 }
             }
 
-            let buf = BytesMut::with_capacity(buf_size);
+            let buf = IoBufferMut::with_capacity(buf_size);
             let res = vectored_blob_reader.read_blobs(&read, buf, ctx).await;
 
             match res {
                 Ok(blobs_buf) => {
-                    let frozen_buf = blobs_buf.buf.freeze();
-                    let view = BufView::new_bytes(frozen_buf);
+                    let view = BufView::new_slice(&blobs_buf.buf);
                     for meta in blobs_buf.blobs.iter() {
                         let img_buf = meta.read(&view).await;
 
@@ -1069,12 +1069,11 @@ impl<'a> ImageLayerIterator<'a> {
         let vectored_blob_reader = VectoredBlobReader::new(&self.image_layer.file);
         let mut next_batch = std::collections::VecDeque::new();
         let buf_size = plan.size();
-        let buf = BytesMut::with_capacity(buf_size);
+        let buf = IoBufferMut::with_capacity(buf_size);
         let blobs_buf = vectored_blob_reader
             .read_blobs(&plan, buf, self.ctx)
             .await?;
-        let frozen_buf = blobs_buf.buf.freeze();
-        let view = BufView::new_bytes(frozen_buf);
+        let view = BufView::new_slice(&blobs_buf.buf);
         for meta in blobs_buf.blobs.iter() {
             let img_buf = meta.read(&view).await?;
             next_batch.push_back((
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index e487bee1f2..7573ddb5cc 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -14,7 +14,6 @@ use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Context, Result};
-use bytes::Bytes;
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::keyspace::KeySpace;
@@ -809,9 +808,8 @@ impl InMemoryLayer {
 
         match l0_flush_global_state {
             l0_flush::Inner::Direct { .. } => {
-                let file_contents: Vec<u8> = inner.file.load_to_vec(ctx).await?;
-
-                let file_contents = Bytes::from(file_contents);
+                let file_contents = inner.file.load_to_io_buf(ctx).await?;
+                let file_contents = file_contents.freeze();
 
                 for (key, vec_map) in inner.index.iter() {
                     // Write all page versions
@@ -825,7 +823,7 @@ impl InMemoryLayer {
                             len,
                             will_init,
                         } = entry;
-                        let buf = Bytes::slice(&file_contents, pos as usize..(pos + len) as usize);
+                        let buf = file_contents.slice(pos as usize..(pos + len) as usize);
                         let (_buf, res) = delta_layer_writer
                             .put_value_bytes(
                                 Key::from_compact(*key),
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
index 0683e15659..a4bb3a6bfc 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer/vectored_dio_read.rs
@@ -9,6 +9,7 @@ use tokio_epoll_uring::{BoundedBuf, IoBufMut, Slice};
 use crate::{
     assert_u64_eq_usize::{U64IsUsize, UsizeIsU64},
     context::RequestContext,
+    virtual_file::{owned_buffers_io::io_buf_aligned::IoBufAlignedMut, IoBufferMut},
 };
 
 /// The file interface we require. At runtime, this is a [`crate::tenant::ephemeral_file::EphemeralFile`].
@@ -24,7 +25,7 @@ pub trait File: Send {
     /// [`std::io::ErrorKind::UnexpectedEof`] error if the file is shorter than `start+dst.len()`.
     ///
     /// No guarantees are made about the remaining bytes in `dst` in case of a short read.
-    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+    async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
         &'b self,
         start: u64,
         dst: Slice<B>,
@@ -227,7 +228,7 @@ where
 
     // Execute physical reads and fill the logical read buffers
     // TODO: pipelined reads; prefetch;
-    let get_io_buffer = |nchunks| Vec::with_capacity(nchunks * DIO_CHUNK_SIZE);
+    let get_io_buffer = |nchunks| IoBufferMut::with_capacity(nchunks * DIO_CHUNK_SIZE);
     for PhysicalRead {
         start_chunk_no,
         nchunks,
@@ -459,7 +460,7 @@ mod tests {
         let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
         let file = InMemoryFile::new_random(10);
         let test_read = |pos, len| {
-            let buf = vec![0; len];
+            let buf = IoBufferMut::with_capacity_zeroed(len);
             let fut = file.read_exact_at_eof_ok(pos, buf.slice_full(), &ctx);
             use futures::FutureExt;
             let (slice, nread) = fut
@@ -470,9 +471,9 @@ mod tests {
             buf.truncate(nread);
             buf
         };
-        assert_eq!(test_read(0, 1), &file.content[0..1]);
-        assert_eq!(test_read(1, 2), &file.content[1..3]);
-        assert_eq!(test_read(9, 2), &file.content[9..]);
+        assert_eq!(&test_read(0, 1), &file.content[0..1]);
+        assert_eq!(&test_read(1, 2), &file.content[1..3]);
+        assert_eq!(&test_read(9, 2), &file.content[9..]);
         assert!(test_read(10, 2).is_empty());
         assert!(test_read(11, 2).is_empty());
     }
@@ -609,7 +610,7 @@ mod tests {
     }
 
     impl<'x> File for RecorderFile<'x> {
-        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufMut + Send>(
+        async fn read_exact_at_eof_ok<'a, 'b, B: IoBufAlignedMut + Send>(
             &'b self,
             start: u64,
             dst: Slice<B>,
@@ -782,7 +783,7 @@ mod tests {
             2048,  1024 => Err("foo".to_owned()),
         };
 
-        let buf = Vec::with_capacity(512);
+        let buf = IoBufferMut::with_capacity(512);
         let (buf, nread) = mock_file
             .read_exact_at_eof_ok(0, buf.slice_full(), &ctx)
             .await
@@ -790,7 +791,7 @@ mod tests {
         assert_eq!(nread, 512);
         assert_eq!(&buf.into_inner()[..nread], &[0; 512]);
 
-        let buf = Vec::with_capacity(512);
+        let buf = IoBufferMut::with_capacity(512);
         let (buf, nread) = mock_file
             .read_exact_at_eof_ok(512, buf.slice_full(), &ctx)
             .await
@@ -798,7 +799,7 @@ mod tests {
         assert_eq!(nread, 512);
         assert_eq!(&buf.into_inner()[..nread], &[1; 512]);
 
-        let buf = Vec::with_capacity(512);
+        let buf = IoBufferMut::with_capacity(512);
         let (buf, nread) = mock_file
             .read_exact_at_eof_ok(1024, buf.slice_full(), &ctx)
             .await
@@ -806,7 +807,7 @@ mod tests {
         assert_eq!(nread, 10);
         assert_eq!(&buf.into_inner()[..nread], &[2; 10]);
 
-        let buf = Vec::with_capacity(1024);
+        let buf = IoBufferMut::with_capacity(1024);
         let err = mock_file
             .read_exact_at_eof_ok(2048, buf.slice_full(), &ctx)
             .await
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 0c03791034..dfe2352310 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -18,7 +18,7 @@
 use std::collections::BTreeMap;
 use std::ops::Deref;
 
-use bytes::{Bytes, BytesMut};
+use bytes::Bytes;
 use pageserver_api::key::Key;
 use tokio::io::AsyncWriteExt;
 use tokio_epoll_uring::BoundedBuf;
@@ -27,6 +27,7 @@ use utils::vec_map::VecMap;
 
 use crate::context::RequestContext;
 use crate::tenant::blob_io::{BYTE_UNCOMPRESSED, BYTE_ZSTD, LEN_COMPRESSION_BIT_MASK};
+use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, VirtualFile};
 
 /// Metadata bundled with the start and end offset of a blob.
@@ -158,7 +159,7 @@ impl std::fmt::Display for VectoredBlob {
 /// Return type of [`VectoredBlobReader::read_blobs`]
 pub struct VectoredBlobsBuf {
     /// Buffer for all blobs in this read
-    pub buf: BytesMut,
+    pub buf: IoBufferMut,
     /// Offsets into the buffer and metadata for all blobs in this read
     pub blobs: Vec<VectoredBlob>,
 }
@@ -441,7 +442,7 @@ impl<'a> VectoredBlobReader<'a> {
     pub async fn read_blobs(
         &self,
         read: &VectoredRead,
-        buf: BytesMut,
+        buf: IoBufferMut,
         ctx: &RequestContext,
     ) -> Result<VectoredBlobsBuf, std::io::Error> {
         assert!(read.size() > 0);
@@ -916,7 +917,7 @@ mod tests {
 
         // Multiply by two (compressed data might need more space), and add a few bytes for the header
         let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
-        let mut buf = BytesMut::with_capacity(reserved_bytes);
+        let mut buf = IoBufferMut::with_capacity(reserved_bytes);
 
         let vectored_blob_reader = VectoredBlobReader::new(&file);
         let meta = BlobMeta {
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 5a364b7aaf..daa8b99ab0 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -18,6 +18,9 @@ use crate::page_cache::{PageWriteGuard, PAGE_SZ};
 use crate::tenant::TENANTS_SEGMENT_NAME;
 use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
+use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
+use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
+use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
@@ -55,6 +58,8 @@ pub(crate) mod owned_buffers_io {
     //! but for the time being we're proving out the primitives in the neon.git repo
     //! for faster iteration.
 
+    pub(crate) mod aligned_buffer;
+    pub(crate) mod io_buf_aligned;
     pub(crate) mod io_buf_ext;
     pub(crate) mod slice;
     pub(crate) mod write;
@@ -196,7 +201,7 @@ impl VirtualFile {
         ctx: &RequestContext,
     ) -> Result<Slice<Buf>, Error>
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBufAlignedMut + Send,
     {
         self.inner.read_exact_at(slice, offset, ctx).await
     }
@@ -771,7 +776,7 @@ impl VirtualFileInner {
         ctx: &RequestContext,
     ) -> Result<Slice<Buf>, Error>
     where
-        Buf: IoBufMut + Send,
+        Buf: IoBufAlignedMut + Send,
     {
         let assert_we_return_original_bounds = if cfg!(debug_assertions) {
             Some((slice.stable_ptr() as usize, slice.bytes_total()))
@@ -1222,12 +1227,14 @@ impl VirtualFileInner {
         ctx: &RequestContext,
     ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
         use crate::page_cache::PAGE_SZ;
-        let slice = Vec::with_capacity(PAGE_SZ).slice_full();
+        let slice = IoBufferMut::with_capacity(PAGE_SZ).slice_full();
         assert_eq!(slice.bytes_total(), PAGE_SZ);
         let slice = self
             .read_exact_at(slice, blknum as u64 * (PAGE_SZ as u64), ctx)
             .await?;
-        Ok(crate::tenant::block_io::BlockLease::Vec(slice.into_inner()))
+        Ok(crate::tenant::block_io::BlockLease::IoBufferMut(
+            slice.into_inner(),
+        ))
     }
 
     async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
@@ -1325,10 +1332,11 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind) {
+pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
+    set_io_mode(mode);
     io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1357,6 +1365,11 @@ pub(crate) const fn get_io_buffer_alignment() -> usize {
     DEFAULT_IO_BUFFER_ALIGNMENT
 }
 
+pub(crate) type IoBufferMut = AlignedBufferMut<ConstAlign<{ get_io_buffer_alignment() }>>;
+pub(crate) type IoBuffer = AlignedBuffer<ConstAlign<{ get_io_buffer_alignment() }>>;
+pub(crate) type IoPageSlice<'a> =
+    AlignedSlice<'a, PAGE_SZ, ConstAlign<{ get_io_buffer_alignment() }>>;
+
 static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
 
 pub(crate) fn set_io_mode(mode: IoMode) {
@@ -1395,10 +1408,10 @@ mod tests {
     impl MaybeVirtualFile {
         async fn read_exact_at(
             &self,
-            mut slice: tokio_epoll_uring::Slice<Vec<u8>>,
+            mut slice: tokio_epoll_uring::Slice<IoBufferMut>,
             offset: u64,
             ctx: &RequestContext,
-        ) -> Result<tokio_epoll_uring::Slice<Vec<u8>>, Error> {
+        ) -> Result<tokio_epoll_uring::Slice<IoBufferMut>, Error> {
             match self {
                 MaybeVirtualFile::VirtualFile(file) => file.read_exact_at(slice, offset, ctx).await,
                 MaybeVirtualFile::File(file) => {
@@ -1466,12 +1479,13 @@ mod tests {
             len: usize,
             ctx: &RequestContext,
         ) -> Result<String, Error> {
-            let slice = Vec::with_capacity(len).slice_full();
+            let slice = IoBufferMut::with_capacity(len).slice_full();
             assert_eq!(slice.bytes_total(), len);
             let slice = self.read_exact_at(slice, pos, ctx).await?;
-            let vec = slice.into_inner();
-            assert_eq!(vec.len(), len);
-            Ok(String::from_utf8(vec).unwrap())
+            let buf = slice.into_inner();
+            assert_eq!(buf.len(), len);
+
+            Ok(String::from_utf8(buf.to_vec()).unwrap())
         }
     }
 
@@ -1695,7 +1709,7 @@ mod tests {
             let files = files.clone();
             let ctx = ctx.detached_child(TaskKind::UnitTest, DownloadBehavior::Error);
             let hdl = rt.spawn(async move {
-                let mut buf = vec![0u8; SIZE];
+                let mut buf = IoBufferMut::with_capacity_zeroed(SIZE);
                 let mut rng = rand::rngs::OsRng;
                 for _ in 1..1000 {
                     let f = &files[rng.gen_range(0..files.len())];
@@ -1704,7 +1718,7 @@ mod tests {
                         .await
                         .unwrap()
                         .into_inner();
-                    assert!(buf == SAMPLE);
+                    assert!(buf[..] == SAMPLE);
                 }
             });
             hdls.push(hdl);
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs
new file mode 100644
index 0000000000..8ffc29b93d
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer.rs
@@ -0,0 +1,9 @@
+pub mod alignment;
+pub mod buffer;
+pub mod buffer_mut;
+pub mod raw;
+pub mod slice;
+
+pub use alignment::*;
+pub use buffer_mut::AlignedBufferMut;
+pub use slice::AlignedSlice;
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
new file mode 100644
index 0000000000..933b78a13b
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
@@ -0,0 +1,26 @@
+pub trait Alignment: std::marker::Unpin + 'static {
+    /// Returns the required alignments.
+    fn align(&self) -> usize;
+}
+
+/// Alignment at compile time.
+#[derive(Debug)]
+pub struct ConstAlign<const A: usize>;
+
+impl<const A: usize> Alignment for ConstAlign<A> {
+    fn align(&self) -> usize {
+        A
+    }
+}
+
+/// Alignment at run time.
+#[derive(Debug)]
+pub struct RuntimeAlign {
+    align: usize,
+}
+
+impl Alignment for RuntimeAlign {
+    fn align(&self) -> usize {
+        self.align
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
new file mode 100644
index 0000000000..2fba6d699b
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
@@ -0,0 +1,124 @@
+use std::{
+    ops::{Deref, Range, RangeBounds},
+    sync::Arc,
+};
+
+use super::{alignment::Alignment, raw::RawAlignedBuffer};
+
+/// An shared, immutable aligned buffer type.
+pub struct AlignedBuffer<A: Alignment> {
+    /// Shared raw buffer.
+    raw: Arc<RawAlignedBuffer<A>>,
+    /// Range that specifies the current slice.
+    range: Range<usize>,
+}
+
+impl<A: Alignment> AlignedBuffer<A> {
+    /// Creates an immutable `IoBuffer` from the raw buffer
+    pub(super) fn from_raw(raw: RawAlignedBuffer<A>, range: Range<usize>) -> Self {
+        AlignedBuffer {
+            raw: Arc::new(raw),
+            range,
+        }
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.range.len()
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.raw.align()
+    }
+
+    #[inline]
+    fn as_ptr(&self) -> *const u8 {
+        // SAFETY: `self.range.start` is guaranteed to be within [0, self.len()).
+        unsafe { self.raw.as_ptr().add(self.range.start) }
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    fn as_slice(&self) -> &[u8] {
+        &self.raw.as_slice()[self.range.start..self.range.end]
+    }
+
+    /// Returns a slice of self for the index range `[begin..end)`.
+    pub fn slice(&self, range: impl RangeBounds<usize>) -> Self {
+        use core::ops::Bound;
+        let len = self.len();
+
+        let begin = match range.start_bound() {
+            Bound::Included(&n) => n,
+            Bound::Excluded(&n) => n.checked_add(1).expect("out of range"),
+            Bound::Unbounded => 0,
+        };
+
+        let end = match range.end_bound() {
+            Bound::Included(&n) => n.checked_add(1).expect("out of range"),
+            Bound::Excluded(&n) => n,
+            Bound::Unbounded => len,
+        };
+
+        assert!(
+            begin <= end,
+            "range start must not be greater than end: {:?} <= {:?}",
+            begin,
+            end,
+        );
+        assert!(
+            end <= len,
+            "range end out of bounds: {:?} <= {:?}",
+            end,
+            len,
+        );
+
+        let begin = self.range.start + begin;
+        let end = self.range.start + end;
+
+        AlignedBuffer {
+            raw: Arc::clone(&self.raw),
+            range: begin..end,
+        }
+    }
+}
+
+impl<A: Alignment> Deref for AlignedBuffer<A> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> AsRef<[u8]> for AlignedBuffer<A> {
+    fn as_ref(&self) -> &[u8] {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
+    fn eq(&self, other: &[u8]) -> bool {
+        self.as_slice().eq(other)
+    }
+}
+
+/// SAFETY: the underlying buffer references a stable memory region.
+unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        self.len()
+    }
+
+    fn bytes_total(&self) -> usize {
+        self.len()
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
new file mode 100644
index 0000000000..b3675d1aea
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -0,0 +1,347 @@
+use std::ops::{Deref, DerefMut};
+
+use super::{
+    alignment::{Alignment, ConstAlign},
+    buffer::AlignedBuffer,
+    raw::RawAlignedBuffer,
+};
+
+/// A mutable aligned buffer type.
+#[derive(Debug)]
+pub struct AlignedBufferMut<A: Alignment> {
+    raw: RawAlignedBuffer<A>,
+}
+
+impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
+    /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
+    ///
+    /// The buffer will be able to hold at most `capacity` elements and will never resize.
+    ///
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
+    /// * `align` must not be zero,
+    ///
+    /// * `align` must be a power of two,
+    ///
+    /// * `capacity`, when rounded up to the nearest multiple of `align`,
+    ///    must not overflow isize (i.e., the rounded value must be
+    ///    less than or equal to `isize::MAX`).
+    pub fn with_capacity(capacity: usize) -> Self {
+        AlignedBufferMut {
+            raw: RawAlignedBuffer::with_capacity(capacity),
+        }
+    }
+
+    /// Constructs a new `IoBufferMut` with at least the specified capacity and alignment, filled with zeros.
+    pub fn with_capacity_zeroed(capacity: usize) -> Self {
+        use bytes::BufMut;
+        let mut buf = Self::with_capacity(capacity);
+        buf.put_bytes(0, capacity);
+        // SAFETY: `put_bytes` filled the entire buffer.
+        unsafe { buf.set_len(capacity) };
+        buf
+    }
+}
+
+impl<A: Alignment> AlignedBufferMut<A> {
+    /// Returns the total number of bytes the buffer can hold.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.raw.capacity()
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.raw.align()
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.raw.len()
+    }
+
+    /// Force the length of the buffer to `new_len`.
+    #[inline]
+    unsafe fn set_len(&mut self, new_len: usize) {
+        self.raw.set_len(new_len)
+    }
+
+    #[inline]
+    fn as_ptr(&self) -> *const u8 {
+        self.raw.as_ptr()
+    }
+
+    #[inline]
+    fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.raw.as_mut_ptr()
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    fn as_slice(&self) -> &[u8] {
+        self.raw.as_slice()
+    }
+
+    /// Extracts a mutable slice of the entire buffer.
+    ///
+    /// Equivalent to `&mut s[..]`.
+    fn as_mut_slice(&mut self) -> &mut [u8] {
+        self.raw.as_mut_slice()
+    }
+
+    /// Drops the all the contents of the buffer, setting its length to `0`.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.raw.clear()
+    }
+
+    /// Reserves capacity for at least `additional` more bytes to be inserted
+    /// in the given `IoBufferMut`. The collection may reserve more space to
+    /// speculatively avoid frequent reallocations. After calling `reserve`,
+    /// capacity will be greater than or equal to `self.len() + additional`.
+    /// Does nothing if capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    pub fn reserve(&mut self, additional: usize) {
+        self.raw.reserve(additional);
+    }
+
+    /// Shortens the buffer, keeping the first len bytes.
+    pub fn truncate(&mut self, len: usize) {
+        self.raw.truncate(len);
+    }
+
+    /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
+    pub fn leak<'a>(self) -> &'a mut [u8] {
+        self.raw.leak()
+    }
+
+    pub fn freeze(self) -> AlignedBuffer<A> {
+        let len = self.len();
+        AlignedBuffer::from_raw(self.raw, 0..len)
+    }
+}
+
+impl<A: Alignment> Deref for AlignedBufferMut<A> {
+    type Target = [u8];
+
+    fn deref(&self) -> &Self::Target {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> DerefMut for AlignedBufferMut<A> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.as_mut_slice()
+    }
+}
+
+impl<A: Alignment> AsRef<[u8]> for AlignedBufferMut<A> {
+    fn as_ref(&self) -> &[u8] {
+        self.as_slice()
+    }
+}
+
+impl<A: Alignment> AsMut<[u8]> for AlignedBufferMut<A> {
+    fn as_mut(&mut self) -> &mut [u8] {
+        self.as_mut_slice()
+    }
+}
+
+impl<A: Alignment> PartialEq<[u8]> for AlignedBufferMut<A> {
+    fn eq(&self, other: &[u8]) -> bool {
+        self.as_slice().eq(other)
+    }
+}
+
+/// SAFETY: When advancing the internal cursor, the caller needs to make sure the bytes advcanced past have been initialized.
+unsafe impl<A: Alignment> bytes::BufMut for AlignedBufferMut<A> {
+    #[inline]
+    fn remaining_mut(&self) -> usize {
+        // Although a `Vec` can have at most isize::MAX bytes, we never want to grow `IoBufferMut`.
+        // Thus, it can have at most `self.capacity` bytes.
+        self.capacity() - self.len()
+    }
+
+    // SAFETY: Caller needs to make sure the bytes being advanced past have been initialized.
+    #[inline]
+    unsafe fn advance_mut(&mut self, cnt: usize) {
+        let len = self.len();
+        let remaining = self.remaining_mut();
+
+        if remaining < cnt {
+            panic_advance(cnt, remaining);
+        }
+
+        // Addition will not overflow since the sum is at most the capacity.
+        self.set_len(len + cnt);
+    }
+
+    #[inline]
+    fn chunk_mut(&mut self) -> &mut bytes::buf::UninitSlice {
+        let cap = self.capacity();
+        let len = self.len();
+
+        // SAFETY: Since `self.ptr` is valid for `cap` bytes, `self.ptr.add(len)` must be
+        // valid for `cap - len` bytes. The subtraction will not underflow since
+        // `len <= cap`.
+        unsafe {
+            bytes::buf::UninitSlice::from_raw_parts_mut(self.as_mut_ptr().add(len), cap - len)
+        }
+    }
+}
+
+/// Panic with a nice error message.
+#[cold]
+fn panic_advance(idx: usize, len: usize) -> ! {
+    panic!(
+        "advance out of bounds: the len is {} but advancing by {}",
+        len, idx
+    );
+}
+
+/// Safety: [`AlignedBufferMut`] has exclusive ownership of the io buffer,
+/// and the underlying pointer remains stable while io-uring is owning the buffer.
+/// The tokio-epoll-uring crate itself will not resize the buffer and will respect
+/// [`tokio_epoll_uring::IoBuf::bytes_total`].
+unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBufferMut<A> {
+    fn stable_ptr(&self) -> *const u8 {
+        self.as_ptr()
+    }
+
+    fn bytes_init(&self) -> usize {
+        self.len()
+    }
+
+    fn bytes_total(&self) -> usize {
+        self.capacity()
+    }
+}
+
+// SAFETY: See above.
+unsafe impl<A: Alignment> tokio_epoll_uring::IoBufMut for AlignedBufferMut<A> {
+    fn stable_mut_ptr(&mut self) -> *mut u8 {
+        self.as_mut_ptr()
+    }
+
+    unsafe fn set_init(&mut self, init_len: usize) {
+        if self.len() < init_len {
+            self.set_len(init_len);
+        }
+    }
+}
+
+#[cfg(test)]
+mod tests {
+
+    use super::*;
+
+    const ALIGN: usize = 4 * 1024;
+    type TestIoBufferMut = AlignedBufferMut<ConstAlign<ALIGN>>;
+
+    #[test]
+    fn test_with_capacity() {
+        let v = TestIoBufferMut::with_capacity(ALIGN * 4);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+
+        let v = TestIoBufferMut::with_capacity(ALIGN / 2);
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN / 2);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    fn test_with_capacity_zeroed() {
+        let v = TestIoBufferMut::with_capacity_zeroed(ALIGN);
+        assert_eq!(v.len(), ALIGN);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+        assert_eq!(&v[..], &[0; ALIGN])
+    }
+
+    #[test]
+    fn test_reserve() {
+        use bytes::BufMut;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN);
+        let capacity = v.capacity();
+        v.reserve(capacity);
+        assert_eq!(v.capacity(), capacity);
+        let data = [b'a'; ALIGN];
+        v.put(&data[..]);
+        v.reserve(capacity);
+        assert!(v.capacity() >= capacity * 2);
+        assert_eq!(&v[..], &data[..]);
+        let capacity = v.capacity();
+        v.clear();
+        v.reserve(capacity);
+        assert_eq!(capacity, v.capacity());
+    }
+
+    #[test]
+    fn test_bytes_put() {
+        use bytes::BufMut;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN * 4);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            for _ in 0..4 {
+                v.put(&x[..]);
+            }
+            assert_eq!(v.len(), ALIGN * 4);
+            assert_eq!(v.capacity(), ALIGN * 4);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN * 4);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+
+    #[test]
+    #[should_panic]
+    fn test_bytes_put_panic() {
+        use bytes::BufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN * 4);
+        let x = [b'a'; ALIGN];
+        for _ in 0..5 {
+            v.put_slice(&x[..]);
+        }
+    }
+
+    #[test]
+    fn test_io_buf_put_slice() {
+        use tokio_epoll_uring::BoundedBufMut;
+        const ALIGN: usize = 4 * 1024;
+        let mut v = TestIoBufferMut::with_capacity(ALIGN);
+        let x = [b'a'; ALIGN];
+
+        for _ in 0..2 {
+            v.put_slice(&x[..]);
+            assert_eq!(v.len(), ALIGN);
+            assert_eq!(v.capacity(), ALIGN);
+            assert_eq!(v.align(), ALIGN);
+            assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+            v.clear()
+        }
+        assert_eq!(v.len(), 0);
+        assert_eq!(v.capacity(), ALIGN);
+        assert_eq!(v.align(), ALIGN);
+        assert_eq!(v.as_ptr().align_offset(ALIGN), 0);
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
new file mode 100644
index 0000000000..6c26dec0db
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/raw.rs
@@ -0,0 +1,216 @@
+use core::slice;
+use std::{
+    alloc::{self, Layout},
+    cmp,
+    mem::ManuallyDrop,
+};
+
+use super::alignment::{Alignment, ConstAlign};
+
+#[derive(Debug)]
+struct AlignedBufferPtr(*mut u8);
+
+// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
+unsafe impl Send for AlignedBufferPtr {}
+
+// SAFETY: We gurantees no one besides `IoBufferPtr` itself has the raw pointer.
+unsafe impl Sync for AlignedBufferPtr {}
+
+/// An aligned buffer type.
+#[derive(Debug)]
+pub struct RawAlignedBuffer<A: Alignment> {
+    ptr: AlignedBufferPtr,
+    capacity: usize,
+    len: usize,
+    align: A,
+}
+
+impl<const A: usize> RawAlignedBuffer<ConstAlign<A>> {
+    /// Constructs a new, empty `IoBufferMut` with at least the specified capacity and alignment.
+    ///
+    /// The buffer will be able to hold at most `capacity` elements and will never resize.
+    ///
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_, or if the following alignment requirement is not met:
+    /// * `align` must not be zero,
+    ///
+    /// * `align` must be a power of two,
+    ///
+    /// * `capacity`, when rounded up to the nearest multiple of `align`,
+    ///    must not overflow isize (i.e., the rounded value must be
+    ///    less than or equal to `isize::MAX`).
+    pub fn with_capacity(capacity: usize) -> Self {
+        let align = ConstAlign::<A>;
+        let layout = Layout::from_size_align(capacity, align.align()).expect("Invalid layout");
+
+        // SAFETY:  Making an allocation with a sized and aligned layout. The memory is manually freed with the same layout.
+        let ptr = unsafe {
+            let ptr = alloc::alloc(layout);
+            if ptr.is_null() {
+                alloc::handle_alloc_error(layout);
+            }
+            AlignedBufferPtr(ptr)
+        };
+
+        RawAlignedBuffer {
+            ptr,
+            capacity,
+            len: 0,
+            align,
+        }
+    }
+}
+
+impl<A: Alignment> RawAlignedBuffer<A> {
+    /// Returns the total number of bytes the buffer can hold.
+    #[inline]
+    pub fn capacity(&self) -> usize {
+        self.capacity
+    }
+
+    /// Returns the alignment of the buffer.
+    #[inline]
+    pub fn align(&self) -> usize {
+        self.align.align()
+    }
+
+    /// Returns the number of bytes in the buffer, also referred to as its 'length'.
+    #[inline]
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Force the length of the buffer to `new_len`.
+    #[inline]
+    pub unsafe fn set_len(&mut self, new_len: usize) {
+        debug_assert!(new_len <= self.capacity());
+        self.len = new_len;
+    }
+
+    #[inline]
+    pub fn as_ptr(&self) -> *const u8 {
+        self.ptr.0
+    }
+
+    #[inline]
+    pub fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.ptr.0
+    }
+
+    /// Extracts a slice containing the entire buffer.
+    ///
+    /// Equivalent to `&s[..]`.
+    #[inline]
+    pub fn as_slice(&self) -> &[u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts(self.as_ptr(), self.len) }
+    }
+
+    /// Extracts a mutable slice of the entire buffer.
+    ///
+    /// Equivalent to `&mut s[..]`.
+    pub fn as_mut_slice(&mut self) -> &mut [u8] {
+        // SAFETY: The pointer is valid and `len` bytes are initialized.
+        unsafe { slice::from_raw_parts_mut(self.as_mut_ptr(), self.len) }
+    }
+
+    /// Drops the all the contents of the buffer, setting its length to `0`.
+    #[inline]
+    pub fn clear(&mut self) {
+        self.len = 0;
+    }
+
+    /// Reserves capacity for at least `additional` more bytes to be inserted
+    /// in the given `IoBufferMut`. The collection may reserve more space to
+    /// speculatively avoid frequent reallocations. After calling `reserve`,
+    /// capacity will be greater than or equal to `self.len() + additional`.
+    /// Does nothing if capacity is already sufficient.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the new capacity exceeds `isize::MAX` _bytes_.
+    pub fn reserve(&mut self, additional: usize) {
+        if additional > self.capacity() - self.len() {
+            self.reserve_inner(additional);
+        }
+    }
+
+    fn reserve_inner(&mut self, additional: usize) {
+        let Some(required_cap) = self.len().checked_add(additional) else {
+            capacity_overflow()
+        };
+
+        let old_capacity = self.capacity();
+        let align = self.align();
+        // This guarantees exponential growth. The doubling cannot overflow
+        // because `cap <= isize::MAX` and the type of `cap` is `usize`.
+        let cap = cmp::max(old_capacity * 2, required_cap);
+
+        if !is_valid_alloc(cap) {
+            capacity_overflow()
+        }
+        let new_layout = Layout::from_size_align(cap, self.align()).expect("Invalid layout");
+
+        let old_ptr = self.as_mut_ptr();
+
+        // SAFETY: old allocation was allocated with std::alloc::alloc with the same layout,
+        // and we panics on null pointer.
+        let (ptr, cap) = unsafe {
+            let old_layout = Layout::from_size_align_unchecked(old_capacity, align);
+            let ptr = alloc::realloc(old_ptr, old_layout, new_layout.size());
+            if ptr.is_null() {
+                alloc::handle_alloc_error(new_layout);
+            }
+            (AlignedBufferPtr(ptr), cap)
+        };
+
+        self.ptr = ptr;
+        self.capacity = cap;
+    }
+
+    /// Shortens the buffer, keeping the first len bytes.
+    pub fn truncate(&mut self, len: usize) {
+        if len > self.len {
+            return;
+        }
+        self.len = len;
+    }
+
+    /// Consumes and leaks the `IoBufferMut`, returning a mutable reference to the contents, &'a mut [u8].
+    pub fn leak<'a>(self) -> &'a mut [u8] {
+        let mut buf = ManuallyDrop::new(self);
+        // SAFETY: leaking the buffer as intended.
+        unsafe { slice::from_raw_parts_mut(buf.as_mut_ptr(), buf.len) }
+    }
+}
+
+fn capacity_overflow() -> ! {
+    panic!("capacity overflow")
+}
+
+// We need to guarantee the following:
+// * We don't ever allocate `> isize::MAX` byte-size objects.
+// * We don't overflow `usize::MAX` and actually allocate too little.
+//
+// On 64-bit we just need to check for overflow since trying to allocate
+// `> isize::MAX` bytes will surely fail. On 32-bit and 16-bit we need to add
+// an extra guard for this in case we're running on a platform which can use
+// all 4GB in user-space, e.g., PAE or x32.
+#[inline]
+fn is_valid_alloc(alloc_size: usize) -> bool {
+    !(usize::BITS < 64 && alloc_size > isize::MAX as usize)
+}
+
+impl<A: Alignment> Drop for RawAlignedBuffer<A> {
+    fn drop(&mut self) {
+        // SAFETY: memory was allocated with std::alloc::alloc with the same layout.
+        unsafe {
+            alloc::dealloc(
+                self.as_mut_ptr(),
+                Layout::from_size_align_unchecked(self.capacity, self.align.align()),
+            )
+        }
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
new file mode 100644
index 0000000000..6cecf34c1c
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/slice.rs
@@ -0,0 +1,40 @@
+use std::ops::{Deref, DerefMut};
+
+use super::alignment::{Alignment, ConstAlign};
+
+/// Newtype for an aligned slice.
+pub struct AlignedSlice<'a, const N: usize, A: Alignment> {
+    /// underlying byte slice
+    buf: &'a mut [u8; N],
+    /// alignment marker
+    _align: A,
+}
+
+impl<'a, const N: usize, const A: usize> AlignedSlice<'a, N, ConstAlign<A>> {
+    /// Create a new aligned slice from a mutable byte slice. The input must already satisify the alignment.
+    pub unsafe fn new_unchecked(buf: &'a mut [u8; N]) -> Self {
+        let _align = ConstAlign::<A>;
+        assert_eq!(buf.as_ptr().align_offset(_align.align()), 0);
+        AlignedSlice { buf, _align }
+    }
+}
+
+impl<'a, const N: usize, A: Alignment> Deref for AlignedSlice<'a, N, A> {
+    type Target = [u8; N];
+
+    fn deref(&self) -> &Self::Target {
+        self.buf
+    }
+}
+
+impl<'a, const N: usize, A: Alignment> DerefMut for AlignedSlice<'a, N, A> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        self.buf
+    }
+}
+
+impl<'a, const N: usize, A: Alignment> AsRef<[u8; N]> for AlignedSlice<'a, N, A> {
+    fn as_ref(&self) -> &[u8; N] {
+        self.buf
+    }
+}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
new file mode 100644
index 0000000000..dba695196e
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -0,0 +1,9 @@
+use tokio_epoll_uring::IoBufMut;
+
+use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
+
+pub trait IoBufAlignedMut: IoBufMut {}
+
+impl IoBufAlignedMut for IoBufferMut {}
+
+impl IoBufAlignedMut for PageWriteGuardBuf {}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
index 7c773b6b21..c3940cf6ce 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -1,5 +1,6 @@
 //! See [`FullSlice`].
 
+use crate::virtual_file::{IoBuffer, IoBufferMut};
 use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
@@ -76,3 +77,5 @@ macro_rules! impl_io_buf_ext {
 impl_io_buf_ext!(Bytes);
 impl_io_buf_ext!(BytesMut);
 impl_io_buf_ext!(Vec<u8>);
+impl_io_buf_ext!(IoBufferMut);
+impl_io_buf_ext!(IoBuffer);

From 34b6bd416a8df8cf0d51f707beaca30dbdbe2adc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 21 Oct 2024 17:33:05 +0200
Subject: [PATCH 059/239] offloaded timeline list API (#9461)

Add a way to list the offloaded timelines.

Before, one had to look at logs to figure out if a timeline has been
offloaded or not, or use the non-presence of a certain timeline in the
list of normal timelines. Now, one can list them directly.

Part of #8088
---
 libs/pageserver_api/src/models.rs | 17 +++++++
 pageserver/src/http/routes.rs     | 80 ++++++++++++++++++++++++++++++-
 pageserver/src/tenant.rs          | 25 +++++++++-
 3 files changed, 120 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 5b0b6bebe3..e08bf40801 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -684,6 +684,23 @@ pub struct TimelineArchivalConfigRequest {
     pub state: TimelineArchivalState,
 }
 
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct TimelinesInfoAndOffloaded {
+    pub timelines: Vec<TimelineInfo>,
+    pub offloaded: Vec<OffloadedTimelineInfo>,
+}
+
+/// Analog of [`TimelineInfo`] for offloaded timelines.
+#[derive(Debug, Serialize, Deserialize, Clone)]
+pub struct OffloadedTimelineInfo {
+    pub tenant_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    /// Whether the timeline has a parent it has been branched off from or not
+    pub ancestor_timeline_id: Option<TimelineId>,
+    /// Whether to retain the branch lsn at the ancestor or not
+    pub ancestor_retain_lsn: Option<Lsn>,
+}
+
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 8f928fd81b..a254f1683d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -26,6 +26,7 @@ use pageserver_api::models::LocationConfigListResponse;
 use pageserver_api::models::LocationConfigMode;
 use pageserver_api::models::LsnLease;
 use pageserver_api::models::LsnLeaseRequest;
+use pageserver_api::models::OffloadedTimelineInfo;
 use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
 use pageserver_api::models::TenantLocationConfigRequest;
@@ -37,6 +38,7 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TimelineArchivalConfigRequest;
+use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
 use pageserver_api::models::TopTenantShardsResponse;
@@ -81,6 +83,7 @@ use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
+use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
@@ -477,6 +480,22 @@ async fn build_timeline_info_common(
     Ok(info)
 }
 
+fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> OffloadedTimelineInfo {
+    let &OffloadedTimeline {
+        tenant_shard_id,
+        timeline_id,
+        ancestor_retain_lsn,
+        ancestor_timeline_id,
+        ..
+    } = offloaded.as_ref();
+    OffloadedTimelineInfo {
+        tenant_id: tenant_shard_id,
+        timeline_id,
+        ancestor_retain_lsn,
+        ancestor_timeline_id,
+    }
+}
+
 // healthcheck handler
 async fn status_handler(
     request: Request<Body>,
@@ -643,7 +662,7 @@ async fn timeline_list_handler(
             )
             .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
             .await
-            .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
+            .context("Failed to build timeline info")
             .map_err(ApiError::InternalServerError)?;
 
             response_data.push(timeline_info);
@@ -658,6 +677,62 @@ async fn timeline_list_handler(
     json_response(StatusCode::OK, response_data)
 }
 
+async fn timeline_and_offloaded_list_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let include_non_incremental_logical_size: Option<bool> =
+        parse_query_param(&request, "include-non-incremental-logical-size")?;
+    let force_await_initial_logical_size: Option<bool> =
+        parse_query_param(&request, "force-await-initial-logical-size")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let state = get_state(&request);
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let response_data = async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
+        let (timelines, offloadeds) = tenant.list_timelines_and_offloaded();
+
+        let mut timeline_infos = Vec::with_capacity(timelines.len());
+        for timeline in timelines {
+            let timeline_info = build_timeline_info(
+                &timeline,
+                include_non_incremental_logical_size.unwrap_or(false),
+                force_await_initial_logical_size.unwrap_or(false),
+                &ctx,
+            )
+            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
+            .await
+            .context("Failed to build timeline info")
+            .map_err(ApiError::InternalServerError)?;
+
+            timeline_infos.push(timeline_info);
+        }
+        let offloaded_infos = offloadeds
+            .into_iter()
+            .map(|offloaded| build_timeline_offloaded_info(&offloaded))
+            .collect::<Vec<_>>();
+        let res = TimelinesInfoAndOffloaded {
+            timelines: timeline_infos,
+            offloaded: offloaded_infos,
+        };
+        Ok::<TimelinesInfoAndOffloaded, ApiError>(res)
+    }
+    .instrument(info_span!("timeline_and_offloaded_list",
+                tenant_id = %tenant_shard_id.tenant_id,
+                shard_id = %tenant_shard_id.shard_slug()))
+    .await?;
+
+    json_response(StatusCode::OK, response_data)
+}
+
 async fn timeline_preserve_initdb_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -2993,6 +3068,9 @@ pub fn make_router(
         .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_list_handler)
         })
+        .get("/v1/tenant/:tenant_shard_id/timeline_and_offloaded", |r| {
+            api_handler(r, timeline_and_offloaded_list_handler)
+        })
         .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
             api_handler(r, timeline_create_handler)
         })
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1066d165cd..41d21ef041 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1755,7 +1755,7 @@ impl Tenant {
     }
 
     /// Lists timelines the tenant contains.
-    /// Up to tenant's implementation to omit certain timelines that ar not considered ready for use.
+    /// It's up to callers to omit certain timelines that are not considered ready for use.
     pub fn list_timelines(&self) -> Vec<Arc<Timeline>> {
         self.timelines
             .lock()
@@ -1765,6 +1765,29 @@ impl Tenant {
             .collect()
     }
 
+    /// Lists timelines the tenant manages, including offloaded ones.
+    ///
+    /// It's up to callers to omit certain timelines that are not considered ready for use.
+    pub fn list_timelines_and_offloaded(
+        &self,
+    ) -> (Vec<Arc<Timeline>>, Vec<Arc<OffloadedTimeline>>) {
+        let timelines = self
+            .timelines
+            .lock()
+            .unwrap()
+            .values()
+            .map(Arc::clone)
+            .collect();
+        let offloaded = self
+            .timelines_offloaded
+            .lock()
+            .unwrap()
+            .values()
+            .map(Arc::clone)
+            .collect();
+        (timelines, offloaded)
+    }
+
     pub fn list_timeline_ids(&self) -> Vec<TimelineId> {
         self.timelines.lock().unwrap().keys().cloned().collect()
     }

From 94369af7825ca26fb5aea805f2c85bdb877ceb74 Mon Sep 17 00:00:00 2001
From: David Gomes <david@neon.tech>
Date: Mon, 21 Oct 2024 18:39:30 -0500
Subject: [PATCH 060/239] chore(compute): bumps pg_session_jwt to latest
 version (#9474)

---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 74970696b5..6451e309f0 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -975,8 +975,8 @@ ARG PG_VERSION
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/e642528f429dd3f5403845a50191b78d434b84a6.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "1a69210703cc91224785e59a0a67562dd9eed9a0914ac84b11447582ca0d5b93 pg_session_jwt.tar.gz" | sha256sum --check && \
+    wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release

From 1e8e04bb2c9b2cdb17f680c8d0df697289114e17 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Tue, 22 Oct 2024 09:11:36 +0300
Subject: [PATCH 061/239] safekeeper: refactor timeline initialization (#9362)

Always do timeline init through atomic rename of temp directory. Add
GlobalTimelines::load_temp_timeline which does this, and use it from
both pull_timeline and basic timeline creation. Fixes a collection
of issues:
- previously timeline creation didn't really flushed cfile to disk
  due to 'nothing to do if state didn't change' check;
- even if it did, without tmp dir it is possible to lose the cfile
  but leave timeline dir in place, making it look corrupted;
- tenant directory creation fsync was missing in timeline creation;
- pull_timeline is now protected from concurrent both itself and
  timeline creation;
- now global timelines map entry got special CreationInProgress
  entry type which prevents from anyone getting access to timeline
  while it is being created (previously one could get access to it,
  but it was locked during creation, which is valid but confusing if
  creation failed).

fixes #8927
---
 safekeeper/src/control_file.rs                |  21 +-
 safekeeper/src/copy_timeline.rs               |  11 +-
 safekeeper/src/pull_timeline.rs               | 106 +-----
 safekeeper/src/receive_wal.rs                 |   3 +-
 safekeeper/src/state.rs                       |  39 +-
 safekeeper/src/timeline.rs                    |  91 +----
 safekeeper/src/timelines_global_map.rs        | 339 +++++++++++-------
 safekeeper/src/wal_storage.rs                 |  10 +-
 .../tests/walproposer_sim/safekeeper.rs       |  20 +-
 9 files changed, 290 insertions(+), 350 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index 8b252b4ab4..cd82e43780 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -66,22 +66,25 @@ impl FileStorage {
         })
     }
 
-    /// Create file storage for a new timeline, but don't persist it yet.
-    pub fn create_new(
-        timeline_dir: Utf8PathBuf,
+    /// Create and reliably persist new control file at given location.
+    ///
+    /// Note: we normally call this in temp directory for atomic init, so
+    /// interested in FileStorage as a result only in tests.
+    pub async fn create_new(
+        dir: Utf8PathBuf,
         conf: &SafeKeeperConf,
         state: TimelinePersistentState,
     ) -> Result<FileStorage> {
         // we don't support creating new timelines in offloaded state
         assert!(matches!(state.eviction_state, EvictionState::Present));
 
-        let store = FileStorage {
-            timeline_dir,
+        let mut store = FileStorage {
+            timeline_dir: dir,
             no_sync: conf.no_sync,
-            state,
+            state: state.clone(),
             last_persist_at: Instant::now(),
         };
-
+        store.persist(&state).await?;
         Ok(store)
     }
 
@@ -190,8 +193,6 @@ impl TimelinePersistentState {
 
 impl Storage for FileStorage {
     /// Persists state durably to the underlying storage.
-    ///
-    /// For a description, see <https://lwn.net/Articles/457667/>.
     async fn persist(&mut self, s: &TimelinePersistentState) -> Result<()> {
         let _timer = PERSIST_CONTROL_FILE_SECONDS.start_timer();
 
@@ -269,7 +270,7 @@ mod test {
             .await
             .expect("failed to create timeline dir");
         let state = TimelinePersistentState::empty();
-        let storage = FileStorage::create_new(timeline_dir, conf, state.clone())?;
+        let storage = FileStorage::create_new(timeline_dir, conf, state.clone()).await?;
         Ok((storage, state))
     }
 
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 220988c3ce..52b13dc5e3 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -12,10 +12,10 @@ use tracing::{info, warn};
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
 use crate::{
-    control_file::{FileStorage, Storage},
-    pull_timeline::{create_temp_timeline_dir, load_temp_timeline, validate_temp_timeline},
+    control_file::FileStorage,
     state::TimelinePersistentState,
     timeline::{Timeline, TimelineError, WalResidentTimeline},
+    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
     wal_backup::copy_s3_segments,
     wal_storage::{wal_file_paths, WalReader},
     GlobalTimelines,
@@ -149,17 +149,16 @@ pub async fn handle_request(request: Request) -> Result<()> {
         vec![],
         request.until_lsn,
         start_lsn,
-    );
+    )?;
     new_state.timeline_start_lsn = start_lsn;
     new_state.peer_horizon_lsn = request.until_lsn;
     new_state.backup_lsn = new_backup_lsn;
 
-    let mut file_storage = FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone())?;
-    file_storage.persist(&new_state).await?;
+    FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone()).await?;
 
     // now we have a ready timeline in a temp directory
     validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
-    load_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
+    GlobalTimelines::load_temp_timeline(request.destination_ttid, &tli_dir_path, true).await?;
 
     Ok(())
 }
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index c772ae6de7..c7f5165f90 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -1,7 +1,6 @@
 use anyhow::{anyhow, bail, Context, Result};
 use bytes::Bytes;
 use camino::Utf8PathBuf;
-use camino_tempfile::Utf8TempDir;
 use chrono::{DateTime, Utc};
 use futures::{SinkExt, StreamExt, TryStreamExt};
 use postgres_ffi::{XLogFileName, XLogSegNo, PG_TLI};
@@ -9,7 +8,6 @@ use serde::{Deserialize, Serialize};
 use std::{
     cmp::min,
     io::{self, ErrorKind},
-    sync::Arc,
 };
 use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
 use tokio_tar::{Archive, Builder, Header};
@@ -20,7 +18,7 @@ use tokio_util::{
 use tracing::{error, info, instrument};
 
 use crate::{
-    control_file::{self, CONTROL_FILE_NAME},
+    control_file::CONTROL_FILE_NAME,
     debug_dump,
     http::{
         client::{self, Client},
@@ -28,13 +26,14 @@ use crate::{
     },
     safekeeper::Term,
     state::TimelinePersistentState,
-    timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError, WalResidentTimeline},
+    timeline::WalResidentTimeline,
+    timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
     wal_backup,
-    wal_storage::{self, open_wal_file, Storage},
-    GlobalTimelines, SafeKeeperConf,
+    wal_storage::open_wal_file,
+    GlobalTimelines,
 };
 use utils::{
-    crashsafe::{durable_rename, fsync_async_opt},
+    crashsafe::fsync_async_opt,
     id::{NodeId, TenantId, TenantTimelineId, TimelineId},
     logging::SecretString,
     lsn::Lsn,
@@ -428,100 +427,9 @@ async fn pull_timeline(
     assert!(status.commit_lsn <= status.flush_lsn);
 
     // Finally, load the timeline.
-    let _tli = load_temp_timeline(conf, ttid, &tli_dir_path).await?;
+    let _tli = GlobalTimelines::load_temp_timeline(ttid, &tli_dir_path, false).await?;
 
     Ok(Response {
         safekeeper_host: host,
     })
 }
-
-/// Create temp directory for a new timeline. It needs to be located on the same
-/// filesystem as the rest of the timelines. It will be automatically deleted when
-/// Utf8TempDir goes out of scope.
-pub async fn create_temp_timeline_dir(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-) -> Result<(Utf8TempDir, Utf8PathBuf)> {
-    // conf.workdir is usually /storage/safekeeper/data
-    // will try to transform it into /storage/safekeeper/tmp
-    let temp_base = conf
-        .workdir
-        .parent()
-        .ok_or(anyhow::anyhow!("workdir has no parent"))?
-        .join("tmp");
-
-    tokio::fs::create_dir_all(&temp_base).await?;
-
-    let tli_dir = camino_tempfile::Builder::new()
-        .suffix("_temptli")
-        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
-        .tempdir_in(temp_base)?;
-
-    let tli_dir_path = tli_dir.path().to_path_buf();
-
-    Ok((tli_dir, tli_dir_path))
-}
-
-/// Do basic validation of a temp timeline, before moving it to the global map.
-pub async fn validate_temp_timeline(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    path: &Utf8PathBuf,
-) -> Result<(Lsn, Lsn)> {
-    let control_path = path.join("safekeeper.control");
-
-    let control_store = control_file::FileStorage::load_control_file(control_path)?;
-    if control_store.server.wal_seg_size == 0 {
-        bail!("wal_seg_size is not set");
-    }
-
-    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
-
-    let commit_lsn = control_store.commit_lsn;
-    let flush_lsn = wal_store.flush_lsn();
-
-    Ok((commit_lsn, flush_lsn))
-}
-
-/// Move timeline from a temp directory to the main storage, and load it to the global map.
-///
-/// This operation is done under a lock to prevent bugs if several concurrent requests are
-/// trying to load the same timeline. Note that it doesn't guard against creating the
-/// timeline with the same ttid, but no one should be doing this anyway.
-pub async fn load_temp_timeline(
-    conf: &SafeKeeperConf,
-    ttid: TenantTimelineId,
-    tmp_path: &Utf8PathBuf,
-) -> Result<Arc<Timeline>> {
-    // Take a lock to prevent concurrent loadings
-    let load_lock = GlobalTimelines::loading_lock().await;
-    let guard = load_lock.lock().await;
-
-    if !matches!(GlobalTimelines::get(ttid), Err(TimelineError::NotFound(_))) {
-        bail!("timeline already exists, cannot overwrite it")
-    }
-
-    // Move timeline dir to the correct location
-    let timeline_path = get_timeline_dir(conf, &ttid);
-
-    info!(
-        "moving timeline {} from {} to {}",
-        ttid, tmp_path, timeline_path
-    );
-    tokio::fs::create_dir_all(get_tenant_dir(conf, &ttid.tenant_id)).await?;
-    // fsync tenant dir creation
-    fsync_async_opt(&conf.workdir, !conf.no_sync).await?;
-    durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
-
-    let tli = GlobalTimelines::load_timeline(&guard, ttid)
-        .await
-        .context("Failed to load timeline after copy")?;
-
-    info!(
-        "loaded timeline {}, flush_lsn={}",
-        ttid,
-        tli.get_flush_lsn().await
-    );
-
-    Ok(tli)
-}
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 2a9ca85299..3dbf72298f 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -339,7 +339,8 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> NetworkReader<'a, IO> {
                 };
                 let tli =
                     GlobalTimelines::create(self.ttid, server_info, Lsn::INVALID, Lsn::INVALID)
-                        .await?;
+                        .await
+                        .context("create timeline")?;
                 tli.wal_residence_guard().await?
             }
             _ => {
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 8ae749ded5..8dd873ee77 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -3,7 +3,7 @@
 
 use std::{cmp::max, ops::Deref};
 
-use anyhow::Result;
+use anyhow::{bail, Result};
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
@@ -13,7 +13,11 @@ use utils::{
 
 use crate::{
     control_file,
-    safekeeper::{AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory},
+    safekeeper::{
+        AcceptorState, PersistedPeerInfo, PgUuid, ServerInfo, Term, TermHistory,
+        UNKNOWN_SERVER_VERSION,
+    },
+    timeline::TimelineError,
     wal_backup_partial::{self},
 };
 
@@ -91,8 +95,24 @@ impl TimelinePersistentState {
         peers: Vec<NodeId>,
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
-    ) -> TimelinePersistentState {
-        TimelinePersistentState {
+    ) -> anyhow::Result<TimelinePersistentState> {
+        if server_info.wal_seg_size == 0 {
+            bail!(TimelineError::UninitializedWalSegSize(*ttid));
+        }
+
+        if server_info.pg_version == UNKNOWN_SERVER_VERSION {
+            bail!(TimelineError::UninitialinzedPgVersion(*ttid));
+        }
+
+        if commit_lsn < local_start_lsn {
+            bail!(
+                "commit_lsn {} is smaller than local_start_lsn {}",
+                commit_lsn,
+                local_start_lsn
+            );
+        }
+
+        Ok(TimelinePersistentState {
             tenant_id: ttid.tenant_id,
             timeline_id: ttid.timeline_id,
             acceptor_state: AcceptorState {
@@ -115,24 +135,23 @@ impl TimelinePersistentState {
             ),
             partial_backup: wal_backup_partial::State::default(),
             eviction_state: EvictionState::Present,
-        }
+        })
     }
 
     #[cfg(test)]
     pub fn empty() -> Self {
-        use crate::safekeeper::UNKNOWN_SERVER_VERSION;
-
         TimelinePersistentState::new(
             &TenantTimelineId::empty(),
             ServerInfo {
-                pg_version: UNKNOWN_SERVER_VERSION, /* Postgres server version */
-                system_id: 0,                       /* Postgres system identifier */
-                wal_seg_size: 0,
+                pg_version: 17, /* Postgres server version */
+                system_id: 0,   /* Postgres system identifier */
+                wal_seg_size: 16 * 1024 * 1024,
             },
             vec![],
             Lsn::INVALID,
             Lsn::INVALID,
         )
+        .unwrap()
     }
 }
 
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 41b9490088..dd4d161226 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -27,11 +27,11 @@ use utils::{
 use storage_broker::proto::SafekeeperTimelineInfo;
 use storage_broker::proto::TenantTimelineId as ProtoTenantTimelineId;
 
+use crate::control_file;
 use crate::rate_limit::RateLimiter;
 use crate::receive_wal::WalReceivers;
 use crate::safekeeper::{
-    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, ServerInfo, Term, TermLsn,
-    INVALID_TERM,
+    AcceptorProposerMessage, ProposerAcceptorMessage, SafeKeeper, Term, TermLsn,
 };
 use crate::send_wal::WalSenders;
 use crate::state::{EvictionState, TimelineMemState, TimelinePersistentState, TimelineState};
@@ -40,7 +40,6 @@ use crate::timeline_manager::{AtomicStatus, ManagerCtl};
 use crate::timelines_set::TimelinesSet;
 use crate::wal_backup::{self, remote_timeline_path};
 use crate::wal_backup_partial::PartialRemoteSegment;
-use crate::{control_file, safekeeper::UNKNOWN_SERVER_VERSION};
 
 use crate::metrics::{FullTimelineInfo, WalStorageMetrics, MISC_OPERATION_SECONDS};
 use crate::wal_storage::{Storage as wal_storage_iface, WalReader};
@@ -326,44 +325,6 @@ pub struct SharedState {
 }
 
 impl SharedState {
-    /// Initialize fresh timeline state without persisting anything to disk.
-    fn create_new(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-        state: TimelinePersistentState,
-    ) -> Result<Self> {
-        if state.server.wal_seg_size == 0 {
-            bail!(TimelineError::UninitializedWalSegSize(*ttid));
-        }
-
-        if state.server.pg_version == UNKNOWN_SERVER_VERSION {
-            bail!(TimelineError::UninitialinzedPgVersion(*ttid));
-        }
-
-        if state.commit_lsn < state.local_start_lsn {
-            bail!(
-                "commit_lsn {} is higher than local_start_lsn {}",
-                state.commit_lsn,
-                state.local_start_lsn
-            );
-        }
-
-        // We don't want to write anything to disk, because we may have existing timeline there.
-        // These functions should not change anything on disk.
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        let control_store =
-            control_file::FileStorage::create_new(timeline_dir.clone(), conf, state)?;
-        let wal_store =
-            wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
-        let sk = SafeKeeper::new(TimelineState::new(control_store), wal_store, conf.my_id)?;
-
-        Ok(Self {
-            sk: StateSK::Loaded(sk),
-            peers_info: PeersInfo(vec![]),
-            wal_removal_on_hold: false,
-        })
-    }
-
     /// Restore SharedState from control file. If file doesn't exist, bails out.
     fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
         let timeline_dir = get_timeline_dir(conf, ttid);
@@ -450,6 +411,8 @@ pub enum TimelineError {
     Cancelled(TenantTimelineId),
     #[error("Timeline {0} was not found in global map")]
     NotFound(TenantTimelineId),
+    #[error("Timeline {0} creation is in progress")]
+    CreationInProgress(TenantTimelineId),
     #[error("Timeline {0} exists on disk, but wasn't loaded on startup")]
     Invalid(TenantTimelineId),
     #[error("Timeline {0} is already exists")]
@@ -514,7 +477,7 @@ pub struct Timeline {
 
 impl Timeline {
     /// Load existing timeline from disk.
-    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Timeline> {
+    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
         let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
 
         let shared_state = SharedState::restore(conf, &ttid)?;
@@ -528,7 +491,7 @@ impl Timeline {
 
         let walreceivers = WalReceivers::new();
         let remote_path = remote_timeline_path(&ttid)?;
-        Ok(Timeline {
+        Ok(Arc::new(Timeline {
             ttid,
             remote_path,
             commit_lsn_watch_tx,
@@ -547,47 +510,7 @@ impl Timeline {
             wal_backup_active: AtomicBool::new(false),
             last_removed_segno: AtomicU64::new(0),
             mgr_status: AtomicStatus::new(),
-        })
-    }
-
-    /// Create a new timeline, which is not yet persisted to disk.
-    pub fn create_empty(
-        conf: &SafeKeeperConf,
-        ttid: TenantTimelineId,
-        server_info: ServerInfo,
-        commit_lsn: Lsn,
-        local_start_lsn: Lsn,
-    ) -> Result<Timeline> {
-        let (commit_lsn_watch_tx, commit_lsn_watch_rx) = watch::channel(Lsn::INVALID);
-        let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) =
-            watch::channel(TermLsn::from((INVALID_TERM, Lsn::INVALID)));
-        let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
-
-        let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
-
-        let walreceivers = WalReceivers::new();
-        let remote_path = remote_timeline_path(&ttid)?;
-        Ok(Timeline {
-            ttid,
-            remote_path,
-            commit_lsn_watch_tx,
-            commit_lsn_watch_rx,
-            term_flush_lsn_watch_tx,
-            term_flush_lsn_watch_rx,
-            shared_state_version_tx,
-            shared_state_version_rx,
-            mutex: RwLock::new(SharedState::create_new(conf, &ttid, state)?),
-            walsenders: WalSenders::new(walreceivers.clone()),
-            walreceivers,
-            cancel: CancellationToken::default(),
-            timeline_dir: get_timeline_dir(conf, &ttid),
-            manager_ctl: ManagerCtl::new(),
-            broker_active: AtomicBool::new(false),
-            wal_backup_active: AtomicBool::new(false),
-            last_removed_segno: AtomicU64::new(0),
-            mgr_status: AtomicStatus::new(),
-        })
+        }))
     }
 
     /// Initialize fresh timeline on disk and start background tasks. If init
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 866cde3339..538bb6e5d2 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -5,11 +5,14 @@
 use crate::defaults::DEFAULT_EVICTION_CONCURRENCY;
 use crate::rate_limit::RateLimiter;
 use crate::safekeeper::ServerInfo;
+use crate::state::TimelinePersistentState;
 use crate::timeline::{get_tenant_dir, get_timeline_dir, Timeline, TimelineError};
 use crate::timelines_set::TimelinesSet;
-use crate::SafeKeeperConf;
+use crate::wal_storage::Storage;
+use crate::{control_file, wal_storage, SafeKeeperConf};
 use anyhow::{bail, Context, Result};
 use camino::Utf8PathBuf;
+use camino_tempfile::Utf8TempDir;
 use once_cell::sync::Lazy;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -17,12 +20,22 @@ use std::str::FromStr;
 use std::sync::atomic::Ordering;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
+use tokio::fs;
 use tracing::*;
+use utils::crashsafe::{durable_rename, fsync_async_opt};
 use utils::id::{TenantId, TenantTimelineId, TimelineId};
 use utils::lsn::Lsn;
 
+// Timeline entry in the global map: either a ready timeline, or mark that it is
+// being created.
+#[derive(Clone)]
+enum GlobalMapTimeline {
+    CreationInProgress,
+    Timeline(Arc<Timeline>),
+}
+
 struct GlobalTimelinesState {
-    timelines: HashMap<TenantTimelineId, Arc<Timeline>>,
+    timelines: HashMap<TenantTimelineId, GlobalMapTimeline>,
 
     // A tombstone indicates this timeline used to exist has been deleted.  These are used to prevent
     // on-demand timeline creation from recreating deleted timelines.  This is only soft-enforced, as
@@ -31,13 +44,9 @@ struct GlobalTimelinesState {
 
     conf: Option<SafeKeeperConf>,
     broker_active_set: Arc<TimelinesSet>,
-    load_lock: Arc<tokio::sync::Mutex<TimelineLoadLock>>,
     global_rate_limiter: RateLimiter,
 }
 
-// Used to prevent concurrent timeline loading.
-pub struct TimelineLoadLock;
-
 impl GlobalTimelinesState {
     /// Get configuration, which must be set once during init.
     fn get_conf(&self) -> &SafeKeeperConf {
@@ -55,22 +64,16 @@ impl GlobalTimelinesState {
         )
     }
 
-    /// Insert timeline into the map. Returns error if timeline with the same id already exists.
-    fn try_insert(&mut self, timeline: Arc<Timeline>) -> Result<()> {
-        let ttid = timeline.ttid;
-        if self.timelines.contains_key(&ttid) {
-            bail!(TimelineError::AlreadyExists(ttid));
-        }
-        self.timelines.insert(ttid, timeline);
-        Ok(())
-    }
-
-    /// Get timeline from the map. Returns error if timeline doesn't exist.
+    /// Get timeline from the map. Returns error if timeline doesn't exist or
+    /// creation is in progress.
     fn get(&self, ttid: &TenantTimelineId) -> Result<Arc<Timeline>, TimelineError> {
-        self.timelines
-            .get(ttid)
-            .cloned()
-            .ok_or(TimelineError::NotFound(*ttid))
+        match self.timelines.get(ttid).cloned() {
+            Some(GlobalMapTimeline::Timeline(tli)) => Ok(tli),
+            Some(GlobalMapTimeline::CreationInProgress) => {
+                Err(TimelineError::CreationInProgress(*ttid))
+            }
+            None => Err(TimelineError::NotFound(*ttid)),
+        }
     }
 
     fn delete(&mut self, ttid: TenantTimelineId) {
@@ -85,7 +88,6 @@ static TIMELINES_STATE: Lazy<Mutex<GlobalTimelinesState>> = Lazy::new(|| {
         tombstones: HashMap::new(),
         conf: None,
         broker_active_set: Arc::new(TimelinesSet::default()),
-        load_lock: Arc::new(tokio::sync::Mutex::new(TimelineLoadLock)),
         global_rate_limiter: RateLimiter::new(1, 1),
     })
 });
@@ -141,11 +143,10 @@ impl GlobalTimelines {
     /// Loads all timelines for the given tenant to memory. Returns fs::read_dir
     /// errors if any.
     ///
-    /// It is async for update_status_notify sake. Since TIMELINES_STATE lock is
-    /// sync and there is no important reason to make it async (it is always
-    /// held for a short while) we just lock and unlock it for each timeline --
-    /// this function is called during init when nothing else is running, so
-    /// this is fine.
+    /// It is async, but TIMELINES_STATE lock is sync and there is no important
+    /// reason to make it async (it is always held for a short while), so we
+    /// just lock and unlock it for each timeline -- this function is called
+    /// during init when nothing else is running, so this is fine.
     async fn load_tenant_timelines(tenant_id: TenantId) -> Result<()> {
         let (conf, broker_active_set, partial_backup_rate_limiter) = {
             let state = TIMELINES_STATE.lock().unwrap();
@@ -163,14 +164,13 @@ impl GlobalTimelines {
                     {
                         let ttid = TenantTimelineId::new(tenant_id, timeline_id);
                         match Timeline::load_timeline(&conf, ttid) {
-                            Ok(timeline) => {
-                                let tli = Arc::new(timeline);
+                            Ok(tli) => {
                                 let mut shared_state = tli.write_shared_state().await;
                                 TIMELINES_STATE
                                     .lock()
                                     .unwrap()
                                     .timelines
-                                    .insert(ttid, tli.clone());
+                                    .insert(ttid, GlobalMapTimeline::Timeline(tli.clone()));
                                 tli.bootstrap(
                                     &mut shared_state,
                                     &conf,
@@ -199,51 +199,6 @@ impl GlobalTimelines {
         Ok(())
     }
 
-    /// Take a lock for timeline loading.
-    pub async fn loading_lock() -> Arc<tokio::sync::Mutex<TimelineLoadLock>> {
-        TIMELINES_STATE.lock().unwrap().load_lock.clone()
-    }
-
-    /// Load timeline from disk to the memory.
-    pub async fn load_timeline<'a>(
-        _guard: &tokio::sync::MutexGuard<'a, TimelineLoadLock>,
-        ttid: TenantTimelineId,
-    ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set, partial_backup_rate_limiter) =
-            TIMELINES_STATE.lock().unwrap().get_dependencies();
-
-        match Timeline::load_timeline(&conf, ttid) {
-            Ok(timeline) => {
-                let tli = Arc::new(timeline);
-                let mut shared_state = tli.write_shared_state().await;
-
-                // TODO: prevent concurrent timeline creation/loading
-                {
-                    let mut state = TIMELINES_STATE.lock().unwrap();
-
-                    // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
-                    // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
-                    if state.tombstones.remove(&ttid).is_some() {
-                        warn!("Un-deleted timeline {ttid}");
-                    }
-
-                    state.timelines.insert(ttid, tli.clone());
-                }
-
-                tli.bootstrap(
-                    &mut shared_state,
-                    &conf,
-                    broker_active_set,
-                    partial_backup_rate_limiter,
-                );
-                drop(shared_state);
-                Ok(tli)
-            }
-            // If we can't load a timeline, it's bad. Caller will figure it out.
-            Err(e) => bail!("failed to load timeline {}, reason: {:?}", ttid, e),
-        }
-    }
-
     /// Get the number of timelines in the map.
     pub fn timelines_count() -> usize {
         TIMELINES_STATE.lock().unwrap().timelines.len()
@@ -266,7 +221,7 @@ impl GlobalTimelines {
         commit_lsn: Lsn,
         local_start_lsn: Lsn,
     ) -> Result<Arc<Timeline>> {
-        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+        let (conf, _, _) = {
             let state = TIMELINES_STATE.lock().unwrap();
             if let Ok(timeline) = state.get(&ttid) {
                 // Timeline already exists, return it.
@@ -282,55 +237,146 @@ impl GlobalTimelines {
 
         info!("creating new timeline {}", ttid);
 
-        let timeline = Arc::new(Timeline::create_empty(
-            &conf,
-            ttid,
-            server_info,
-            commit_lsn,
-            local_start_lsn,
-        )?);
+        // Do on disk initialization in tmp dir.
+        let (_tmp_dir, tmp_dir_path) = create_temp_timeline_dir(&conf, ttid).await?;
 
-        // Take a lock and finish the initialization holding this mutex. No other threads
-        // can interfere with creation after we will insert timeline into the map.
-        {
-            let mut shared_state = timeline.write_shared_state().await;
+        // TODO: currently we create only cfile. It would be reasonable to
+        // immediately initialize first WAL segment as well.
+        let state =
+            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
+        control_file::FileStorage::create_new(tmp_dir_path.clone(), &conf, state).await?;
+        let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
+        Ok(timeline)
+    }
 
-            // We can get a race condition here in case of concurrent create calls, but only
-            // in theory. create() will return valid timeline on the next try.
-            TIMELINES_STATE
-                .lock()
-                .unwrap()
-                .try_insert(timeline.clone())?;
+    /// Move timeline from a temp directory to the main storage, and load it to
+    /// the global map. Creating timeline in this way ensures atomicity: rename
+    /// is atomic, so either move of the whole datadir succeeds or it doesn't,
+    /// but corrupted data dir shouldn't be possible.
+    ///
+    /// We'd like to avoid holding map lock while doing IO, so it's a 3 step
+    /// process:
+    /// 1) check the global map that timeline doesn't exist and mark that we're
+    ///    creating it;
+    /// 2) move the directory and load the timeline
+    /// 3) take lock again and insert the timeline into the global map.
+    pub async fn load_temp_timeline(
+        ttid: TenantTimelineId,
+        tmp_path: &Utf8PathBuf,
+        check_tombstone: bool,
+    ) -> Result<Arc<Timeline>> {
+        // Check for existence and mark that we're creating it.
+        let (conf, broker_active_set, partial_backup_rate_limiter) = {
+            let mut state = TIMELINES_STATE.lock().unwrap();
+            match state.timelines.get(&ttid) {
+                Some(GlobalMapTimeline::CreationInProgress) => {
+                    bail!(TimelineError::CreationInProgress(ttid));
+                }
+                Some(GlobalMapTimeline::Timeline(_)) => {
+                    bail!(TimelineError::AlreadyExists(ttid));
+                }
+                _ => {}
+            }
+            if check_tombstone {
+                if state.tombstones.contains_key(&ttid) {
+                    anyhow::bail!("timeline {ttid} is deleted, refusing to recreate");
+                }
+            } else {
+                // We may be have been asked to load a timeline that was previously deleted (e.g. from `pull_timeline.rs`).  We trust
+                // that the human doing this manual intervention knows what they are doing, and remove its tombstone.
+                if state.tombstones.remove(&ttid).is_some() {
+                    warn!("un-deleted timeline {ttid}");
+                }
+            }
+            state
+                .timelines
+                .insert(ttid, GlobalMapTimeline::CreationInProgress);
+            state.get_dependencies()
+        };
 
-            // Write the new timeline to the disk and start background workers.
-            // Bootstrap is transactional, so if it fails, the timeline will be deleted,
-            // and the state on disk should remain unchanged.
-            if let Err(e) = timeline
-                .init_new(
-                    &mut shared_state,
+        // Do the actual move and reflect the result in the map.
+        match GlobalTimelines::install_temp_timeline(ttid, tmp_path, &conf).await {
+            Ok(timeline) => {
+                let mut timeline_shared_state = timeline.write_shared_state().await;
+                let mut state = TIMELINES_STATE.lock().unwrap();
+                assert!(matches!(
+                    state.timelines.get(&ttid),
+                    Some(GlobalMapTimeline::CreationInProgress)
+                ));
+
+                state
+                    .timelines
+                    .insert(ttid, GlobalMapTimeline::Timeline(timeline.clone()));
+                drop(state);
+                timeline.bootstrap(
+                    &mut timeline_shared_state,
                     &conf,
                     broker_active_set,
                     partial_backup_rate_limiter,
-                )
-                .await
-            {
-                // Note: the most likely reason for init failure is that the timeline
-                // directory already exists on disk. This happens when timeline is corrupted
-                // and wasn't loaded from disk on startup because of that. We want to preserve
-                // the timeline directory in this case, for further inspection.
-
-                // TODO: this is an unusual error, perhaps we should send it to sentry
-                // TODO: compute will try to create timeline every second, we should add backoff
-                error!("failed to init new timeline {}: {}", ttid, e);
-
-                // Timeline failed to init, it cannot be used. Remove it from the map.
-                TIMELINES_STATE.lock().unwrap().timelines.remove(&ttid);
-                return Err(e);
+                );
+                drop(timeline_shared_state);
+                Ok(timeline)
+            }
+            Err(e) => {
+                // Init failed, remove the marker from the map
+                let mut state = TIMELINES_STATE.lock().unwrap();
+                assert!(matches!(
+                    state.timelines.get(&ttid),
+                    Some(GlobalMapTimeline::CreationInProgress)
+                ));
+                state.timelines.remove(&ttid);
+                Err(e)
             }
-            // We are done with bootstrap, release the lock, return the timeline.
-            // {} block forces release before .await
         }
-        Ok(timeline)
+    }
+
+    /// Main part of load_temp_timeline: do the move and load.
+    async fn install_temp_timeline(
+        ttid: TenantTimelineId,
+        tmp_path: &Utf8PathBuf,
+        conf: &SafeKeeperConf,
+    ) -> Result<Arc<Timeline>> {
+        let tenant_path = get_tenant_dir(conf, &ttid.tenant_id);
+        let timeline_path = get_timeline_dir(conf, &ttid);
+
+        // We must have already checked that timeline doesn't exist in the map,
+        // but there might be existing datadir: if timeline is corrupted it is
+        // not loaded. We don't want to overwrite such a dir, so check for its
+        // existence.
+        match fs::metadata(&timeline_path).await {
+            Ok(_) => {
+                // Timeline directory exists on disk, we should leave state unchanged
+                // and return error.
+                bail!(TimelineError::Invalid(ttid));
+            }
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => {
+                return Err(e.into());
+            }
+        }
+
+        info!(
+            "moving timeline {} from {} to {}",
+            ttid, tmp_path, timeline_path
+        );
+
+        // Now it is safe to move the timeline directory to the correct
+        // location. First, create tenant directory. Ignore error if it already
+        // exists.
+        if let Err(e) = tokio::fs::create_dir(&tenant_path).await {
+            if e.kind() != std::io::ErrorKind::AlreadyExists {
+                return Err(e.into());
+            }
+        }
+        // fsync it
+        fsync_async_opt(&tenant_path, !conf.no_sync).await?;
+        // and its creation
+        fsync_async_opt(&conf.workdir, !conf.no_sync).await?;
+
+        // Do the move.
+        durable_rename(tmp_path, &timeline_path, !conf.no_sync).await?;
+
+        Timeline::load_timeline(conf, ttid)
     }
 
     /// Get a timeline from the global map. If it's not present, it doesn't exist on disk,
@@ -358,8 +404,16 @@ impl GlobalTimelines {
         global_lock
             .timelines
             .values()
-            .filter(|t| !t.is_cancelled())
-            .cloned()
+            .filter_map(|t| match t {
+                GlobalMapTimeline::Timeline(t) => {
+                    if t.is_cancelled() {
+                        None
+                    } else {
+                        Some(t.clone())
+                    }
+                }
+                _ => None,
+            })
             .collect()
     }
 
@@ -370,8 +424,11 @@ impl GlobalTimelines {
         global_lock
             .timelines
             .values()
+            .filter_map(|t| match t {
+                GlobalMapTimeline::Timeline(t) => Some(t.clone()),
+                _ => None,
+            })
             .filter(|t| t.ttid.tenant_id == tenant_id)
-            .cloned()
             .collect()
     }
 
@@ -504,3 +561,45 @@ fn delete_dir(path: Utf8PathBuf) -> Result<bool> {
         Err(e) => Err(e.into()),
     }
 }
+
+/// Create temp directory for a new timeline. It needs to be located on the same
+/// filesystem as the rest of the timelines. It will be automatically deleted when
+/// Utf8TempDir goes out of scope.
+pub async fn create_temp_timeline_dir(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+) -> Result<(Utf8TempDir, Utf8PathBuf)> {
+    let temp_base = conf.workdir.join("tmp");
+
+    tokio::fs::create_dir_all(&temp_base).await?;
+
+    let tli_dir = camino_tempfile::Builder::new()
+        .suffix("_temptli")
+        .prefix(&format!("{}_{}_", ttid.tenant_id, ttid.timeline_id))
+        .tempdir_in(temp_base)?;
+
+    let tli_dir_path = tli_dir.path().to_path_buf();
+
+    Ok((tli_dir, tli_dir_path))
+}
+
+/// Do basic validation of a temp timeline, before moving it to the global map.
+pub async fn validate_temp_timeline(
+    conf: &SafeKeeperConf,
+    ttid: TenantTimelineId,
+    path: &Utf8PathBuf,
+) -> Result<(Lsn, Lsn)> {
+    let control_path = path.join("safekeeper.control");
+
+    let control_store = control_file::FileStorage::load_control_file(control_path)?;
+    if control_store.server.wal_seg_size == 0 {
+        bail!("wal_seg_size is not set");
+    }
+
+    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
+
+    let commit_lsn = control_store.commit_lsn;
+    let flush_lsn = wal_store.flush_lsn();
+
+    Ok((commit_lsn, flush_lsn))
+}
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 6e7da94973..61d7825ae6 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -186,8 +186,14 @@ impl PhysicalStorage {
             "initialized storage for timeline {}, flush_lsn={}, commit_lsn={}, peer_horizon_lsn={}",
             ttid.timeline_id, flush_lsn, state.commit_lsn, state.peer_horizon_lsn,
         );
-        if flush_lsn < state.commit_lsn || flush_lsn < state.peer_horizon_lsn {
-            warn!("timeline {} potential data loss: flush_lsn by find_end_of_wal is less than either commit_lsn or peer_horizon_lsn from control file", ttid.timeline_id);
+        if flush_lsn < state.commit_lsn {
+            bail!("timeline {} potential data loss: flush_lsn {} by find_end_of_wal is less than commit_lsn  {} from control file", ttid.timeline_id, flush_lsn, state.commit_lsn);
+        }
+        if flush_lsn < state.peer_horizon_lsn {
+            warn!(
+                "timeline {}: flush_lsn {} is less than cfile peer_horizon_lsn {}",
+                ttid.timeline_id, flush_lsn, state.peer_horizon_lsn
+            );
         }
 
         Ok(PhysicalStorage {
diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs
index 047b4be8fa..12aa025771 100644
--- a/safekeeper/tests/walproposer_sim/safekeeper.rs
+++ b/safekeeper/tests/walproposer_sim/safekeeper.rs
@@ -59,7 +59,7 @@ impl GlobalMap {
 
             if state.commit_lsn < state.local_start_lsn {
                 bail!(
-                    "commit_lsn {} is higher than local_start_lsn {}",
+                    "commit_lsn {} is smaller than local_start_lsn {}",
                     state.commit_lsn,
                     state.local_start_lsn
                 );
@@ -96,23 +96,7 @@ impl GlobalMap {
         let local_start_lsn = Lsn::INVALID;
 
         let state =
-            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn);
-
-        if state.server.wal_seg_size == 0 {
-            bail!(TimelineError::UninitializedWalSegSize(ttid));
-        }
-
-        if state.server.pg_version == UNKNOWN_SERVER_VERSION {
-            bail!(TimelineError::UninitialinzedPgVersion(ttid));
-        }
-
-        if state.commit_lsn < state.local_start_lsn {
-            bail!(
-                "commit_lsn {} is higher than local_start_lsn {}",
-                state.commit_lsn,
-                state.local_start_lsn
-            );
-        }
+            TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
 
         let disk_timeline = self.disk.put_state(&ttid, state);
         let control_store = DiskStateStorage::new(disk_timeline.clone());

From b7fa93f6b7ab5d562e0985eb06bdac8ba12ad892 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 22 Oct 2024 09:14:29 -0600
Subject: [PATCH 062/239] Use make's builtin RM variable

At least as far as removing individual files goes, this is the best
pattern for removing. I can't say the same for removing directories, but
I went ahead and changed those to `$(RM) -r` anyway.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Makefile           | 4 ++--
 compute/Makefile   | 2 +-
 pgxn/neon/Makefile | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/Makefile b/Makefile
index 33cfda2661..8e3b755112 100644
--- a/Makefile
+++ b/Makefile
@@ -297,7 +297,7 @@ clean: postgres-clean neon-pg-clean-ext
 # This removes everything
 .PHONY: distclean
 distclean:
-	rm -rf $(POSTGRES_INSTALL_DIR)
+	$(RM) -r $(POSTGRES_INSTALL_DIR)
 	$(CARGO_CMD_PREFIX) cargo clean
 
 .PHONY: fmt
@@ -329,7 +329,7 @@ postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
 		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
-	rm -f pg*.BAK
+	$(RM) pg*.BAK
 
 # Indent pxgn/neon.
 .PHONY: neon-pgindent
diff --git a/compute/Makefile b/compute/Makefile
index 08e3c7a68b..645880ce70 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -34,7 +34,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 
 .PHONY: clean
 clean:
-	rm -f \
+	$(RM) \
 		etc/neon_collector.yml \
 		etc/neon_collector_autoscaling.yml \
 		etc/sql_exporter.yml \
diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index f1229b2d73..1503b856f7 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -54,7 +54,7 @@ walproposer-lib: libwalproposer.a;
 
 .PHONY: libwalproposer.a
 libwalproposer.a: $(WALPROP_OBJS)
-	rm -f $@
+	$(RM) $@
 	$(AR) $(AROPT) $@ $^
 
 # needs vars:

From 8dca188974530b3c0c2160b22930615141e0236b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 22 Oct 2024 19:43:02 +0100
Subject: [PATCH 063/239] storage controller: add metrics for tenant shard,
 node count (#9475)

## Problem

Previously, figuring out how many tenant shards were managed by a
storage controller was typically done by peeking at the database or
calling into the API. A metric makes it easier to monitor, as
unexpectedly increasing shard counts can be indicative of problems
elsewhere in the system.

## Summary of changes

- Add metrics `storage_controller_pageserver_nodes` (updated on node
CRUD operations from Service) and `storage_controller_tenant_shards`
(updated RAII-style from TenantShard)
---
 storage_controller/src/metrics.rs             |  6 +++++
 storage_controller/src/service.rs             | 22 ++++++++++++++++---
 storage_controller/src/tenant_shard.rs        | 19 ++++++++++++++++
 .../regress/test_storage_controller.py        |  9 ++++++++
 4 files changed, 53 insertions(+), 3 deletions(-)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index 5989aeba91..a1f7bc2457 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -37,6 +37,12 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we spawn a reconcile task
     pub(crate) storage_controller_reconcile_spawn: measured::Counter,
 
+    /// Size of the in-memory map of tenant shards
+    pub(crate) storage_controller_tenant_shards: measured::Gauge,
+
+    /// Size of the in-memory map of pageserver_nodes
+    pub(crate) storage_controller_pageserver_nodes: measured::Gauge,
+
     /// Reconciler tasks completed, broken down by success/failure/cancelled
     pub(crate) storage_controller_reconcile_complete:
         measured::CounterVec<ReconcileCompleteLabelGroupSet>,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 01aa8f1dab..2cde1d6a3d 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -934,7 +934,6 @@ impl Service {
         self.startup_complete.clone().wait().await;
 
         const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
-
         let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
         while !self.reconcilers_cancel.is_cancelled() {
             tokio::select! {
@@ -1272,6 +1271,10 @@ impl Service {
             .collect::<Vec<_>>();
         let nodes: HashMap<NodeId, Node> = nodes.into_iter().map(|n| (n.get_id(), n)).collect();
         tracing::info!("Loaded {} nodes from database.", nodes.len());
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_nodes
+            .set(nodes.len() as i64);
 
         tracing::info!("Loading shards from database...");
         let mut tenant_shard_persistence = persistence.list_tenant_shards().await?;
@@ -4110,9 +4113,9 @@ impl Service {
                     (
                         old_attached,
                         generation,
-                        old_state.policy,
+                        old_state.policy.clone(),
                         old_state.shard,
-                        old_state.config,
+                        old_state.config.clone(),
                     )
                 };
 
@@ -5075,6 +5078,10 @@ impl Service {
         let mut nodes = (*locked.nodes).clone();
         nodes.remove(&node_id);
         locked.nodes = Arc::new(nodes);
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_nodes
+            .set(locked.nodes.len() as i64);
 
         locked.scheduler.node_remove(node_id);
 
@@ -5158,6 +5165,10 @@ impl Service {
                     removed_node.set_availability(NodeAvailability::Offline);
                 }
                 *nodes = Arc::new(nodes_mut);
+                metrics::METRICS_REGISTRY
+                    .metrics_group
+                    .storage_controller_pageserver_nodes
+                    .set(nodes.len() as i64);
             }
         }
 
@@ -5346,6 +5357,11 @@ impl Service {
 
         locked.nodes = Arc::new(new_nodes);
 
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pageserver_nodes
+            .set(locked.nodes.len() as i64);
+
         tracing::info!(
             "Registered pageserver {}, now have {} pageservers",
             register_req.node_id,
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 8a7ff866e6..e696c72ba7 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -473,6 +473,11 @@ impl TenantShard {
         shard: ShardIdentity,
         policy: PlacementPolicy,
     ) -> Self {
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_tenant_shards
+            .inc();
+
         Self {
             tenant_shard_id,
             policy,
@@ -1384,6 +1389,11 @@ impl TenantShard {
         let tenant_shard_id = tsp.get_tenant_shard_id()?;
         let shard_identity = tsp.get_shard_identity()?;
 
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_tenant_shards
+            .inc();
+
         Ok(Self {
             tenant_shard_id,
             shard: shard_identity,
@@ -1512,6 +1522,15 @@ impl TenantShard {
     }
 }
 
+impl Drop for TenantShard {
+    fn drop(&mut self) {
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_tenant_shards
+            .dec();
+    }
+}
+
 #[cfg(test)]
 pub(crate) mod tests {
     use std::{cell::RefCell, rc::Rc};
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index a4e293da9e..d4bc4b1a4f 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -107,6 +107,15 @@ def test_storage_controller_smoke(neon_env_builder: NeonEnvBuilder, combination)
     for tid in tenant_ids:
         env.create_tenant(tid, shard_count=shards_per_tenant)
 
+    # Validate high level metrics
+    assert (
+        env.storage_controller.get_metric_value("storage_controller_tenant_shards")
+        == len(tenant_ids) * shards_per_tenant
+    )
+    assert env.storage_controller.get_metric_value("storage_controller_pageserver_nodes") == len(
+        env.storage_controller.node_list()
+    )
+
     # Repeating a creation should be idempotent (we are just testing it doesn't return an error)
     env.storage_controller.tenant_create(
         tenant_id=next(iter(tenant_ids)), shard_count=shards_per_tenant

From f36cf3f885e9559434f378b45b2e944440e56058 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 22 Oct 2024 21:58:55 +0200
Subject: [PATCH 064/239] Fix local errors for the tests with the versions mix
 (#9477)

## Problem
If the environment variables `COMPATIBILITY_NEON_BIN` or
`COMPATIBILITY_POSTGRES_DISTRIB_DIR` are not set (this is usual during a
local run), the tests with the versions mix cannot run.
## Summary of changes
If these variables are not set turn off the version mix.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 test_runner/fixtures/utils.py | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 7ca6b3dd1c..d12fa59abc 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -16,6 +16,7 @@ from typing import TYPE_CHECKING, Any, Callable, TypeVar
 from urllib.parse import urlencode
 
 import allure
+import pytest
 import zstandard
 from psycopg2.extensions import cursor
 from typing_extensions import override
@@ -634,9 +635,27 @@ def allpairs_versions():
     the different versions.
     """
     ids = []
+    argvalues = []
+    compat_not_defined = (
+        os.getenv("COMPATIBILITY_POSTGRES_DISTRIB_DIR") is None
+        or os.getenv("COMPATIBILITY_NEON_BIN") is None
+    )
     for pair in VERSIONS_COMBINATIONS:
         cur_id = []
+        all_new = all(v == "new" for v in pair.values())
         for component in sorted(pair.keys()):
             cur_id.append(pair[component][0])
+        # Adding None if all versions are new, sof no need to mix at all
+        # If COMPATIBILITY_NEON_BIN or COMPATIBILITY_POSTGRES_DISTRIB_DIR are not defined,
+        # we will skip all the tests which include the versions mix.
+        argvalues.append(
+            pytest.param(
+                None if all_new else pair,
+                marks=pytest.mark.skipif(
+                    compat_not_defined and not all_new,
+                    reason="COMPATIBILITY_NEON_BIN or COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set",
+                ),
+            )
+        )
         ids.append(f"combination_{''.join(cur_id)}")
-    return {"argnames": "combination", "argvalues": VERSIONS_COMBINATIONS, "ids": ids}
+    return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}

From fcb55a2aa2a742346e875126a5e0d1cec6663645 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 22 Oct 2024 14:34:26 -0600
Subject: [PATCH 065/239] Fix copy-paste error in checkpoints_timed metric

Importing the wrong metric. Sigh...

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/sql_exporter/checkpoints_timed.libsonnet | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/etc/sql_exporter/checkpoints_timed.libsonnet b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
index 0ba0080188..ebe2ddc9f2 100644
--- a/compute/etc/sql_exporter/checkpoints_timed.libsonnet
+++ b/compute/etc/sql_exporter/checkpoints_timed.libsonnet
@@ -1,7 +1,7 @@
 local neon = import 'neon.libsonnet';
 
-local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_req.sql';
-local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_req.17.sql';
+local pg_stat_bgwriter = importstr 'sql_exporter/checkpoints_timed.sql';
+local pg_stat_checkpointer = importstr 'sql_exporter/checkpoints_timed.17.sql';
 
 {
   metric_name: 'checkpoints_timed',

From 6f8fcdf9ea71599735192d0f60cce80b0cd42405 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 22 Oct 2024 22:52:30 +0200
Subject: [PATCH 066/239] Timeline offloading persistence (#9444)

Persist timeline offloaded state to S3.

Right now, as of #8907, at each restart of the pageserver, all offloaded
state is lost, so we load the full timeline again. As it starts with an
empty local directory, we might potentially download some files again,
leading to downloads that are ultimately wasteful.

This patch adds support for persisting the offloaded state, allowing us
to never load offloaded timelines in the first place. The persistence
feature is facilitated via a new file in S3 that is tenant-global, which
contains a list of all offloaded timelines. It is updated each time we
offload or unoffload a timeline, and otherwise never touched.

This choice means that tenants where no offloading is happening will not
immediately get a manifest, keeping the change very minimal at the
start.

We leave generation support for future work. It is important to support
generations, as in the worst case, the manifest might be overwritten by
an older generation after a timeline has been unoffloaded (and
unarchived), so the next pageserver process instantiation might wrongly
believe that some timeline is still offloaded even though it should be
active.

Part of #9386, #8088
---
 libs/pageserver_api/src/models.rs             |   2 +
 pageserver/src/http/routes.rs                 |   2 +
 pageserver/src/tenant.rs                      | 280 +++++++++++++++---
 .../src/tenant/remote_timeline_client.rs      |  41 ++-
 .../tenant/remote_timeline_client/download.rs |  53 +++-
 .../tenant/remote_timeline_client/manifest.rs |  53 ++++
 .../tenant/remote_timeline_client/upload.rs   |  33 +++
 pageserver/src/tenant/timeline.rs             |   6 +-
 pageserver/src/tenant/timeline/delete.rs      |  69 +++--
 pageserver/src/tenant/timeline/offload.rs     |  53 +++-
 test_runner/regress/test_timeline_archive.py  | 125 +++++++-
 11 files changed, 637 insertions(+), 80 deletions(-)
 create mode 100644 pageserver/src/tenant/remote_timeline_client/manifest.rs

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index e08bf40801..d0ee4b64d1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -699,6 +699,8 @@ pub struct OffloadedTimelineInfo {
     pub ancestor_timeline_id: Option<TimelineId>,
     /// Whether to retain the branch lsn at the ancestor or not
     pub ancestor_retain_lsn: Option<Lsn>,
+    /// The time point when the timeline was archived
+    pub archived_at: chrono::DateTime<chrono::Utc>,
 }
 
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index a254f1683d..2490bf5f20 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -486,6 +486,7 @@ fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> Offloade
         timeline_id,
         ancestor_retain_lsn,
         ancestor_timeline_id,
+        archived_at,
         ..
     } = offloaded.as_ref();
     OffloadedTimelineInfo {
@@ -493,6 +494,7 @@ fn build_timeline_offloaded_info(offloaded: &Arc<OffloadedTimeline>) -> Offloade
         timeline_id,
         ancestor_retain_lsn,
         ancestor_timeline_id,
+        archived_at: archived_at.and_utc(),
     }
 }
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 41d21ef041..7a3305797c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -16,6 +16,7 @@ use anyhow::{bail, Context};
 use arc_swap::ArcSwap;
 use camino::Utf8Path;
 use camino::Utf8PathBuf;
+use chrono::NaiveDateTime;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::StreamExt;
@@ -31,6 +32,10 @@ use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use remote_storage::TimeoutOrCancel;
+use remote_timeline_client::manifest::{
+    OffloadedTimelineManifest, TenantManifest, LATEST_TENANT_MANIFEST_VERSION,
+};
+use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
 use std::fmt;
 use std::future::Future;
@@ -65,13 +70,14 @@ use self::config::TenantConf;
 use self::metadata::TimelineMetadata;
 use self::mgr::GetActiveTenantError;
 use self::mgr::GetTenantError;
-use self::remote_timeline_client::upload::upload_index_part;
+use self::remote_timeline_client::upload::{upload_index_part, upload_tenant_manifest};
 use self::remote_timeline_client::{RemoteTimelineClient, WaitCompletionError};
 use self::timeline::uninit::TimelineCreateGuard;
 use self::timeline::uninit::TimelineExclusionError;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::GcCutoffs;
+use self::timeline::TimelineDeleteProgress;
 use self::timeline::TimelineResources;
 use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
@@ -240,6 +246,7 @@ struct TimelinePreload {
 }
 
 pub(crate) struct TenantPreload {
+    tenant_manifest: TenantManifest,
     timelines: HashMap<TimelineId, TimelinePreload>,
 }
 
@@ -488,6 +495,12 @@ impl WalRedoManager {
     }
 }
 
+/// A very lightweight memory representation of an offloaded timeline.
+///
+/// We need to store the list of offloaded timelines so that we can perform operations on them,
+/// like unoffloading them, or (at a later date), decide to perform flattening.
+/// This type has a much smaller memory impact than [`Timeline`], and thus we can store many
+/// more offloaded timelines than we can manage ones that aren't.
 pub struct OffloadedTimeline {
     pub tenant_shard_id: TenantShardId,
     pub timeline_id: TimelineId,
@@ -495,27 +508,78 @@ pub struct OffloadedTimeline {
     /// Whether to retain the branch lsn at the ancestor or not
     pub ancestor_retain_lsn: Option<Lsn>,
 
-    // TODO: once we persist offloaded state, make this lazily constructed
-    pub remote_client: Arc<RemoteTimelineClient>,
+    /// When the timeline was archived.
+    ///
+    /// Present for future flattening deliberations.
+    pub archived_at: NaiveDateTime,
+
+    /// Lazily constructed remote client for the timeline
+    ///
+    /// If we offload a timeline, we keep around the remote client
+    /// for the duration of the process. If we find it through the
+    /// manifest, we don't construct it up until it's needed (deletion).
+    pub remote_client: Option<Arc<RemoteTimelineClient>>,
 
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+    pub delete_progress: TimelineDeleteProgress,
 }
 
 impl OffloadedTimeline {
-    fn from_timeline(timeline: &Timeline) -> Self {
+    /// Obtains an offloaded timeline from a given timeline object.
+    ///
+    /// Returns `None` if the `archived_at` flag couldn't be obtained, i.e.
+    /// the timeline is not in a stopped state.
+    /// Panics if the timeline is not archived.
+    fn from_timeline(timeline: &Timeline) -> Result<Self, UploadQueueNotReadyError> {
         let ancestor_retain_lsn = timeline
             .get_ancestor_timeline_id()
             .map(|_timeline_id| timeline.get_ancestor_lsn());
-        Self {
+        let archived_at = timeline
+            .remote_client
+            .archived_at_stopped_queue()?
+            .expect("must be called on an archived timeline");
+        Ok(Self {
             tenant_shard_id: timeline.tenant_shard_id,
             timeline_id: timeline.timeline_id,
             ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
             ancestor_retain_lsn,
+            archived_at,
 
-            remote_client: timeline.remote_client.clone(),
+            remote_client: Some(timeline.remote_client.clone()),
             delete_progress: timeline.delete_progress.clone(),
+        })
+    }
+    fn from_manifest(tenant_shard_id: TenantShardId, manifest: &OffloadedTimelineManifest) -> Self {
+        let OffloadedTimelineManifest {
+            timeline_id,
+            ancestor_timeline_id,
+            ancestor_retain_lsn,
+            archived_at,
+        } = *manifest;
+        Self {
+            tenant_shard_id,
+            timeline_id,
+            ancestor_timeline_id,
+            ancestor_retain_lsn,
+            archived_at,
+            remote_client: None,
+            delete_progress: TimelineDeleteProgress::default(),
+        }
+    }
+    fn manifest(&self) -> OffloadedTimelineManifest {
+        let Self {
+            timeline_id,
+            ancestor_timeline_id,
+            ancestor_retain_lsn,
+            archived_at,
+            ..
+        } = self;
+        OffloadedTimelineManifest {
+            timeline_id: *timeline_id,
+            ancestor_timeline_id: *ancestor_timeline_id,
+            ancestor_retain_lsn: *ancestor_retain_lsn,
+            archived_at: *archived_at,
         }
     }
 }
@@ -551,10 +615,19 @@ impl TimelineOrOffloaded {
             TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
         }
     }
-    pub fn remote_client(&self) -> &Arc<RemoteTimelineClient> {
+    pub fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
         match self {
-            TimelineOrOffloaded::Timeline(timeline) => &timeline.remote_client,
-            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.remote_client,
+            TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
+            TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
+                Some(remote_client) => remote_client,
+                None => {
+                    let remote_client = tenant.build_timeline_client(
+                        offloaded.timeline_id,
+                        tenant.remote_storage.clone(),
+                    );
+                    Arc::new(remote_client)
+                }
+            },
         }
     }
 }
@@ -1131,14 +1204,35 @@ impl Tenant {
             cancel.clone(),
         )
         .await?;
+        let (offloaded_add, tenant_manifest) =
+            match remote_timeline_client::do_download_tenant_manifest(
+                remote_storage,
+                &self.tenant_shard_id,
+                &cancel,
+            )
+            .await
+            {
+                Ok((tenant_manifest, _generation)) => (
+                    format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
+                    tenant_manifest,
+                ),
+                Err(DownloadError::NotFound) => {
+                    ("no manifest".to_string(), TenantManifest::empty())
+                }
+                Err(e) => Err(e)?,
+            };
 
-        info!("found {} timelines", remote_timeline_ids.len(),);
+        info!(
+            "found {} timelines, and {offloaded_add}",
+            remote_timeline_ids.len()
+        );
 
         for k in other_keys {
             warn!("Unexpected non timeline key {k}");
         }
 
         Ok(TenantPreload {
+            tenant_manifest,
             timelines: self
                 .load_timelines_metadata(remote_timeline_ids, remote_storage, cancel)
                 .await?,
@@ -1163,12 +1257,26 @@ impl Tenant {
             anyhow::bail!("local-only deployment is no longer supported, https://github.com/neondatabase/neon/issues/5624");
         };
 
+        let mut offloaded_timeline_ids = HashSet::new();
+        let mut offloaded_timelines_list = Vec::new();
+        for timeline_manifest in preload.tenant_manifest.offloaded_timelines.iter() {
+            let timeline_id = timeline_manifest.timeline_id;
+            let offloaded_timeline =
+                OffloadedTimeline::from_manifest(self.tenant_shard_id, timeline_manifest);
+            offloaded_timelines_list.push((timeline_id, Arc::new(offloaded_timeline)));
+            offloaded_timeline_ids.insert(timeline_id);
+        }
+
         let mut timelines_to_resume_deletions = vec![];
 
         let mut remote_index_and_client = HashMap::new();
         let mut timeline_ancestors = HashMap::new();
         let mut existent_timelines = HashSet::new();
         for (timeline_id, preload) in preload.timelines {
+            if offloaded_timeline_ids.remove(&timeline_id) {
+                // The timeline is offloaded, skip loading it.
+                continue;
+            }
             let index_part = match preload.index_part {
                 Ok(i) => {
                     debug!("remote index part exists for timeline {timeline_id}");
@@ -1272,6 +1380,43 @@ impl Tenant {
             .context("resume_deletion")
             .map_err(LoadLocalTimelineError::ResumeDeletion)?;
         }
+        // Complete deletions for offloaded timeline id's.
+        offloaded_timelines_list
+            .retain(|(offloaded_id, _offloaded)| {
+                // At this point, offloaded_timeline_ids has the list of all offloaded timelines
+                // without a prefix in S3, so they are inexistent.
+                // In the end, existence of a timeline is finally determined by the existence of an index-part.json in remote storage.
+                // If there is a dangling reference in another location, they need to be cleaned up.
+                let delete = offloaded_timeline_ids.contains(offloaded_id);
+                if delete {
+                    tracing::info!("Removing offloaded timeline {offloaded_id} from manifest as no remote prefix was found");
+                }
+                !delete
+        });
+        if !offloaded_timelines_list.is_empty() {
+            tracing::info!(
+                "Tenant has {} offloaded timelines",
+                offloaded_timelines_list.len()
+            );
+        }
+        {
+            let mut offloaded_timelines_accessor = self.timelines_offloaded.lock().unwrap();
+            offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
+        }
+        if !offloaded_timeline_ids.is_empty() {
+            let manifest = self.tenant_manifest();
+            // TODO: generation support
+            let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+            upload_tenant_manifest(
+                &self.remote_storage,
+                &self.tenant_shard_id,
+                generation,
+                &manifest,
+                &self.cancel,
+            )
+            .await
+            .map_err(TimelineArchivalError::Other)?;
+        }
 
         // The local filesystem contents are a cache of what's in the remote IndexPart;
         // IndexPart is the source of truth.
@@ -1443,20 +1588,28 @@ impl Tenant {
         Ok(timeline_preloads)
     }
 
-    fn load_timeline_metadata(
-        self: &Arc<Tenant>,
+    fn build_timeline_client(
+        &self,
         timeline_id: TimelineId,
         remote_storage: GenericRemoteStorage,
-        cancel: CancellationToken,
-    ) -> impl Future<Output = TimelinePreload> {
-        let client = RemoteTimelineClient::new(
+    ) -> RemoteTimelineClient {
+        RemoteTimelineClient::new(
             remote_storage.clone(),
             self.deletion_queue_client.clone(),
             self.conf,
             self.tenant_shard_id,
             timeline_id,
             self.generation,
-        );
+        )
+    }
+
+    fn load_timeline_metadata(
+        self: &Arc<Tenant>,
+        timeline_id: TimelineId,
+        remote_storage: GenericRemoteStorage,
+        cancel: CancellationToken,
+    ) -> impl Future<Output = TimelinePreload> {
+        let client = self.build_timeline_client(timeline_id, remote_storage);
         async move {
             debug_assert_current_span_has_tenant_and_timeline_id();
             debug!("starting index part download");
@@ -1547,7 +1700,7 @@ impl Tenant {
         info!("unoffloading timeline");
         let cancel = self.cancel.clone();
         let timeline_preload = self
-            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel)
+            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
             .await;
 
         let index_part = match timeline_preload.index_part {
@@ -1592,17 +1745,37 @@ impl Tenant {
             )
         })
         .map_err(TimelineArchivalError::Other)?;
-        let timelines = self.timelines.lock().unwrap();
-        let Some(timeline) = timelines.get(&timeline_id) else {
-            warn!("timeline not available directly after attach");
-            return Err(TimelineArchivalError::Other(anyhow::anyhow!(
-                "timeline not available directly after attach"
-            )));
+
+        let timeline = {
+            let timelines = self.timelines.lock().unwrap();
+            let Some(timeline) = timelines.get(&timeline_id) else {
+                warn!("timeline not available directly after attach");
+                // This is not a panic because no locks are held between `load_remote_timeline`
+                // which puts the timeline into timelines, and our look into the timeline map.
+                return Err(TimelineArchivalError::Other(anyhow::anyhow!(
+                    "timeline not available directly after attach"
+                )));
+            };
+            let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
+            if offloaded_timelines.remove(&timeline_id).is_none() {
+                warn!("timeline already removed from offloaded timelines");
+            }
+            Arc::clone(timeline)
         };
-        let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
-        if offloaded_timelines.remove(&timeline_id).is_none() {
-            warn!("timeline already removed from offloaded timelines");
-        }
+
+        // Upload new list of offloaded timelines to S3
+        let manifest = self.tenant_manifest();
+        // TODO: generation support
+        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+        upload_tenant_manifest(
+            &self.remote_storage,
+            &self.tenant_shard_id,
+            generation,
+            &manifest,
+            &cancel,
+        )
+        .await
+        .map_err(TimelineArchivalError::Other)?;
 
         // Activate the timeline (if it makes sense)
         if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -1616,7 +1789,7 @@ impl Tenant {
         }
 
         info!("timeline unoffloading complete");
-        Ok(Arc::clone(timeline))
+        Ok(timeline)
     }
 
     pub(crate) async fn apply_timeline_archival_config(
@@ -2793,6 +2966,26 @@ impl Tenant {
             }
         }
 
+        // TODO: also copy index files of offloaded timelines
+
+        let tenant_manifest = self.tenant_manifest();
+        // TODO: generation support
+        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+        for child_shard in child_shards {
+            tracing::info!(
+                "Uploading tenant manifest for child {}",
+                child_shard.to_index()
+            );
+            upload_tenant_manifest(
+                &self.remote_storage,
+                child_shard,
+                generation,
+                &tenant_manifest,
+                &self.cancel,
+            )
+            .await?;
+        }
+
         Ok(())
     }
 
@@ -2970,6 +3163,22 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
     }
 
+    pub(crate) fn tenant_manifest(&self) -> TenantManifest {
+        let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
+
+        let mut timeline_manifests = timelines_offloaded
+            .iter()
+            .map(|(_timeline_id, offloaded)| offloaded.manifest())
+            .collect::<Vec<_>>();
+        // Sort the manifests so that our output is deterministic
+        timeline_manifests.sort_by_key(|timeline_manifest| timeline_manifest.timeline_id);
+
+        TenantManifest {
+            version: LATEST_TENANT_MANIFEST_VERSION,
+            offloaded_timelines: timeline_manifests,
+        }
+    }
+
     pub fn set_new_tenant_config(&self, new_tenant_conf: TenantConfOpt) {
         // Use read-copy-update in order to avoid overwriting the location config
         // state if this races with [`Tenant::set_new_location_config`]. Note that
@@ -3962,18 +4171,21 @@ impl Tenant {
         Ok(timeline)
     }
 
-    /// Call this before constructing a timeline, to build its required structures
-    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
-        let remote_client = RemoteTimelineClient::new(
+    fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
+        RemoteTimelineClient::new(
             self.remote_storage.clone(),
             self.deletion_queue_client.clone(),
             self.conf,
             self.tenant_shard_id,
             timeline_id,
             self.generation,
-        );
+        )
+    }
+
+    /// Call this before constructing a timeline, to build its required structures
+    fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
         TimelineResources {
-            remote_client,
+            remote_client: self.build_timeline_remote_client(timeline_id),
             timeline_get_throttle: self.timeline_get_throttle.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 14b894d17c..066fd12a9a 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -180,6 +180,7 @@
 
 pub(crate) mod download;
 pub mod index;
+pub mod manifest;
 pub(crate) mod upload;
 
 use anyhow::Context;
@@ -191,7 +192,6 @@ use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
-pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
     self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -245,9 +245,11 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;
 
 pub(crate) use download::{
-    download_index_part, is_temp_download_file, list_remote_tenant_shards, list_remote_timelines,
+    do_download_tenant_manifest, download_index_part, is_temp_download_file,
+    list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
+pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
 
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -272,6 +274,12 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 /// which we warn and skip.
 const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
 
+/// Hardcode a generation for the tenant manifest for now so that we don't
+/// need to deal with generation-less manifests in the future.
+///
+/// TODO: add proper generation support to all the places that use this.
+pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
+
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -295,6 +303,10 @@ pub enum WaitCompletionError {
     UploadQueueShutDownOrStopped,
 }
 
+#[derive(Debug, thiserror::Error)]
+#[error("Upload queue either in unexpected state or hasn't downloaded manifest yet")]
+pub struct UploadQueueNotReadyError;
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -468,6 +480,20 @@ impl RemoteTimelineClient {
             .ok()
     }
 
+    /// Returns `Ok(Some(timestamp))` if the timeline has been archived, `Ok(None)` if the timeline hasn't been archived.
+    ///
+    /// Return Err(_) if the remote index_part hasn't been downloaded yet, or the timeline hasn't been stopped yet.
+    pub(crate) fn archived_at_stopped_queue(
+        &self,
+    ) -> Result<Option<NaiveDateTime>, UploadQueueNotReadyError> {
+        self.upload_queue
+            .lock()
+            .unwrap()
+            .stopped_mut()
+            .map(|q| q.upload_queue_for_deletion.clean.0.archived_at)
+            .map_err(|_| UploadQueueNotReadyError)
+    }
+
     fn update_remote_physical_size_gauge(&self, current_remote_index_part: Option<&IndexPart>) {
         let size: u64 = if let Some(current_remote_index_part) = current_remote_index_part {
             current_remote_index_part
@@ -2198,6 +2224,17 @@ pub fn remote_tenant_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     RemotePath::from_string(&path).expect("Failed to construct path")
 }
 
+pub fn remote_tenant_manifest_path(
+    tenant_shard_id: &TenantShardId,
+    generation: Generation,
+) -> RemotePath {
+    let path = format!(
+        "tenants/{tenant_shard_id}/tenant-manifest{}.json",
+        generation.get_suffix()
+    );
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
     RemotePath::from_string(&path).expect("Failed to construct path")
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index b5d4b0f0bb..95f8f026d4 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -34,10 +34,11 @@ use utils::id::{TenantId, TimelineId};
 use utils::pausable_failpoint;
 
 use super::index::{IndexPart, LayerFileMetadata};
+use super::manifest::TenantManifest;
 use super::{
     parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
+    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 ///
@@ -338,19 +339,15 @@ pub async fn list_remote_timelines(
     list_identifiers::<TimelineId>(storage, remote_path, cancel).await
 }
 
-async fn do_download_index_part(
+async fn do_download_remote_path_retry_forever(
     storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-    index_generation: Generation,
+    remote_path: &RemotePath,
     cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
-    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
-
-    let (index_part_bytes, index_part_mtime) = download_retry_forever(
+) -> Result<(Vec<u8>, SystemTime), DownloadError> {
+    download_retry_forever(
         || async {
             let download = storage
-                .download(&remote_path, &DownloadOpts::default(), cancel)
+                .download(remote_path, &DownloadOpts::default(), cancel)
                 .await?;
 
             let mut bytes = Vec::new();
@@ -365,7 +362,39 @@ async fn do_download_index_part(
         &format!("download {remote_path:?}"),
         cancel,
     )
-    .await?;
+    .await
+}
+
+pub async fn do_download_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    cancel: &CancellationToken,
+) -> Result<(TenantManifest, Generation), DownloadError> {
+    // TODO: generation support
+    let generation = super::TENANT_MANIFEST_GENERATION;
+    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+
+    let (manifest_bytes, _manifest_bytes_mtime) =
+        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
+
+    let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
+        .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
+        .map_err(DownloadError::Other)?;
+
+    Ok((tenant_manifest, generation))
+}
+
+async fn do_download_index_part(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+    index_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
+
+    let (index_part_bytes, index_part_mtime) =
+        do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
 
     let index_part: IndexPart = serde_json::from_slice(&index_part_bytes)
         .with_context(|| format!("deserialize index part file at {remote_path:?}"))
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
new file mode 100644
index 0000000000..7d92d45146
--- /dev/null
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -0,0 +1,53 @@
+use chrono::NaiveDateTime;
+use serde::{Deserialize, Serialize};
+use utils::{id::TimelineId, lsn::Lsn};
+
+/// Tenant-shard scoped manifest
+#[derive(Clone, Serialize, Deserialize)]
+pub struct TenantManifest {
+    /// Debugging aid describing the version of this manifest.
+    /// Can also be used for distinguishing breaking changes later on.
+    pub version: usize,
+
+    /// The list of offloaded timelines together with enough information
+    /// to not have to actually load them.
+    ///
+    /// Note: the timelines mentioned in this list might be deleted, i.e.
+    /// we don't hold an invariant that the references aren't dangling.
+    /// Existence of index-part.json is the actual indicator of timeline existence.
+    pub offloaded_timelines: Vec<OffloadedTimelineManifest>,
+}
+
+/// The remote level representation of an offloaded timeline.
+///
+/// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
+/// but the two datastructures serve different needs, this is for a persistent disk format
+/// that must be backwards compatible, while the other is only for informative purposes.
+#[derive(Clone, Serialize, Deserialize, Copy)]
+pub struct OffloadedTimelineManifest {
+    pub timeline_id: TimelineId,
+    /// Whether the timeline has a parent it has been branched off from or not
+    pub ancestor_timeline_id: Option<TimelineId>,
+    /// Whether to retain the branch lsn at the ancestor or not
+    pub ancestor_retain_lsn: Option<Lsn>,
+    /// The time point when the timeline was archived
+    pub archived_at: NaiveDateTime,
+}
+
+pub const LATEST_TENANT_MANIFEST_VERSION: usize = 1;
+
+impl TenantManifest {
+    pub(crate) fn empty() -> Self {
+        Self {
+            version: LATEST_TENANT_MANIFEST_VERSION,
+            offloaded_timelines: vec![],
+        }
+    }
+    pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+        serde_json::from_slice::<Self>(bytes)
+    }
+
+    pub(crate) fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
+        serde_json::to_vec(self)
+    }
+}
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index c4dd184610..5a2b7bd08f 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -13,9 +13,11 @@ use tokio_util::sync::CancellationToken;
 use utils::{backoff, pausable_failpoint};
 
 use super::index::IndexPart;
+use super::manifest::TenantManifest;
 use super::Generation;
 use crate::tenant::remote_timeline_client::{
     remote_index_path, remote_initdb_archive_path, remote_initdb_preserved_archive_path,
+    remote_tenant_manifest_path,
 };
 use remote_storage::{GenericRemoteStorage, RemotePath, TimeTravelError};
 use utils::id::{TenantId, TimelineId};
@@ -55,6 +57,37 @@ pub(crate) async fn upload_index_part<'a>(
         .await
         .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
+/// Serializes and uploads the given tenant manifest data to the remote storage.
+pub(crate) async fn upload_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    generation: Generation,
+    tenant_manifest: &TenantManifest,
+    cancel: &CancellationToken,
+) -> anyhow::Result<()> {
+    tracing::trace!("uploading new tenant manifest");
+
+    fail_point!("before-upload-manifest", |_| {
+        bail!("failpoint before-upload-manifest")
+    });
+    pausable_failpoint!("before-upload-manifest-pausable");
+
+    let serialized = tenant_manifest.to_json_bytes()?;
+    let serialized = Bytes::from(serialized);
+
+    let tenant_manifest_site = serialized.len();
+
+    let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
+    storage
+        .upload_storage_object(
+            futures::stream::once(futures::future::ready(Ok(serialized))),
+            tenant_manifest_site,
+            &remote_path,
+            cancel,
+        )
+        .await
+        .with_context(|| format!("upload tenant manifest for '{tenant_shard_id}'"))
+}
 
 /// Attempts to upload given layer files.
 /// No extra checks for overlapping files is made and any files that are already present remotely will be overwritten, if submitted during the upload.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d67a139dfa..d5ceec663b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -371,7 +371,7 @@ pub struct Timeline {
 
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+    pub delete_progress: TimelineDeleteProgress,
 
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 
@@ -426,6 +426,8 @@ pub struct Timeline {
     pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
 }
 
+pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
+
 pub struct WalReceiverInfo {
     pub wal_source_connconf: PgConnectionConfig,
     pub last_received_msg_lsn: Lsn,
@@ -2250,7 +2252,7 @@ impl Timeline {
                 eviction_task_timeline_state: tokio::sync::Mutex::new(
                     EvictionTaskTimelineState::default(),
                 ),
-                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
+                delete_progress: TimelineDeleteProgress::default(),
 
                 cancel,
                 gate: Gate::default(),
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 71b9e4e288..4799aab436 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,7 +14,9 @@ use crate::{
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
-        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
+        remote_timeline_client::{
+            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
+        },
         CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
     },
 };
@@ -25,12 +27,9 @@ use super::{Timeline, TimelineResources};
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
 async fn set_deleted_in_remote_index(
-    timeline: &TimelineOrOffloaded,
+    remote_client: &Arc<RemoteTimelineClient>,
 ) -> Result<(), DeleteTimelineError> {
-    let res = timeline
-        .remote_client()
-        .persist_index_part_with_deleted_flag()
-        .await;
+    let res = remote_client.persist_index_part_with_deleted_flag().await;
     match res {
         // If we (now, or already) marked it successfully as deleted, we can proceed
         Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
@@ -129,12 +128,10 @@ pub(super) async fn delete_local_timeline_directory(
 }
 
 /// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> {
-    timeline
-        .remote_client()
-        .delete_all()
-        .await
-        .context("delete_all")
+async fn delete_remote_layers_and_index(
+    remote_client: &Arc<RemoteTimelineClient>,
+) -> anyhow::Result<()> {
+    remote_client.delete_all().await.context("delete_all")
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
@@ -179,6 +176,32 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
     Ok(())
 }
 
+/// It is important that this gets called when DeletionGuard is being held.
+/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+async fn upload_new_tenant_manifest(
+    tenant: &Tenant,
+    _: &DeletionGuard, // using it as a witness
+) -> anyhow::Result<()> {
+    // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
+    // between the deletion of the index-part.json and reaching of this code.
+    // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
+    // However, we handle this case in tenant loading code so the next time we attach, the issue is
+    // resolved.
+    let manifest = tenant.tenant_manifest();
+    // TODO: generation support
+    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+    remote_timeline_client::upload_tenant_manifest(
+        &tenant.remote_storage,
+        &tenant.tenant_shard_id,
+        generation,
+        &manifest,
+        &tenant.cancel,
+    )
+    .await?;
+
+    Ok(())
+}
+
 /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -235,7 +258,8 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        set_deleted_in_remote_index(&timeline).await?;
+        let remote_client = timeline.remote_client_maybe_construct(tenant);
+        set_deleted_in_remote_index(&remote_client).await?;
 
         fail::fail_point!("timeline-delete-before-schedule", |_| {
             Err(anyhow::anyhow!(
@@ -243,7 +267,13 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+        Self::schedule_background(
+            guard,
+            tenant.conf,
+            Arc::clone(tenant),
+            timeline,
+            remote_client,
+        );
 
         Ok(())
     }
@@ -301,8 +331,9 @@ impl DeleteTimelineFlow {
 
         guard.mark_in_progress()?;
 
+        let remote_client = timeline.remote_client.clone();
         let timeline = TimelineOrOffloaded::Timeline(timeline);
-        Self::schedule_background(guard, tenant.conf, tenant, timeline);
+        Self::schedule_background(guard, tenant.conf, tenant, timeline, remote_client);
 
         Ok(())
     }
@@ -380,6 +411,7 @@ impl DeleteTimelineFlow {
         conf: &'static PageServerConf,
         tenant: Arc<Tenant>,
         timeline: TimelineOrOffloaded,
+        remote_client: Arc<RemoteTimelineClient>,
     ) {
         let tenant_shard_id = timeline.tenant_shard_id();
         let timeline_id = timeline.timeline_id();
@@ -391,7 +423,7 @@ impl DeleteTimelineFlow {
             Some(timeline_id),
             "timeline_delete",
             async move {
-                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
                     error!("Error: {err:#}");
                     if let TimelineOrOffloaded::Timeline(timeline) = timeline {
                         timeline.set_broken(format!("{err:#}"))
@@ -408,6 +440,7 @@ impl DeleteTimelineFlow {
         conf: &PageServerConf,
         tenant: &Tenant,
         timeline: &TimelineOrOffloaded,
+        remote_client: Arc<RemoteTimelineClient>,
     ) -> Result<(), DeleteTimelineError> {
         // Offloaded timelines have no local state
         // TODO: once we persist offloaded information, delete the timeline from there, too
@@ -415,12 +448,14 @@ impl DeleteTimelineFlow {
             delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
         }
 
-        delete_remote_layers_and_index(timeline).await?;
+        delete_remote_layers_and_index(&remote_client).await?;
 
         pausable_failpoint!("in_progress_delete");
 
         remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
 
+        upload_new_tenant_manifest(tenant, &guard).await?;
+
         *guard = Self::Finished;
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 7e6084baaf..8e6eceb084 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -1,17 +1,17 @@
 use std::sync::Arc;
 
-use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
-
-use super::{
-    delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard},
-    Timeline,
-};
+use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
+use super::Timeline;
+use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
 
 pub(crate) async fn offload_timeline(
     tenant: &Tenant,
     timeline: &Arc<Timeline>,
 ) -> anyhow::Result<()> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
+
     let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
@@ -19,14 +19,28 @@ pub(crate) async fn offload_timeline(
         return Ok(());
     };
 
+    let is_archived = timeline.is_archived();
+    match is_archived {
+        Some(true) => (),
+        Some(false) => {
+            tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
+            anyhow::bail!("timeline isn't archived");
+        }
+        None => {
+            tracing::warn!(
+                ?is_archived,
+                "tried offloading a timeline where manifest is not yet available"
+            );
+            anyhow::bail!("timeline manifest hasn't been loaded yet");
+        }
+    }
+
     // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
     timeline.shutdown(super::ShutdownMode::Hard).await;
 
     // TODO extend guard mechanism above with method
     // to make deletions possible while offloading is in progress
 
-    // TODO mark timeline as offloaded in S3
-
     let conf = &tenant.conf;
     delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
 
@@ -36,10 +50,31 @@ pub(crate) async fn offload_timeline(
         let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
         offloaded_timelines.insert(
             timeline.timeline_id,
-            Arc::new(OffloadedTimeline::from_timeline(&timeline)),
+            Arc::new(
+                OffloadedTimeline::from_timeline(&timeline)
+                    .expect("we checked above that timeline was ready"),
+            ),
         );
     }
 
+    // Last step: mark timeline as offloaded in S3
+    // TODO: maybe move this step above, right above deletion of the local timeline directory,
+    // then there is no potential race condition where we partially offload a timeline, and
+    // at the next restart attach it again.
+    // For that to happen, we'd need to make the manifest reflect our *intended* state,
+    // not our actual state of offloaded timelines.
+    let manifest = tenant.tenant_manifest();
+    // TODO: generation support
+    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
+    remote_timeline_client::upload_tenant_manifest(
+        &tenant.remote_storage,
+        &tenant.tenant_shard_id,
+        generation,
+        &manifest,
+        &tenant.cancel,
+    )
+    .await?;
+
     Ok(())
 }
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 85e1077fd5..cb8724dd1c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -4,8 +4,11 @@ import pytest
 from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException
+from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
+from fixtures.remote_storage import s3_storage
 from fixtures.utils import wait_until
 
 
@@ -168,7 +171,7 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         state=TimelineArchivalState.ARCHIVED,
     )
 
-    def timeline_offloaded(timeline_id: TimelineId) -> bool:
+    def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
         return (
             env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
             is not None
@@ -186,12 +189,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
     def parent_offloaded():
         if manual_offload:
             ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=parent_timeline_id)
-        assert timeline_offloaded(parent_timeline_id)
+        assert timeline_offloaded_logged(parent_timeline_id)
 
     def leaf_offloaded():
         if manual_offload:
             ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
-        assert timeline_offloaded(leaf_timeline_id)
+        assert timeline_offloaded_logged(leaf_timeline_id)
 
     wait_until(30, 1, leaf_offloaded)
     wait_until(30, 1, parent_offloaded)
@@ -218,4 +221,118 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
         assert sum == sum_again
 
-    assert not timeline_offloaded(initial_timeline_id)
+    assert not timeline_offloaded_logged(initial_timeline_id)
+
+
+def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
+    """
+    Test for persistence of timeline offload state
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, root_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+        }
+    )
+
+    # Create a branch and archive it
+    child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
+
+    with env.endpoints.create_start(
+        "test_archived_branch_persisted", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,2048)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
+        last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/",
+    )
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        child_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+    leaf_detail = ps_http.timeline_detail(
+        tenant_id,
+        child_timeline_id,
+    )
+    assert leaf_detail["is_archived"] is True
+
+    def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
+        # TODO add a proper API to check if a timeline has been offloaded or not
+        return not any(
+            timeline["timeline_id"] == str(timeline_id)
+            for timeline in ps_http.timeline_list(tenant_id=tenant_id)
+        )
+
+    def child_offloaded():
+        ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
+        assert timeline_offloaded_api(child_timeline_id)
+
+    wait_until(30, 1, child_offloaded)
+
+    assert timeline_offloaded_api(child_timeline_id)
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
+    )
+
+    # Test persistence, is the timeline still offloaded?
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    assert timeline_offloaded_api(child_timeline_id)
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        child_timeline_id,
+        state=TimelineArchivalState.UNARCHIVED,
+    )
+    child_detail = ps_http.timeline_detail(
+        tenant_id,
+        child_timeline_id,
+    )
+    assert child_detail["is_archived"] is False
+
+    with env.endpoints.create_start(
+        "test_archived_branch_persisted", tenant_id=tenant_id
+    ) as endpoint:
+        sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
+        assert sum == sum_again
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
+    )
+
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    ps_http.tenant_delete(tenant_id)
+
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/",
+    )

From 64949a37a91124957f30177d10a57a061d8fea02 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 22 Oct 2024 18:06:21 -0400
Subject: [PATCH 067/239] fix(pageserver): make delta split layer writer finish
 atomic (#9048)

similar to https://github.com/neondatabase/neon/pull/8841, we make the
delta layer writer atomic when finishing the layers.

## Summary of changes

* `put_value` not taking discard fn anymore
* `finish` decides what layers to keep

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../src/tenant/storage_layer/split_writer.rs  | 162 +++++++-----------
 pageserver/src/tenant/timeline/compaction.rs  |  34 +---
 2 files changed, 63 insertions(+), 133 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/split_writer.rs
index 5bd9a47e2b..45ac0c6668 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/split_writer.rs
@@ -170,7 +170,7 @@ impl SplitImageLayerWriter {
                     }
                     Err(e) => {
                         // ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
-                        // so we don't need to remove it by ourselves.
+                        // so we don't need to remove the layer we just failed to create by ourselves.
                         clean_up_layers(generated_layers);
                         return Err(e);
                     }
@@ -206,7 +206,7 @@ impl SplitImageLayerWriter {
 pub struct SplitDeltaLayerWriter {
     inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
-    generated_layers: Vec<SplitWriterResult>,
+    generated_layer_writers: Vec<(DeltaLayerWriter, PersistentLayerKey)>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
@@ -225,7 +225,7 @@ impl SplitDeltaLayerWriter {
         Ok(Self {
             target_layer_size,
             inner: None,
-            generated_layers: Vec::new(),
+            generated_layer_writers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
@@ -234,20 +234,13 @@ impl SplitDeltaLayerWriter {
         })
     }
 
-    /// Put value into the layer writer. In the case the writer decides to produce a layer, and the discard fn returns true, no layer will be written in the end.
-    pub async fn put_value_with_discard_fn<D, F>(
+    pub async fn put_value(
         &mut self,
         key: Key,
         lsn: Lsn,
         val: Value,
-        tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        discard: D,
-    ) -> anyhow::Result<()>
-    where
-        D: FnOnce(&PersistentLayerKey) -> F,
-        F: Future<Output = bool>,
-    {
+    ) -> anyhow::Result<()> {
         // The current estimation is key size plus LSN size plus value size estimation. This is not an accurate
         // number, and therefore the final layer size could be a little bit larger or smaller than the target.
         //
@@ -291,24 +284,8 @@ impl SplitDeltaLayerWriter {
                     lsn_range: self.lsn_range.clone(),
                     is_delta: true,
                 };
-                if discard(&layer_key).await {
-                    drop(prev_delta_writer);
-                    self.generated_layers
-                        .push(SplitWriterResult::Discarded(layer_key));
-                } else {
-                    // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
-                    // files for `finish_creating`.
-                    let (desc, path) = prev_delta_writer.finish(key, ctx).await?;
-                    let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
-                        Ok(layer) => layer,
-                        Err(e) => {
-                            tokio::fs::remove_file(&path).await.ok();
-                            return Err(e);
-                        }
-                    };
-                    self.generated_layers
-                        .push(SplitWriterResult::Produced(delta_layer));
-                }
+                self.generated_layer_writers
+                    .push((prev_delta_writer, layer_key));
             } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
                 anyhow::bail!(
@@ -323,60 +300,68 @@ impl SplitDeltaLayerWriter {
         inner.put_value(key, lsn, val, ctx).await
     }
 
-    pub async fn put_value(
-        &mut self,
-        key: Key,
-        lsn: Lsn,
-        val: Value,
-        tline: &Arc<Timeline>,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.put_value_with_discard_fn(key, lsn, val, tline, ctx, |_| async { false })
-            .await
-    }
-
     pub(crate) async fn finish_with_discard_fn<D, F>(
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-        discard: D,
+        discard_fn: D,
     ) -> anyhow::Result<Vec<SplitWriterResult>>
     where
-        D: FnOnce(&PersistentLayerKey) -> F,
+        D: Fn(&PersistentLayerKey) -> F,
         F: Future<Output = bool>,
     {
         let Self {
-            mut generated_layers,
+            mut generated_layer_writers,
             inner,
             ..
         } = self;
-        let Some((start_key, inner)) = inner else {
-            return Ok(generated_layers);
-        };
-        if inner.num_keys() == 0 {
-            return Ok(generated_layers);
+        if let Some((start_key, writer)) = inner {
+            if writer.num_keys() != 0 {
+                let end_key = self.last_key_written.next();
+                let layer_key = PersistentLayerKey {
+                    key_range: start_key..end_key,
+                    lsn_range: self.lsn_range.clone(),
+                    is_delta: true,
+                };
+                generated_layer_writers.push((writer, layer_key));
+            }
         }
-        let end_key = self.last_key_written.next();
-        let layer_key = PersistentLayerKey {
-            key_range: start_key..end_key,
-            lsn_range: self.lsn_range.clone(),
-            is_delta: true,
-        };
-        if discard(&layer_key).await {
-            generated_layers.push(SplitWriterResult::Discarded(layer_key));
-        } else {
-            // `finish` will remove the file if anything goes wrong, while we need to handle deleting temporary
-            // files for `finish_creating`.
-            let (desc, path) = inner.finish(end_key, ctx).await?;
-            let delta_layer = match Layer::finish_creating(self.conf, tline, desc, &path) {
-                Ok(layer) => layer,
-                Err(e) => {
-                    tokio::fs::remove_file(&path).await.ok();
-                    return Err(e);
+        let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
+            for produced_layer in generated_layers {
+                if let SplitWriterResult::Produced(delta_layer) = produced_layer {
+                    let layer: Layer = delta_layer.into();
+                    layer.delete_on_drop();
                 }
-            };
-            generated_layers.push(SplitWriterResult::Produced(delta_layer));
+            }
+        };
+        // BEGIN: catch every error and do the recovery in the below section
+        let mut generated_layers = Vec::new();
+        for (inner, layer_key) in generated_layer_writers {
+            if discard_fn(&layer_key).await {
+                generated_layers.push(SplitWriterResult::Discarded(layer_key));
+            } else {
+                let layer = match inner.finish(layer_key.key_range.end, ctx).await {
+                    Ok((desc, path)) => {
+                        match Layer::finish_creating(self.conf, tline, desc, &path) {
+                            Ok(layer) => layer,
+                            Err(e) => {
+                                tokio::fs::remove_file(&path).await.ok();
+                                clean_up_layers(generated_layers);
+                                return Err(e);
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        // DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
+                        // so we don't need to remove the layer we just failed to create by ourselves.
+                        clean_up_layers(generated_layers);
+                        return Err(e);
+                    }
+                };
+                generated_layers.push(SplitWriterResult::Produced(layer));
+            }
         }
+        // END: catch every error and do the recovery in the above section
         Ok(generated_layers)
     }
 
@@ -389,11 +374,6 @@ impl SplitDeltaLayerWriter {
         self.finish_with_discard_fn(tline, ctx, |_| async { false })
             .await
     }
-
-    /// This function will be deprecated with #8841.
-    pub(crate) fn take(self) -> anyhow::Result<(Vec<SplitWriterResult>, Option<DeltaLayerWriter>)> {
-        Ok((self.generated_layers, self.inner.map(|x| x.1)))
-    }
 }
 
 #[cfg(test)]
@@ -473,13 +453,7 @@ mod tests {
         assert_eq!(layers.len(), 1);
 
         delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
             .await
             .unwrap();
         let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -551,14 +525,7 @@ mod tests {
                 .await
                 .unwrap();
             delta_writer
-                .put_value_with_discard_fn(
-                    get_key(i),
-                    Lsn(0x20),
-                    Value::Image(get_large_img()),
-                    &tline,
-                    &ctx,
-                    |_| async { discard },
-                )
+                .put_value(get_key(i), Lsn(0x20), Value::Image(get_large_img()), &ctx)
                 .await
                 .unwrap();
         }
@@ -664,23 +631,11 @@ mod tests {
         assert_eq!(layers.len(), 2);
 
         delta_writer
-            .put_value(
-                get_key(0),
-                Lsn(0x18),
-                Value::Image(get_img(0)),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(0), Lsn(0x18), Value::Image(get_img(0)), &ctx)
             .await
             .unwrap();
         delta_writer
-            .put_value(
-                get_key(1),
-                Lsn(0x1A),
-                Value::Image(get_large_img()),
-                &tline,
-                &ctx,
-            )
+            .put_value(get_key(1), Lsn(0x1A), Value::Image(get_large_img()), &ctx)
             .await
             .unwrap();
         let layers = delta_writer.finish(&tline, &ctx).await.unwrap();
@@ -744,7 +699,6 @@ mod tests {
                     get_key(0),
                     Lsn(i as u64 * 16 + 0x10),
                     Value::Image(get_large_img()),
-                    &tline,
                     &ctx,
                 )
                 .await
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 5cb1460b29..37d907ddcb 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -121,18 +121,12 @@ impl KeyHistoryRetention {
     async fn pipe_to(
         self,
         key: Key,
-        tline: &Arc<Timeline>,
         delta_writer: &mut SplitDeltaLayerWriter,
         mut image_writer: Option<&mut SplitImageLayerWriter>,
         stat: &mut CompactionStatistics,
-        dry_run: bool,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut first_batch = true;
-        let discard = |key: &PersistentLayerKey| {
-            let key = key.clone();
-            async move { Self::discard_key(&key, tline, dry_run).await }
-        };
         for (cutoff_lsn, KeyLogAtLsn(logs)) in self.below_horizon {
             if first_batch {
                 if logs.len() == 1 && logs[0].1.is_image() {
@@ -144,40 +138,27 @@ impl KeyHistoryRetention {
                         image_writer.put_image(key, img.clone(), ctx).await?;
                     } else {
                         delta_writer
-                            .put_value_with_discard_fn(
-                                key,
-                                cutoff_lsn,
-                                Value::Image(img.clone()),
-                                tline,
-                                ctx,
-                                discard,
-                            )
+                            .put_value(key, cutoff_lsn, Value::Image(img.clone()), ctx)
                             .await?;
                     }
                 } else {
                     for (lsn, val) in logs {
                         stat.produce_key(&val);
-                        delta_writer
-                            .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                            .await?;
+                        delta_writer.put_value(key, lsn, val, ctx).await?;
                     }
                 }
                 first_batch = false;
             } else {
                 for (lsn, val) in logs {
                     stat.produce_key(&val);
-                    delta_writer
-                        .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                        .await?;
+                    delta_writer.put_value(key, lsn, val, ctx).await?;
                 }
             }
         }
         let KeyLogAtLsn(above_horizon_logs) = self.above_horizon;
         for (lsn, val) in above_horizon_logs {
             stat.produce_key(&val);
-            delta_writer
-                .put_value_with_discard_fn(key, lsn, val, tline, ctx, discard)
-                .await?;
+            delta_writer.put_value(key, lsn, val, ctx).await?;
         }
         Ok(())
     }
@@ -1988,11 +1969,9 @@ impl Timeline {
                 retention
                     .pipe_to(
                         *last_key,
-                        self,
                         &mut delta_layer_writer,
                         image_layer_writer.as_mut(),
                         &mut stat,
-                        dry_run,
                         ctx,
                     )
                     .await?;
@@ -2019,11 +1998,9 @@ impl Timeline {
         retention
             .pipe_to(
                 last_key,
-                self,
                 &mut delta_layer_writer,
                 image_layer_writer.as_mut(),
                 &mut stat,
-                dry_run,
                 ctx,
             )
             .await?;
@@ -2051,8 +2028,7 @@ impl Timeline {
                 .finish_with_discard_fn(self, ctx, discard)
                 .await?
         } else {
-            let (layers, _) = delta_layer_writer.take()?;
-            assert!(layers.is_empty(), "delta layers produced in dry run mode?");
+            drop(delta_layer_writer);
             Vec::new()
         };
 

From 3a3bd34a28e0137513e7e31a6b808cf9566a14c5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 23 Oct 2024 00:34:24 +0200
Subject: [PATCH 068/239] Rename IndexPart::{from_s3_bytes,to_s3_bytes} (#9481)

We support multiple storage backends now, so remove the `_s3_` from the
name.

Analogous to the names adopted for tenant manifests added in #9444.
---
 pageserver/ctl/src/index_part.rs              |  2 +-
 .../tenant/remote_timeline_client/index.rs    | 24 +++++++++----------
 .../tenant/remote_timeline_client/upload.rs   |  2 +-
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/pageserver/ctl/src/index_part.rs b/pageserver/ctl/src/index_part.rs
index 20018846f8..6cce2844c7 100644
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -11,7 +11,7 @@ pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
     match cmd {
         IndexPartCmd::Dump { path } => {
             let bytes = tokio::fs::read(path).await.context("read file")?;
-            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
+            let des: IndexPart = IndexPart::from_json_bytes(&bytes).context("deserialize")?;
             let output = serde_json::to_string_pretty(&des).context("serialize output")?;
             println!("{output}");
             Ok(())
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 3a74a4ed11..d8a881a2c4 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -121,11 +121,11 @@ impl IndexPart {
         self.disk_consistent_lsn
     }
 
-    pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
         serde_json::from_slice::<IndexPart>(bytes)
     }
 
-    pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
+    pub fn to_json_bytes(&self) -> serde_json::Result<Vec<u8>> {
         serde_json::to_vec(self)
     }
 
@@ -383,7 +383,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -427,7 +427,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -472,7 +472,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -520,7 +520,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
+        let empty_layers_parsed = IndexPart::from_json_bytes(empty_layers_json.as_bytes()).unwrap();
 
         assert_eq!(empty_layers_parsed, expected);
     }
@@ -563,7 +563,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -609,7 +609,7 @@ mod tests {
             last_aux_file_policy: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -660,7 +660,7 @@ mod tests {
             last_aux_file_policy: Some(AuxFilePolicy::V2),
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -716,7 +716,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -773,7 +773,7 @@ mod tests {
             last_aux_file_policy: Default::default(),
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
@@ -835,7 +835,7 @@ mod tests {
             archived_at: None,
         };
 
-        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
+        let part = IndexPart::from_json_bytes(example.as_bytes()).unwrap();
         assert_eq!(part, expected);
     }
 
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 5a2b7bd08f..0cd5d05aa2 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -41,7 +41,7 @@ pub(crate) async fn upload_index_part<'a>(
     pausable_failpoint!("before-upload-index-pausable");
 
     // FIXME: this error comes too late
-    let serialized = index_part.to_s3_bytes()?;
+    let serialized = index_part.to_json_bytes()?;
     let serialized = Bytes::from(serialized);
 
     let index_part_size = serialized.len();

From 92d5e0e87a8d397f86cb7c8dc0fddb318b2da46b Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 23 Oct 2024 08:21:28 +0200
Subject: [PATCH 069/239] proxy: clear lib.rs of code items (#9479)

We keep lib.rs for crate configs, lint configs and re-exports for the binaries.
---
 proxy/src/auth/backend/jwt.rs            |   4 +-
 proxy/src/auth/backend/local.rs          |   3 +-
 proxy/src/auth/backend/mod.rs            |   5 +-
 proxy/src/auth/credentials.rs            |   2 +-
 proxy/src/auth/password_hack.rs          |   2 +-
 proxy/src/bin/local_proxy.rs             |   6 +-
 proxy/src/bin/pg_sni_router.rs           |   6 +-
 proxy/src/bin/proxy.rs                   |   6 +-
 proxy/src/cache/endpoints.rs             |   2 +-
 proxy/src/cache/project_info.rs          |   4 +-
 proxy/src/compute.rs                     |   2 +-
 proxy/src/compute_ctl/mod.rs             |   3 +-
 proxy/src/config.rs                      |   2 +-
 proxy/src/context/mod.rs                 |   2 +-
 proxy/src/control_plane/provider/mock.rs |   3 +-
 proxy/src/control_plane/provider/mod.rs  |   3 +-
 proxy/src/control_plane/provider/neon.rs |   3 +-
 proxy/src/error.rs                       |   7 +
 proxy/src/intern.rs                      |   2 +-
 proxy/src/lib.rs                         | 168 +----------------------
 proxy/src/proxy/connect_compute.rs       |   2 +-
 proxy/src/proxy/mod.rs                   |   3 +-
 proxy/src/proxy/tests/mod.rs             |   3 +-
 proxy/src/rate_limiter/limiter.rs        |   2 +-
 proxy/src/redis/notifications.rs         |   2 +-
 proxy/src/scram/mod.rs                   |   2 +-
 proxy/src/scram/threadpool.rs            |   2 +-
 proxy/src/serverless/backend.rs          |   3 +-
 proxy/src/serverless/conn_pool.rs        |   2 +-
 proxy/src/serverless/conn_pool_lib.rs    |   2 +-
 proxy/src/serverless/http_conn_pool.rs   |   2 +-
 proxy/src/serverless/local_conn_pool.rs  |   2 +-
 proxy/src/serverless/sql_over_http.rs    |   2 +-
 proxy/src/signals.rs                     |  39 ++++++
 proxy/src/types.rs                       | 122 ++++++++++++++++
 proxy/src/usage_metrics.rs               |   3 +-
 36 files changed, 221 insertions(+), 207 deletions(-)
 create mode 100644 proxy/src/signals.rs
 create mode 100644 proxy/src/types.rs

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 3f53ee24c3..2185677159 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -16,7 +16,7 @@ use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetEndpointJwksError;
 use crate::http::parse_json_body_with_limit;
 use crate::intern::RoleNameInt;
-use crate::{EndpointId, RoleName};
+use crate::types::{EndpointId, RoleName};
 
 // TODO(conrad): make these configurable.
 const CLOCK_SKEW_LEEWAY: Duration = Duration::from_secs(30);
@@ -669,7 +669,7 @@ mod tests {
     use tokio::net::TcpListener;
 
     use super::*;
-    use crate::RoleName;
+    use crate::types::RoleName;
 
     fn new_ec_jwk(kid: String) -> (p256::SecretKey, jose_jwk::Jwk) {
         let sk = p256::SecretKey::random(&mut OsRng);
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 1e029ff609..f9cb085daf 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -10,9 +10,10 @@ use crate::compute_ctl::ComputeCtlApi;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, MetricsAuxInfo};
 use crate::control_plane::NodeInfo;
+use crate::http;
 use crate::intern::{BranchIdTag, EndpointIdTag, InternId, ProjectIdTag};
+use crate::types::EndpointId;
 use crate::url::ApiUrl;
-use crate::{http, EndpointId};
 
 pub struct LocalBackend {
     pub(crate) initialize: Semaphore,
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index a4db130b61..17334b9cbb 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -32,7 +32,8 @@ use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::proxy::NeonOptions;
 use crate::rate_limiter::{BucketRateLimiter, EndpointRateLimiter, RateBucketInfo};
 use crate::stream::Stream;
-use crate::{scram, stream, EndpointCacheKey, EndpointId, RoleName};
+use crate::types::{EndpointCacheKey, EndpointId, RoleName};
+use crate::{scram, stream};
 
 /// Alternative to [`std::borrow::Cow`] but doesn't need `T: ToOwned` as we don't need that functionality
 pub enum MaybeOwned<'a, T> {
@@ -551,7 +552,7 @@ mod tests {
         async fn get_endpoint_jwks(
             &self,
             _ctx: &RequestMonitoring,
-            _endpoint: crate::EndpointId,
+            _endpoint: crate::types::EndpointId,
         ) -> Result<Vec<super::jwt::AuthRule>, control_plane::errors::GetEndpointJwksError>
         {
             unimplemented!()
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 465e427f7c..ddecae6af5 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -15,7 +15,7 @@ use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, SniKind};
 use crate::proxy::NeonOptions;
 use crate::serverless::SERVERLESS_DRIVER_SNI;
-use crate::{EndpointId, RoleName};
+use crate::types::{EndpointId, RoleName};
 
 #[derive(Debug, Error, PartialEq, Eq, Clone)]
 pub(crate) enum ComputeUserInfoParseError {
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 8585b8ff48..b934c28a78 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -5,7 +5,7 @@
 
 use bstr::ByteSlice;
 
-use crate::EndpointId;
+use crate::types::EndpointId;
 
 pub(crate) struct PasswordHackPayload {
     pub(crate) endpoint: EndpointId,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index a16c288e5d..df3628465f 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -25,8 +25,8 @@ use proxy::rate_limiter::{
 use proxy::scram::threadpool::ThreadPool;
 use proxy::serverless::cancel_set::CancelSet;
 use proxy::serverless::{self, GlobalConnPoolOptions};
+use proxy::types::RoleName;
 use proxy::url::ApiUrl;
-use proxy::RoleName;
 
 project_git_version!(GIT_VERSION);
 project_build_tag!(BUILD_TAG);
@@ -177,7 +177,7 @@ async fn main() -> anyhow::Result<()> {
     let mut maintenance_tasks = JoinSet::new();
 
     let refresh_config_notify = Arc::new(Notify::new());
-    maintenance_tasks.spawn(proxy::handle_signals(shutdown.clone(), {
+    maintenance_tasks.spawn(proxy::signals::handle(shutdown.clone(), {
         let refresh_config_notify = Arc::clone(&refresh_config_notify);
         move || {
             refresh_config_notify.notify_one();
@@ -216,7 +216,7 @@ async fn main() -> anyhow::Result<()> {
 
     match futures::future::select(pin!(maintenance_tasks.join_next()), pin!(task)).await {
         // exit immediately on maintenance task completion
-        Either::Left((Some(res), _)) => match proxy::flatten_err(res)? {},
+        Either::Left((Some(res), _)) => match proxy::error::flatten_err(res)? {},
         // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
         Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
         // exit immediately on client task error
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 13b7fdd40a..025053d3cb 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -133,14 +133,14 @@ async fn main() -> anyhow::Result<()> {
         proxy_listener,
         cancellation_token.clone(),
     ));
-    let signals_task = tokio::spawn(proxy::handle_signals(cancellation_token, || {}));
+    let signals_task = tokio::spawn(proxy::signals::handle(cancellation_token, || {}));
 
     // the signal task cant ever succeed.
     // the main task can error, or can succeed on cancellation.
     // we want to immediately exit on either of these cases
     let signal = match futures::future::select(signals_task, main).await {
-        Either::Left((res, _)) => proxy::flatten_err(res)?,
-        Either::Right((res, _)) => return proxy::flatten_err(res),
+        Either::Left((res, _)) => proxy::error::flatten_err(res)?,
+        Either::Right((res, _)) => return proxy::error::flatten_err(res),
     };
 
     // maintenance tasks return `Infallible` success values, this is an impossible value
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 96a71e69c6..6e190029aa 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -495,7 +495,7 @@ async fn main() -> anyhow::Result<()> {
 
     // maintenance tasks. these never return unless there's an error
     let mut maintenance_tasks = JoinSet::new();
-    maintenance_tasks.spawn(proxy::handle_signals(cancellation_token.clone(), || {}));
+    maintenance_tasks.spawn(proxy::signals::handle(cancellation_token.clone(), || {}));
     maintenance_tasks.spawn(http::health_server::task_main(
         http_listener,
         AppMetrics {
@@ -561,11 +561,11 @@ async fn main() -> anyhow::Result<()> {
         .await
         {
             // exit immediately on maintenance task completion
-            Either::Left((Some(res), _)) => break proxy::flatten_err(res)?,
+            Either::Left((Some(res), _)) => break proxy::error::flatten_err(res)?,
             // exit with error immediately if all maintenance tasks have ceased (should be caught by branch above)
             Either::Left((None, _)) => bail!("no maintenance tasks running. invalid state"),
             // exit immediately on client task error
-            Either::Right((Some(res), _)) => proxy::flatten_err(res)?,
+            Either::Right((Some(res), _)) => proxy::error::flatten_err(res)?,
             // exit if all our client tasks have shutdown gracefully
             Either::Right((None, _)) => return Ok(()),
         }
diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 82f3247fa7..12c33169bf 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -17,7 +17,7 @@ use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
 use crate::metrics::{Metrics, RedisErrors, RedisEventsCount};
 use crate::rate_limiter::GlobalRateLimiter;
 use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
-use crate::EndpointId;
+use crate::types::EndpointId;
 
 #[derive(Deserialize, Debug, Clone)]
 pub(crate) struct ControlPlaneEventKey {
diff --git a/proxy/src/cache/project_info.rs b/proxy/src/cache/project_info.rs
index 31d1dc96e7..84430dc812 100644
--- a/proxy/src/cache/project_info.rs
+++ b/proxy/src/cache/project_info.rs
@@ -17,7 +17,7 @@ use crate::auth::IpPattern;
 use crate::config::ProjectInfoCacheOptions;
 use crate::control_plane::AuthSecret;
 use crate::intern::{EndpointIdInt, ProjectIdInt, RoleNameInt};
-use crate::{EndpointId, RoleName};
+use crate::types::{EndpointId, RoleName};
 
 #[async_trait]
 pub(crate) trait ProjectInfoCache {
@@ -368,7 +368,7 @@ impl Cache for ProjectInfoCacheImpl {
 mod tests {
     use super::*;
     use crate::scram::ServerSecret;
-    use crate::ProjectId;
+    use crate::types::ProjectId;
 
     #[tokio::test]
     async fn test_project_info_cache_settings() {
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index a7c2cab4a1..b97942ee5d 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -25,7 +25,7 @@ use crate::control_plane::provider::ApiLockError;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
 use crate::proxy::neon_option;
-use crate::Host;
+use crate::types::Host;
 
 pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 
diff --git a/proxy/src/compute_ctl/mod.rs b/proxy/src/compute_ctl/mod.rs
index 2b57897223..60fdf107d4 100644
--- a/proxy/src/compute_ctl/mod.rs
+++ b/proxy/src/compute_ctl/mod.rs
@@ -4,8 +4,9 @@ use serde::de::DeserializeOwned;
 use serde::{Deserialize, Serialize};
 use thiserror::Error;
 
+use crate::http;
+use crate::types::{DbName, RoleName};
 use crate::url::ApiUrl;
-use crate::{http, DbName, RoleName};
 
 pub struct ComputeCtlApi {
     pub(crate) api: http::Endpoint,
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 3baa7ec751..5183f22fa3 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -20,7 +20,7 @@ use crate::rate_limiter::{RateBucketInfo, RateLimitAlgorithm, RateLimiterConfig}
 use crate::scram::threadpool::ThreadPool;
 use crate::serverless::cancel_set::CancelSet;
 use crate::serverless::GlobalConnPoolOptions;
-use crate::Host;
+use crate::types::Host;
 
 pub struct ProxyConfig {
     pub tls_config: Option<TlsConfig>,
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index e2d2c1b766..ca3b808a1b 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,7 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
     ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
-use crate::{DbName, EndpointId, RoleName};
+use crate::types::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
 
diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/provider/mock.rs
index fb061376e7..75a242d8d3 100644
--- a/proxy/src/control_plane/provider/mock.rs
+++ b/proxy/src/control_plane/provider/mock.rs
@@ -21,8 +21,9 @@ use crate::control_plane::messages::MetricsAuxInfo;
 use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
 use crate::error::io_error;
 use crate::intern::RoleNameInt;
+use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
 use crate::url::ApiUrl;
-use crate::{compute, scram, BranchId, EndpointId, ProjectId, RoleName};
+use crate::{compute, scram};
 
 #[derive(Debug, Error)]
 enum MockApiError {
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
index 88399dffa8..49e57b6b7e 100644
--- a/proxy/src/control_plane/provider/mod.rs
+++ b/proxy/src/control_plane/provider/mod.rs
@@ -23,7 +23,8 @@ use crate::error::ReportableError;
 use crate::intern::ProjectIdInt;
 use crate::metrics::ApiLockMetrics;
 use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
-use crate::{compute, scram, EndpointCacheKey, EndpointId};
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, scram};
 
 pub(crate) mod errors {
     use thiserror::Error;
diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/provider/neon.rs
index 5d0692c7ca..8ea91d7875 100644
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/provider/neon.rs
@@ -24,7 +24,8 @@ use crate::control_plane::errors::GetEndpointJwksError;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
 use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
-use crate::{compute, http, scram, EndpointCacheKey, EndpointId};
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, http, scram};
 
 const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
 
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index e71ed0c048..7b693a7418 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -1,7 +1,9 @@
 use std::error::Error as StdError;
 use std::{fmt, io};
 
+use anyhow::Context;
 use measured::FixedCardinalityLabel;
+use tokio::task::JoinError;
 
 /// Upcast (almost) any error into an opaque [`io::Error`].
 pub(crate) fn io_error(e: impl Into<Box<dyn StdError + Send + Sync>>) -> io::Error {
@@ -97,3 +99,8 @@ impl ReportableError for tokio_postgres::error::Error {
         }
     }
 }
+
+/// Flattens `Result<Result<T>>` into `Result<T>`.
+pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
+    r.context("join error").and_then(|x| x)
+}
diff --git a/proxy/src/intern.rs b/proxy/src/intern.rs
index 49aab917e4..f56d92a6b3 100644
--- a/proxy/src/intern.rs
+++ b/proxy/src/intern.rs
@@ -7,7 +7,7 @@ use std::sync::OnceLock;
 use lasso::{Capacity, MemoryLimits, Spur, ThreadedRodeo};
 use rustc_hash::FxHasher;
 
-use crate::{BranchId, EndpointId, ProjectId, RoleName};
+use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
 
 pub trait InternId: Sized + 'static {
     fn get_interner() -> &'static StringInterner<Self>;
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index ea17a88067..f95d645c23 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -78,14 +78,6 @@
 // List of temporarily allowed lints to unblock beta/nightly.
 #![allow(unknown_lints)]
 
-use std::convert::Infallible;
-
-use anyhow::{bail, Context};
-use intern::{EndpointIdInt, EndpointIdTag, InternId};
-use tokio::task::JoinError;
-use tokio_util::sync::CancellationToken;
-use tracing::warn;
-
 pub mod auth;
 pub mod cache;
 pub mod cancellation;
@@ -109,165 +101,9 @@ pub mod redis;
 pub mod sasl;
 pub mod scram;
 pub mod serverless;
+pub mod signals;
 pub mod stream;
+pub mod types;
 pub mod url;
 pub mod usage_metrics;
 pub mod waiters;
-
-/// Handle unix signals appropriately.
-pub async fn handle_signals<F>(
-    token: CancellationToken,
-    mut refresh_config: F,
-) -> anyhow::Result<Infallible>
-where
-    F: FnMut(),
-{
-    use tokio::signal::unix::{signal, SignalKind};
-
-    let mut hangup = signal(SignalKind::hangup())?;
-    let mut interrupt = signal(SignalKind::interrupt())?;
-    let mut terminate = signal(SignalKind::terminate())?;
-
-    loop {
-        tokio::select! {
-            // Hangup is commonly used for config reload.
-            _ = hangup.recv() => {
-                warn!("received SIGHUP");
-                refresh_config();
-            }
-            // Shut down the whole application.
-            _ = interrupt.recv() => {
-                warn!("received SIGINT, exiting immediately");
-                bail!("interrupted");
-            }
-            _ = terminate.recv() => {
-                warn!("received SIGTERM, shutting down once all existing connections have closed");
-                token.cancel();
-            }
-        }
-    }
-}
-
-/// Flattens `Result<Result<T>>` into `Result<T>`.
-pub fn flatten_err<T>(r: Result<anyhow::Result<T>, JoinError>) -> anyhow::Result<T> {
-    r.context("join error").and_then(|x| x)
-}
-
-macro_rules! smol_str_wrapper {
-    ($name:ident) => {
-        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
-        pub struct $name(smol_str::SmolStr);
-
-        impl $name {
-            #[allow(unused)]
-            pub(crate) fn as_str(&self) -> &str {
-                self.0.as_str()
-            }
-        }
-
-        impl std::fmt::Display for $name {
-            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                self.0.fmt(f)
-            }
-        }
-
-        impl<T> std::cmp::PartialEq<T> for $name
-        where
-            smol_str::SmolStr: std::cmp::PartialEq<T>,
-        {
-            fn eq(&self, other: &T) -> bool {
-                self.0.eq(other)
-            }
-        }
-
-        impl<T> From<T> for $name
-        where
-            smol_str::SmolStr: From<T>,
-        {
-            fn from(x: T) -> Self {
-                Self(x.into())
-            }
-        }
-
-        impl AsRef<str> for $name {
-            fn as_ref(&self) -> &str {
-                self.0.as_ref()
-            }
-        }
-
-        impl std::ops::Deref for $name {
-            type Target = str;
-            fn deref(&self) -> &str {
-                &*self.0
-            }
-        }
-
-        impl<'de> serde::de::Deserialize<'de> for $name {
-            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
-                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
-            }
-        }
-
-        impl serde::Serialize for $name {
-            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
-                self.0.serialize(s)
-            }
-        }
-    };
-}
-
-const POOLER_SUFFIX: &str = "-pooler";
-
-impl EndpointId {
-    fn normalize(&self) -> Self {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            stripped.into()
-        } else {
-            self.clone()
-        }
-    }
-
-    fn normalize_intern(&self) -> EndpointIdInt {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            EndpointIdTag::get_interner().get_or_intern(stripped)
-        } else {
-            self.into()
-        }
-    }
-}
-
-// 90% of role name strings are 20 characters or less.
-smol_str_wrapper!(RoleName);
-// 50% of endpoint strings are 23 characters or less.
-smol_str_wrapper!(EndpointId);
-// 50% of branch strings are 23 characters or less.
-smol_str_wrapper!(BranchId);
-// 90% of project strings are 23 characters or less.
-smol_str_wrapper!(ProjectId);
-
-// will usually equal endpoint ID
-smol_str_wrapper!(EndpointCacheKey);
-
-smol_str_wrapper!(DbName);
-
-// postgres hostname, will likely be a port:ip addr
-smol_str_wrapper!(Host);
-
-// Endpoints are a bit tricky. Rare they might be branches or projects.
-impl EndpointId {
-    pub(crate) fn is_endpoint(&self) -> bool {
-        self.0.starts_with("ep-")
-    }
-    pub(crate) fn is_branch(&self) -> bool {
-        self.0.starts_with("br-")
-    }
-    // pub(crate) fn is_project(&self) -> bool {
-    //     !self.is_endpoint() && !self.is_branch()
-    // }
-    pub(crate) fn as_branch(&self) -> BranchId {
-        BranchId(self.0.clone())
-    }
-    pub(crate) fn as_project(&self) -> ProjectId {
-        ProjectId(self.0.clone())
-    }
-}
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 8e9663626a..659b7afa68 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -17,7 +17,7 @@ use crate::metrics::{
 };
 use crate::proxy::retry::{retry_after, should_retry, CouldRetry};
 use crate::proxy::wake_compute::wake_compute;
-use crate::Host;
+use crate::types::Host;
 
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index f646862caa..2970d93393 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -32,7 +32,8 @@ use crate::protocol2::read_proxy_protocol;
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
-use crate::{auth, compute, EndpointCacheKey};
+use crate::types::EndpointCacheKey;
+use crate::{auth, compute};
 
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 3f54b0661b..fe62fee204 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -28,7 +28,8 @@ use crate::control_plane::provider::{
 };
 use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
-use crate::{sasl, scram, BranchId, EndpointId, ProjectId};
+use crate::types::{BranchId, EndpointId, ProjectId};
+use crate::{sasl, scram};
 
 /// Generate a set of TLS certificates: CA + server.
 fn generate_certs(
diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs
index 5de64c2254..4259fd04f4 100644
--- a/proxy/src/rate_limiter/limiter.rs
+++ b/proxy/src/rate_limiter/limiter.rs
@@ -250,7 +250,7 @@ mod tests {
     use super::{BucketRateLimiter, WakeComputeRateLimiter};
     use crate::intern::EndpointIdInt;
     use crate::rate_limiter::RateBucketInfo;
-    use crate::EndpointId;
+    use crate::types::EndpointId;
 
     #[test]
     fn rate_bucket_rpi() {
diff --git a/proxy/src/redis/notifications.rs b/proxy/src/redis/notifications.rs
index e56c5a3414..62e7b1b565 100644
--- a/proxy/src/redis/notifications.rs
+++ b/proxy/src/redis/notifications.rs
@@ -271,7 +271,7 @@ mod tests {
     use serde_json::json;
 
     use super::*;
-    use crate::{ProjectId, RoleName};
+    use crate::types::{ProjectId, RoleName};
 
     #[test]
     fn parse_allowed_ips() -> anyhow::Result<()> {
diff --git a/proxy/src/scram/mod.rs b/proxy/src/scram/mod.rs
index 97644b6282..718445f61d 100644
--- a/proxy/src/scram/mod.rs
+++ b/proxy/src/scram/mod.rs
@@ -62,7 +62,7 @@ mod tests {
     use super::{Exchange, ServerSecret};
     use crate::intern::EndpointIdInt;
     use crate::sasl::{Mechanism, Step};
-    use crate::EndpointId;
+    use crate::types::EndpointId;
 
     #[test]
     fn snapshot() {
diff --git a/proxy/src/scram/threadpool.rs b/proxy/src/scram/threadpool.rs
index cc1b69fcf9..ebc6dd2a3c 100644
--- a/proxy/src/scram/threadpool.rs
+++ b/proxy/src/scram/threadpool.rs
@@ -189,7 +189,7 @@ impl Drop for JobHandle {
 #[cfg(test)]
 mod tests {
     use super::*;
-    use crate::EndpointId;
+    use crate::types::EndpointId;
 
     #[tokio::test]
     async fn hash_is_correct() {
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 5d59b4d252..07e0e30148 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -18,6 +18,7 @@ use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCH
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
+use crate::compute;
 use crate::compute_ctl::{
     ComputeCtlError, ExtensionInstallRequest, Privilege, SetRoleGrantsRequest,
 };
@@ -32,7 +33,7 @@ use crate::intern::EndpointIdInt;
 use crate::proxy::connect_compute::ConnectMechanism;
 use crate::proxy::retry::{CouldRetry, ShouldRetryWakeCompute};
 use crate::rate_limiter::EndpointRateLimiter;
-use crate::{compute, EndpointId, Host};
+use crate::types::{EndpointId, Host};
 
 pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<super::http_conn_pool::GlobalConnPool<Send>>,
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 8401e3a1c9..7fa3357b5b 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -211,7 +211,7 @@ mod tests {
     use super::*;
     use crate::proxy::NeonOptions;
     use crate::serverless::cancel_set::CancelSet;
-    use crate::{BranchId, EndpointId, ProjectId};
+    use crate::types::{BranchId, EndpointId, ProjectId};
 
     struct MockClient(Arc<AtomicBool>);
     impl MockClient {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 844730194d..8830cddf0c 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -16,8 +16,8 @@ use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::ColdStartInfo;
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::types::{DbName, EndpointCacheKey, RoleName};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{DbName, EndpointCacheKey, RoleName};
 
 #[derive(Debug, Clone)]
 pub(crate) struct ConnInfo {
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 363e397976..934a50c14f 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -14,8 +14,8 @@ use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
+use crate::types::EndpointCacheKey;
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::EndpointCacheKey;
 
 pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
 pub(crate) type Connect =
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index e1ad46c751..064e7db7b3 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -35,8 +35,8 @@ use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
+use crate::types::{DbName, RoleName};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
-use crate::{DbName, RoleName};
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
 pub(crate) const EXT_VERSION: &str = "0.1.2";
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 6fbb044669..8e2d4c126a 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -38,8 +38,8 @@ use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::metrics::{HttpDirection, Metrics};
 use crate::proxy::{run_until_cancelled, NeonOptions};
 use crate::serverless::backend::HttpConnError;
+use crate::types::{DbName, RoleName};
 use crate::usage_metrics::{MetricCounter, MetricCounterRecorder};
-use crate::{DbName, RoleName};
 
 #[derive(serde::Deserialize)]
 #[serde(rename_all = "camelCase")]
diff --git a/proxy/src/signals.rs b/proxy/src/signals.rs
new file mode 100644
index 0000000000..514a83d5eb
--- /dev/null
+++ b/proxy/src/signals.rs
@@ -0,0 +1,39 @@
+use std::convert::Infallible;
+
+use anyhow::bail;
+use tokio_util::sync::CancellationToken;
+use tracing::warn;
+
+/// Handle unix signals appropriately.
+pub async fn handle<F>(
+    token: CancellationToken,
+    mut refresh_config: F,
+) -> anyhow::Result<Infallible>
+where
+    F: FnMut(),
+{
+    use tokio::signal::unix::{signal, SignalKind};
+
+    let mut hangup = signal(SignalKind::hangup())?;
+    let mut interrupt = signal(SignalKind::interrupt())?;
+    let mut terminate = signal(SignalKind::terminate())?;
+
+    loop {
+        tokio::select! {
+            // Hangup is commonly used for config reload.
+            _ = hangup.recv() => {
+                warn!("received SIGHUP");
+                refresh_config();
+            }
+            // Shut down the whole application.
+            _ = interrupt.recv() => {
+                warn!("received SIGINT, exiting immediately");
+                bail!("interrupted");
+            }
+            _ = terminate.recv() => {
+                warn!("received SIGTERM, shutting down once all existing connections have closed");
+                token.cancel();
+            }
+        }
+    }
+}
diff --git a/proxy/src/types.rs b/proxy/src/types.rs
new file mode 100644
index 0000000000..b0408a51d1
--- /dev/null
+++ b/proxy/src/types.rs
@@ -0,0 +1,122 @@
+use crate::intern::{EndpointIdInt, EndpointIdTag, InternId};
+
+macro_rules! smol_str_wrapper {
+    ($name:ident) => {
+        #[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Default)]
+        pub struct $name(smol_str::SmolStr);
+
+        impl $name {
+            #[allow(unused)]
+            pub(crate) fn as_str(&self) -> &str {
+                self.0.as_str()
+            }
+        }
+
+        impl std::fmt::Display for $name {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                self.0.fmt(f)
+            }
+        }
+
+        impl<T> std::cmp::PartialEq<T> for $name
+        where
+            smol_str::SmolStr: std::cmp::PartialEq<T>,
+        {
+            fn eq(&self, other: &T) -> bool {
+                self.0.eq(other)
+            }
+        }
+
+        impl<T> From<T> for $name
+        where
+            smol_str::SmolStr: From<T>,
+        {
+            fn from(x: T) -> Self {
+                Self(x.into())
+            }
+        }
+
+        impl AsRef<str> for $name {
+            fn as_ref(&self) -> &str {
+                self.0.as_ref()
+            }
+        }
+
+        impl std::ops::Deref for $name {
+            type Target = str;
+            fn deref(&self) -> &str {
+                &*self.0
+            }
+        }
+
+        impl<'de> serde::de::Deserialize<'de> for $name {
+            fn deserialize<D: serde::de::Deserializer<'de>>(d: D) -> Result<Self, D::Error> {
+                <smol_str::SmolStr as serde::de::Deserialize<'de>>::deserialize(d).map(Self)
+            }
+        }
+
+        impl serde::Serialize for $name {
+            fn serialize<S: serde::Serializer>(&self, s: S) -> Result<S::Ok, S::Error> {
+                self.0.serialize(s)
+            }
+        }
+    };
+}
+
+const POOLER_SUFFIX: &str = "-pooler";
+
+impl EndpointId {
+    #[must_use]
+    pub fn normalize(&self) -> Self {
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            stripped.into()
+        } else {
+            self.clone()
+        }
+    }
+
+    #[must_use]
+    pub fn normalize_intern(&self) -> EndpointIdInt {
+        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
+            EndpointIdTag::get_interner().get_or_intern(stripped)
+        } else {
+            self.into()
+        }
+    }
+}
+
+// 90% of role name strings are 20 characters or less.
+smol_str_wrapper!(RoleName);
+// 50% of endpoint strings are 23 characters or less.
+smol_str_wrapper!(EndpointId);
+// 50% of branch strings are 23 characters or less.
+smol_str_wrapper!(BranchId);
+// 90% of project strings are 23 characters or less.
+smol_str_wrapper!(ProjectId);
+
+// will usually equal endpoint ID
+smol_str_wrapper!(EndpointCacheKey);
+
+smol_str_wrapper!(DbName);
+
+// postgres hostname, will likely be a port:ip addr
+smol_str_wrapper!(Host);
+
+// Endpoints are a bit tricky. Rare they might be branches or projects.
+impl EndpointId {
+    pub(crate) fn is_endpoint(&self) -> bool {
+        self.0.starts_with("ep-")
+    }
+    pub(crate) fn is_branch(&self) -> bool {
+        self.0.starts_with("br-")
+    }
+    // pub(crate) fn is_project(&self) -> bool {
+    //     !self.is_endpoint() && !self.is_branch()
+    // }
+    pub(crate) fn as_branch(&self) -> BranchId {
+        BranchId(self.0.clone())
+    }
+    pub(crate) fn as_project(&self) -> ProjectId {
+        ProjectId(self.0.clone())
+    }
+}
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index f944d5aec3..c5e8588623 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -497,7 +497,8 @@ mod tests {
     use url::Url;
 
     use super::*;
-    use crate::{http, BranchId, EndpointId};
+    use crate::http;
+    use crate::types::{BranchId, EndpointId};
 
     #[tokio::test]
     async fn metrics() {

From 0595320c87c4bcab9e346cf7904f7e4b00454388 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 23 Oct 2024 09:55:00 -0600
Subject: [PATCH 070/239] Protect call to pg_current_wal_lsn() in retained_wal
 query

We can't call pg_current_wal_lsn() if we are a standby instance (read
replica). Any attempt to call this function while in recovery results
in:

ERROR:  recovery is in progress

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/sql_exporter/retained_wal.sql | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/compute/etc/sql_exporter/retained_wal.sql b/compute/etc/sql_exporter/retained_wal.sql
index 6c58359461..3e2aadfc28 100644
--- a/compute/etc/sql_exporter/retained_wal.sql
+++ b/compute/etc/sql_exporter/retained_wal.sql
@@ -1,5 +1,10 @@
 SELECT
   slot_name,
-  pg_wal_lsn_diff(pg_current_wal_lsn(), restart_lsn)::FLOAT8 AS retained_wal
+  pg_wal_lsn_diff(
+    CASE
+      WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn()
+      ELSE pg_current_wal_lsn()
+    END,
+    restart_lsn)::FLOAT8 AS retained_wal
 FROM pg_replication_slots
 WHERE active = false;

From e3ff87ce3bfa99662a9b2d299b78be1bfa35b8cd Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 23 Oct 2024 17:29:55 +0100
Subject: [PATCH 071/239] tests: avoid using background_process when invoking
 pg_ctl (#9469)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Problem

Occasionally, we get failures to start the storage controller's db with
errors like:
```
aborting due to panic at /__w/neon/neon/control_plane/src/background_process.rs:349:67:
claim pid file: lock file

Caused by:
    file is already locked
```
e.g.
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9428/11380574562/index.html#/testresult/1c68d413ea9ecd4a

This is happening in a stop,start cycle during a test. Presumably the
pidfile from the startup background process is still held at the point
we stop, because we let pg_ctl keep running in the background.

## Summary of changes

- Refactor pg_ctl invocations into a helper
- In the controller's `start` function, use pg_ctl & a wait loop for
pg_isready, instead of using background_process

---------

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 control_plane/src/storage_controller.rs | 102 +++++++++++++++---------
 1 file changed, 65 insertions(+), 37 deletions(-)

diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs
index 43c63e7ef4..b70bd2e1b5 100644
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -20,7 +20,16 @@ use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
 use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
-use std::{fs, net::SocketAddr, path::PathBuf, str::FromStr, sync::OnceLock};
+use std::{
+    ffi::OsStr,
+    fs,
+    net::SocketAddr,
+    path::PathBuf,
+    process::ExitStatus,
+    str::FromStr,
+    sync::OnceLock,
+    time::{Duration, Instant},
+};
 use tokio::process::Command;
 use tracing::instrument;
 use url::Url;
@@ -168,16 +177,6 @@ impl StorageController {
         .expect("non-Unicode path")
     }
 
-    /// PIDFile for the postgres instance used to store storage controller state
-    fn postgres_pid_file(&self) -> Utf8PathBuf {
-        Utf8PathBuf::from_path_buf(
-            self.env
-                .base_data_dir
-                .join("storage_controller_postgres.pid"),
-        )
-        .expect("non-Unicode path")
-    }
-
     /// Find the directory containing postgres subdirectories, such `bin` and `lib`
     ///
     /// This usually uses STORAGE_CONTROLLER_POSTGRES_VERSION of postgres, but will fall back
@@ -296,6 +295,31 @@ impl StorageController {
             .map_err(anyhow::Error::new)
     }
 
+    /// Wrapper for the pg_ctl binary, which we spawn as a short-lived subprocess when starting and stopping postgres
+    async fn pg_ctl<I, S>(&self, args: I) -> ExitStatus
+    where
+        I: IntoIterator<Item = S>,
+        S: AsRef<OsStr>,
+    {
+        let pg_bin_dir = self.get_pg_bin_dir().await.unwrap();
+        let bin_path = pg_bin_dir.join("pg_ctl");
+
+        let pg_lib_dir = self.get_pg_lib_dir().await.unwrap();
+        let envs = [
+            ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+            ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
+        ];
+
+        Command::new(bin_path)
+            .args(args)
+            .envs(envs)
+            .spawn()
+            .expect("Failed to spawn pg_ctl, binary_missing?")
+            .wait()
+            .await
+            .expect("Failed to wait for pg_ctl termination")
+    }
+
     pub async fn start(&self, start_args: NeonStorageControllerStartArgs) -> anyhow::Result<()> {
         let instance_dir = self.storage_controller_instance_dir(start_args.instance_id);
         if let Err(err) = tokio::fs::create_dir(&instance_dir).await {
@@ -404,20 +428,34 @@ impl StorageController {
                 db_start_args
             );
 
-            background_process::start_process(
-                "storage_controller_db",
-                &self.env.base_data_dir,
-                pg_bin_dir.join("pg_ctl").as_std_path(),
-                db_start_args,
-                vec![
-                    ("LD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                    ("DYLD_LIBRARY_PATH".to_owned(), pg_lib_dir.to_string()),
-                ],
-                background_process::InitialPidFile::Create(self.postgres_pid_file()),
-                &start_args.start_timeout,
-                || self.pg_isready(&pg_bin_dir, postgres_port),
-            )
-            .await?;
+            let db_start_status = self.pg_ctl(db_start_args).await;
+            let start_timeout: Duration = start_args.start_timeout.into();
+            let db_start_deadline = Instant::now() + start_timeout;
+            if !db_start_status.success() {
+                return Err(anyhow::anyhow!(
+                    "Failed to start postgres {}",
+                    db_start_status.code().unwrap()
+                ));
+            }
+
+            loop {
+                if Instant::now() > db_start_deadline {
+                    return Err(anyhow::anyhow!("Timed out waiting for postgres to start"));
+                }
+
+                match self.pg_isready(&pg_bin_dir, postgres_port).await {
+                    Ok(true) => {
+                        tracing::info!("storage controller postgres is now ready");
+                        break;
+                    }
+                    Ok(false) => {
+                        tokio::time::sleep(Duration::from_millis(100)).await;
+                    }
+                    Err(e) => {
+                        tracing::warn!("Failed to check postgres status: {e}")
+                    }
+                }
+            }
 
             self.setup_database(postgres_port).await?;
         }
@@ -583,15 +621,10 @@ impl StorageController {
         }
 
         let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
 
         println!("Stopping storage controller database...");
         let pg_stop_args = ["-D", &pg_data_path.to_string_lossy(), "stop"];
-        let stop_status = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_stop_args)
-            .spawn()?
-            .wait()
-            .await?;
+        let stop_status = self.pg_ctl(pg_stop_args).await;
         if !stop_status.success() {
             match self.is_postgres_running().await {
                 Ok(false) => {
@@ -612,14 +645,9 @@ impl StorageController {
 
     async fn is_postgres_running(&self) -> anyhow::Result<bool> {
         let pg_data_path = self.env.base_data_dir.join("storage_controller_db");
-        let pg_bin_dir = self.get_pg_bin_dir().await?;
 
         let pg_status_args = ["-D", &pg_data_path.to_string_lossy(), "status"];
-        let status_exitcode = Command::new(pg_bin_dir.join("pg_ctl"))
-            .args(pg_status_args)
-            .spawn()?
-            .wait()
-            .await?;
+        let status_exitcode = self.pg_ctl(pg_status_args).await;
 
         // pg_ctl status returns this exit code if postgres is not running: in this case it is
         // fine that stop failed.  Otherwise it is an error that stop failed.

From ac1205c14c3fd655ad6b90b0f4fcd7d3a47938c3 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 23 Oct 2024 19:58:28 +0100
Subject: [PATCH 072/239] pageserver: add metric for number of zeroed pages on
 rel extend (#9492)

## Problem

Filling the gap in with zeroes is annoying for sharded ingest. We are
not sure it even happens in reality.

## Summary of Changes

Add one global counter which tracks how many such gap blocks we filled
on relation extends. We can add more metrics once we understand the
scope.
---
 pageserver/src/metrics.rs   | 6 ++++++
 pageserver/src/walingest.rs | 7 +++++++
 2 files changed, 13 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 3e824b59fb..8f697558d6 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2092,6 +2092,7 @@ pub(crate) struct WalIngestMetrics {
     pub(crate) records_received: IntCounter,
     pub(crate) records_committed: IntCounter,
     pub(crate) records_filtered: IntCounter,
+    pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
 }
 
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
@@ -2115,6 +2116,11 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
         "Number of WAL records filtered out due to sharding"
     )
     .expect("failed to define a metric"),
+    gap_blocks_zeroed_on_rel_extend: register_int_counter!(
+        "pageserver_gap_blocks_zeroed_on_rel_extend",
+        "Total number of zero gap blocks written on relation extends"
+    )
+    .expect("failed to define a metric"),
 });
 
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 95d1f76920..d3e8bf59f2 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1915,7 +1915,9 @@ impl WalIngest {
             modification.put_rel_extend(rel, new_nblocks, ctx).await?;
 
             let mut key = rel_block_to_key(rel, blknum);
+
             // fill the gap with zeros
+            let mut gap_blocks_filled: u64 = 0;
             for gap_blknum in old_nblocks..blknum {
                 key.field6 = gap_blknum;
 
@@ -1924,7 +1926,12 @@ impl WalIngest {
                 }
 
                 modification.put_rel_page_image_zero(rel, gap_blknum)?;
+                gap_blocks_filled += 1;
             }
+
+            WAL_INGEST
+                .gap_blocks_zeroed_on_rel_extend
+                .inc_by(gap_blocks_filled);
         }
         Ok(())
     }

From b86432c29e63e61bbdeb110135101cdec7cfdb86 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 23 Oct 2024 21:52:22 -0600
Subject: [PATCH 073/239] Fix buggy sizeof

A sizeof on a pointer on a 64 bit machine is 8 bytes whereas
Entry::old_name is a 64 byte array of characters. There was most likely
no fallout since the string would start with NUL bytes, but best to fix
nonetheless.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/control_plane_connector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 0730c305cb..4713103909 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -767,7 +767,7 @@ HandleDropRole(DropRoleStmt *stmt)
 		entry->type = Op_Delete;
 		entry->password = NULL;
 		if (!found)
-			memset(entry->old_name, 0, sizeof(entry));
+			memset(entry->old_name, 0, sizeof(entry->old_name));
 	}
 }
 

From 6f34f97573cf9a649535bee766e951f19d7fe94a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 24 Oct 2024 10:00:22 +0200
Subject: [PATCH 074/239] refactor(pageserver(load_remote_timeline)) remove
 dead code handling absence of IndexPart (#9408)

The code is dead at runtime since we're nowadays always running with
remote storage and treat it as the source of truth during attach.

Clean it up as a preliminary to
https://github.com/neondatabase/neon/pull/9218.

Related: https://github.com/neondatabase/neon/pull/9366
---
 pageserver/src/tenant.rs               | 23 +++--------------------
 pageserver/src/tenant/timeline.rs      |  5 ++---
 pageserver/src/tenant/timeline/init.rs | 12 +-----------
 3 files changed, 6 insertions(+), 34 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7a3305797c..d503b299c1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -869,7 +869,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         resources: TimelineResources,
-        index_part: Option<IndexPart>,
+        index_part: IndexPart,
         metadata: TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         _ctx: &RequestContext,
@@ -894,24 +894,7 @@ impl Tenant {
             "these are used interchangeably"
         );
 
-        if let Some(index_part) = index_part.as_ref() {
-            timeline.remote_client.init_upload_queue(index_part)?;
-        } else {
-            // No data on the remote storage, but we have local metadata file. We can end up
-            // here with timeline_create being interrupted before finishing index part upload.
-            // By doing what we do here, the index part upload is retried.
-            // If control plane retries timeline creation in the meantime, the mgmt API handler
-            // for timeline creation will coalesce on the upload we queue here.
-
-            // FIXME: this branch should be dead code as we no longer write local metadata.
-
-            timeline
-                .remote_client
-                .init_upload_queue_for_empty_remote(&metadata)?;
-            timeline
-                .remote_client
-                .schedule_index_upload_for_full_metadata_update(&metadata)?;
-        }
+        timeline.remote_client.init_upload_queue(&index_part)?;
 
         timeline
             .load_layer_map(disk_consistent_lsn, index_part)
@@ -1541,7 +1524,7 @@ impl Tenant {
         self.timeline_init_and_sync(
             timeline_id,
             resources,
-            Some(index_part),
+            index_part,
             remote_metadata,
             ancestor,
             ctx,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d5ceec663b..7b40a24c54 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2404,7 +2404,7 @@ impl Timeline {
     pub(super) async fn load_layer_map(
         &self,
         disk_consistent_lsn: Lsn,
-        index_part: Option<IndexPart>,
+        index_part: IndexPart,
     ) -> anyhow::Result<()> {
         use init::{Decision::*, Discovered, DismissedLayer};
         use LayerName::*;
@@ -2468,8 +2468,7 @@ impl Timeline {
                     );
                 }
 
-                let decided =
-                    init::reconcile(discovered_layers, index_part.as_ref(), disk_consistent_lsn);
+                let decided = init::reconcile(discovered_layers, &index_part, disk_consistent_lsn);
 
                 let mut loaded_layers = Vec::new();
                 let mut needs_cleanup = Vec::new();
diff --git a/pageserver/src/tenant/timeline/init.rs b/pageserver/src/tenant/timeline/init.rs
index 5bc67c7133..6634d07a0d 100644
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -125,19 +125,9 @@ pub(super) enum DismissedLayer {
 /// Merges local discoveries and remote [`IndexPart`] to a collection of decisions.
 pub(super) fn reconcile(
     local_layers: Vec<(LayerName, LocalLayerFileMetadata)>,
-    index_part: Option<&IndexPart>,
+    index_part: &IndexPart,
     disk_consistent_lsn: Lsn,
 ) -> Vec<(LayerName, Result<Decision, DismissedLayer>)> {
-    let Some(index_part) = index_part else {
-        // If we have no remote metadata, no local layer files are considered valid to load
-        return local_layers
-            .into_iter()
-            .map(|(layer_name, local_metadata)| {
-                (layer_name, Err(DismissedLayer::LocalOnly(local_metadata)))
-            })
-            .collect();
-    };
-
     let mut result = Vec::new();
 
     let mut remote_layers = HashMap::new();

From d589498c6f556d6c1b246b00800fe51ece416485 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 24 Oct 2024 15:23:09 +0100
Subject: [PATCH 075/239] storcon: respect Reconciler::cancel during await_lsn
 (#9486)

## Problem

When a pageserver is misbehaving (e.g. we hit an ingest bug or something
is pathologically slow), the storage controller could get stuck in the
part of live migration that waits for LSNs to catch up. This is a
problem, because it can prevent us migrating the troublesome tenant to
another pageserver.

Closes: https://github.com/neondatabase/cloud/issues/19169

## Summary of changes

- Respect Reconciler::cancel during await_lsn.
---
 storage_controller/src/reconciler.rs | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 9d2182d44c..3ad386a95b 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -450,6 +450,9 @@ impl Reconciler {
         }
     }
 
+    /// This function does _not_ mutate any state, so it is cancellation safe.
+    ///
+    /// This function does not respect [`Self::cancel`], callers should handle that.
     async fn await_lsn(
         &self,
         tenant_shard_id: TenantShardId,
@@ -570,8 +573,10 @@ impl Reconciler {
 
         if let Some(baseline) = baseline_lsns {
             tracing::info!("🕑 Waiting for LSN to catch up...");
-            self.await_lsn(self.tenant_shard_id, &dest_ps, baseline)
-                .await?;
+            tokio::select! {
+                r = self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) => {r?;}
+                _ = self.cancel.cancelled() => {return Err(ReconcileError::Cancel)}
+            };
         }
 
         tracing::info!("🔁 Notifying compute to use pageserver {dest_ps}");

From b8a311131eae01c8d4e4dca09e3892728ace804f Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Thu, 24 Oct 2024 15:49:26 +0100
Subject: [PATCH 076/239] CI: remove `git config --add safe.directory` hack
 (#9391)

## Problem

We have `git config --global --add safe.directory ...` leftovers from the
past, but `actions/checkout` does it by default (since v3.0.2, we use v4)

## Summary of changes
- Remove `git config --global --add safe.directory ...` hack
---
 .github/workflows/_build-and-test-locally.yml | 14 --------------
 .github/workflows/build_and_test.yml          | 14 --------------
 2 files changed, 28 deletions(-)

diff --git a/.github/workflows/_build-and-test-locally.yml b/.github/workflows/_build-and-test-locally.yml
index c0f59fbdd5..8e28049888 100644
--- a/.github/workflows/_build-and-test-locally.yml
+++ b/.github/workflows/_build-and-test-locally.yml
@@ -53,20 +53,6 @@ jobs:
       BUILD_TAG: ${{ inputs.build-tag }}
 
     steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16 17; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
       - uses: actions/checkout@v4
         with:
           submodules: true
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 1186b9927b..0d3ea7db28 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1078,20 +1078,6 @@ jobs:
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
     steps:
-      - name: Fix git ownership
-        run: |
-          # Workaround for `fatal: detected dubious ownership in repository at ...`
-          #
-          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
-          #   Ref https://github.com/actions/checkout/issues/785
-          #
-          git config --global --add safe.directory ${{ github.workspace }}
-          git config --global --add safe.directory ${GITHUB_WORKSPACE}
-          for r in 14 15 16 17; do
-            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
-            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
-          done
-
       - uses: actions/checkout@v4
 
       - name: Trigger deploy workflow

From fb0406e9d25515ef9bc35331e0858f93c8b24798 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 24 Oct 2024 10:49:54 -0400
Subject: [PATCH 077/239] refactor(pageserver): refactor split writers using
 batch layer writer (#9493)

part of https://github.com/neondatabase/neon/issues/9114,
https://github.com/neondatabase/neon/issues/8836,
https://github.com/neondatabase/neon/issues/8362

The split layer writer code can be used in a more general way: the
caller puts unfinished writers into the batch layer writer and let batch
layer writer to ensure the atomicity of the layer produces.

## Summary of changes

* Add batch layer writer, which atomically finishes the layers.
`BatchLayerWriter::finish` is simply a copy-paste from previous split
layer writers.
* Refactor split writers to use the batch layer writer.
* The current split writer tests cover all code path of batch layer
writer.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer.rs        |   2 +-
 ...{split_writer.rs => batch_split_writer.rs} | 282 ++++++++++--------
 .../src/tenant/storage_layer/image_layer.rs   |   2 +-
 pageserver/src/tenant/timeline/compaction.rs  |  14 +-
 4 files changed, 160 insertions(+), 140 deletions(-)
 rename pageserver/src/tenant/storage_layer/{split_writer.rs => batch_split_writer.rs} (80%)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index a229b59560..4a63491e90 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -1,5 +1,6 @@
 //! Common traits and structs for layers
 
+pub mod batch_split_writer;
 pub mod delta_layer;
 pub mod filter_iterator;
 pub mod image_layer;
@@ -8,7 +9,6 @@ pub(crate) mod layer;
 mod layer_desc;
 mod layer_name;
 pub mod merge_iterator;
-pub mod split_writer;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use crate::repository::Value;
diff --git a/pageserver/src/tenant/storage_layer/split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
similarity index 80%
rename from pageserver/src/tenant/storage_layer/split_writer.rs
rename to pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 45ac0c6668..272e422c90 100644
--- a/pageserver/src/tenant/storage_layer/split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -12,41 +12,154 @@ use super::{
     DeltaLayerWriter, ImageLayerWriter, PersistentLayerDesc, PersistentLayerKey, ResidentLayer,
 };
 
-pub(crate) enum SplitWriterResult {
+pub(crate) enum BatchWriterResult {
     Produced(ResidentLayer),
     Discarded(PersistentLayerKey),
 }
 
 #[cfg(test)]
-impl SplitWriterResult {
+impl BatchWriterResult {
     fn into_resident_layer(self) -> ResidentLayer {
         match self {
-            SplitWriterResult::Produced(layer) => layer,
-            SplitWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
+            BatchWriterResult::Produced(layer) => layer,
+            BatchWriterResult::Discarded(_) => panic!("unexpected discarded layer"),
         }
     }
 
     fn into_discarded_layer(self) -> PersistentLayerKey {
         match self {
-            SplitWriterResult::Produced(_) => panic!("unexpected produced layer"),
-            SplitWriterResult::Discarded(layer) => layer,
+            BatchWriterResult::Produced(_) => panic!("unexpected produced layer"),
+            BatchWriterResult::Discarded(layer) => layer,
         }
     }
 }
 
+enum LayerWriterWrapper {
+    Image(ImageLayerWriter),
+    Delta(DeltaLayerWriter),
+}
+
+/// An layer writer that takes unfinished layers and finish them atomically.
+#[must_use]
+pub struct BatchLayerWriter {
+    generated_layer_writers: Vec<(LayerWriterWrapper, PersistentLayerKey)>,
+    conf: &'static PageServerConf,
+}
+
+impl BatchLayerWriter {
+    pub async fn new(conf: &'static PageServerConf) -> anyhow::Result<Self> {
+        Ok(Self {
+            generated_layer_writers: Vec::new(),
+            conf,
+        })
+    }
+
+    pub fn add_unfinished_image_writer(
+        &mut self,
+        writer: ImageLayerWriter,
+        key_range: Range<Key>,
+        lsn: Lsn,
+    ) {
+        self.generated_layer_writers.push((
+            LayerWriterWrapper::Image(writer),
+            PersistentLayerKey {
+                key_range,
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(lsn),
+                is_delta: false,
+            },
+        ));
+    }
+
+    pub fn add_unfinished_delta_writer(
+        &mut self,
+        writer: DeltaLayerWriter,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+    ) {
+        self.generated_layer_writers.push((
+            LayerWriterWrapper::Delta(writer),
+            PersistentLayerKey {
+                key_range,
+                lsn_range,
+                is_delta: true,
+            },
+        ));
+    }
+
+    pub(crate) async fn finish_with_discard_fn<D, F>(
+        self,
+        tline: &Arc<Timeline>,
+        ctx: &RequestContext,
+        discard_fn: D,
+    ) -> anyhow::Result<Vec<BatchWriterResult>>
+    where
+        D: Fn(&PersistentLayerKey) -> F,
+        F: Future<Output = bool>,
+    {
+        let Self {
+            generated_layer_writers,
+            ..
+        } = self;
+        let clean_up_layers = |generated_layers: Vec<BatchWriterResult>| {
+            for produced_layer in generated_layers {
+                if let BatchWriterResult::Produced(resident_layer) = produced_layer {
+                    let layer: Layer = resident_layer.into();
+                    layer.delete_on_drop();
+                }
+            }
+        };
+        // BEGIN: catch every error and do the recovery in the below section
+        let mut generated_layers: Vec<BatchWriterResult> = Vec::new();
+        for (inner, layer_key) in generated_layer_writers {
+            if discard_fn(&layer_key).await {
+                generated_layers.push(BatchWriterResult::Discarded(layer_key));
+            } else {
+                let res = match inner {
+                    LayerWriterWrapper::Delta(writer) => {
+                        writer.finish(layer_key.key_range.end, ctx).await
+                    }
+                    LayerWriterWrapper::Image(writer) => {
+                        writer
+                            .finish_with_end_key(layer_key.key_range.end, ctx)
+                            .await
+                    }
+                };
+                let layer = match res {
+                    Ok((desc, path)) => {
+                        match Layer::finish_creating(self.conf, tline, desc, &path) {
+                            Ok(layer) => layer,
+                            Err(e) => {
+                                tokio::fs::remove_file(&path).await.ok();
+                                clean_up_layers(generated_layers);
+                                return Err(e);
+                            }
+                        }
+                    }
+                    Err(e) => {
+                        // Image/DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
+                        // so we don't need to remove the layer we just failed to create by ourselves.
+                        clean_up_layers(generated_layers);
+                        return Err(e);
+                    }
+                };
+                generated_layers.push(BatchWriterResult::Produced(layer));
+            }
+        }
+        // END: catch every error and do the recovery in the above section
+        Ok(generated_layers)
+    }
+}
+
 /// An image writer that takes images and produces multiple image layers.
-///
-/// The interface does not guarantee atomicity (i.e., if the image layer generation
-/// fails, there might be leftover files to be cleaned up)
 #[must_use]
 pub struct SplitImageLayerWriter {
     inner: ImageLayerWriter,
     target_layer_size: u64,
-    generated_layer_writers: Vec<(ImageLayerWriter, PersistentLayerKey)>,
+    lsn: Lsn,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
-    lsn: Lsn,
+    batches: BatchLayerWriter,
     start_key: Key,
 }
 
@@ -71,10 +184,10 @@ impl SplitImageLayerWriter {
                 ctx,
             )
             .await?,
-            generated_layer_writers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
+            batches: BatchLayerWriter::new(conf).await?,
             lsn,
             start_key,
         })
@@ -102,16 +215,13 @@ impl SplitImageLayerWriter {
                 ctx,
             )
             .await?;
-            let layer_key = PersistentLayerKey {
-                key_range: self.start_key..key,
-                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-                is_delta: false,
-            };
             let prev_image_writer = std::mem::replace(&mut self.inner, next_image_writer);
+            self.batches.add_unfinished_image_writer(
+                prev_image_writer,
+                self.start_key..key,
+                self.lsn,
+            );
             self.start_key = key;
-
-            self.generated_layer_writers
-                .push((prev_image_writer, layer_key));
         }
         self.inner.put_image(key, img, ctx).await
     }
@@ -122,64 +232,18 @@ impl SplitImageLayerWriter {
         ctx: &RequestContext,
         end_key: Key,
         discard_fn: D,
-    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    ) -> anyhow::Result<Vec<BatchWriterResult>>
     where
         D: Fn(&PersistentLayerKey) -> F,
         F: Future<Output = bool>,
     {
         let Self {
-            mut generated_layer_writers,
-            inner,
-            ..
+            mut batches, inner, ..
         } = self;
         if inner.num_keys() != 0 {
-            let layer_key = PersistentLayerKey {
-                key_range: self.start_key..end_key,
-                lsn_range: PersistentLayerDesc::image_layer_lsn_range(self.lsn),
-                is_delta: false,
-            };
-            generated_layer_writers.push((inner, layer_key));
+            batches.add_unfinished_image_writer(inner, self.start_key..end_key, self.lsn);
         }
-        let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
-            for produced_layer in generated_layers {
-                if let SplitWriterResult::Produced(image_layer) = produced_layer {
-                    let layer: Layer = image_layer.into();
-                    layer.delete_on_drop();
-                }
-            }
-        };
-        // BEGIN: catch every error and do the recovery in the below section
-        let mut generated_layers = Vec::new();
-        for (inner, layer_key) in generated_layer_writers {
-            if discard_fn(&layer_key).await {
-                generated_layers.push(SplitWriterResult::Discarded(layer_key));
-            } else {
-                let layer = match inner
-                    .finish_with_end_key(layer_key.key_range.end, ctx)
-                    .await
-                {
-                    Ok((desc, path)) => {
-                        match Layer::finish_creating(self.conf, tline, desc, &path) {
-                            Ok(layer) => layer,
-                            Err(e) => {
-                                tokio::fs::remove_file(&path).await.ok();
-                                clean_up_layers(generated_layers);
-                                return Err(e);
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        // ImageLayerWriter::finish will clean up the temporary layer if anything goes wrong,
-                        // so we don't need to remove the layer we just failed to create by ourselves.
-                        clean_up_layers(generated_layers);
-                        return Err(e);
-                    }
-                };
-                generated_layers.push(SplitWriterResult::Produced(layer));
-            }
-        }
-        // END: catch every error and do the recovery in the above section
-        Ok(generated_layers)
+        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
     }
 
     #[cfg(test)]
@@ -188,7 +252,7 @@ impl SplitImageLayerWriter {
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         end_key: Key,
-    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+    ) -> anyhow::Result<Vec<BatchWriterResult>> {
         self.finish_with_discard_fn(tline, ctx, end_key, |_| async { false })
             .await
     }
@@ -196,9 +260,6 @@ impl SplitImageLayerWriter {
 
 /// A delta writer that takes key-lsn-values and produces multiple delta layers.
 ///
-/// The interface does not guarantee atomicity (i.e., if the delta layer generation fails,
-/// there might be leftover files to be cleaned up).
-///
 /// Note that if updates of a single key exceed the target size limit, all of the updates will be batched
 /// into a single file. This behavior might change in the future. For reference, the legacy compaction algorithm
 /// will split them into multiple files based on size.
@@ -206,12 +267,12 @@ impl SplitImageLayerWriter {
 pub struct SplitDeltaLayerWriter {
     inner: Option<(Key, DeltaLayerWriter)>,
     target_layer_size: u64,
-    generated_layer_writers: Vec<(DeltaLayerWriter, PersistentLayerKey)>,
     conf: &'static PageServerConf,
     timeline_id: TimelineId,
     tenant_shard_id: TenantShardId,
     lsn_range: Range<Lsn>,
     last_key_written: Key,
+    batches: BatchLayerWriter,
 }
 
 impl SplitDeltaLayerWriter {
@@ -225,12 +286,12 @@ impl SplitDeltaLayerWriter {
         Ok(Self {
             target_layer_size,
             inner: None,
-            generated_layer_writers: Vec::new(),
             conf,
             timeline_id,
             tenant_shard_id,
             lsn_range,
             last_key_written: Key::MIN,
+            batches: BatchLayerWriter::new(conf).await?,
         })
     }
 
@@ -279,13 +340,11 @@ impl SplitDeltaLayerWriter {
                 .await?;
                 let (start_key, prev_delta_writer) =
                     std::mem::replace(&mut self.inner, Some((key, next_delta_writer))).unwrap();
-                let layer_key = PersistentLayerKey {
-                    key_range: start_key..key,
-                    lsn_range: self.lsn_range.clone(),
-                    is_delta: true,
-                };
-                self.generated_layer_writers
-                    .push((prev_delta_writer, layer_key));
+                self.batches.add_unfinished_delta_writer(
+                    prev_delta_writer,
+                    start_key..key,
+                    self.lsn_range.clone(),
+                );
             } else if inner.estimated_size() >= S3_UPLOAD_LIMIT {
                 // We have to produce a very large file b/c a key is updated too often.
                 anyhow::bail!(
@@ -305,64 +364,25 @@ impl SplitDeltaLayerWriter {
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
         discard_fn: D,
-    ) -> anyhow::Result<Vec<SplitWriterResult>>
+    ) -> anyhow::Result<Vec<BatchWriterResult>>
     where
         D: Fn(&PersistentLayerKey) -> F,
         F: Future<Output = bool>,
     {
         let Self {
-            mut generated_layer_writers,
-            inner,
-            ..
+            mut batches, inner, ..
         } = self;
         if let Some((start_key, writer)) = inner {
             if writer.num_keys() != 0 {
                 let end_key = self.last_key_written.next();
-                let layer_key = PersistentLayerKey {
-                    key_range: start_key..end_key,
-                    lsn_range: self.lsn_range.clone(),
-                    is_delta: true,
-                };
-                generated_layer_writers.push((writer, layer_key));
+                batches.add_unfinished_delta_writer(
+                    writer,
+                    start_key..end_key,
+                    self.lsn_range.clone(),
+                );
             }
         }
-        let clean_up_layers = |generated_layers: Vec<SplitWriterResult>| {
-            for produced_layer in generated_layers {
-                if let SplitWriterResult::Produced(delta_layer) = produced_layer {
-                    let layer: Layer = delta_layer.into();
-                    layer.delete_on_drop();
-                }
-            }
-        };
-        // BEGIN: catch every error and do the recovery in the below section
-        let mut generated_layers = Vec::new();
-        for (inner, layer_key) in generated_layer_writers {
-            if discard_fn(&layer_key).await {
-                generated_layers.push(SplitWriterResult::Discarded(layer_key));
-            } else {
-                let layer = match inner.finish(layer_key.key_range.end, ctx).await {
-                    Ok((desc, path)) => {
-                        match Layer::finish_creating(self.conf, tline, desc, &path) {
-                            Ok(layer) => layer,
-                            Err(e) => {
-                                tokio::fs::remove_file(&path).await.ok();
-                                clean_up_layers(generated_layers);
-                                return Err(e);
-                            }
-                        }
-                    }
-                    Err(e) => {
-                        // DeltaLayerWriter::finish will clean up the temporary layer if anything goes wrong,
-                        // so we don't need to remove the layer we just failed to create by ourselves.
-                        clean_up_layers(generated_layers);
-                        return Err(e);
-                    }
-                };
-                generated_layers.push(SplitWriterResult::Produced(layer));
-            }
-        }
-        // END: catch every error and do the recovery in the above section
-        Ok(generated_layers)
+        batches.finish_with_discard_fn(tline, ctx, discard_fn).await
     }
 
     #[cfg(test)]
@@ -370,7 +390,7 @@ impl SplitDeltaLayerWriter {
         self,
         tline: &Arc<Timeline>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<SplitWriterResult>> {
+    ) -> anyhow::Result<Vec<BatchWriterResult>> {
         self.finish_with_discard_fn(tline, ctx, |_| async { false })
             .await
     }
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index fa058833d4..ff2be1780e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -1009,7 +1009,7 @@ impl ImageLayerWriter {
         self.inner.take().unwrap().finish(ctx, None).await
     }
 
-    /// Finish writing the image layer with an end key, used in [`super::split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
+    /// Finish writing the image layer with an end key, used in [`super::batch_split_writer::SplitImageLayerWriter`]. The end key determines the end of the image layer's covered range and is exclusive.
     pub(super) async fn finish_with_end_key(
         mut self,
         end_key: Key,
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 37d907ddcb..6aa5b30f07 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -32,11 +32,11 @@ use crate::page_cache;
 use crate::statvfs::Statvfs;
 use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
+use crate::tenant::storage_layer::batch_split_writer::{
+    BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
+};
 use crate::tenant::storage_layer::filter_iterator::FilterIterator;
 use crate::tenant::storage_layer::merge_iterator::MergeIterator;
-use crate::tenant::storage_layer::split_writer::{
-    SplitDeltaLayerWriter, SplitImageLayerWriter, SplitWriterResult,
-};
 use crate::tenant::storage_layer::{
     AsLayerDesc, PersistentLayerDesc, PersistentLayerKey, ValueReconstructState,
 };
@@ -2038,11 +2038,11 @@ impl Timeline {
         let produced_image_layers_len = produced_image_layers.len();
         for action in produced_delta_layers {
             match action {
-                SplitWriterResult::Produced(layer) => {
+                BatchWriterResult::Produced(layer) => {
                     stat.produce_delta_layer(layer.layer_desc().file_size());
                     compact_to.push(layer);
                 }
-                SplitWriterResult::Discarded(l) => {
+                BatchWriterResult::Discarded(l) => {
                     keep_layers.insert(l);
                     stat.discard_delta_layer();
                 }
@@ -2050,11 +2050,11 @@ impl Timeline {
         }
         for action in produced_image_layers {
             match action {
-                SplitWriterResult::Produced(layer) => {
+                BatchWriterResult::Produced(layer) => {
                     stat.produce_image_layer(layer.layer_desc().file_size());
                     compact_to.push(layer);
                 }
-                SplitWriterResult::Discarded(l) => {
+                BatchWriterResult::Discarded(l) => {
                     keep_layers.insert(l);
                     stat.discard_image_layer();
                 }

From 5069123b6d5c5cdcac1a011cb3141cd915298161 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 24 Oct 2024 17:12:47 +0100
Subject: [PATCH 078/239] pageserver: refactor ingest inplace to decouple
 decoding and handling (#9472)

## Problem

WAL ingest couples decoding of special records with their handling
(updates to the storage engine mostly).
This is a roadblock for our plan to move WAL filtering (and implicitly
decoding) to safekeepers since they cannot
do writes to the storage engine.

## Summary of changes

This PR decouples the decoding of the special WAL records from their
application. The changes are done in place
and I've done my best to refrain from refactorings and attempted to
preserve the original code as much as possible.

Related: https://github.com/neondatabase/neon/issues/9335
Epic: https://github.com/neondatabase/neon/issues/9329
---
 pageserver/src/walingest.rs                   | 1547 +++++++++++------
 test_runner/regress/test_tenant_relocation.py |   20 +-
 2 files changed, 1031 insertions(+), 536 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index d3e8bf59f2..8a4c0554f8 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -107,6 +107,143 @@ struct WarnIngestLag {
     timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
+// These structs are an intermediary representation of the PostgreSQL WAL records.
+// The ones prefixed with `Xl` are lower level, while the ones that are not have
+// all the required context to be acted upon by the pageserver.
+
+enum HeapamRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+struct ClearVmBits {
+    new_heap_blkno: Option<u32>,
+    old_heap_blkno: Option<u32>,
+    vm_rel: RelTag,
+    flags: u8,
+}
+
+enum NeonrmgrRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+enum SmgrRecord {
+    Create(SmgrCreate),
+    Truncate(XlSmgrTruncate),
+}
+
+struct SmgrCreate {
+    rel: RelTag,
+}
+
+enum DbaseRecord {
+    Create(DbaseCreate),
+    Drop(DbaseDrop),
+}
+
+struct DbaseCreate {
+    db_id: u32,
+    tablespace_id: u32,
+    src_db_id: u32,
+    src_tablespace_id: u32,
+}
+
+struct DbaseDrop {
+    db_id: u32,
+    tablespace_ids: Vec<u32>,
+}
+
+enum ClogRecord {
+    ZeroPage(ClogZeroPage),
+    Truncate(ClogTruncate),
+}
+
+struct ClogZeroPage {
+    segno: u32,
+    rpageno: u32,
+}
+
+struct ClogTruncate {
+    pageno: u32,
+    oldest_xid: u32,
+    oldest_xid_db: u32,
+}
+
+enum XactRecord {
+    Commit(XactCommon),
+    Abort(XactCommon),
+    CommitPrepared(XactCommon),
+    AbortPrepared(XactCommon),
+    Prepare(XactPrepare),
+}
+
+struct XactCommon {
+    parsed: XlXactParsedRecord,
+    origin_id: u16,
+    // Fields below are only used for logging
+    xl_xid: u32,
+    lsn: Lsn,
+}
+
+struct XactPrepare {
+    xl_xid: u32,
+    data: Bytes,
+}
+
+enum MultiXactRecord {
+    ZeroPage(MultiXactZeroPage),
+    Create(XlMultiXactCreate),
+    Truncate(XlMultiXactTruncate),
+}
+
+struct MultiXactZeroPage {
+    slru_kind: SlruKind,
+    segno: u32,
+    rpageno: u32,
+}
+
+enum RelmapRecord {
+    Update(RelmapUpdate),
+}
+
+struct RelmapUpdate {
+    update: XlRelmapUpdate,
+    buf: Bytes,
+}
+
+enum XlogRecord {
+    Raw(RawXlogRecord),
+}
+
+struct RawXlogRecord {
+    info: u8,
+    lsn: Lsn,
+    buf: Bytes,
+}
+
+enum LogicalMessageRecord {
+    Put(PutLogicalMessage),
+    #[cfg(feature = "testing")]
+    Failpoint,
+}
+
+struct PutLogicalMessage {
+    path: String,
+    buf: Bytes,
+}
+
+enum StandbyRecord {
+    RunningXacts(StandbyRunningXacts),
+}
+
+struct StandbyRunningXacts {
+    oldest_running_xid: u32,
+}
+
+enum ReploriginRecord {
+    Set(XlReploriginSet),
+    Drop(XlReploriginDrop),
+}
+
 impl WalIngest {
     pub async fn new(
         timeline: &Timeline,
@@ -182,105 +319,58 @@ impl WalIngest {
             pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
                 // Heap AM records need some special handling, because they modify VM pages
                 // without registering them with the standard mechanism.
-                self.ingest_heapam_record(&mut buf, modification, &decoded, ctx)
-                    .await?;
+                let maybe_heapam_record =
+                    Self::decode_heapam_record(&mut buf, &decoded, pg_version)?;
+                if let Some(heapam_record) = maybe_heapam_record {
+                    match heapam_record {
+                        HeapamRecord::ClearVmBits(clear_vm_bits) => {
+                            self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
+                                .await?;
+                        }
+                    }
+                }
             }
             pg_constants::RM_NEON_ID => {
-                self.ingest_neonrmgr_record(&mut buf, modification, &decoded, ctx)
-                    .await?;
+                let maybe_nenonrmgr_record =
+                    Self::decode_neonmgr_record(&mut buf, &decoded, pg_version)?;
+                if let Some(neonrmgr_record) = maybe_nenonrmgr_record {
+                    match neonrmgr_record {
+                        NeonrmgrRecord::ClearVmBits(clear_vm_bits) => {
+                            self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
+                                .await?;
+                        }
+                    }
+                }
             }
             // Handle other special record types
             pg_constants::RM_SMGR_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-
-                if info == pg_constants::XLOG_SMGR_CREATE {
-                    let create = XlSmgrCreate::decode(&mut buf);
-                    self.ingest_xlog_smgr_create(modification, &create, ctx)
-                        .await?;
-                } else if info == pg_constants::XLOG_SMGR_TRUNCATE {
-                    let truncate = XlSmgrTruncate::decode(&mut buf);
-                    self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
-                        .await?;
+                let maybe_smgr_record =
+                    Self::decode_smgr_record(&mut buf, &decoded, pg_version).unwrap();
+                if let Some(smgr_record) = maybe_smgr_record {
+                    match smgr_record {
+                        SmgrRecord::Create(create) => {
+                            self.ingest_xlog_smgr_create(create, modification, ctx)
+                                .await?;
+                        }
+                        SmgrRecord::Truncate(truncate) => {
+                            self.ingest_xlog_smgr_truncate(truncate, modification, ctx)
+                                .await?;
+                        }
+                    }
                 }
             }
             pg_constants::RM_DBASE_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                debug!(%info, %pg_version, "handle RM_DBASE_ID");
+                let maybe_dbase_record =
+                    Self::decode_dbase_record(&mut buf, &decoded, pg_version).unwrap();
 
-                if pg_version == 14 {
-                    if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
-                        let createdb = XlCreateDatabase::decode(&mut buf);
-                        debug!("XLOG_DBASE_CREATE v14");
-
-                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
-                            .await?;
-                    } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
-                        let dropdb = XlDropDatabase::decode(&mut buf);
-                        for tablespace_id in dropdb.tablespace_ids {
-                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                            modification
-                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                if let Some(dbase_record) = maybe_dbase_record {
+                    match dbase_record {
+                        DbaseRecord::Create(create) => {
+                            self.ingest_xlog_dbase_create(create, modification, ctx)
                                 .await?;
                         }
-                    }
-                } else if pg_version == 15 {
-                    if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                    } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                        // The XLOG record was renamed between v14 and v15,
-                        // but the record format is the same.
-                        // So we can reuse XlCreateDatabase here.
-                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
-                        let createdb = XlCreateDatabase::decode(&mut buf);
-                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
-                            .await?;
-                    } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
-                        let dropdb = XlDropDatabase::decode(&mut buf);
-                        for tablespace_id in dropdb.tablespace_ids {
-                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                            modification
-                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
-                                .await?;
-                        }
-                    }
-                } else if pg_version == 16 {
-                    if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                    } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                        // The XLOG record was renamed between v14 and v15,
-                        // but the record format is the same.
-                        // So we can reuse XlCreateDatabase here.
-                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
-                        let createdb = XlCreateDatabase::decode(&mut buf);
-                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
-                            .await?;
-                    } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
-                        let dropdb = XlDropDatabase::decode(&mut buf);
-                        for tablespace_id in dropdb.tablespace_ids {
-                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                            modification
-                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
-                                .await?;
-                        }
-                    }
-                } else if pg_version == 17 {
-                    if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                        // The XLOG record was renamed between v14 and v15,
-                        // but the record format is the same.
-                        // So we can reuse XlCreateDatabase here.
-                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
-                        let createdb = XlCreateDatabase::decode(&mut buf);
-                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
-                            .await?;
-                    } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
-                        let dropdb = XlDropDatabase::decode(&mut buf);
-                        for tablespace_id in dropdb.tablespace_ids {
-                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                            modification
-                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
-                                .await?;
+                        DbaseRecord::Drop(drop) => {
+                            self.ingest_xlog_dbase_drop(drop, modification, ctx).await?;
                         }
                     }
                 }
@@ -289,266 +379,113 @@ impl WalIngest {
                 trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
             }
             pg_constants::RM_CLOG_ID => {
-                let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
+                // [`Self::decode_clog_record`] may never fail and always returns.
+                // It has this interface to match all the other decoding methods.
+                let clog_record = Self::decode_clog_record(&mut buf, &decoded, pg_version)
+                    .unwrap()
+                    .unwrap();
 
-                if info == pg_constants::CLOG_ZEROPAGE {
-                    let pageno = if pg_version < 17 {
-                        buf.get_u32_le()
-                    } else {
-                        buf.get_u64_le() as u32
-                    };
-                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    self.put_slru_page_image(
-                        modification,
-                        SlruKind::Clog,
-                        segno,
-                        rpageno,
-                        ZERO_PAGE.clone(),
-                        ctx,
-                    )
-                    .await?;
-                } else {
-                    assert!(info == pg_constants::CLOG_TRUNCATE);
-                    let xlrec = XlClogTruncate::decode(&mut buf, pg_version);
-                    self.ingest_clog_truncate_record(modification, &xlrec, ctx)
-                        .await?;
+                match clog_record {
+                    ClogRecord::ZeroPage(zero_page) => {
+                        self.ingest_clog_zero_page(zero_page, modification, ctx)
+                            .await?;
+                    }
+                    ClogRecord::Truncate(truncate) => {
+                        self.ingest_clog_truncate(truncate, modification, ctx)
+                            .await?;
+                    }
                 }
             }
             pg_constants::RM_XACT_ID => {
-                let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
-
-                if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
-                    let parsed_xact =
-                        XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                    self.ingest_xact_record(
-                        modification,
-                        &parsed_xact,
-                        info == pg_constants::XLOG_XACT_COMMIT,
-                        decoded.origin_id,
-                        ctx,
-                    )
-                    .await?;
-                } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                    || info == pg_constants::XLOG_XACT_ABORT_PREPARED
-                {
-                    let parsed_xact =
-                        XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                    self.ingest_xact_record(
-                        modification,
-                        &parsed_xact,
-                        info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                        decoded.origin_id,
-                        ctx,
-                    )
-                    .await?;
-                    // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
-                    trace!(
-                        "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
-                        decoded.xl_xid,
-                        parsed_xact.xid,
-                        lsn,
-                    );
-
-                    let xid: u64 = if pg_version >= 17 {
-                        self.adjust_to_full_transaction_id(parsed_xact.xid)?
-                    } else {
-                        parsed_xact.xid as u64
-                    };
-                    modification.drop_twophase_file(xid, ctx).await?;
-                } else if info == pg_constants::XLOG_XACT_PREPARE {
-                    let xid: u64 = if pg_version >= 17 {
-                        self.adjust_to_full_transaction_id(decoded.xl_xid)?
-                    } else {
-                        decoded.xl_xid as u64
-                    };
-                    modification
-                        .put_twophase_file(xid, Bytes::copy_from_slice(&buf[..]), ctx)
+                let maybe_xact_record =
+                    Self::decode_xact_record(&mut buf, &decoded, lsn, pg_version).unwrap();
+                if let Some(xact_record) = maybe_xact_record {
+                    self.ingest_xact_record(xact_record, modification, ctx)
                         .await?;
                 }
             }
             pg_constants::RM_MULTIXACT_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-
-                if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
-                    let pageno = if pg_version < 17 {
-                        buf.get_u32_le()
-                    } else {
-                        buf.get_u64_le() as u32
-                    };
-                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    self.put_slru_page_image(
-                        modification,
-                        SlruKind::MultiXactOffsets,
-                        segno,
-                        rpageno,
-                        ZERO_PAGE.clone(),
-                        ctx,
-                    )
-                    .await?;
-                } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
-                    let pageno = if pg_version < 17 {
-                        buf.get_u32_le()
-                    } else {
-                        buf.get_u64_le() as u32
-                    };
-                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                    self.put_slru_page_image(
-                        modification,
-                        SlruKind::MultiXactMembers,
-                        segno,
-                        rpageno,
-                        ZERO_PAGE.clone(),
-                        ctx,
-                    )
-                    .await?;
-                } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                    let xlrec = XlMultiXactCreate::decode(&mut buf);
-                    self.ingest_multixact_create_record(modification, &xlrec)?;
-                } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
-                    let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                    self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
-                        .await?;
+                let maybe_multixact_record =
+                    Self::decode_multixact_record(&mut buf, &decoded, pg_version).unwrap();
+                if let Some(multixact_record) = maybe_multixact_record {
+                    match multixact_record {
+                        MultiXactRecord::ZeroPage(zero_page) => {
+                            self.ingest_multixact_zero_page(zero_page, modification, ctx)
+                                .await?;
+                        }
+                        MultiXactRecord::Create(create) => {
+                            self.ingest_multixact_create(modification, &create)?;
+                        }
+                        MultiXactRecord::Truncate(truncate) => {
+                            self.ingest_multixact_truncate(modification, &truncate, ctx)
+                                .await?;
+                        }
+                    }
                 }
             }
             pg_constants::RM_RELMAP_ID => {
-                let xlrec = XlRelmapUpdate::decode(&mut buf);
-                self.ingest_relmap_page(modification, &xlrec, &decoded, ctx)
-                    .await?;
-            }
-            pg_constants::RM_XLOG_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-
-                if info == pg_constants::XLOG_PARAMETER_CHANGE {
-                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
-                        let rec = v17::XlParameterChange::decode(&mut buf);
-                        cp.wal_level = rec.wal_level;
-                        self.checkpoint_modified = true;
-                    }
-                } else if info == pg_constants::XLOG_END_OF_RECOVERY {
-                    if let CheckPoint::V17(cp) = &mut self.checkpoint {
-                        let rec = v17::XlEndOfRecovery::decode(&mut buf);
-                        cp.wal_level = rec.wal_level;
-                        self.checkpoint_modified = true;
+                let relmap_record = Self::decode_relmap_record(&mut buf, &decoded, pg_version)
+                    .unwrap()
+                    .unwrap();
+                match relmap_record {
+                    RelmapRecord::Update(update) => {
+                        self.ingest_relmap_update(update, modification, ctx).await?;
                     }
                 }
+            }
+            // This is an odd duck. It needs to go to all shards.
+            // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
+            // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
+            // the pageserver and decode it there.
+            //
+            // Alternatively, one can make the checkpoint part of the subscription protocol
+            // to the pageserver. This should work fine, but can be done at a later point.
+            pg_constants::RM_XLOG_ID => {
+                let xlog_record = Self::decode_xlog_record(&mut buf, &decoded, lsn, pg_version)
+                    .unwrap()
+                    .unwrap();
 
-                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
-                    if info == pg_constants::XLOG_NEXTOID {
-                        let next_oid = buf.get_u32_le();
-                        if cp.nextOid != next_oid {
-                            cp.nextOid = next_oid;
-                            self.checkpoint_modified = true;
-                        }
-                    } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-                        || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                    {
-                        let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
-                        buf.copy_to_slice(&mut checkpoint_bytes);
-                        let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
-                        trace!(
-                            "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                            xlog_checkpoint.oldestXid,
-                            cp.oldestXid
-                        );
-                        if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
-                            cp.oldestXid = xlog_checkpoint.oldestXid;
-                        }
-                        trace!(
-                            "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
-                            xlog_checkpoint.oldestActiveXid,
-                            cp.oldestActiveXid
-                        );
-
-                        // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
-                        // because at shutdown, all in-progress transactions will implicitly
-                        // end. Postgres startup code knows that, and allows hot standby to start
-                        // immediately from a shutdown checkpoint.
-                        //
-                        // In Neon, Postgres hot standby startup always behaves as if starting from
-                        // an online checkpoint. It needs a valid `oldestActiveXid` value, so
-                        // instead of overwriting self.checkpoint.oldestActiveXid with
-                        // InvalidTransactionid from the checkpoint WAL record, update it to a
-                        // proper value, knowing that there are no in-progress transactions at this
-                        // point, except for prepared transactions.
-                        //
-                        // See also the neon code changes in the InitWalRecovery() function.
-                        if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
-                            && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-                        {
-                            let oldest_active_xid = if pg_version >= 17 {
-                                let mut oldest_active_full_xid = cp.nextXid.value;
-                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                                    if xid < oldest_active_full_xid {
-                                        oldest_active_full_xid = xid;
-                                    }
-                                }
-                                oldest_active_full_xid as u32
-                            } else {
-                                let mut oldest_active_xid = cp.nextXid.value as u32;
-                                for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
-                                    let narrow_xid = xid as u32;
-                                    if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
-                                        oldest_active_xid = narrow_xid;
-                                    }
-                                }
-                                oldest_active_xid
-                            };
-                            cp.oldestActiveXid = oldest_active_xid;
-                        } else {
-                            cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
-                        }
-
-                        // Write a new checkpoint key-value pair on every checkpoint record, even
-                        // if nothing really changed. Not strictly required, but it seems nice to
-                        // have some trace of the checkpoint records in the layer files at the same
-                        // LSNs.
-                        self.checkpoint_modified = true;
+                match xlog_record {
+                    XlogRecord::Raw(raw) => {
+                        self.ingest_raw_xlog_record(raw, modification, ctx).await?;
                     }
-                });
+                }
             }
             pg_constants::RM_LOGICALMSG_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-
-                if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-                    let xlrec = crate::walrecord::XlLogicalMessage::decode(&mut buf);
-                    let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
-                    let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
-                    if prefix == "neon-test" {
-                        // This is a convenient way to make the WAL ingestion pause at
-                        // particular point in the WAL. For more fine-grained control,
-                        // we could peek into the message and only pause if it contains
-                        // a particular string, for example, but this is enough for now.
-                        failpoint_support::sleep_millis_async!("wal-ingest-logical-message-sleep");
-                    } else if let Some(path) = prefix.strip_prefix("neon-file:") {
-                        modification.put_file(path, message, ctx).await?;
+                let maybe_logical_message_record =
+                    Self::decode_logical_message_record(&mut buf, &decoded, pg_version).unwrap();
+                if let Some(logical_message_record) = maybe_logical_message_record {
+                    match logical_message_record {
+                        LogicalMessageRecord::Put(put) => {
+                            self.ingest_logical_message_put(put, modification, ctx)
+                                .await?;
+                        }
+                        #[cfg(feature = "testing")]
+                        LogicalMessageRecord::Failpoint => {
+                            // This is a convenient way to make the WAL ingestion pause at
+                            // particular point in the WAL. For more fine-grained control,
+                            // we could peek into the message and only pause if it contains
+                            // a particular string, for example, but this is enough for now.
+                            failpoint_support::sleep_millis_async!(
+                                "pageserver-wal-ingest-logical-message-sleep"
+                            );
+                        }
                     }
                 }
             }
             pg_constants::RM_STANDBY_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                if info == pg_constants::XLOG_RUNNING_XACTS {
-                    let xlrec = crate::walrecord::XlRunningXacts::decode(&mut buf);
-
-                    enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
-                        cp.oldestActiveXid = xlrec.oldest_running_xid;
-                    });
-
-                    self.checkpoint_modified = true;
+                let maybe_standby_record =
+                    Self::decode_standby_record(&mut buf, &decoded, pg_version).unwrap();
+                if let Some(standby_record) = maybe_standby_record {
+                    self.ingest_standby_record(standby_record).unwrap();
                 }
             }
             pg_constants::RM_REPLORIGIN_ID => {
-                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-                if info == pg_constants::XLOG_REPLORIGIN_SET {
-                    let xlrec = crate::walrecord::XlReploriginSet::decode(&mut buf);
-                    modification
-                        .set_replorigin(xlrec.node_id, xlrec.remote_lsn)
-                        .await?
-                } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
-                    let xlrec = crate::walrecord::XlReploriginDrop::decode(&mut buf);
-                    modification.drop_replorigin(xlrec.node_id).await?
+                let maybe_replorigin_record =
+                    Self::decode_replorigin_record(&mut buf, &decoded, pg_version).unwrap();
+                if let Some(replorigin_record) = maybe_replorigin_record {
+                    self.ingest_replorigin_record(replorigin_record, modification)
+                        .await?;
                 }
             }
             _x => {
@@ -709,13 +646,99 @@ impl WalIngest {
         Ok(())
     }
 
-    async fn ingest_heapam_record(
+    async fn ingest_clear_vm_bits(
         &mut self,
-        buf: &mut Bytes,
+        clear_vm_bits: ClearVmBits,
         modification: &mut DatadirModification<'_>,
-        decoded: &DecodedWALRecord,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let ClearVmBits {
+            new_heap_blkno,
+            old_heap_blkno,
+            flags,
+            vm_rel,
+        } = clear_vm_bits;
+        // Clear the VM bits if required.
+        let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
+        let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
+
+        // Sometimes, Postgres seems to create heap WAL records with the
+        // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
+        // not set. In fact, it's possible that the VM page does not exist at all.
+        // In that case, we don't want to store a record to clear the VM bit;
+        // replaying it would fail to find the previous image of the page, because
+        // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
+        // record if it doesn't.
+        let vm_size = get_relsize(modification, vm_rel, ctx).await?;
+        if let Some(blknum) = new_vm_blk {
+            if blknum >= vm_size {
+                new_vm_blk = None;
+            }
+        }
+        if let Some(blknum) = old_vm_blk {
+            if blknum >= vm_size {
+                old_vm_blk = None;
+            }
+        }
+
+        if new_vm_blk.is_some() || old_vm_blk.is_some() {
+            if new_vm_blk == old_vm_blk {
+                // An UPDATE record that needs to clear the bits for both old and the
+                // new page, both of which reside on the same VM page.
+                self.put_rel_wal_record(
+                    modification,
+                    vm_rel,
+                    new_vm_blk.unwrap(),
+                    NeonWalRecord::ClearVisibilityMapFlags {
+                        new_heap_blkno,
+                        old_heap_blkno,
+                        flags,
+                    },
+                    ctx,
+                )
+                .await?;
+            } else {
+                // Clear VM bits for one heap page, or for two pages that reside on
+                // different VM pages.
+                if let Some(new_vm_blk) = new_vm_blk {
+                    self.put_rel_wal_record(
+                        modification,
+                        vm_rel,
+                        new_vm_blk,
+                        NeonWalRecord::ClearVisibilityMapFlags {
+                            new_heap_blkno,
+                            old_heap_blkno: None,
+                            flags,
+                        },
+                        ctx,
+                    )
+                    .await?;
+                }
+                if let Some(old_vm_blk) = old_vm_blk {
+                    self.put_rel_wal_record(
+                        modification,
+                        vm_rel,
+                        old_vm_blk,
+                        NeonWalRecord::ClearVisibilityMapFlags {
+                            new_heap_blkno: None,
+                            old_heap_blkno,
+                            flags,
+                        },
+                        ctx,
+                    )
+                    .await?;
+                }
+            }
+        }
+
+        Ok(())
+    }
+
+    fn decode_heapam_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<HeapamRecord>> {
         // Handle VM bit updates that are implicitly part of heap records.
 
         // First, look at the record to determine which VM bits need
@@ -725,7 +748,7 @@ impl WalIngest {
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
 
-        match modification.tline.pg_version {
+        match pg_version {
             14 => {
                 if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
                     let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
@@ -997,7 +1020,6 @@ impl WalIngest {
             _ => {}
         }
 
-        // Clear the VM bits if required.
         if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
             let vm_rel = RelTag {
                 forknum: VISIBILITYMAP_FORKNUM,
@@ -1006,89 +1028,22 @@ impl WalIngest {
                 relnode: decoded.blocks[0].rnode_relnode,
             };
 
-            let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
-            let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
-
-            // Sometimes, Postgres seems to create heap WAL records with the
-            // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
-            // not set. In fact, it's possible that the VM page does not exist at all.
-            // In that case, we don't want to store a record to clear the VM bit;
-            // replaying it would fail to find the previous image of the page, because
-            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
-            // record if it doesn't.
-            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
-            if let Some(blknum) = new_vm_blk {
-                if blknum >= vm_size {
-                    new_vm_blk = None;
-                }
-            }
-            if let Some(blknum) = old_vm_blk {
-                if blknum >= vm_size {
-                    old_vm_blk = None;
-                }
-            }
-
-            if new_vm_blk.is_some() || old_vm_blk.is_some() {
-                if new_vm_blk == old_vm_blk {
-                    // An UPDATE record that needs to clear the bits for both old and the
-                    // new page, both of which reside on the same VM page.
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        new_vm_blk.unwrap(),
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno,
-                            old_heap_blkno,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                } else {
-                    // Clear VM bits for one heap page, or for two pages that reside on
-                    // different VM pages.
-                    if let Some(new_vm_blk) = new_vm_blk {
-                        self.put_rel_wal_record(
-                            modification,
-                            vm_rel,
-                            new_vm_blk,
-                            NeonWalRecord::ClearVisibilityMapFlags {
-                                new_heap_blkno,
-                                old_heap_blkno: None,
-                                flags,
-                            },
-                            ctx,
-                        )
-                        .await?;
-                    }
-                    if let Some(old_vm_blk) = old_vm_blk {
-                        self.put_rel_wal_record(
-                            modification,
-                            vm_rel,
-                            old_vm_blk,
-                            NeonWalRecord::ClearVisibilityMapFlags {
-                                new_heap_blkno: None,
-                                old_heap_blkno,
-                                flags,
-                            },
-                            ctx,
-                        )
-                        .await?;
-                    }
-                }
-            }
+            Ok(Some(HeapamRecord::ClearVmBits(ClearVmBits {
+                new_heap_blkno,
+                old_heap_blkno,
+                vm_rel,
+                flags,
+            })))
+        } else {
+            Ok(None)
         }
-
-        Ok(())
     }
 
-    async fn ingest_neonrmgr_record(
-        &mut self,
+    fn decode_neonmgr_record(
         buf: &mut Bytes,
-        modification: &mut DatadirModification<'_>,
         decoded: &DecodedWALRecord,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        pg_version: u32,
+    ) -> anyhow::Result<Option<NeonrmgrRecord>> {
         // Handle VM bit updates that are implicitly part of heap records.
 
         // First, look at the record to determine which VM bits need
@@ -1097,7 +1052,6 @@ impl WalIngest {
         let mut new_heap_blkno: Option<u32> = None;
         let mut old_heap_blkno: Option<u32> = None;
         let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-        let pg_version = modification.tline.pg_version;
 
         assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
 
@@ -1168,7 +1122,6 @@ impl WalIngest {
             ),
         }
 
-        // Clear the VM bits if required.
         if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
             let vm_rel = RelTag {
                 forknum: VISIBILITYMAP_FORKNUM,
@@ -1177,93 +1130,30 @@ impl WalIngest {
                 relnode: decoded.blocks[0].rnode_relnode,
             };
 
-            let mut new_vm_blk = new_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
-            let mut old_vm_blk = old_heap_blkno.map(pg_constants::HEAPBLK_TO_MAPBLOCK);
-
-            // Sometimes, Postgres seems to create heap WAL records with the
-            // ALL_VISIBLE_CLEARED flag set, even though the bit in the VM page is
-            // not set. In fact, it's possible that the VM page does not exist at all.
-            // In that case, we don't want to store a record to clear the VM bit;
-            // replaying it would fail to find the previous image of the page, because
-            // it doesn't exist. So check if the VM page(s) exist, and skip the WAL
-            // record if it doesn't.
-            let vm_size = get_relsize(modification, vm_rel, ctx).await?;
-            if let Some(blknum) = new_vm_blk {
-                if blknum >= vm_size {
-                    new_vm_blk = None;
-                }
-            }
-            if let Some(blknum) = old_vm_blk {
-                if blknum >= vm_size {
-                    old_vm_blk = None;
-                }
-            }
-
-            if new_vm_blk.is_some() || old_vm_blk.is_some() {
-                if new_vm_blk == old_vm_blk {
-                    // An UPDATE record that needs to clear the bits for both old and the
-                    // new page, both of which reside on the same VM page.
-                    self.put_rel_wal_record(
-                        modification,
-                        vm_rel,
-                        new_vm_blk.unwrap(),
-                        NeonWalRecord::ClearVisibilityMapFlags {
-                            new_heap_blkno,
-                            old_heap_blkno,
-                            flags,
-                        },
-                        ctx,
-                    )
-                    .await?;
-                } else {
-                    // Clear VM bits for one heap page, or for two pages that reside on
-                    // different VM pages.
-                    if let Some(new_vm_blk) = new_vm_blk {
-                        self.put_rel_wal_record(
-                            modification,
-                            vm_rel,
-                            new_vm_blk,
-                            NeonWalRecord::ClearVisibilityMapFlags {
-                                new_heap_blkno,
-                                old_heap_blkno: None,
-                                flags,
-                            },
-                            ctx,
-                        )
-                        .await?;
-                    }
-                    if let Some(old_vm_blk) = old_vm_blk {
-                        self.put_rel_wal_record(
-                            modification,
-                            vm_rel,
-                            old_vm_blk,
-                            NeonWalRecord::ClearVisibilityMapFlags {
-                                new_heap_blkno: None,
-                                old_heap_blkno,
-                                flags,
-                            },
-                            ctx,
-                        )
-                        .await?;
-                    }
-                }
-            }
+            Ok(Some(NeonrmgrRecord::ClearVmBits(ClearVmBits {
+                new_heap_blkno,
+                old_heap_blkno,
+                vm_rel,
+                flags,
+            })))
+        } else {
+            Ok(None)
         }
-
-        Ok(())
     }
 
     /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
     async fn ingest_xlog_dbase_create(
         &mut self,
+        create: DbaseCreate,
         modification: &mut DatadirModification<'_>,
-        rec: &XlCreateDatabase,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let db_id = rec.db_id;
-        let tablespace_id = rec.tablespace_id;
-        let src_db_id = rec.src_db_id;
-        let src_tablespace_id = rec.src_tablespace_id;
+        let DbaseCreate {
+            db_id,
+            tablespace_id,
+            src_db_id,
+            src_tablespace_id,
+        } = create;
 
         let rels = modification
             .tline
@@ -1349,46 +1239,209 @@ impl WalIngest {
         Ok(())
     }
 
-    async fn ingest_xlog_smgr_create(
+    async fn ingest_xlog_dbase_drop(
         &mut self,
+        dbase_drop: DbaseDrop,
         modification: &mut DatadirModification<'_>,
-        rec: &XlSmgrCreate,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let rel = RelTag {
-            spcnode: rec.rnode.spcnode,
-            dbnode: rec.rnode.dbnode,
-            relnode: rec.rnode.relnode,
-            forknum: rec.forknum,
-        };
+        let DbaseDrop {
+            db_id,
+            tablespace_ids,
+        } = dbase_drop;
+        for tablespace_id in tablespace_ids {
+            trace!("Drop db {}, {}", tablespace_id, db_id);
+            modification.drop_dbdir(tablespace_id, db_id, ctx).await?;
+        }
+
+        Ok(())
+    }
+
+    fn decode_dbase_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<DbaseRecord>> {
+        // TODO: Refactor this to avoid the duplication between postgres versions.
+
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        debug!(%info, %pg_version, "handle RM_DBASE_ID");
+
+        if pg_version == 14 {
+            if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
+                let createdb = XlCreateDatabase::decode(buf);
+                debug!("XLOG_DBASE_CREATE v14");
+
+                let record = DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                });
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+
+                let record = DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                });
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 15 {
+            if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                });
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                });
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 16 {
+            if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                });
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                });
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 17 {
+            if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                });
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                });
+
+                return Ok(Some(record));
+            }
+        }
+
+        Ok(None)
+    }
+
+    async fn ingest_xlog_smgr_create(
+        &mut self,
+        create: SmgrCreate,
+        modification: &mut DatadirModification<'_>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let SmgrCreate { rel } = create;
         self.put_rel_creation(modification, rel, ctx).await?;
         Ok(())
     }
 
+    fn decode_smgr_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        _pg_version: u32,
+    ) -> anyhow::Result<Option<SmgrRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_SMGR_CREATE {
+            let create = XlSmgrCreate::decode(buf);
+            let rel = RelTag {
+                spcnode: create.rnode.spcnode,
+                dbnode: create.rnode.dbnode,
+                relnode: create.rnode.relnode,
+                forknum: create.forknum,
+            };
+
+            return Ok(Some(SmgrRecord::Create(SmgrCreate { rel })));
+        } else if info == pg_constants::XLOG_SMGR_TRUNCATE {
+            let truncate = XlSmgrTruncate::decode(buf);
+            return Ok(Some(SmgrRecord::Truncate(truncate)));
+        }
+
+        Ok(None)
+    }
+
     /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
     ///
     /// This is the same logic as in PostgreSQL's smgr_redo() function.
     async fn ingest_xlog_smgr_truncate(
         &mut self,
+        truncate: XlSmgrTruncate,
         modification: &mut DatadirModification<'_>,
-        rec: &XlSmgrTruncate,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let spcnode = rec.rnode.spcnode;
-        let dbnode = rec.rnode.dbnode;
-        let relnode = rec.rnode.relnode;
+        let XlSmgrTruncate {
+            blkno,
+            rnode,
+            flags,
+        } = truncate;
 
-        if (rec.flags & pg_constants::SMGR_TRUNCATE_HEAP) != 0 {
+        let spcnode = rnode.spcnode;
+        let dbnode = rnode.dbnode;
+        let relnode = rnode.relnode;
+
+        if flags & pg_constants::SMGR_TRUNCATE_HEAP != 0 {
             let rel = RelTag {
                 spcnode,
                 dbnode,
                 relnode,
                 forknum: MAIN_FORKNUM,
             };
-            self.put_rel_truncation(modification, rel, rec.blkno, ctx)
+
+            self.put_rel_truncation(modification, rel, blkno, ctx)
                 .await?;
         }
-        if (rec.flags & pg_constants::SMGR_TRUNCATE_FSM) != 0 {
+        if flags & pg_constants::SMGR_TRUNCATE_FSM != 0 {
             let rel = RelTag {
                 spcnode,
                 dbnode,
@@ -1396,9 +1449,9 @@ impl WalIngest {
                 forknum: FSM_FORKNUM,
             };
 
-            let fsm_logical_page_no = rec.blkno / pg_constants::SLOTS_PER_FSM_PAGE;
+            let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
             let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
-            if rec.blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
+            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
                 // Tail of last remaining FSM page has to be zeroed.
                 // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
                 modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
@@ -1411,7 +1464,7 @@ impl WalIngest {
                     .await?;
             }
         }
-        if (rec.flags & pg_constants::SMGR_TRUNCATE_VM) != 0 {
+        if flags & pg_constants::SMGR_TRUNCATE_VM != 0 {
             let rel = RelTag {
                 spcnode,
                 dbnode,
@@ -1419,8 +1472,8 @@ impl WalIngest {
                 forknum: VISIBILITYMAP_FORKNUM,
             };
 
-            let mut vm_page_no = rec.blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
-            if rec.blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
+            let mut vm_page_no = blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
+            if blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
                 // Tail of last remaining vm page has to be zeroed.
                 // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
                 modification.put_rel_page_image_zero(rel, vm_page_no)?;
@@ -1493,12 +1546,32 @@ impl WalIngest {
     ///
     async fn ingest_xact_record(
         &mut self,
+        record: XactRecord,
         modification: &mut DatadirModification<'_>,
-        parsed: &XlXactParsedRecord,
-        is_commit: bool,
-        origin_id: u16,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let (xact_common, is_commit, is_prepared) = match record {
+            XactRecord::Prepare(XactPrepare { xl_xid, data }) => {
+                let xid: u64 = if modification.tline.pg_version >= 17 {
+                    self.adjust_to_full_transaction_id(xl_xid)?
+                } else {
+                    xl_xid as u64
+                };
+                return modification.put_twophase_file(xid, data, ctx).await;
+            }
+            XactRecord::Commit(common) => (common, true, false),
+            XactRecord::Abort(common) => (common, false, false),
+            XactRecord::CommitPrepared(common) => (common, true, true),
+            XactRecord::AbortPrepared(common) => (common, false, true),
+        };
+
+        let XactCommon {
+            parsed,
+            origin_id,
+            xl_xid,
+            lsn,
+        } = xact_common;
+
         // Record update of CLOG pages
         let mut pageno = parsed.xid / pg_constants::CLOG_XACTS_PER_PAGE;
         let mut segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
@@ -1569,18 +1642,95 @@ impl WalIngest {
                 .set_replorigin(origin_id, parsed.origin_lsn)
                 .await?;
         }
+
+        if is_prepared {
+            // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
+            trace!(
+                "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
+                xl_xid,
+                parsed.xid,
+                lsn,
+            );
+
+            let xid: u64 = if modification.tline.pg_version >= 17 {
+                self.adjust_to_full_transaction_id(parsed.xid)?
+            } else {
+                parsed.xid as u64
+            };
+            modification.drop_twophase_file(xid, ctx).await?;
+        }
+
         Ok(())
     }
 
-    async fn ingest_clog_truncate_record(
+    // TODO(vlad): Standardise interface for `decode_...`
+    fn decode_xact_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+        _pg_version: u32,
+    ) -> anyhow::Result<Option<XactRecord>> {
+        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
+        let origin_id = decoded.origin_id;
+        let xl_xid = decoded.xl_xid;
+
+        if info == pg_constants::XLOG_XACT_COMMIT {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(XactRecord::Commit(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            })));
+        } else if info == pg_constants::XLOG_XACT_ABORT {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(XactRecord::Abort(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            })));
+        } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(XactRecord::CommitPrepared(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            })));
+        } else if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(XactRecord::AbortPrepared(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            })));
+        } else if info == pg_constants::XLOG_XACT_PREPARE {
+            return Ok(Some(XactRecord::Prepare(XactPrepare {
+                xl_xid: decoded.xl_xid,
+                data: Bytes::copy_from_slice(&buf[..]),
+            })));
+        }
+
+        Ok(None)
+    }
+
+    async fn ingest_clog_truncate(
         &mut self,
+        truncate: ClogTruncate,
         modification: &mut DatadirModification<'_>,
-        xlrec: &XlClogTruncate,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        let ClogTruncate {
+            pageno,
+            oldest_xid,
+            oldest_xid_db,
+        } = truncate;
+
         info!(
             "RM_CLOG_ID truncate pageno {} oldestXid {} oldestXidDB {}",
-            xlrec.pageno, xlrec.oldest_xid, xlrec.oldest_xid_db
+            pageno, oldest_xid, oldest_xid_db
         );
 
         // In Postgres, oldestXid and oldestXidDB are updated in memory when the CLOG is
@@ -1588,8 +1738,8 @@ impl WalIngest {
         // later. In Neon, a server can start at any LSN, not just on a checkpoint record,
         // so we keep the oldestXid and oldestXidDB up-to-date.
         enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
-            cp.oldestXid = xlrec.oldest_xid;
-            cp.oldestXidDB = xlrec.oldest_xid_db;
+            cp.oldestXid = oldest_xid;
+            cp.oldestXidDB = oldest_xid_db;
         });
         self.checkpoint_modified = true;
 
@@ -1606,7 +1756,7 @@ impl WalIngest {
         // the current endpoint page must not be eligible for removal.
         // See SimpleLruTruncate() in slru.c
         if dispatch_pgversion!(modification.tline.pg_version, {
-            pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, xlrec.pageno)
+            pgv::nonrelfile_utils::clogpage_precedes(latest_page_number, pageno)
         }) {
             info!("could not truncate directory pg_xact apparent wraparound");
             return Ok(());
@@ -1626,7 +1776,7 @@ impl WalIngest {
             let segpage = segno * pg_constants::SLRU_PAGES_PER_SEGMENT;
 
             let may_delete = dispatch_pgversion!(modification.tline.pg_version, {
-                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, xlrec.pageno)
+                pgv::nonrelfile_utils::slru_may_delete_clogsegment(segpage, pageno)
             });
 
             if may_delete {
@@ -1640,7 +1790,55 @@ impl WalIngest {
         Ok(())
     }
 
-    fn ingest_multixact_create_record(
+    async fn ingest_clog_zero_page(
+        &mut self,
+        zero_page: ClogZeroPage,
+        modification: &mut DatadirModification<'_>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let ClogZeroPage { segno, rpageno } = zero_page;
+
+        self.put_slru_page_image(
+            modification,
+            SlruKind::Clog,
+            segno,
+            rpageno,
+            ZERO_PAGE.clone(),
+            ctx,
+        )
+        .await
+    }
+
+    fn decode_clog_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<ClogRecord>> {
+        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
+
+        if info == pg_constants::CLOG_ZEROPAGE {
+            let pageno = if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            };
+            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+            Ok(Some(ClogRecord::ZeroPage(ClogZeroPage { segno, rpageno })))
+        } else {
+            assert!(info == pg_constants::CLOG_TRUNCATE);
+            let xlrec = XlClogTruncate::decode(buf, pg_version);
+
+            Ok(Some(ClogRecord::Truncate(ClogTruncate {
+                pageno: xlrec.pageno,
+                oldest_xid: xlrec.oldest_xid,
+                oldest_xid_db: xlrec.oldest_xid_db,
+            })))
+        }
+    }
+
+    fn ingest_multixact_create(
         &mut self,
         modification: &mut DatadirModification,
         xlrec: &XlMultiXactCreate,
@@ -1742,7 +1940,7 @@ impl WalIngest {
         Ok(())
     }
 
-    async fn ingest_multixact_truncate_record(
+    async fn ingest_multixact_truncate(
         &mut self,
         modification: &mut DatadirModification<'_>,
         xlrec: &XlMultiXactTruncate,
@@ -1788,26 +1986,315 @@ impl WalIngest {
         Ok(())
     }
 
-    async fn ingest_relmap_page(
+    async fn ingest_multixact_zero_page(
         &mut self,
+        zero_page: MultiXactZeroPage,
         modification: &mut DatadirModification<'_>,
-        xlrec: &XlRelmapUpdate,
-        decoded: &DecodedWALRecord,
         ctx: &RequestContext,
     ) -> Result<()> {
+        let MultiXactZeroPage {
+            slru_kind,
+            segno,
+            rpageno,
+        } = zero_page;
+        self.put_slru_page_image(
+            modification,
+            slru_kind,
+            segno,
+            rpageno,
+            ZERO_PAGE.clone(),
+            ctx,
+        )
+        .await
+    }
+
+    fn decode_multixact_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MultiXactRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+
+        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
+            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
+        {
+            let pageno = if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            };
+            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+            let slru_kind = match info {
+                pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets,
+                pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers,
+                _ => unreachable!(),
+            };
+
+            return Ok(Some(MultiXactRecord::ZeroPage(MultiXactZeroPage {
+                slru_kind,
+                segno,
+                rpageno,
+            })));
+        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+            let xlrec = XlMultiXactCreate::decode(buf);
+            return Ok(Some(MultiXactRecord::Create(xlrec)));
+        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
+            let xlrec = XlMultiXactTruncate::decode(buf);
+            return Ok(Some(MultiXactRecord::Truncate(xlrec)));
+        }
+
+        Ok(None)
+    }
+
+    async fn ingest_relmap_update(
+        &mut self,
+        update: RelmapUpdate,
+        modification: &mut DatadirModification<'_>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let RelmapUpdate { update, buf } = update;
+
+        modification
+            .put_relmap_file(update.tsid, update.dbid, buf, ctx)
+            .await
+    }
+
+    fn decode_relmap_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        _pg_version: u32,
+    ) -> anyhow::Result<Option<RelmapRecord>> {
+        let update = XlRelmapUpdate::decode(buf);
+
         let mut buf = decoded.record.clone();
         buf.advance(decoded.main_data_offset);
         // skip xl_relmap_update
         buf.advance(12);
 
-        modification
-            .put_relmap_file(
-                xlrec.tsid,
-                xlrec.dbid,
-                Bytes::copy_from_slice(&buf[..]),
-                ctx,
-            )
-            .await
+        Ok(Some(RelmapRecord::Update(RelmapUpdate {
+            update,
+            buf: Bytes::copy_from_slice(&buf[..]),
+        })))
+    }
+
+    async fn ingest_raw_xlog_record(
+        &mut self,
+        raw_record: RawXlogRecord,
+        modification: &mut DatadirModification<'_>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let RawXlogRecord { info, lsn, mut buf } = raw_record;
+        let pg_version = modification.tline.pg_version;
+
+        if info == pg_constants::XLOG_PARAMETER_CHANGE {
+            if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                let rec = v17::XlParameterChange::decode(&mut buf);
+                cp.wal_level = rec.wal_level;
+                self.checkpoint_modified = true;
+            }
+        } else if info == pg_constants::XLOG_END_OF_RECOVERY {
+            if let CheckPoint::V17(cp) = &mut self.checkpoint {
+                let rec = v17::XlEndOfRecovery::decode(&mut buf);
+                cp.wal_level = rec.wal_level;
+                self.checkpoint_modified = true;
+            }
+        }
+
+        enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+            if info == pg_constants::XLOG_NEXTOID {
+                let next_oid = buf.get_u32_le();
+                if cp.nextOid != next_oid {
+                    cp.nextOid = next_oid;
+                    self.checkpoint_modified = true;
+                }
+            } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+                || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+            {
+                let mut checkpoint_bytes = [0u8; pgv::xlog_utils::SIZEOF_CHECKPOINT];
+                buf.copy_to_slice(&mut checkpoint_bytes);
+                let xlog_checkpoint = pgv::CheckPoint::decode(&checkpoint_bytes)?;
+                trace!(
+                    "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                    xlog_checkpoint.oldestXid,
+                    cp.oldestXid
+                );
+                if (cp.oldestXid.wrapping_sub(xlog_checkpoint.oldestXid) as i32) < 0 {
+                    cp.oldestXid = xlog_checkpoint.oldestXid;
+                }
+                trace!(
+                    "xlog_checkpoint.oldestActiveXid={}, checkpoint.oldestActiveXid={}",
+                    xlog_checkpoint.oldestActiveXid,
+                    cp.oldestActiveXid
+                );
+
+                // A shutdown checkpoint has `oldestActiveXid == InvalidTransactionid`,
+                // because at shutdown, all in-progress transactions will implicitly
+                // end. Postgres startup code knows that, and allows hot standby to start
+                // immediately from a shutdown checkpoint.
+                //
+                // In Neon, Postgres hot standby startup always behaves as if starting from
+                // an online checkpoint. It needs a valid `oldestActiveXid` value, so
+                // instead of overwriting self.checkpoint.oldestActiveXid with
+                // InvalidTransactionid from the checkpoint WAL record, update it to a
+                // proper value, knowing that there are no in-progress transactions at this
+                // point, except for prepared transactions.
+                //
+                // See also the neon code changes in the InitWalRecovery() function.
+                if xlog_checkpoint.oldestActiveXid == pg_constants::INVALID_TRANSACTION_ID
+                    && info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                {
+                    let oldest_active_xid = if pg_version >= 17 {
+                        let mut oldest_active_full_xid = cp.nextXid.value;
+                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                            if xid < oldest_active_full_xid {
+                                oldest_active_full_xid = xid;
+                            }
+                        }
+                        oldest_active_full_xid as u32
+                    } else {
+                        let mut oldest_active_xid = cp.nextXid.value as u32;
+                        for xid in modification.tline.list_twophase_files(lsn, ctx).await? {
+                            let narrow_xid = xid as u32;
+                            if (narrow_xid.wrapping_sub(oldest_active_xid) as i32) < 0 {
+                                oldest_active_xid = narrow_xid;
+                            }
+                        }
+                        oldest_active_xid
+                    };
+                    cp.oldestActiveXid = oldest_active_xid;
+                } else {
+                    cp.oldestActiveXid = xlog_checkpoint.oldestActiveXid;
+                }
+
+                // Write a new checkpoint key-value pair on every checkpoint record, even
+                // if nothing really changed. Not strictly required, but it seems nice to
+                // have some trace of the checkpoint records in the layer files at the same
+                // LSNs.
+                self.checkpoint_modified = true;
+            }
+        });
+
+        Ok(())
+    }
+
+    fn decode_xlog_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+        _pg_version: u32,
+    ) -> anyhow::Result<Option<XlogRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        Ok(Some(XlogRecord::Raw(RawXlogRecord {
+            info,
+            lsn,
+            buf: buf.clone(),
+        })))
+    }
+
+    async fn ingest_logical_message_put(
+        &mut self,
+        put: PutLogicalMessage,
+        modification: &mut DatadirModification<'_>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let PutLogicalMessage { path, buf } = put;
+        modification.put_file(path.as_str(), &buf, ctx).await
+    }
+
+    fn decode_logical_message_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        _pg_version: u32,
+    ) -> anyhow::Result<Option<LogicalMessageRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+            let xlrec = crate::walrecord::XlLogicalMessage::decode(buf);
+            let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
+
+            #[cfg(feature = "testing")]
+            if prefix == "neon-test" {
+                return Ok(Some(LogicalMessageRecord::Failpoint));
+            }
+
+            if let Some(path) = prefix.strip_prefix("neon-file:") {
+                let buf_size = xlrec.prefix_size + xlrec.message_size;
+                let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]);
+                return Ok(Some(LogicalMessageRecord::Put(PutLogicalMessage {
+                    path: path.to_string(),
+                    buf,
+                })));
+            }
+        }
+
+        Ok(None)
+    }
+
+    fn decode_standby_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        _pg_version: u32,
+    ) -> anyhow::Result<Option<StandbyRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_RUNNING_XACTS {
+            let xlrec = crate::walrecord::XlRunningXacts::decode(buf);
+            return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts {
+                oldest_running_xid: xlrec.oldest_running_xid,
+            })));
+        }
+
+        Ok(None)
+    }
+
+    fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
+        match record {
+            StandbyRecord::RunningXacts(running_xacts) => {
+                enum_pgversion_dispatch!(&mut self.checkpoint, CheckPoint, cp, {
+                    cp.oldestActiveXid = running_xacts.oldest_running_xid;
+                });
+
+                self.checkpoint_modified = true;
+            }
+        }
+
+        Ok(())
+    }
+
+    fn decode_replorigin_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        _pg_version: u32,
+    ) -> anyhow::Result<Option<ReploriginRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_REPLORIGIN_SET {
+            let xlrec = crate::walrecord::XlReploriginSet::decode(buf);
+            return Ok(Some(ReploriginRecord::Set(xlrec)));
+        } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
+            let xlrec = crate::walrecord::XlReploriginDrop::decode(buf);
+            return Ok(Some(ReploriginRecord::Drop(xlrec)));
+        }
+
+        Ok(None)
+    }
+
+    async fn ingest_replorigin_record(
+        &mut self,
+        record: ReploriginRecord,
+        modification: &mut DatadirModification<'_>,
+    ) -> Result<()> {
+        match record {
+            ReploriginRecord::Set(set) => {
+                modification
+                    .set_replorigin(set.node_id, set.remote_lsn)
+                    .await?;
+            }
+            ReploriginRecord::Drop(drop) => {
+                modification.drop_replorigin(drop.node_id).await?;
+            }
+        }
+
+        Ok(())
     }
 
     async fn put_rel_creation(
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 5561a128b7..fc9adb14c9 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -435,7 +435,9 @@ def test_emergency_relocate_with_branches_slow_replay(
 
     # This fail point will pause the WAL ingestion on the main branch, after the
     # the first insert
-    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+    pageserver_http.configure_failpoints(
+        [("pageserver-wal-ingest-logical-message-sleep", "return(5000)")]
+    )
 
     # Attach and wait a few seconds to give it time to load the tenants, attach to the
     # safekeepers, and to stream and ingest the WAL up to the pause-point.
@@ -453,11 +455,13 @@ def test_emergency_relocate_with_branches_slow_replay(
         assert cur.fetchall() == [("before pause",), ("after pause",)]
 
     # Sanity check that the failpoint was reached
-    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains(
+        'failpoint "pageserver-wal-ingest-logical-message-sleep": sleep done'
+    )
     assert time.time() - before_attach_time > 5
 
     # Clean up
-    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
+    pageserver_http.configure_failpoints(("pageserver-wal-ingest-logical-message-sleep", "off"))
 
 
 # Simulate hard crash of pageserver and re-attach a tenant with a branch
@@ -581,7 +585,9 @@ def test_emergency_relocate_with_branches_createdb(
     # bug reproduced easily even without this, as there is always some delay between
     # loading the timeline and establishing the connection to the safekeeper to stream and
     # ingest the WAL, but let's make this less dependent on accidental timing.
-    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+    pageserver_http.configure_failpoints(
+        [("pageserver-wal-ingest-logical-message-sleep", "return(5000)")]
+    )
     before_attach_time = time.time()
     env.pageserver.tenant_attach(tenant_id)
 
@@ -590,8 +596,10 @@ def test_emergency_relocate_with_branches_createdb(
         assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
 
     # Sanity check that the failpoint was reached
-    env.pageserver.assert_log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    env.pageserver.assert_log_contains(
+        'failpoint "pageserver-wal-ingest-logical-message-sleep": sleep done'
+    )
     assert time.time() - before_attach_time > 5
 
     # Clean up
-    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
+    pageserver_http.configure_failpoints(("pageserver-wal-ingest-logical-message-sleep", "off"))

From b782b11b33b505ae16667fc0139b621c1885ff3a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 25 Oct 2024 12:04:27 +0200
Subject: [PATCH 079/239] refactor(timeline creation): represent bootstrap vs
 branch using enum (#9366)

# Problem

Timeline creation can either be bootstrap or branch.
The distinction is made based on whether the `ancestor_*` fields are
present or not.

In the PGDATA import code
(https://github.com/neondatabase/neon/pull/9218), I add a third variant
to timeline creation.

# Solution

The above pushed me to refactor the code in Pageserver to distinguish
the different creation requests through enum variants.

There is no externally observable effect from this change.

On the implementation level, a notable change is that the acquisition of
the `TimelineCreationGuard` happens later than before. This is necessary
so that we have everything in place to construct the
`CreateTimelineIdempotency`. Notably, this moves the acquisition of the
creation guard _after_ the acquisition of the `gc_cs` lock in the case
of branching. This might appear as if we're at risk of holding `gc_cs`
longer than before this PR, but, even before this PR, we were holding
`gc_cs` until after the `wait_completion()` that makes the timeline
creation durable in S3 returns. I don't see any deadlock risk with
reversing the lock acquisition order.

As a drive-by change, I found that the `create_timeline()` function in
`neon_local` is unused, so I removed it.

# Refs

* platform context: https://github.com/neondatabase/neon/pull/9218
* product context: https://github.com/neondatabase/cloud/issues/17507
* next PR stacked atop this one:
https://github.com/neondatabase/neon/pull/9501
---
 control_plane/src/bin/neon_local.rs     |  25 +-
 control_plane/src/pageserver.rs         |  22 --
 libs/pageserver_api/src/models.rs       |  31 +-
 pageserver/src/http/routes.rs           |  42 ++-
 pageserver/src/tenant.rs                | 371 +++++++++++++++++-------
 storage_controller/src/service.rs       |   8 +-
 test_runner/fixtures/pageserver/http.py |  11 +-
 7 files changed, 334 insertions(+), 176 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 624936620d..48438adf43 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -1073,10 +1073,10 @@ async fn handle_tenant(subcmd: &TenantCmd, env: &mut local_env::LocalEnv) -> any
                     tenant_id,
                     TimelineCreateRequest {
                         new_timeline_id,
-                        ancestor_timeline_id: None,
-                        ancestor_start_lsn: None,
-                        existing_initdb_timeline_id: None,
-                        pg_version: Some(args.pg_version),
+                        mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
+                            existing_initdb_timeline_id: None,
+                            pg_version: Some(args.pg_version),
+                        },
                     },
                 )
                 .await?;
@@ -1133,10 +1133,10 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
             let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
-                ancestor_timeline_id: None,
-                existing_initdb_timeline_id: None,
-                ancestor_start_lsn: None,
-                pg_version: Some(args.pg_version),
+                mode: pageserver_api::models::TimelineCreateRequestMode::Bootstrap {
+                    existing_initdb_timeline_id: None,
+                    pg_version: Some(args.pg_version),
+                },
             };
             let timeline_info = storage_controller
                 .tenant_timeline_create(tenant_id, create_req)
@@ -1189,10 +1189,11 @@ async fn handle_timeline(cmd: &TimelineCmd, env: &mut local_env::LocalEnv) -> Re
             let storage_controller = StorageController::from_env(env);
             let create_req = TimelineCreateRequest {
                 new_timeline_id,
-                ancestor_timeline_id: Some(ancestor_timeline_id),
-                existing_initdb_timeline_id: None,
-                ancestor_start_lsn: start_lsn,
-                pg_version: None,
+                mode: pageserver_api::models::TimelineCreateRequestMode::Branch {
+                    ancestor_timeline_id,
+                    ancestor_start_lsn: start_lsn,
+                    pg_version: None,
+                },
             };
             let timeline_info = storage_controller
                 .tenant_timeline_create(tenant_id, create_req)
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index cae9416af6..5b5828c6ed 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -529,28 +529,6 @@ impl PageServerNode {
         Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
     }
 
-    pub async fn timeline_create(
-        &self,
-        tenant_shard_id: TenantShardId,
-        new_timeline_id: TimelineId,
-        ancestor_start_lsn: Option<Lsn>,
-        ancestor_timeline_id: Option<TimelineId>,
-        pg_version: Option<u32>,
-        existing_initdb_timeline_id: Option<TimelineId>,
-    ) -> anyhow::Result<TimelineInfo> {
-        let req = models::TimelineCreateRequest {
-            new_timeline_id,
-            ancestor_start_lsn,
-            ancestor_timeline_id,
-            pg_version,
-            existing_initdb_timeline_id,
-        };
-        Ok(self
-            .http_client
-            .timeline_create(tenant_shard_id, &req)
-            .await?)
-    }
-
     /// Import a basebackup prepared using either:
     /// a) `pg_basebackup -F tar`, or
     /// b) The `fullbackup` pageserver endpoint
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d0ee4b64d1..8684927554 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -211,13 +211,30 @@ pub enum TimelineState {
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TimelineCreateRequest {
     pub new_timeline_id: TimelineId,
-    #[serde(default)]
-    pub ancestor_timeline_id: Option<TimelineId>,
-    #[serde(default)]
-    pub existing_initdb_timeline_id: Option<TimelineId>,
-    #[serde(default)]
-    pub ancestor_start_lsn: Option<Lsn>,
-    pub pg_version: Option<u32>,
+    #[serde(flatten)]
+    pub mode: TimelineCreateRequestMode,
+}
+
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(untagged)]
+pub enum TimelineCreateRequestMode {
+    Branch {
+        ancestor_timeline_id: TimelineId,
+        #[serde(default)]
+        ancestor_start_lsn: Option<Lsn>,
+        // TODO: cplane sets this, but, the branching code always
+        // inherits the ancestor's pg_version. Earlier code wasn't
+        // using a flattened enum, so, it was an accepted field, and
+        // we continue to accept it by having it here.
+        pg_version: Option<u32>,
+    },
+    // NB: Bootstrap is all-optional, and thus the serde(untagged) will cause serde to stop at Bootstrap.
+    // (serde picks the first matching enum variant, in declaration order).
+    Bootstrap {
+        #[serde(default)]
+        existing_initdb_timeline_id: Option<TimelineId>,
+        pg_version: Option<u32>,
+    },
 }
 
 #[derive(Serialize, Deserialize, Clone)]
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2490bf5f20..bc03df9ad2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -38,6 +38,7 @@ use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
 use pageserver_api::models::TimelineArchivalConfigRequest;
+use pageserver_api::models::TimelineCreateRequestMode;
 use pageserver_api::models::TimelinesInfoAndOffloaded;
 use pageserver_api::models::TopTenantShardItem;
 use pageserver_api::models::TopTenantShardsRequest;
@@ -85,6 +86,7 @@ use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
 use crate::tenant::OffloadedTimeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
+use crate::DEFAULT_PG_VERSION;
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
     StatusResponse, TenantConfigRequest, TenantInfo, TimelineCreateRequest, TimelineGcRequest,
@@ -547,6 +549,26 @@ async fn timeline_create_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
     let new_timeline_id = request_data.new_timeline_id;
+    // fill in the default pg_version if not provided & convert request into domain model
+    let params: tenant::CreateTimelineParams = match request_data.mode {
+        TimelineCreateRequestMode::Bootstrap {
+            existing_initdb_timeline_id,
+            pg_version,
+        } => tenant::CreateTimelineParams::Bootstrap(tenant::CreateTimelineParamsBootstrap {
+            new_timeline_id,
+            existing_initdb_timeline_id,
+            pg_version: pg_version.unwrap_or(DEFAULT_PG_VERSION),
+        }),
+        TimelineCreateRequestMode::Branch {
+            ancestor_timeline_id,
+            ancestor_start_lsn,
+            pg_version: _,
+        } => tenant::CreateTimelineParams::Branch(tenant::CreateTimelineParamsBranch {
+            new_timeline_id,
+            ancestor_timeline_id,
+            ancestor_start_lsn,
+        }),
+    };
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
 
@@ -559,22 +581,12 @@ async fn timeline_create_handler(
 
         tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
 
-        if let Some(ancestor_id) = request_data.ancestor_timeline_id.as_ref() {
-            tracing::info!(%ancestor_id, "starting to branch");
-        } else {
-            tracing::info!("bootstrapping");
-        }
+        // earlier versions of the code had pg_version and ancestor_lsn in the span
+        // => continue to provide that information, but, through a log message that doesn't require us to destructure
+        tracing::info!(?params, "creating timeline");
 
         match tenant
-            .create_timeline(
-                new_timeline_id,
-                request_data.ancestor_timeline_id,
-                request_data.ancestor_start_lsn,
-                request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-                request_data.existing_initdb_timeline_id,
-                state.broker_client.clone(),
-                &ctx,
-            )
+            .create_timeline(params, state.broker_client.clone(), &ctx)
             .await
         {
             Ok(new_timeline) => {
@@ -625,8 +637,6 @@ async fn timeline_create_handler(
         tenant_id = %tenant_shard_id.tenant_id,
         shard_id = %tenant_shard_id.shard_slug(),
         timeline_id = %new_timeline_id,
-        lsn=?request_data.ancestor_start_lsn,
-        pg_version=?request_data.pg_version
     ))
     .await
 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d503b299c1..d8ce916bcb 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -737,6 +737,83 @@ impl Debug for SetStoppingError {
     }
 }
 
+/// Arguments to [`Tenant::create_timeline`].
+///
+/// Not usable as an idempotency key for timeline creation because if [`CreateTimelineParamsBranch::ancestor_start_lsn`]
+/// is `None`, the result of the timeline create call is not deterministic.
+///
+/// See [`CreateTimelineIdempotency`] for an idempotency key.
+#[derive(Debug)]
+pub(crate) enum CreateTimelineParams {
+    Bootstrap(CreateTimelineParamsBootstrap),
+    Branch(CreateTimelineParamsBranch),
+}
+
+#[derive(Debug)]
+pub(crate) struct CreateTimelineParamsBootstrap {
+    pub(crate) new_timeline_id: TimelineId,
+    pub(crate) existing_initdb_timeline_id: Option<TimelineId>,
+    pub(crate) pg_version: u32,
+}
+
+/// NB: See comment on [`CreateTimelineIdempotency::Branch`] for why there's no `pg_version` here.
+#[derive(Debug)]
+pub(crate) struct CreateTimelineParamsBranch {
+    pub(crate) new_timeline_id: TimelineId,
+    pub(crate) ancestor_timeline_id: TimelineId,
+    pub(crate) ancestor_start_lsn: Option<Lsn>,
+}
+
+/// What is used to determine idempotency of a [`Tenant::create_timeline`] call.
+///
+/// Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN.
+///
+/// We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`]
+/// is not considered for idempotency.
+///
+/// We can improve on this over time.
+pub(crate) enum CreateTimelineIdempotency {
+    Bootstrap {
+        pg_version: u32,
+    },
+    /// NB: branches always have the same `pg_version` as their ancestor.
+    /// While [`pageserver_api::models::TimelineCreateRequestMode::Branch::pg_version`]
+    /// exists as a field, and is set by cplane, it has always been ignored by pageserver when
+    /// determining the child branch pg_version.
+    Branch {
+        ancestor_timeline_id: TimelineId,
+        ancestor_start_lsn: Lsn,
+    },
+}
+
+/// What is returned by [`Tenant::start_creating_timeline`].
+#[must_use]
+enum StartCreatingTimelineResult<'t> {
+    CreateGuard(TimelineCreateGuard<'t>),
+    Idempotent(Arc<Timeline>),
+}
+
+/// What is returned by [`Tenant::create_timeline`].
+enum CreateTimelineResult {
+    Created(Arc<Timeline>),
+    Idempotent(Arc<Timeline>),
+}
+
+impl CreateTimelineResult {
+    fn timeline(&self) -> &Arc<Timeline> {
+        match self {
+            Self::Created(t) | Self::Idempotent(t) => t,
+        }
+    }
+    /// Unit test timelines aren't activated, test has to do it if it needs to.
+    #[cfg(test)]
+    fn into_timeline_for_test(self) -> Arc<Timeline> {
+        match self {
+            Self::Created(t) | Self::Idempotent(t) => t,
+        }
+    }
+}
+
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
     #[error("creation of timeline with the given ID is in progress")]
@@ -2090,11 +2167,7 @@ impl Tenant {
     #[allow(clippy::too_many_arguments)]
     pub(crate) async fn create_timeline(
         self: &Arc<Tenant>,
-        new_timeline_id: TimelineId,
-        ancestor_timeline_id: Option<TimelineId>,
-        mut ancestor_start_lsn: Option<Lsn>,
-        pg_version: u32,
-        load_existing_initdb: Option<TimelineId>,
+        params: CreateTimelineParams,
         broker_client: storage_broker::BrokerClientChannel,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
@@ -2113,54 +2186,25 @@ impl Tenant {
             .enter()
             .map_err(|_| CreateTimelineError::ShuttingDown)?;
 
-        // Get exclusive access to the timeline ID: this ensures that it does not already exist,
-        // and that no other creation attempts will be allowed in while we are working.
-        let create_guard = match self.create_timeline_create_guard(new_timeline_id) {
-            Ok(m) => m,
-            Err(TimelineExclusionError::AlreadyCreating) => {
-                // Creation is in progress, we cannot create it again, and we cannot
-                // check if this request matches the existing one, so caller must try
-                // again later.
-                return Err(CreateTimelineError::AlreadyCreating);
+        let result: CreateTimelineResult = match params {
+            CreateTimelineParams::Bootstrap(CreateTimelineParamsBootstrap {
+                new_timeline_id,
+                existing_initdb_timeline_id,
+                pg_version,
+            }) => {
+                self.bootstrap_timeline(
+                    new_timeline_id,
+                    pg_version,
+                    existing_initdb_timeline_id,
+                    ctx,
+                )
+                .await?
             }
-            Err(TimelineExclusionError::Other(e)) => {
-                return Err(CreateTimelineError::Other(e));
-            }
-            Err(TimelineExclusionError::AlreadyExists(existing)) => {
-                debug!("timeline {new_timeline_id} already exists");
-
-                // Idempotency: creating the same timeline twice is not an error, unless
-                // the second creation has different parameters.
-                if existing.get_ancestor_timeline_id() != ancestor_timeline_id
-                    || existing.pg_version != pg_version
-                    || (ancestor_start_lsn.is_some()
-                        && ancestor_start_lsn != Some(existing.get_ancestor_lsn()))
-                {
-                    return Err(CreateTimelineError::Conflict);
-                }
-
-                // Wait for uploads to complete, so that when we return Ok, the timeline
-                // is known to be durable on remote storage. Just like we do at the end of
-                // this function, after we have created the timeline ourselves.
-                //
-                // We only really care that the initial version of `index_part.json` has
-                // been uploaded. That's enough to remember that the timeline
-                // exists. However, there is no function to wait specifically for that so
-                // we just wait for all in-progress uploads to finish.
-                existing
-                    .remote_client
-                    .wait_completion()
-                    .await
-                    .context("wait for timeline uploads to complete")?;
-
-                return Ok(existing);
-            }
-        };
-
-        pausable_failpoint!("timeline-creation-after-uninit");
-
-        let loaded_timeline = match ancestor_timeline_id {
-            Some(ancestor_timeline_id) => {
+            CreateTimelineParams::Branch(CreateTimelineParamsBranch {
+                new_timeline_id,
+                ancestor_timeline_id,
+                mut ancestor_start_lsn,
+            }) => {
                 let ancestor_timeline = self
                     .get_timeline(ancestor_timeline_id, false)
                     .context("Cannot branch off the timeline that's not present in pageserver")?;
@@ -2207,43 +2251,39 @@ impl Tenant {
                         })?;
                 }
 
-                self.branch_timeline(
-                    &ancestor_timeline,
-                    new_timeline_id,
-                    ancestor_start_lsn,
-                    create_guard,
-                    ctx,
-                )
-                .await?
-            }
-            None => {
-                self.bootstrap_timeline(
-                    new_timeline_id,
-                    pg_version,
-                    load_existing_initdb,
-                    create_guard,
-                    ctx,
-                )
-                .await?
+                self.branch_timeline(&ancestor_timeline, new_timeline_id, ancestor_start_lsn, ctx)
+                    .await?
             }
         };
 
         // At this point we have dropped our guard on [`Self::timelines_creating`], and
         // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
         // not send a success to the caller until it is.  The same applies to handling retries,
-        // see the handling of [`TimelineExclusionError::AlreadyExists`] above.
-        let kind = ancestor_timeline_id
-            .map(|_| "branched")
-            .unwrap_or("bootstrapped");
-        loaded_timeline
+        // that is done in [`Self::start_creating_timeline`].
+        result
+            .timeline()
             .remote_client
             .wait_completion()
             .await
-            .with_context(|| format!("wait for {} timeline initial uploads to complete", kind))?;
+            .context("wait for timeline initial uploads to complete")?;
 
-        loaded_timeline.activate(self.clone(), broker_client, None, ctx);
+        // The creating task is responsible for activating the timeline.
+        // We do this after `wait_completion()` so that we don't spin up tasks that start
+        // doing stuff before the IndexPart is durable in S3, which is done by the previous section.
+        let activated_timeline = match result {
+            CreateTimelineResult::Created(timeline) => {
+                timeline.activate(self.clone(), broker_client, None, ctx);
+                timeline
+            }
+            CreateTimelineResult::Idempotent(timeline) => {
+                info!(
+                    "request was deemed idempotent, activation will be done by the creating task"
+                );
+                timeline
+            }
+        };
 
-        Ok(loaded_timeline)
+        Ok(activated_timeline)
     }
 
     pub(crate) async fn delete_timeline(
@@ -3747,16 +3787,16 @@ impl Tenant {
     /// timeline background tasks are launched, except the flush loop.
     #[cfg(test)]
     async fn branch_timeline_test(
-        &self,
+        self: &Arc<Self>,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         ancestor_lsn: Option<Lsn>,
         ctx: &RequestContext,
     ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        let create_guard = self.create_timeline_create_guard(dst_id).unwrap();
         let tl = self
-            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, create_guard, ctx)
-            .await?;
+            .branch_timeline_impl(src_timeline, dst_id, ancestor_lsn, ctx)
+            .await?
+            .into_timeline_for_test();
         tl.set_state(TimelineState::Active);
         Ok(tl)
     }
@@ -3765,7 +3805,7 @@ impl Tenant {
     #[cfg(test)]
     #[allow(clippy::too_many_arguments)]
     pub async fn branch_timeline_test_with_layers(
-        &self,
+        self: &Arc<Self>,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         ancestor_lsn: Option<Lsn>,
@@ -3813,28 +3853,24 @@ impl Tenant {
     }
 
     /// Branch an existing timeline.
-    ///
-    /// The caller is responsible for activating the returned timeline.
     async fn branch_timeline(
-        &self,
+        self: &Arc<Self>,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
-    ) -> Result<Arc<Timeline>, CreateTimelineError> {
-        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, timeline_create_guard, ctx)
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
+        self.branch_timeline_impl(src_timeline, dst_id, start_lsn, ctx)
             .await
     }
 
     async fn branch_timeline_impl(
-        &self,
+        self: &Arc<Self>,
         src_timeline: &Arc<Timeline>,
         dst_id: TimelineId,
         start_lsn: Option<Lsn>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
         _ctx: &RequestContext,
-    ) -> Result<Arc<Timeline>, CreateTimelineError> {
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
         let src_id = src_timeline.timeline_id;
 
         // We will validate our ancestor LSN in this function.  Acquire the GC lock so that
@@ -3849,6 +3885,23 @@ impl Tenant {
             lsn
         });
 
+        // we finally have determined the ancestor_start_lsn, so we can get claim exclusivity now
+        let timeline_create_guard = match self
+            .start_creating_timeline(
+                dst_id,
+                CreateTimelineIdempotency::Branch {
+                    ancestor_timeline_id: src_timeline.timeline_id,
+                    ancestor_start_lsn: start_lsn,
+                },
+            )
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(timeline) => {
+                return Ok(CreateTimelineResult::Idempotent(timeline));
+            }
+        };
+
         // Ensure that `start_lsn` is valid, i.e. the LSN is within the PITR
         // horizon on the source timeline
         //
@@ -3934,28 +3987,110 @@ impl Tenant {
             .schedule_index_upload_for_full_metadata_update(&metadata)
             .context("branch initial metadata upload")?;
 
-        Ok(new_timeline)
+        Ok(CreateTimelineResult::Created(new_timeline))
     }
 
     /// For unit tests, make this visible so that other modules can directly create timelines
     #[cfg(test)]
     #[tracing::instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))]
     pub(crate) async fn bootstrap_timeline_test(
-        &self,
+        self: &Arc<Self>,
         timeline_id: TimelineId,
         pg_version: u32,
         load_existing_initdb: Option<TimelineId>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<Timeline>> {
-        let create_guard = self.create_timeline_create_guard(timeline_id).unwrap();
-        self.bootstrap_timeline(
-            timeline_id,
-            pg_version,
-            load_existing_initdb,
-            create_guard,
-            ctx,
-        )
-        .await
+        self.bootstrap_timeline(timeline_id, pg_version, load_existing_initdb, ctx)
+            .await
+            .map_err(anyhow::Error::new)
+            .map(|r| r.into_timeline_for_test())
+    }
+
+    /// Get exclusive access to the timeline ID for creation.
+    ///
+    /// Timeline-creating code paths must use this function before making changes
+    /// to in-memory or persistent state.
+    ///
+    /// The `state` parameter is a description of the timeline creation operation
+    /// we intend to perform.
+    /// If the timeline was already created in the meantime, we check whether this
+    /// request conflicts or is idempotent , based on `state`.
+    async fn start_creating_timeline(
+        &self,
+        new_timeline_id: TimelineId,
+        idempotency: CreateTimelineIdempotency,
+    ) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
+        match self.create_timeline_create_guard(new_timeline_id) {
+            Ok(create_guard) => {
+                pausable_failpoint!("timeline-creation-after-uninit");
+                Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
+            }
+            Err(TimelineExclusionError::AlreadyCreating) => {
+                // Creation is in progress, we cannot create it again, and we cannot
+                // check if this request matches the existing one, so caller must try
+                // again later.
+                Err(CreateTimelineError::AlreadyCreating)
+            }
+            Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)),
+            Err(TimelineExclusionError::AlreadyExists(existing)) => {
+                debug!("timeline already exists");
+
+                // Idempotency: creating the same timeline twice is not an error, unless
+                // the second creation has different parameters.
+                //
+                // TODO: this is a crutch; we should store the CreateTimelineState as an
+                // immutable attribute in the index part, and compare them using derive(`Eq`).
+                match idempotency {
+                    CreateTimelineIdempotency::Bootstrap { pg_version } => {
+                        if existing.pg_version != pg_version {
+                            info!("timeline already exists with different pg_version");
+                            return Err(CreateTimelineError::Conflict);
+                        }
+                        if existing.get_ancestor_timeline_id().is_some() {
+                            info!("timeline already exists with an ancestor");
+                            return Err(CreateTimelineError::Conflict);
+                        }
+                        if existing.get_ancestor_lsn() != Lsn::INVALID {
+                            info!("timeline already exists with an ancestor LSN");
+                            return Err(CreateTimelineError::Conflict);
+                        }
+                    }
+                    CreateTimelineIdempotency::Branch {
+                        ancestor_timeline_id,
+                        ancestor_start_lsn,
+                    } => {
+                        if existing.get_ancestor_timeline_id() != Some(ancestor_timeline_id) {
+                            info!("timeline already exists with different ancestor");
+                            return Err(CreateTimelineError::Conflict);
+                        }
+                        if existing.get_ancestor_lsn() != ancestor_start_lsn {
+                            info!("timeline already exists with different ancestor LSN");
+                            return Err(CreateTimelineError::Conflict);
+                        }
+                    }
+                }
+
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                existing
+                    .remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
+
+                // TODO: shouldn't we also wait for timeline to become active?
+                // Code before this(https://github.com/neondatabase/neon/pull/9366) refactoring
+                // didn't do it.
+
+                Ok(StartCreatingTimelineResult::Idempotent(existing))
+            }
+        }
     }
 
     async fn upload_initdb(
@@ -4009,16 +4144,26 @@ impl Tenant {
 
     /// - run initdb to init temporary instance and get bootstrap data
     /// - after initialization completes, tar up the temp dir and upload it to S3.
-    ///
-    /// The caller is responsible for activating the returned timeline.
     async fn bootstrap_timeline(
-        &self,
+        self: &Arc<Self>,
         timeline_id: TimelineId,
         pg_version: u32,
         load_existing_initdb: Option<TimelineId>,
-        timeline_create_guard: TimelineCreateGuard<'_>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
+    ) -> Result<CreateTimelineResult, CreateTimelineError> {
+        let timeline_create_guard = match self
+            .start_creating_timeline(
+                timeline_id,
+                CreateTimelineIdempotency::Bootstrap { pg_version },
+            )
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(timeline) => {
+                return Ok(CreateTimelineResult::Idempotent(timeline))
+            }
+        };
+
         // create a `tenant/{tenant_id}/timelines/basebackup-{timeline_id}.{TEMP_FILE_SUFFIX}/`
         // temporary directory for basebackup files for the given timeline.
 
@@ -4082,7 +4227,9 @@ impl Tenant {
                 .context("extract initdb tar")?;
         } else {
             // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
-            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
+            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel)
+                .await
+                .context("run initdb")?;
 
             // Upload the created data dir to S3
             if self.tenant_shard_id().is_shard_zero() {
@@ -4136,7 +4283,9 @@ impl Tenant {
         })?;
 
         fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
+            Err(CreateTimelineError::Other(anyhow::anyhow!(
+                "failpoint before-checkpoint-new-timeline"
+            )))
         });
 
         unfinished_timeline
@@ -4151,7 +4300,7 @@ impl Tenant {
         // All done!
         let timeline = raw_timeline.finish_creation()?;
 
-        Ok(timeline)
+        Ok(CreateTimelineResult::Created(timeline))
     }
 
     fn build_timeline_remote_client(&self, timeline_id: TimelineId) -> RemoteTimelineClient {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 2cde1d6a3d..a2a6e63dd2 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3130,9 +3130,11 @@ impl Service {
             .await?;
 
             // Propagate the LSN that shard zero picked, if caller didn't provide one
-            if create_req.ancestor_timeline_id.is_some() && create_req.ancestor_start_lsn.is_none()
-            {
-                create_req.ancestor_start_lsn = timeline_info.ancestor_lsn;
+            match &mut create_req.mode {
+                models::TimelineCreateRequestMode::Branch { ancestor_start_lsn, .. } if ancestor_start_lsn.is_none() => {
+                    *ancestor_start_lsn = timeline_info.ancestor_lsn;
+                },
+                _ => {}
             }
 
             // Create timeline on remaining shards with number >0
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 18d65cb7de..db83c3ec89 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -476,12 +476,13 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
     ) -> dict[Any, Any]:
         body: dict[str, Any] = {
             "new_timeline_id": str(new_timeline_id),
-            "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
-            "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
-            "existing_initdb_timeline_id": str(existing_initdb_timeline_id)
-            if existing_initdb_timeline_id
-            else None,
         }
+        if ancestor_timeline_id:
+            body["ancestor_timeline_id"] = str(ancestor_timeline_id)
+        if ancestor_start_lsn:
+            body["ancestor_start_lsn"] = str(ancestor_start_lsn)
+        if existing_initdb_timeline_id:
+            body["existing_initdb_timeline_id"] = str(existing_initdb_timeline_id)
         if pg_version != PgVersion.NOT_SET:
             body["pg_version"] = int(pg_version)
 

From b3bedda6fd6acee7d65ca5040adef9b62300c391 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 25 Oct 2024 11:15:53 +0100
Subject: [PATCH 080/239] pageserver/walingest: log on gappy rel extend (#9502)

## Problem

https://github.com/neondatabase/neon/pull/9492 added a metric to track
the total count of block gaps filled on rel extend. More context is
needed to understand when this happens. The current theory is that it
may only happen on pg 14 and pg 15 since they do not WAL log relation extends.

## Summary of Changes

A rate limited log is added.
---
 pageserver/src/walingest.rs | 53 +++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 8a4c0554f8..d81552ac77 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -2419,6 +2419,59 @@ impl WalIngest {
             WAL_INGEST
                 .gap_blocks_zeroed_on_rel_extend
                 .inc_by(gap_blocks_filled);
+
+            // Log something when relation extends cause use to fill gaps
+            // with zero pages. Logging is rate limited per pg version to
+            // avoid skewing.
+            if gap_blocks_filled > 0 {
+                use once_cell::sync::Lazy;
+                use std::sync::Mutex;
+                use utils::rate_limit::RateLimit;
+
+                struct RateLimitPerPgVersion {
+                    rate_limiters: [Lazy<Mutex<RateLimit>>; 4],
+                }
+
+                impl RateLimitPerPgVersion {
+                    const fn new() -> Self {
+                        Self {
+                            rate_limiters: [const {
+                                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(30))))
+                            }; 4],
+                        }
+                    }
+
+                    const fn rate_limiter(
+                        &self,
+                        pg_version: u32,
+                    ) -> Option<&Lazy<Mutex<RateLimit>>> {
+                        const MIN_PG_VERSION: u32 = 14;
+                        const MAX_PG_VERSION: u32 = 17;
+
+                        if pg_version < MIN_PG_VERSION || pg_version > MAX_PG_VERSION {
+                            return None;
+                        }
+
+                        Some(&self.rate_limiters[(pg_version - MIN_PG_VERSION) as usize])
+                    }
+                }
+
+                static LOGGED: RateLimitPerPgVersion = RateLimitPerPgVersion::new();
+                if let Some(rate_limiter) = LOGGED.rate_limiter(modification.tline.pg_version) {
+                    if let Ok(mut locked) = rate_limiter.try_lock() {
+                        locked.call(|| {
+                            info!(
+                                lsn=%modification.get_lsn(),
+                                pg_version=%modification.tline.pg_version,
+                                rel=%rel,
+                                "Filled {} gap blocks on rel extend to {} from {}",
+                                gap_blocks_filled,
+                                new_nblocks,
+                                old_nblocks);
+                        });
+                    }
+                }
+            }
         }
         Ok(())
     }

From 4d9036bf1f3fec2a2285bf5ced349195c36c56f8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 25 Oct 2024 12:32:46 +0200
Subject: [PATCH 081/239] Support offloaded timelines during shard split
 (#9489)

Before, we didn't copy over the `index-part.json` of offloaded timelines
to the new shard's location, resulting in the new shard not knowing the
timeline even exists.

In #9444, we copy over the manifest, but we also need to do this for
`index-part.json`.

As the operations to do are mostly the same between offloaded and
non-offloaded timelines, we can iterate over all of them in the same
loop, after the introduction of a `TimelineOrOffloadedArcRef` type to
generalize over the two cases. This is analogous to the deletion code
added in #8907.

The added test also ensures that the sharded archival config endpoint
works, something that has not yet been ensured by tests.

Part of #8088
---
 pageserver/src/tenant.rs                      | 113 ++++++++++----
 .../src/tenant/remote_timeline_client.rs      |   6 +-
 test_runner/fixtures/neon_fixtures.py         |  27 +++-
 test_runner/fixtures/pageserver/http.py       |  25 +++
 test_runner/regress/test_sharding.py          | 143 +++++++++++++++++-
 5 files changed, 283 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d8ce916bcb..968d093a80 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -597,17 +597,21 @@ pub enum TimelineOrOffloaded {
 }
 
 impl TimelineOrOffloaded {
-    pub fn tenant_shard_id(&self) -> TenantShardId {
+    pub fn arc_ref(&self) -> TimelineOrOffloadedArcRef<'_> {
         match self {
-            TimelineOrOffloaded::Timeline(timeline) => timeline.tenant_shard_id,
-            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.tenant_shard_id,
+            TimelineOrOffloaded::Timeline(timeline) => {
+                TimelineOrOffloadedArcRef::Timeline(timeline)
+            }
+            TimelineOrOffloaded::Offloaded(offloaded) => {
+                TimelineOrOffloadedArcRef::Offloaded(offloaded)
+            }
         }
     }
+    pub fn tenant_shard_id(&self) -> TenantShardId {
+        self.arc_ref().tenant_shard_id()
+    }
     pub fn timeline_id(&self) -> TimelineId {
-        match self {
-            TimelineOrOffloaded::Timeline(timeline) => timeline.timeline_id,
-            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.timeline_id,
-        }
+        self.arc_ref().timeline_id()
     }
     pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> {
         match self {
@@ -615,7 +619,7 @@ impl TimelineOrOffloaded {
             TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
         }
     }
-    pub fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
+    fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
         match self {
             TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
             TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
@@ -632,6 +636,38 @@ impl TimelineOrOffloaded {
     }
 }
 
+pub enum TimelineOrOffloadedArcRef<'a> {
+    Timeline(&'a Arc<Timeline>),
+    Offloaded(&'a Arc<OffloadedTimeline>),
+}
+
+impl TimelineOrOffloadedArcRef<'_> {
+    pub fn tenant_shard_id(&self) -> TenantShardId {
+        match self {
+            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.tenant_shard_id,
+            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.tenant_shard_id,
+        }
+    }
+    pub fn timeline_id(&self) -> TimelineId {
+        match self {
+            TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.timeline_id,
+            TimelineOrOffloadedArcRef::Offloaded(offloaded) => offloaded.timeline_id,
+        }
+    }
+}
+
+impl<'a> From<&'a Arc<Timeline>> for TimelineOrOffloadedArcRef<'a> {
+    fn from(timeline: &'a Arc<Timeline>) -> Self {
+        Self::Timeline(timeline)
+    }
+}
+
+impl<'a> From<&'a Arc<OffloadedTimeline>> for TimelineOrOffloadedArcRef<'a> {
+    fn from(timeline: &'a Arc<OffloadedTimeline>) -> Self {
+        Self::Offloaded(timeline)
+    }
+}
+
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
     #[error("Timeline is shutting down")]
@@ -2940,33 +2976,58 @@ impl Tenant {
         &self,
         child_shards: &Vec<TenantShardId>,
     ) -> anyhow::Result<()> {
-        let timelines = self.timelines.lock().unwrap().clone();
-        for timeline in timelines.values() {
+        let (timelines, offloaded) = {
+            let timelines = self.timelines.lock().unwrap();
+            let offloaded = self.timelines_offloaded.lock().unwrap();
+            (timelines.clone(), offloaded.clone())
+        };
+        let timelines_iter = timelines
+            .values()
+            .map(TimelineOrOffloadedArcRef::<'_>::from)
+            .chain(
+                offloaded
+                    .values()
+                    .map(TimelineOrOffloadedArcRef::<'_>::from),
+            );
+        for timeline in timelines_iter {
             // We do not block timeline creation/deletion during splits inside the pageserver: it is up to higher levels
             // to ensure that they do not start a split if currently in the process of doing these.
 
-            // Upload an index from the parent: this is partly to provide freshness for the
-            // child tenants that will copy it, and partly for general ease-of-debugging: there will
-            // always be a parent shard index in the same generation as we wrote the child shard index.
-            tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index");
-            timeline
-                .remote_client
-                .schedule_index_upload_for_file_changes()?;
-            timeline.remote_client.wait_completion().await?;
+            let timeline_id = timeline.timeline_id();
+
+            if let TimelineOrOffloadedArcRef::Timeline(timeline) = timeline {
+                // Upload an index from the parent: this is partly to provide freshness for the
+                // child tenants that will copy it, and partly for general ease-of-debugging: there will
+                // always be a parent shard index in the same generation as we wrote the child shard index.
+                tracing::info!(%timeline_id, "Uploading index");
+                timeline
+                    .remote_client
+                    .schedule_index_upload_for_file_changes()?;
+                timeline.remote_client.wait_completion().await?;
+            }
+
+            let remote_client = match timeline {
+                TimelineOrOffloadedArcRef::Timeline(timeline) => timeline.remote_client.clone(),
+                TimelineOrOffloadedArcRef::Offloaded(offloaded) => {
+                    let remote_client = self
+                        .build_timeline_client(offloaded.timeline_id, self.remote_storage.clone());
+                    Arc::new(remote_client)
+                }
+            };
 
             // Shut down the timeline's remote client: this means that the indices we write
             // for child shards will not be invalidated by the parent shard deleting layers.
-            tracing::info!(timeline_id=%timeline.timeline_id, "Shutting down remote storage client");
-            timeline.remote_client.shutdown().await;
+            tracing::info!(%timeline_id, "Shutting down remote storage client");
+            remote_client.shutdown().await;
 
             // Download methods can still be used after shutdown, as they don't flow through the remote client's
             // queue.  In principal the RemoteTimelineClient could provide this without downloading it, but this
             // operation is rare, so it's simpler to just download it (and robustly guarantees that the index
             // we use here really is the remotely persistent one).
-            tracing::info!(timeline_id=%timeline.timeline_id, "Downloading index_part from parent");
-            let result = timeline.remote_client
+            tracing::info!(%timeline_id, "Downloading index_part from parent");
+            let result = remote_client
                 .download_index_file(&self.cancel)
-                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline.timeline_id))
+                .instrument(info_span!("download_index_file", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), %timeline_id))
                 .await?;
             let index_part = match result {
                 MaybeDeletedIndexPart::Deleted(_) => {
@@ -2976,11 +3037,11 @@ impl Tenant {
             };
 
             for child_shard in child_shards {
-                tracing::info!(timeline_id=%timeline.timeline_id, "Uploading index_part for child {}", child_shard.to_index());
+                tracing::info!(%timeline_id, "Uploading index_part for child {}", child_shard.to_index());
                 upload_index_part(
                     &self.remote_storage,
                     child_shard,
-                    &timeline.timeline_id,
+                    &timeline_id,
                     self.generation,
                     &index_part,
                     &self.cancel,
@@ -2989,8 +3050,6 @@ impl Tenant {
             }
         }
 
-        // TODO: also copy index files of offloaded timelines
-
         let tenant_manifest = self.tenant_manifest();
         // TODO: generation support
         let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 066fd12a9a..1c72c7fff8 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1278,10 +1278,14 @@ impl RemoteTimelineClient {
         let fut = {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = match &mut *guard {
-                UploadQueue::Stopped(_) => return,
+                UploadQueue::Stopped(_) => {
+                    scopeguard::ScopeGuard::into_inner(sg);
+                    return;
+                }
                 UploadQueue::Uninitialized => {
                     // transition into Stopped state
                     self.stop_impl(&mut guard);
+                    scopeguard::ScopeGuard::into_inner(sg);
                     return;
                 }
                 UploadQueue::Initialized(ref mut init) => init,
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 747c2c0d63..a1ea056213 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -44,7 +44,14 @@ from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
 from fixtures.auth_tokens import AuthKeys, TokenScope
-from fixtures.common_types import Lsn, NodeId, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import (
+    Lsn,
+    NodeId,
+    TenantId,
+    TenantShardId,
+    TimelineArchivalState,
+    TimelineId,
+)
 from fixtures.endpoint.http import EndpointHttpClient
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
@@ -2132,6 +2139,24 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response.raise_for_status()
         return response.json()
 
+    def timeline_archival_config(
+        self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        state: TimelineArchivalState,
+    ):
+        config = {"state": state.value}
+        log.info(
+            f"requesting timeline archival config {config} for tenant {tenant_id} and timeline {timeline_id}"
+        )
+        res = self.request(
+            "PUT",
+            f"{self.api}/v1/tenant/{tenant_id}/timeline/{timeline_id}/archival_config",
+            json=config,
+            headers=self.headers(TokenScope.ADMIN),
+        )
+        return res.json()
+
     def configure_failpoints(self, config_strings: tuple[str, str] | list[tuple[str, str]]):
         if isinstance(config_strings, tuple):
             pairs = [config_strings]
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index db83c3ec89..706bc550e5 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -142,6 +142,19 @@ class TenantConfig:
         )
 
 
+@dataclass
+class TimelinesInfoAndOffloaded:
+    timelines: list[dict[str, Any]]
+    offloaded: list[dict[str, Any]]
+
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> TimelinesInfoAndOffloaded:
+        return TimelinesInfoAndOffloaded(
+            timelines=d["timelines"],
+            offloaded=d["offloaded"],
+        )
+
+
 class PageserverHttpClient(requests.Session, MetricsGetter):
     def __init__(
         self,
@@ -464,6 +477,18 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         assert isinstance(res_json, list)
         return res_json
 
+    def timeline_and_offloaded_list(
+        self,
+        tenant_id: Union[TenantId, TenantShardId],
+    ) -> TimelinesInfoAndOffloaded:
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline_and_offloaded",
+        )
+        self.verbose_error(res)
+        res_json = res.json()
+        assert isinstance(res_json, dict)
+        return TimelinesInfoAndOffloaded.from_json(res_json)
+
     def timeline_create(
         self,
         pg_version: PgVersion,
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index b1abcaa763..6c2a059098 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -3,11 +3,11 @@ from __future__ import annotations
 import os
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Any
 
 import pytest
 import requests
-from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.compute_reconfigure import ComputeReconfigure
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
@@ -353,6 +353,145 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
     workload.validate()
 
 
+def test_sharding_split_offloading(neon_env_builder: NeonEnvBuilder):
+    """
+    Test that during a split, we don't miss archived and offloaded timelines.
+    """
+
+    TENANT_CONF = {
+        # small checkpointing and compaction targets to ensure we generate many upload operations
+        "checkpoint_distance": 128 * 1024,
+        "compaction_threshold": 1,
+        "compaction_target_size": 128 * 1024,
+        # no PITR horizon, we specify the horizon when we request on-demand GC
+        "pitr_interval": "3600s",
+        # disable background compaction, GC and offloading. We invoke it manually when we want it to happen.
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # Disable automatic creation of image layers, as we will create them explicitly when we want them
+        "image_creation_threshold": 9999,
+        "image_layer_creation_check_threshold": 0,
+        "lsn_lease_length": "0s",
+    }
+
+    neon_env_builder.storage_controller_config = {
+        # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
+        "max_offline": "30s",
+        "max_warming_up": "300s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=TENANT_CONF)
+    tenant_id = env.initial_tenant
+    timeline_id_main = env.initial_timeline
+
+    # Check that we created with an unsharded TenantShardId: this is the default,
+    # but check it in case we change the default in future
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 0)) is not None
+
+    workload_main = Workload(env, tenant_id, timeline_id_main, branch_name="main")
+    workload_main.init()
+    workload_main.write_rows(256)
+    workload_main.validate()
+    workload_main.stop()
+
+    # Create two timelines, archive one, offload the other
+    timeline_id_archived = env.create_branch("archived_not_offloaded")
+    timeline_id_offloaded = env.create_branch("archived_offloaded")
+
+    def timeline_id_set_for(list: list[dict[str, Any]]) -> set[TimelineId]:
+        return set(
+            map(
+                lambda t: TimelineId(t["timeline_id"]),
+                list,
+            )
+        )
+
+    expected_offloaded_set = {timeline_id_offloaded}
+    expected_timeline_set = {timeline_id_main, timeline_id_archived}
+
+    with env.get_tenant_pageserver(tenant_id).http_client() as http_client:
+        http_client.timeline_archival_config(
+            tenant_id, timeline_id_archived, TimelineArchivalState.ARCHIVED
+        )
+        http_client.timeline_archival_config(
+            tenant_id, timeline_id_offloaded, TimelineArchivalState.ARCHIVED
+        )
+        http_client.timeline_offload(tenant_id, timeline_id_offloaded)
+        list = http_client.timeline_and_offloaded_list(tenant_id)
+        assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
+        assert timeline_id_set_for(list.timelines) == expected_timeline_set
+
+        # Do a full image layer generation before splitting
+        http_client.timeline_checkpoint(
+            tenant_id, timeline_id_main, force_image_layer_creation=True, wait_until_uploaded=True
+        )
+
+    # Split one shard into two
+    shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=2)
+
+    # Let all shards move into their stable locations, so that during subsequent steps we
+    # don't have reconciles in progress (simpler to reason about what messages we expect in logs)
+    env.storage_controller.reconcile_until_idle()
+
+    # Check we got the shard IDs we expected
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 0, 2)) is not None
+    assert env.storage_controller.inspect(TenantShardId(tenant_id, 1, 2)) is not None
+
+    workload_main.validate()
+    workload_main.stop()
+
+    env.storage_controller.consistency_check()
+
+    # Ensure each shard has the same list of timelines and offloaded timelines
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        list = ps.http_client().timeline_and_offloaded_list(shard)
+        assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
+        assert timeline_id_set_for(list.timelines) == expected_timeline_set
+
+        ps.http_client().timeline_compact(shard, timeline_id_main)
+
+    # Check that we can still read all the data
+    workload_main.validate()
+
+    # Force a restart, which requires the state to be persisted.
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Ensure each shard has the same list of timelines and offloaded timelines
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        list = ps.http_client().timeline_and_offloaded_list(shard)
+        assert timeline_id_set_for(list.offloaded) == expected_offloaded_set
+        assert timeline_id_set_for(list.timelines) == expected_timeline_set
+
+        ps.http_client().timeline_compact(shard, timeline_id_main)
+
+    # Compaction shouldn't make anything unreadable
+    workload_main.validate()
+
+    # Do sharded unarchival
+    env.storage_controller.timeline_archival_config(
+        tenant_id, timeline_id_offloaded, TimelineArchivalState.UNARCHIVED
+    )
+    env.storage_controller.timeline_archival_config(
+        tenant_id, timeline_id_archived, TimelineArchivalState.UNARCHIVED
+    )
+
+    for shard in shards:
+        ps = env.get_tenant_pageserver(shard)
+
+        list = ps.http_client().timeline_and_offloaded_list(shard)
+        assert timeline_id_set_for(list.offloaded) == set()
+        assert timeline_id_set_for(list.timelines) == {
+            timeline_id_main,
+            timeline_id_archived,
+            timeline_id_offloaded,
+        }
+
+
 def test_sharding_split_smoke(
     neon_env_builder: NeonEnvBuilder,
 ):

From db900ae9d0bc01aa38d95938325b29b30eb4b4cf Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Fri, 25 Oct 2024 07:50:47 -0400
Subject: [PATCH 082/239] fix(test): remove too strict layers_removed==0 check
 in test_readonly_node_gc (#9506)

Fixes #9098

## Problem

`test_readonly_node_gc` is flaky. As shown in [Allure
Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9469/11444519440/index.html#suites/3ccffb1d100105b98aed3dc19b717917/2c02073738fa2b39),
we would get a `AssertionError: No layers should be removed, old layers
are guarded by leases.` after the test restarts pageservers or after
reconfigure pageservers.

During the investigation, we found that the layers has LSN (`0/1563088`)
greater than the LSN (`0x1562000`) protected by the lease. For instance,


**Layers removed**
<pre>

000000067F00000005000034540100000000-000000067F00000005000040050100000000__000000000<b><i>1563088</i></b>-00000001
(shard 0002)

000000068000000000000017E20000000001-010000000100000001000000000000000001__000000000<b><i>1563088</i></b>-00000001
(shard 0002)
</pre>

**Lsn Lease Granted**
<pre>
handle_make_lsn_lease{lsn=<b><i>0/1562000</i></b> shard_id=0002
shard_id=0002}: lease created, valid until 2024-10-21
</pre>

This means that these layers are not guarded by the leases: they are in
"future", not visible to the static endpoint.

## Summary of changes

- Remove the assertion layers_removed == 0 after trigger timeline GC
while holding the lease. Instead rely on the successful execution of
the`SELECT` query to test lease validity.
- Improve test logging


Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 test_runner/regress/test_readonly_node.py | 19 ++++++++++---------
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 30c69cb883..8151160477 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -169,23 +169,24 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
             )
         return last_flush_lsn
 
-    def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint):
+    def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
         """
         Trigger GC manually on all pageservers. Then run an `SELECT` query.
         """
         for shard, ps in tenant_get_shards(env, env.initial_tenant):
             client = ps.http_client()
             gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+            # Note: cannot assert on `layers_removed` here because it could be layers
+            # not guarded by the lease. Rely on successful execution of the query instead.
             log.info(f"{gc_result=}")
 
-            assert (
-                gc_result["layers_removed"] == 0
-            ), "No layers should be removed, old layers are guarded by leases."
-
         with ep_static.cursor() as cur:
+            # Following query should succeed if pages are properly guarded by leases.
             cur.execute("SELECT count(*) FROM t0")
             assert cur.fetchone() == (ROW_COUNT,)
 
+        log.info(f"`SELECT` query succeed after GC, {ctx=}")
+
     # Insert some records on main branch
     with env.endpoints.create_start("main") as ep_main:
         with ep_main.cursor() as cur:
@@ -210,9 +211,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
             # Wait for static compute to renew lease at least once.
             time.sleep(LSN_LEASE_LENGTH / 2)
 
-            generate_updates_on_main(env, ep_main, i, end=100)
+            generate_updates_on_main(env, ep_main, 3, end=100)
 
-            trigger_gc_and_select(env, ep_static)
+            trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")
 
             # Trigger Pageserver restarts
             for ps in env.pageservers:
@@ -221,7 +222,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
                 time.sleep(LSN_LEASE_LENGTH / 2)
                 ps.start()
 
-            trigger_gc_and_select(env, ep_static)
+            trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")
 
             # Reconfigure pageservers
             env.pageservers[0].stop()
@@ -230,7 +231,7 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
             )
             env.storage_controller.reconcile_until_idle()
 
-            trigger_gc_and_select(env, ep_static)
+            trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")
 
         # Do some update so we can increment latest_gc_cutoff
         generate_updates_on_main(env, ep_main, i, end=100)

From 9768f09f6bf73b7ed35d88355a96afc3d96049c2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Fri, 25 Oct 2024 14:04:41 +0200
Subject: [PATCH 083/239] proxy: don't follow redirects for user provided JWKS
 urls + set custom user agent (#9514)

partially fixes https://github.com/neondatabase/cloud/issues/19249

ref https://docs.rs/reqwest/latest/reqwest/redirect/index.html
> By default, a Client will automatically handle HTTP redirects, having
a maximum redirect chain of 10 hops. To customize this behavior, a
redirect::Policy can be used with a ClientBuilder.
---
 .gitignore                    |  2 ++
 proxy/src/auth/backend/jwt.rs | 17 ++++++++++++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 2c38cdcc59..a07a65ccef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,6 +6,8 @@ __pycache__/
 test_output/
 .vscode
 .idea
+*.swp
+tags
 neon.iml
 /.neon
 /integration_tests/.neon
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 2185677159..69ab4b8ccb 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -5,6 +5,7 @@ use std::time::{Duration, SystemTime};
 use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
+use reqwest::{redirect, Client};
 use serde::de::Visitor;
 use serde::{Deserialize, Deserializer};
 use signature::Verifier;
@@ -24,6 +25,7 @@ const MIN_RENEW: Duration = Duration::from_secs(30);
 const AUTO_RENEW: Duration = Duration::from_secs(300);
 const MAX_RENEW: Duration = Duration::from_secs(3600);
 const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
+const JWKS_USER_AGENT: &str = "neon-proxy";
 
 /// How to get the JWT auth rules
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
@@ -50,7 +52,6 @@ pub(crate) struct AuthRule {
     pub(crate) role_names: Vec<RoleNameInt>,
 }
 
-#[derive(Default)]
 pub struct JwkCache {
     client: reqwest::Client,
 
@@ -357,6 +358,20 @@ impl JwkCache {
     }
 }
 
+impl Default for JwkCache {
+    fn default() -> Self {
+        let client = Client::builder()
+            .user_agent(JWKS_USER_AGENT)
+            .redirect(redirect::Policy::none())
+            .build()
+            .expect("using &str and standard redirect::Policy");
+        JwkCache {
+            client,
+            map: DashMap::default(),
+        }
+    }
+}
+
 fn verify_ec_signature(data: &[u8], sig: &[u8], key: &jose_jwk::Ec) -> Result<(), JwtError> {
     use ecdsa::Signature;
     use signature::Verifier;

From 6f5c2626844e984abebb8704928ecdcbc87ad49e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 25 Oct 2024 14:16:45 +0200
Subject: [PATCH 084/239] pageserver: add testing API to scan layers for
 disposable keys (#9393)

This PR adds a pageserver mgmt API to scan a layer file for disposable
keys.

It hooks it up to the sharding compaction test, demonstrating that we're
not filtering out all disposable keys.

This is extracted from PGDATA import
(https://github.com/neondatabase/neon/pull/9218)
where I do the filtering of layer files based on `is_key_disposable`.
---
 libs/pageserver_api/src/models.rs             |  6 ++
 pageserver/src/http/routes.rs                 | 97 +++++++++++++++++++
 .../src/tenant/storage_layer/delta_layer.rs   | 14 ++-
 .../src/tenant/storage_layer/image_layer.rs   | 15 +++
 pageserver/src/tenant/storage_layer/layer.rs  | 27 +++---
 .../src/tenant/storage_layer/layer_desc.rs    | 28 ++++++
 pageserver/src/tenant/timeline/compaction.rs  |  9 +-
 .../src/tenant/timeline/layer_manager.rs      |  7 +-
 test_runner/fixtures/neon_fixtures.py         | 51 +++++++++-
 test_runner/fixtures/pageserver/http.py       | 30 ++++++
 test_runner/regress/test_sharding.py          | 16 ++-
 11 files changed, 277 insertions(+), 23 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 8684927554..d37f62185c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -1068,6 +1068,12 @@ pub mod virtual_file {
     }
 }
 
+#[derive(Debug, Clone, Serialize, Deserialize)]
+pub struct ScanDisposableKeysResponse {
+    pub disposable_count: usize,
+    pub not_disposable_count: usize,
+}
+
 // Wrapped in libpq CopyData
 #[derive(PartialEq, Eq, Debug)]
 pub enum PagestreamFeMessage {
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index bc03df9ad2..3943f62ac0 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1293,6 +1293,99 @@ async fn layer_map_info_handler(
     json_response(StatusCode::OK, layer_map_info)
 }
 
+#[instrument(skip_all, fields(tenant_id, shard_id, timeline_id, layer_name))]
+async fn timeline_layer_scan_disposable_keys(
+    request: Request<Body>,
+    cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    let layer_name: LayerName = parse_request_param(&request, "layer_name")?;
+
+    tracing::Span::current().record(
+        "tenant_id",
+        tracing::field::display(&tenant_shard_id.tenant_id),
+    );
+    tracing::Span::current().record(
+        "shard_id",
+        tracing::field::display(tenant_shard_id.shard_slug()),
+    );
+    tracing::Span::current().record("timeline_id", tracing::field::display(&timeline_id));
+    tracing::Span::current().record("layer_name", tracing::field::display(&layer_name));
+
+    let state = get_state(&request);
+
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    // technically the timeline need not be active for this scan to complete
+    let timeline =
+        active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id)
+            .await?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
+
+    let guard = timeline.layers.read().await;
+    let Some(layer) = guard.try_get_from_key(&layer_name.clone().into()) else {
+        return Err(ApiError::NotFound(
+            anyhow::anyhow!("Layer {tenant_shard_id}/{timeline_id}/{layer_name} not found").into(),
+        ));
+    };
+
+    let resident_layer = layer
+        .download_and_keep_resident()
+        .await
+        .map_err(|err| match err {
+            tenant::storage_layer::layer::DownloadError::TimelineShutdown
+            | tenant::storage_layer::layer::DownloadError::DownloadCancelled => {
+                ApiError::ShuttingDown
+            }
+            tenant::storage_layer::layer::DownloadError::ContextAndConfigReallyDeniesDownloads
+            | tenant::storage_layer::layer::DownloadError::DownloadRequired
+            | tenant::storage_layer::layer::DownloadError::NotFile(_)
+            | tenant::storage_layer::layer::DownloadError::DownloadFailed
+            | tenant::storage_layer::layer::DownloadError::PreStatFailed(_) => {
+                ApiError::InternalServerError(err.into())
+            }
+            #[cfg(test)]
+            tenant::storage_layer::layer::DownloadError::Failpoint(_) => {
+                ApiError::InternalServerError(err.into())
+            }
+        })?;
+
+    let keys = resident_layer
+        .load_keys(&ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    let shard_identity = timeline.get_shard_identity();
+
+    let mut disposable_count = 0;
+    let mut not_disposable_count = 0;
+    let cancel = cancel.clone();
+    for (i, key) in keys.into_iter().enumerate() {
+        if shard_identity.is_key_disposable(&key) {
+            disposable_count += 1;
+            tracing::debug!(key = %key, key.dbg=?key, "disposable key");
+        } else {
+            not_disposable_count += 1;
+        }
+        #[allow(clippy::collapsible_if)]
+        if i % 10000 == 0 {
+            if cancel.is_cancelled() || timeline.cancel.is_cancelled() || timeline.is_stopping() {
+                return Err(ApiError::ShuttingDown);
+            }
+        }
+    }
+
+    json_response(
+        StatusCode::OK,
+        pageserver_api::models::ScanDisposableKeysResponse {
+            disposable_count,
+            not_disposable_count,
+        },
+    )
+}
+
 async fn layer_download_handler(
     request: Request<Body>,
     _cancel: CancellationToken,
@@ -3155,6 +3248,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
             |r| api_handler(r, evict_timeline_layer_handler),
         )
+        .post(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_name/scan_disposable_keys",
+            |r| testing_api_handler("timeline_layer_scan_disposable_keys", r, timeline_layer_scan_disposable_keys),
+        )
         .post(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/block_gc",
             |r| api_handler(r, timeline_gc_blocking_handler),
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index ceae1d4b1a..641729d681 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -1084,7 +1084,7 @@ impl DeltaLayerInner {
         }
     }
 
-    pub(super) async fn load_keys<'a>(
+    pub(crate) async fn index_entries<'a>(
         &'a self,
         ctx: &RequestContext,
     ) -> Result<Vec<DeltaEntry<'a>>> {
@@ -1346,7 +1346,7 @@ impl DeltaLayerInner {
 
         tree_reader.dump().await?;
 
-        let keys = self.load_keys(ctx).await?;
+        let keys = self.index_entries(ctx).await?;
 
         async fn dump_blob(val: &ValueRef<'_>, ctx: &RequestContext) -> anyhow::Result<String> {
             let buf = val.load_raw(ctx).await?;
@@ -1453,6 +1453,16 @@ impl DeltaLayerInner {
             ),
         }
     }
+
+    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
+    //
+    // We're reusing the index traversal logical in plan_reads; would be nice to
+    // factor that out.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
+        self.index_entries(ctx)
+            .await
+            .map(|entries| entries.into_iter().map(|entry| entry.key).collect())
+    }
 }
 
 /// A set of data associated with a delta layer key and its value
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index ff2be1780e..3f90df312d 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -673,6 +673,21 @@ impl ImageLayerInner {
             ),
         }
     }
+
+    /// NB: not super efficient, but not terrible either. Should prob be an iterator.
+    //
+    // We're reusing the index traversal logical in plan_reads; would be nice to
+    // factor that out.
+    pub(crate) async fn load_keys(&self, ctx: &RequestContext) -> anyhow::Result<Vec<Key>> {
+        let plan = self
+            .plan_reads(KeySpace::single(self.key_range.clone()), None, ctx)
+            .await?;
+        Ok(plan
+            .into_iter()
+            .flat_map(|read| read.blobs_at)
+            .map(|(_, blob_meta)| blob_meta.key)
+            .collect())
+    }
 }
 
 /// A builder object for constructing a new image layer.
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index 38a7cd09af..a9f1189b41 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -19,7 +19,7 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::timeline::{CompactionError, GetVectoredError};
 use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};
 
-use super::delta_layer::{self, DeltaEntry};
+use super::delta_layer::{self};
 use super::image_layer::{self};
 use super::{
     AsLayerDesc, ImageLayerWriter, LayerAccessStats, LayerAccessStatsReset, LayerName,
@@ -1841,23 +1841,22 @@ impl ResidentLayer {
     pub(crate) async fn load_keys<'a>(
         &'a self,
         ctx: &RequestContext,
-    ) -> anyhow::Result<Vec<DeltaEntry<'a>>> {
+    ) -> anyhow::Result<Vec<pageserver_api::key::Key>> {
         use LayerKind::*;
 
         let owner = &self.owner.0;
-        match self.downloaded.get(owner, ctx).await? {
-            Delta(ref d) => {
-                // this is valid because the DownloadedLayer::kind is a OnceCell, not a
-                // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
-                // while it's being held.
-                self.owner.record_access(ctx);
+        let inner = self.downloaded.get(owner, ctx).await?;
 
-                delta_layer::DeltaLayerInner::load_keys(d, ctx)
-                    .await
-                    .with_context(|| format!("Layer index is corrupted for {self}"))
-            }
-            Image(_) => anyhow::bail!(format!("cannot load_keys on a image layer {self}")),
-        }
+        // this is valid because the DownloadedLayer::kind is a OnceCell, not a
+        // Mutex<OnceCell>, so we cannot go and deinitialize the value with OnceCell::take
+        // while it's being held.
+        self.owner.record_access(ctx);
+
+        let res = match inner {
+            Delta(ref d) => delta_layer::DeltaLayerInner::load_keys(d, ctx).await,
+            Image(ref i) => image_layer::ImageLayerInner::load_keys(i, ctx).await,
+        };
+        res.with_context(|| format!("Layer index is corrupted for {self}"))
     }
 
     /// Read all they keys in this layer which match the ShardIdentity, and write them all to
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index e90ff3c4b2..a30c25d780 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -57,6 +57,34 @@ impl std::fmt::Display for PersistentLayerKey {
     }
 }
 
+impl From<ImageLayerName> for PersistentLayerKey {
+    fn from(image_layer_name: ImageLayerName) -> Self {
+        Self {
+            key_range: image_layer_name.key_range,
+            lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer_name.lsn),
+            is_delta: false,
+        }
+    }
+}
+
+impl From<DeltaLayerName> for PersistentLayerKey {
+    fn from(delta_layer_name: DeltaLayerName) -> Self {
+        Self {
+            key_range: delta_layer_name.key_range,
+            lsn_range: delta_layer_name.lsn_range,
+            is_delta: true,
+        }
+    }
+}
+
+impl From<LayerName> for PersistentLayerKey {
+    fn from(layer_name: LayerName) -> Self {
+        match layer_name {
+            LayerName::Image(i) => i.into(),
+            LayerName::Delta(d) => d.into(),
+        }
+    }
+}
 impl PersistentLayerDesc {
     pub fn key(&self) -> PersistentLayerKey {
         PersistentLayerKey {
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 6aa5b30f07..73e4f0e87c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -834,7 +834,12 @@ impl Timeline {
                 if self.cancel.is_cancelled() {
                     return Err(CompactionError::ShuttingDown);
                 }
-                all_keys.extend(l.load_keys(ctx).await.map_err(CompactionError::Other)?);
+                let delta = l.get_as_delta(ctx).await.map_err(CompactionError::Other)?;
+                let keys = delta
+                    .index_entries(ctx)
+                    .await
+                    .map_err(CompactionError::Other)?;
+                all_keys.extend(keys);
             }
             // The current stdlib sorting implementation is designed in a way where it is
             // particularly fast where the slice is made up of sorted sub-ranges.
@@ -2438,7 +2443,7 @@ impl CompactionDeltaLayer<TimelineAdaptor> for ResidentDeltaLayer {
     type DeltaEntry<'a> = DeltaEntry<'a>;
 
     async fn load_keys<'a>(&self, ctx: &RequestContext) -> anyhow::Result<Vec<DeltaEntry<'_>>> {
-        self.0.load_keys(ctx).await
+        self.0.get_as_delta(ctx).await?.index_entries(ctx).await
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 8f20d84401..4293a44dca 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -45,13 +45,16 @@ impl LayerManager {
     pub(crate) fn get_from_key(&self, key: &PersistentLayerKey) -> Layer {
         // The assumption for the `expect()` is that all code maintains the following invariant:
         // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
-        self.layers()
-            .get(key)
+        self.try_get_from_key(key)
             .with_context(|| format!("get layer from key: {key}"))
             .expect("not found")
             .clone()
     }
 
+    pub(crate) fn try_get_from_key(&self, key: &PersistentLayerKey) -> Option<&Layer> {
+        self.layers().get(key)
+    }
+
     pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
         self.get_from_key(&desc.key())
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a1ea056213..6491069f20 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -61,7 +61,11 @@ from fixtures.pageserver.allowed_errors import (
     DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.common_types import LayerName, parse_layer_file_name
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import (
+    HistoricLayerInfo,
+    PageserverHttpClient,
+    ScanDisposableKeysResponse,
+)
 from fixtures.pageserver.utils import (
     wait_for_last_record_lsn,
 )
@@ -2670,6 +2674,51 @@ class NeonPageserver(PgProtocol, LogUtils):
         layers = self.list_layers(tenant_id, timeline_id)
         return layer_name in [parse_layer_file_name(p.name) for p in layers]
 
+    def timeline_scan_no_disposable_keys(
+        self, tenant_shard_id: TenantShardId, timeline_id: TimelineId
+    ) -> TimelineAssertNoDisposableKeysResult:
+        """
+        Scan all keys in all layers of the tenant/timeline for disposable keys.
+        Disposable keys are keys that are present in a layer referenced by the shard
+        but are not going to be accessed by the shard.
+        For example, after shard split, the child shards will reference the parent's layer
+        files until new data is ingested and/or compaction rewrites the layers.
+        """
+
+        ps_http = self.http_client()
+        tally = ScanDisposableKeysResponse(0, 0)
+        per_layer = []
+        with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor:
+            futs = []
+            shard_layer_map = ps_http.layer_map_info(tenant_shard_id, timeline_id)
+            for layer in shard_layer_map.historic_layers:
+
+                def do_layer(
+                    shard_ps_http: PageserverHttpClient,
+                    tenant_shard_id: TenantShardId,
+                    timeline_id: TimelineId,
+                    layer: HistoricLayerInfo,
+                ) -> tuple[HistoricLayerInfo, ScanDisposableKeysResponse]:
+                    return (
+                        layer,
+                        shard_ps_http.timeline_layer_scan_disposable_keys(
+                            tenant_shard_id, timeline_id, layer.layer_file_name
+                        ),
+                    )
+
+                futs.append(executor.submit(do_layer, ps_http, tenant_shard_id, timeline_id, layer))
+            for fut in futs:
+                layer, result = fut.result()
+                tally += result
+                per_layer.append((layer, result))
+        return TimelineAssertNoDisposableKeysResult(tally, per_layer)
+
+
+@dataclass
+class TimelineAssertNoDisposableKeysResult:
+    tally: ScanDisposableKeysResponse
+    per_layer: list[tuple[HistoricLayerInfo, ScanDisposableKeysResponse]]
+
 
 class PgBin:
     """A helper class for executing postgres binaries"""
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 706bc550e5..175a1870d4 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -129,6 +129,26 @@ class LayerMapInfo:
         return set(x.layer_file_name for x in self.historic_layers)
 
 
+@dataclass
+class ScanDisposableKeysResponse:
+    disposable_count: int
+    not_disposable_count: int
+
+    def __add__(self, b):
+        a = self
+        assert isinstance(a, ScanDisposableKeysResponse)
+        assert isinstance(b, ScanDisposableKeysResponse)
+        return ScanDisposableKeysResponse(
+            a.disposable_count + b.disposable_count, a.not_disposable_count + b.not_disposable_count
+        )
+
+    @classmethod
+    def from_json(cls, d: dict[str, Any]) -> ScanDisposableKeysResponse:
+        disposable_count = d["disposable_count"]
+        not_disposable_count = d["not_disposable_count"]
+        return ScanDisposableKeysResponse(disposable_count, not_disposable_count)
+
+
 @dataclass
 class TenantConfig:
     tenant_specific_overrides: dict[str, Any]
@@ -905,6 +925,16 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         self.verbose_error(res)
         return LayerMapInfo.from_json(res.json())
 
+    def timeline_layer_scan_disposable_keys(
+        self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
+    ) -> ScanDisposableKeysResponse:
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/layer/{layer_name}/scan_disposable_keys",
+        )
+        self.verbose_error(res)
+        assert res.status_code == 200
+        return ScanDisposableKeysResponse.from_json(res.json())
+
     def download_layer(
         self, tenant_id: Union[TenantId, TenantShardId], timeline_id: TimelineId, layer_name: str
     ):
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 6c2a059098..3a249bbdb4 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -188,7 +188,9 @@ def test_sharding_split_unsharded(
         "compact-shard-ancestors-persistent",
     ],
 )
-def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint: Optional[str]):
+def test_sharding_split_compaction(
+    neon_env_builder: NeonEnvBuilder, failpoint: Optional[str], build_type: str
+):
     """
     Test that after a split, we clean up parent layer data in the child shards via compaction.
     """
@@ -322,9 +324,19 @@ def test_sharding_split_compaction(neon_env_builder: NeonEnvBuilder, failpoint:
             # Physical size should shrink because layers are smaller
             assert detail_after["current_physical_size"] < detail_before["current_physical_size"]
 
-    # Validate size statistics
+    # Validate filtering compaction actually happened
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+
+        log.info("scan all layer files for disposable keys, there shouldn't be any")
+        result = ps.timeline_scan_no_disposable_keys(shard, timeline_id)
+        tally = result.tally
+        raw_page_count = tally.not_disposable_count + tally.disposable_count
+        assert tally.not_disposable_count > (
+            raw_page_count // 2
+        ), "compaction doesn't rewrite layers that are >=50pct local"
+
+        log.info("check sizes")
         timeline_info = ps.http_client().timeline_detail(shard, timeline_id)
         reported_size = timeline_info["current_physical_size"]
         layer_paths = ps.list_layers(shard, timeline_id)

From e0c7f1ce15d3c62951055a6a8cbc591103249384 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 25 Oct 2024 14:20:53 +0200
Subject: [PATCH 085/239] remote_storage(local_fs): return correct file sizes
 (#9511)

## Problem

`local_fs` doesn't return file sizes, which I need in PGDATA import
(#9218)

## Solution

Include file sizes in the result.

I would have liked to add a unit test, and started doing that in

* https://github.com/neondatabase/neon/pull/9510

by extending the common object storage tests
(`libs/remote_storage/tests/common/tests.rs`) to check for sizes as
well.

But it turns out that localfs is not even covered by the common object
storage tests and upon closer inspection, it seems that this area needs
more attention.
=> punt the effort into https://github.com/neondatabase/neon/pull/9510
---
 libs/remote_storage/src/local_fs.rs | 35 +++++++++++++----------------
 1 file changed, 16 insertions(+), 19 deletions(-)

diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index 93a052139b..553153826e 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -357,22 +357,20 @@ impl RemoteStorage for LocalFs {
                 .list_recursive(prefix)
                 .await
                 .map_err(DownloadError::Other)?;
-            let objects = keys
-                .into_iter()
-                .filter_map(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    if path.is_dir() {
-                        None
-                    } else {
-                        Some(ListingObject {
-                            key: k.clone(),
-                            // LocalFs is just for testing, so just specify a dummy time
-                            last_modified: SystemTime::now(),
-                            size: 0,
-                        })
-                    }
-                })
-                .collect();
+            let mut objects = Vec::with_capacity(keys.len());
+            for key in keys {
+                let path = key.with_base(&self.storage_root);
+                let metadata = file_metadata(&path).await?;
+                if metadata.is_dir() {
+                    continue;
+                }
+                objects.push(ListingObject {
+                    key: key.clone(),
+                    last_modified: metadata.modified()?,
+                    size: metadata.len(),
+                });
+            }
+            let objects = objects;
 
             if let ListingMode::NoDelimiter = mode {
                 result.keys = objects;
@@ -410,9 +408,8 @@ impl RemoteStorage for LocalFs {
                     } else {
                         result.keys.push(ListingObject {
                             key: RemotePath::from_string(&relative_key).unwrap(),
-                            // LocalFs is just for testing
-                            last_modified: SystemTime::now(),
-                            size: 0,
+                            last_modified: object.last_modified,
+                            size: object.size,
                         });
                     }
                 }

From c6cf5e7c0f89a9bd742337d2a4a960c2d16175d6 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 25 Oct 2024 16:13:46 +0300
Subject: [PATCH 086/239] Make test_pageserver_lsn_wait_error_safekeeper_stop
 less aggressive. (#9517)

Previously it inserted ~150MiB of WAL while expecting page fetching to
work in 1s (wait_lsn_timeout=1s). It failed in CI in debug builds.
Instead, just directly wait for the wanted condition, i.e. needed
safekeepers are reported in pageserver timed out waiting for WAL error
message. Also set NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES to 1 in this
test and neighbour one, it reduces execution time from 2.5m to ~10s.
---
 test_runner/regress/test_wal_receiver.py | 76 ++++++++++++++----------
 1 file changed, 44 insertions(+), 32 deletions(-)

diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index be2aa2b346..294f86ffa7 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -1,11 +1,12 @@
 from __future__ import annotations
 
-import time
+import os
 from typing import TYPE_CHECKING
 
 from fixtures.common_types import Lsn, TenantId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder
+from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
     from typing import Any
@@ -19,6 +20,10 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.http_client()
 
+    # In this test we force 'Timed out while waiting for WAL record error' while
+    # fetching basebackup and don't want any retries.
+    os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
+
     tenant_id, timeline_id = env.create_tenant()
     expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
     env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
@@ -49,11 +54,14 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuilder):
     # Trigger WAL wait timeout faster
     def customize_pageserver_toml(ps_cfg: dict[str, Any]):
-        ps_cfg["wait_lsn_timeout"] = "1s"
+        ps_cfg["wait_lsn_timeout"] = "2s"
         tenant_config = ps_cfg.setdefault("tenant_config", {})
         tenant_config["walreceiver_connect_timeout"] = "2s"
         tenant_config["lagging_wal_timeout"] = "2s"
 
+    # In this test we force 'Timed out while waiting for WAL record error' while
+    # fetching basebackup and don't want any retries.
+    os.environ["NEON_COMPUTE_TESTING_BASEBACKUP_RETRIES"] = "1"
     neon_env_builder.pageserver_config_override = customize_pageserver_toml
 
     # Have notable SK ids to ensure we check logs for their presence, not some other random numbers
@@ -64,7 +72,6 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
 
     tenant_id, timeline_id = env.create_tenant()
 
-    elements_to_insert = 1_000_000
     expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive"
     env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*")
     # we configure wait_lsn_timeout to a shorter value than the lagging_wal_timeout / walreceiver_connect_timeout
@@ -74,45 +81,50 @@ def test_pageserver_lsn_wait_error_safekeeper_stop(neon_env_builder: NeonEnvBuil
         ".*ingesting record with timestamp lagging more than wait_lsn_timeout.*"
     )
 
-    insert_test_elements(env, tenant_id, start=0, count=elements_to_insert)
+    insert_test_elements(env, tenant_id, start=0, count=1)
 
-    try:
-        trigger_wait_lsn_timeout(env, tenant_id)
-    except Exception as e:
-        exception_string = str(e)
-        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
-
-        for safekeeper in env.safekeepers:
+    def all_sks_in_wareceiver_state():
+        try:
+            trigger_wait_lsn_timeout(env, tenant_id)
+        except Exception as e:
+            exception_string = str(e)
             assert (
-                str(safekeeper.id) in exception_string
-            ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
+                expected_timeout_error in exception_string
+            ), "Should time out during waiting for WAL"
+
+            for safekeeper in env.safekeepers:
+                assert (
+                    str(safekeeper.id) in exception_string
+                ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
+
+    wait_until(60, 0.5, all_sks_in_wareceiver_state)
 
     stopped_safekeeper = env.safekeepers[-1]
     stopped_safekeeper_id = stopped_safekeeper.id
     log.info(f"Stopping safekeeper {stopped_safekeeper.id}")
     stopped_safekeeper.stop()
-    # sleep until stopped safekeeper is removed from candidates
-    time.sleep(2)
 
-    # Spend some more time inserting, to ensure SKs report updated statuses and walreceiver in PS have time to update its connection stats.
-    insert_test_elements(env, tenant_id, start=elements_to_insert + 1, count=elements_to_insert)
+    def all_but_stopped_sks_in_wareceiver_state():
+        try:
+            trigger_wait_lsn_timeout(env, tenant_id)
+        except Exception as e:
+            # Strip out the part before stdout, as it contains full command with the list of all safekeepers
+            exception_string = str(e).split("stdout", 1)[-1]
+            assert (
+                expected_timeout_error in exception_string
+            ), "Should time out during waiting for WAL"
 
-    try:
-        trigger_wait_lsn_timeout(env, tenant_id)
-    except Exception as e:
-        # Strip out the part before stdout, as it contains full command with the list of all safekeepers
-        exception_string = str(e).split("stdout", 1)[-1]
-        assert expected_timeout_error in exception_string, "Should time out during waiting for WAL"
+            for safekeeper in env.safekeepers:
+                if safekeeper.id == stopped_safekeeper_id:
+                    assert (
+                        str(safekeeper.id) not in exception_string
+                    ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
+                else:
+                    assert (
+                        str(safekeeper.id) in exception_string
+                    ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
 
-        for safekeeper in env.safekeepers:
-            if safekeeper.id == stopped_safekeeper_id:
-                assert (
-                    str(safekeeper.id) not in exception_string
-                ), f"Should not have stopped safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
-            else:
-                assert (
-                    str(safekeeper.id) in exception_string
-                ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
+    wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state)
 
 
 def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):

From 05eff3a67ecf613bbb88d92d0f2b758d5608bf39 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 25 Oct 2024 07:41:44 -0600
Subject: [PATCH 087/239] Move logical replication slot monitor

neon.c is getting crowded and the logical replication slot monitor is
a good candidate for reorganization. It is very self-contained, and
being in a separate file will make it that much easier to find.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/Makefile                      |   1 +
 pgxn/neon/logical_replication_monitor.c | 253 ++++++++++++++++++++++++
 pgxn/neon/logical_replication_monitor.h |   6 +
 pgxn/neon/neon.c                        | 245 +----------------------
 4 files changed, 261 insertions(+), 244 deletions(-)
 create mode 100644 pgxn/neon/logical_replication_monitor.c
 create mode 100644 pgxn/neon/logical_replication_monitor.h

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 1503b856f7..42f2a8efda 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -8,6 +8,7 @@ OBJS = \
 	file_cache.o \
 	hll.o \
 	libpagestore.o \
+	logical_replication_monitor.o \
 	neon.o \
 	neon_pgversioncompat.o \
 	neon_perf_counters.o \
diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c
new file mode 100644
index 0000000000..2de429b83d
--- /dev/null
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -0,0 +1,253 @@
+#include <limits.h>
+#include <string.h>
+#include <dirent.h>
+#include <signal.h>
+
+#include "postgres.h"
+
+#include "miscadmin.h"
+#include "postmaster/bgworker.h"
+#include "postmaster/interrupt.h"
+#include "replication/slot.h"
+#include "storage/fd.h"
+#include "storage/procsignal.h"
+#include "tcop/tcopprot.h"
+#include "utils/guc.h"
+#include "utils/wait_event.h"
+
+#include "logical_replication_monitor.h"
+
+#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
+
+static int	logical_replication_max_snap_files = 300;
+
+PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
+
+static int
+LsnDescComparator(const void *a, const void *b)
+{
+	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
+	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+
+	if (lsn1 < lsn2)
+		return 1;
+	else if (lsn1 == lsn2)
+		return 0;
+	else
+		return -1;
+}
+
+/*
+ * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
+ * next gc would leave not more than logical_replication_max_snap_files; all
+ * slots having lower restart_lsn should be dropped.
+ */
+static XLogRecPtr
+get_num_snap_files_lsn_threshold(void)
+{
+	DIR		   *dirdesc;
+	struct dirent *de;
+	char	   *snap_path = "pg_logical/snapshots/";
+	int			lsns_allocated = 1024;
+	int			lsns_num = 0;
+	XLogRecPtr *lsns;
+	XLogRecPtr	cutoff;
+
+	if (logical_replication_max_snap_files < 0)
+		return 0;
+
+	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+
+	/* find all .snap files and get their lsns */
+	dirdesc = AllocateDir(snap_path);
+	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	{
+		XLogRecPtr	lsn;
+		uint32		hi;
+		uint32		lo;
+
+		if (strcmp(de->d_name, ".") == 0 ||
+			strcmp(de->d_name, "..") == 0)
+			continue;
+
+		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
+		{
+			ereport(LOG,
+					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
+			continue;
+		}
+
+		lsn = ((uint64) hi) << 32 | lo;
+		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
+		if (lsns_allocated == lsns_num)
+		{
+			lsns_allocated *= 2;
+			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+		}
+		lsns[lsns_num++] = lsn;
+	}
+	/* sort by lsn desc */
+	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
+	/* and take cutoff at logical_replication_max_snap_files */
+	if (logical_replication_max_snap_files > lsns_num)
+		cutoff = 0;
+	/* have less files than cutoff */
+	else
+	{
+		cutoff = lsns[logical_replication_max_snap_files - 1];
+		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
+			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+	}
+	pfree(lsns);
+	FreeDir(dirdesc);
+	return cutoff;
+}
+
+void
+InitLogicalReplicationMonitor(void)
+{
+	BackgroundWorker bgw;
+
+	DefineCustomIntVariable(
+							"neon.logical_replication_max_snap_files",
+							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							NULL,
+							&logical_replication_max_snap_files,
+							300, -1, INT_MAX,
+							PGC_SIGHUP,
+							0,
+							NULL, NULL, NULL);
+
+	memset(&bgw, 0, sizeof(bgw));
+	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
+	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
+	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
+	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
+	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
+	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
+	bgw.bgw_restart_time = 5;
+	bgw.bgw_notify_pid = 0;
+	bgw.bgw_main_arg = (Datum) 0;
+
+	RegisterBackgroundWorker(&bgw);
+}
+
+/*
+ * Unused logical replication slots pins WAL and prevents deletion of snapshots.
+ * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
+ * need too many .snap files.
+ */
+void
+LogicalSlotsMonitorMain(Datum main_arg)
+{
+	/* Establish signal handlers. */
+	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
+	pqsignal(SIGHUP, SignalHandlerForConfigReload);
+	pqsignal(SIGTERM, die);
+
+	BackgroundWorkerUnblockSignals();
+
+	for (;;)
+	{
+		XLogRecPtr	cutoff_lsn;
+
+		/* In case of a SIGHUP, just reload the configuration. */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+		}
+
+		/*
+		 * If there are too many .snap files, just drop all logical slots to
+		 * prevent aux files bloat.
+		 */
+		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		if (cutoff_lsn > 0)
+		{
+			for (int i = 0; i < max_replication_slots; i++)
+			{
+				char		slot_name[NAMEDATALEN];
+				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
+				XLogRecPtr	restart_lsn;
+
+				/* find the name */
+				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
+				/* Consider only logical repliction slots */
+				if (!s->in_use || !SlotIsLogical(s))
+				{
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
+				}
+
+				/* do we need to drop it? */
+				SpinLockAcquire(&s->mutex);
+				restart_lsn = s->data.restart_lsn;
+				SpinLockRelease(&s->mutex);
+				if (restart_lsn >= cutoff_lsn)
+				{
+					LWLockRelease(ReplicationSlotControlLock);
+					continue;
+				}
+
+				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
+				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
+					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
+				LWLockRelease(ReplicationSlotControlLock);
+
+				/* now try to drop it, killing owner before if any */
+				for (;;)
+				{
+					pid_t		active_pid;
+
+					SpinLockAcquire(&s->mutex);
+					active_pid = s->active_pid;
+					SpinLockRelease(&s->mutex);
+
+					if (active_pid == 0)
+					{
+						/*
+						 * Slot is releasted, try to drop it. Though of course
+						 * it could have been reacquired, so drop can ERROR
+						 * out. Similarly it could have been dropped in the
+						 * meanwhile.
+						 *
+						 * In principle we could remove pg_try/pg_catch, that
+						 * would restart the whole bgworker.
+						 */
+						ConditionVariableCancelSleep();
+						PG_TRY();
+						{
+							ReplicationSlotDrop(slot_name, true);
+							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
+						}
+						PG_CATCH();
+						{
+							/* log ERROR and reset elog stack */
+							EmitErrorReport();
+							FlushErrorState();
+							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
+						}
+						PG_END_TRY();
+						break;
+					}
+					else
+					{
+						/* kill the owner and wait for release */
+						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
+						(void) kill(active_pid, SIGTERM);
+						/* We shouldn't get stuck, but to be safe add timeout. */
+						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
+					}
+				}
+			}
+		}
+
+		(void) WaitLatch(MyLatch,
+						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
+						 LS_MONITOR_CHECK_INTERVAL,
+						 PG_WAIT_EXTENSION);
+		ResetLatch(MyLatch);
+		CHECK_FOR_INTERRUPTS();
+	}
+}
diff --git a/pgxn/neon/logical_replication_monitor.h b/pgxn/neon/logical_replication_monitor.h
new file mode 100644
index 0000000000..a2f9949b19
--- /dev/null
+++ b/pgxn/neon/logical_replication_monitor.h
@@ -0,0 +1,6 @@
+#ifndef __NEON_LOGICAL_REPLICATION_MONITOR_H__
+#define __NEON_LOGICAL_REPLICATION_MONITOR_H__
+
+void InitLogicalReplicationMonitor(void);
+
+#endif
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index c3ed96710a..f8ec725c18 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -14,32 +14,22 @@
 #include "miscadmin.h"
 #include "access/subtrans.h"
 #include "access/twophase.h"
-#include "access/xact.h"
 #include "access/xlog.h"
-#include "storage/buf_internals.h"
-#include "storage/bufmgr.h"
-#include "catalog/pg_type.h"
-#include "postmaster/bgworker.h"
-#include "postmaster/interrupt.h"
 #include "replication/logical.h"
 #include "replication/slot.h"
 #include "replication/walsender.h"
 #include "storage/proc.h"
-#include "storage/procsignal.h"
-#include "tcop/tcopprot.h"
 #include "funcapi.h"
 #include "access/htup_details.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
 #include "utils/guc_tables.h"
-#include "utils/wait_event.h"
 
 #include "extension_server.h"
 #include "neon.h"
-#include "walproposer.h"
-#include "pagestore_client.h"
 #include "control_plane_connector.h"
+#include "logical_replication_monitor.h"
 #include "walsender_hooks.h"
 #if PG_MAJORVERSION_NUM >= 16
 #include "storage/ipc.h"
@@ -48,7 +38,6 @@
 PG_MODULE_MAGIC;
 void		_PG_init(void);
 
-static int	logical_replication_max_snap_files = 300;
 
 static int  running_xacts_overflow_policy;
 
@@ -82,237 +71,6 @@ static const struct config_enum_entry running_xacts_overflow_policies[] = {
 	{NULL, 0, false}
 };
 
-static void
-InitLogicalReplicationMonitor(void)
-{
-	BackgroundWorker bgw;
-
-	DefineCustomIntVariable(
-							"neon.logical_replication_max_snap_files",
-							"Maximum allowed logical replication .snap files. When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
-							NULL,
-							&logical_replication_max_snap_files,
-							300, -1, INT_MAX,
-							PGC_SIGHUP,
-							0,
-							NULL, NULL, NULL);
-
-	memset(&bgw, 0, sizeof(bgw));
-	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
-	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
-	snprintf(bgw.bgw_library_name, BGW_MAXLEN, "neon");
-	snprintf(bgw.bgw_function_name, BGW_MAXLEN, "LogicalSlotsMonitorMain");
-	snprintf(bgw.bgw_name, BGW_MAXLEN, "Logical replication monitor");
-	snprintf(bgw.bgw_type, BGW_MAXLEN, "Logical replication monitor");
-	bgw.bgw_restart_time = 5;
-	bgw.bgw_notify_pid = 0;
-	bgw.bgw_main_arg = (Datum) 0;
-
-	RegisterBackgroundWorker(&bgw);
-}
-
-static int
-LsnDescComparator(const void *a, const void *b)
-{
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
-
-	if (lsn1 < lsn2)
-		return 1;
-	else if (lsn1 == lsn2)
-		return 0;
-	else
-		return -1;
-}
-
-/*
- * Look at .snap files and calculate minimum allowed restart_lsn of slot so that
- * next gc would leave not more than logical_replication_max_snap_files; all
- * slots having lower restart_lsn should be dropped.
- */
-static XLogRecPtr
-get_num_snap_files_lsn_threshold(void)
-{
-	DIR		   *dirdesc;
-	struct dirent *de;
-	char	   *snap_path = "pg_logical/snapshots/";
-	int			lsns_allocated = 1024;
-	int			lsns_num = 0;
-	XLogRecPtr *lsns;
-	XLogRecPtr	cutoff;
-
-	if (logical_replication_max_snap_files < 0)
-		return 0;
-
-	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
-
-	/* find all .snap files and get their lsns */
-	dirdesc = AllocateDir(snap_path);
-	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
-	{
-		XLogRecPtr	lsn;
-		uint32		hi;
-		uint32		lo;
-
-		if (strcmp(de->d_name, ".") == 0 ||
-			strcmp(de->d_name, "..") == 0)
-			continue;
-
-		if (sscanf(de->d_name, "%X-%X.snap", &hi, &lo) != 2)
-		{
-			ereport(LOG,
-					(errmsg("could not parse file name as .snap file \"%s\"", de->d_name)));
-			continue;
-		}
-
-		lsn = ((uint64) hi) << 32 | lo;
-		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-		if (lsns_allocated == lsns_num)
-		{
-			lsns_allocated *= 2;
-			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
-		}
-		lsns[lsns_num++] = lsn;
-	}
-	/* sort by lsn desc */
-	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
-	/* and take cutoff at logical_replication_max_snap_files */
-	if (logical_replication_max_snap_files > lsns_num)
-		cutoff = 0;
-	/* have less files than cutoff */
-	else
-	{
-		cutoff = lsns[logical_replication_max_snap_files - 1];
-		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
-			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
-	}
-	pfree(lsns);
-	FreeDir(dirdesc);
-	return cutoff;
-}
-
-#define LS_MONITOR_CHECK_INTERVAL 10000 /* ms */
-
-/*
- * Unused logical replication slots pins WAL and prevents deletion of snapshots.
- * WAL bloat is guarded by max_slot_wal_keep_size; this bgw removes slots which
- * need too many .snap files.
- */
-PGDLLEXPORT void
-LogicalSlotsMonitorMain(Datum main_arg)
-{
-	/* Establish signal handlers. */
-	pqsignal(SIGUSR1, procsignal_sigusr1_handler);
-	pqsignal(SIGHUP, SignalHandlerForConfigReload);
-	pqsignal(SIGTERM, die);
-
-	BackgroundWorkerUnblockSignals();
-
-	for (;;)
-	{
-		XLogRecPtr	cutoff_lsn;
-
-		/* In case of a SIGHUP, just reload the configuration. */
-		if (ConfigReloadPending)
-		{
-			ConfigReloadPending = false;
-			ProcessConfigFile(PGC_SIGHUP);
-		}
-
-		/*
-		 * If there are too many .snap files, just drop all logical slots to
-		 * prevent aux files bloat.
-		 */
-		cutoff_lsn = get_num_snap_files_lsn_threshold();
-		if (cutoff_lsn > 0)
-		{
-			for (int i = 0; i < max_replication_slots; i++)
-			{
-				char		slot_name[NAMEDATALEN];
-				ReplicationSlot *s = &ReplicationSlotCtl->replication_slots[i];
-				XLogRecPtr	restart_lsn;
-
-				/* find the name */
-				LWLockAcquire(ReplicationSlotControlLock, LW_SHARED);
-				/* Consider only logical repliction slots */
-				if (!s->in_use || !SlotIsLogical(s))
-				{
-					LWLockRelease(ReplicationSlotControlLock);
-					continue;
-				}
-
-				/* do we need to drop it? */
-				SpinLockAcquire(&s->mutex);
-				restart_lsn = s->data.restart_lsn;
-				SpinLockRelease(&s->mutex);
-				if (restart_lsn >= cutoff_lsn)
-				{
-					LWLockRelease(ReplicationSlotControlLock);
-					continue;
-				}
-
-				strlcpy(slot_name, s->data.name.data, NAMEDATALEN);
-				elog(LOG, "ls_monitor: dropping slot %s with restart_lsn %X/%X below horizon %X/%X",
-					 slot_name, LSN_FORMAT_ARGS(restart_lsn), LSN_FORMAT_ARGS(cutoff_lsn));
-				LWLockRelease(ReplicationSlotControlLock);
-
-				/* now try to drop it, killing owner before if any */
-				for (;;)
-				{
-					pid_t		active_pid;
-
-					SpinLockAcquire(&s->mutex);
-					active_pid = s->active_pid;
-					SpinLockRelease(&s->mutex);
-
-					if (active_pid == 0)
-					{
-						/*
-						 * Slot is releasted, try to drop it. Though of course
-						 * it could have been reacquired, so drop can ERROR
-						 * out. Similarly it could have been dropped in the
-						 * meanwhile.
-						 *
-						 * In principle we could remove pg_try/pg_catch, that
-						 * would restart the whole bgworker.
-						 */
-						ConditionVariableCancelSleep();
-						PG_TRY();
-						{
-							ReplicationSlotDrop(slot_name, true);
-							elog(LOG, "ls_monitor: slot %s dropped", slot_name);
-						}
-						PG_CATCH();
-						{
-							/* log ERROR and reset elog stack */
-							EmitErrorReport();
-							FlushErrorState();
-							elog(LOG, "ls_monitor: failed to drop slot %s", slot_name);
-						}
-						PG_END_TRY();
-						break;
-					}
-					else
-					{
-						/* kill the owner and wait for release */
-						elog(LOG, "ls_monitor: killing slot %s owner %d", slot_name, active_pid);
-						(void) kill(active_pid, SIGTERM);
-						/* We shouldn't get stuck, but to be safe add timeout. */
-						ConditionVariableTimedSleep(&s->active_cv, 1000, WAIT_EVENT_REPLICATION_SLOT_DROP);
-					}
-				}
-			}
-		}
-
-		(void) WaitLatch(MyLatch,
-						 WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | WL_TIMEOUT,
-						 LS_MONITOR_CHECK_INTERVAL,
-						 PG_WAIT_EXTENSION);
-		ResetLatch(MyLatch);
-		CHECK_FOR_INTERRUPTS();
-	}
-}
-
 /*
  * XXX: These private to procarray.c, but we need them here.
  */
@@ -667,7 +425,6 @@ _PG_init(void)
 	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
 	InitLogicalReplicationMonitor();
-
 	InitControlPlaneConnector();
 
 	pg_init_extension_server();

From 2090e928d158913c9075ba23a936c956668a234f Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 25 Oct 2024 15:44:20 +0200
Subject: [PATCH 088/239] refactor(timeline creation): idempotency checking
 (#9501)

# Context

In the PGDATA import code
(https://github.com/neondatabase/neon/pull/9218) I add a third way to
create timelines, namely, by importing from a copy of a vanilla PGDATA
directory in object storage.

For idempotency, I'm using the PGDATA object storage location
specification, which is stored in the IndexPart for the entire lifespan
of the timeline. When loading the timeline from remote storage, that
value gets stored inside `struct Timeline` and timeline creation
compares the creation argument with that value to determine idempotency
of the request.

# Changes

This PR refactors the existing idempotency handling of Timeline
bootstrap and branching such that we simply compare the
`CreateTimelineIdempotency` struct, using the derive-generated
`PartialEq` implementation.

Also, by spelling idempotency out in the type names, I find it adds a
lot of clarity.

The pathway to idempotency via requester-provided idempotency key also
becomes very straight-forward, if we ever want to do this in the future.

# Refs
* platform context: https://github.com/neondatabase/neon/pull/9218
* product context: https://github.com/neondatabase/cloud/issues/17507
* stacks on top of https://github.com/neondatabase/neon/pull/9366
---
 pageserver/src/http/openapi_spec.yml     |   4 +
 pageserver/src/tenant.rs                 | 158 +++++++++++++----------
 pageserver/src/tenant/timeline.rs        |   6 +
 pageserver/src/tenant/timeline/delete.rs |   1 +
 pageserver/src/tenant/timeline/uninit.rs |  19 ++-
 5 files changed, 119 insertions(+), 69 deletions(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 42086dc2e6..2bc7f5ad39 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -597,6 +597,10 @@ paths:
         Create a timeline. Returns new timeline id on success.
         Recreating the same timeline will succeed if the parameters match the existing timeline.
         If no pg_version is specified, assume DEFAULT_PG_VERSION hardcoded in the pageserver.
+
+        To ensure durability, the caller must retry the creation until success.
+        Just because the timeline is visible via other endpoints does not mean it is durable.
+        Future versions may stop showing timelines that are not yet durable.
       requestBody:
         content:
           application/json:
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 968d093a80..d4f6384d9b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -800,15 +800,25 @@ pub(crate) struct CreateTimelineParamsBranch {
     pub(crate) ancestor_start_lsn: Option<Lsn>,
 }
 
-/// What is used to determine idempotency of a [`Tenant::create_timeline`] call.
+/// What is used to determine idempotency of a [`Tenant::create_timeline`] call in  [`Tenant::start_creating_timeline`].
 ///
-/// Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN.
+/// Each [`Timeline`] object holds [`Self`] as an immutable property in [`Timeline::create_idempotency`].
 ///
-/// We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`]
-/// is not considered for idempotency.
+/// We lower timeline creation requests to [`Self`], and then use [`PartialEq::eq`] to compare [`Timeline::create_idempotency`] with the request.
+/// If they are equal, we return a reference to the existing timeline, otherwise it's an idempotency conflict.
 ///
-/// We can improve on this over time.
+/// There is special treatment for [`Self::FailWithConflict`] to always return an idempotency conflict.
+/// It would be nice to have more advanced derive macros to make that special treatment declarative.
+///
+/// Notes:
+/// - Unlike [`CreateTimelineParams`], ancestor LSN is fixed, so, branching will be at a deterministic LSN.
+/// - We make some trade-offs though, e.g., [`CreateTimelineParamsBootstrap::existing_initdb_timeline_id`]
+///   is not considered for idempotency. We can improve on this over time if we deem it necessary.
+///
+#[derive(Debug, Clone, PartialEq, Eq)]
 pub(crate) enum CreateTimelineIdempotency {
+    /// NB: special treatment, see comment in [`Self`].
+    FailWithConflict,
     Bootstrap {
         pg_version: u32,
     },
@@ -836,6 +846,12 @@ enum CreateTimelineResult {
 }
 
 impl CreateTimelineResult {
+    fn discriminant(&self) -> &'static str {
+        match self {
+            Self::Created(_) => "Created",
+            Self::Idempotent(_) => "Idempotent",
+        }
+    }
     fn timeline(&self) -> &Arc<Timeline> {
         match self {
             Self::Created(t) | Self::Idempotent(t) => t,
@@ -989,12 +1005,24 @@ impl Tenant {
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_shard_id;
 
+        let idempotency = if metadata.ancestor_timeline().is_none() {
+            CreateTimelineIdempotency::Bootstrap {
+                pg_version: metadata.pg_version(),
+            }
+        } else {
+            CreateTimelineIdempotency::Branch {
+                ancestor_timeline_id: metadata.ancestor_timeline().unwrap(),
+                ancestor_start_lsn: metadata.ancestor_lsn(),
+            }
+        };
+
         let timeline = self.create_timeline_struct(
             timeline_id,
             &metadata,
             ancestor.clone(),
             resources,
             CreateTimelineCause::Load,
+            idempotency.clone(),
         )?;
         let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -2061,16 +2089,17 @@ impl Tenant {
         self.timelines.lock().unwrap().keys().cloned().collect()
     }
 
-    /// This is used to create the initial 'main' timeline during bootstrapping,
-    /// or when importing a new base backup. The caller is expected to load an
-    /// initial image of the datadir to the new timeline after this.
+    /// This is used by tests & import-from-basebackup.
     ///
-    /// Until that happens, the on-disk state is invalid (disk_consistent_lsn=Lsn(0))
-    /// and the timeline will fail to load at a restart.
+    /// The returned [`UninitializedTimeline`] contains no data nor metadata and it is in
+    /// a state that will fail [`Tenant::load_remote_timeline`] because `disk_consistent_lsn=Lsn(0)`.
     ///
-    /// For tests, use `DatadirModification::init_empty_test_timeline` + `commit` to setup the
-    /// minimum amount of keys required to get a writable timeline.
-    /// (Without it, `put` might fail due to `repartition` failing.)
+    /// The caller is responsible for getting the timeline into a state that will be accepted
+    /// by [`Tenant::load_remote_timeline`] / [`Tenant::attach`].
+    /// Then they may call [`UninitializedTimeline::finish_creation`] to add the timeline
+    /// to the [`Tenant::timelines`].
+    ///
+    /// Tests should use `Tenant::create_test_timeline` to set up the minimum required metadata keys.
     pub(crate) async fn create_empty_timeline(
         &self,
         new_timeline_id: TimelineId,
@@ -2084,7 +2113,15 @@ impl Tenant {
         );
 
         // Protect against concurrent attempts to use this TimelineId
-        let create_guard = self.create_timeline_create_guard(new_timeline_id)?;
+        let create_guard = match self
+            .start_creating_timeline(new_timeline_id, CreateTimelineIdempotency::FailWithConflict)
+            .await?
+        {
+            StartCreatingTimelineResult::CreateGuard(guard) => guard,
+            StartCreatingTimelineResult::Idempotent(_) => {
+                unreachable!("FailWithConflict implies we get an error instead")
+            }
+        };
 
         let new_metadata = TimelineMetadata::new(
             // Initialize disk_consistent LSN to 0, The caller must import some data to
@@ -2294,8 +2331,17 @@ impl Tenant {
 
         // At this point we have dropped our guard on [`Self::timelines_creating`], and
         // the timeline is visible in [`Self::timelines`], but it is _not_ durable yet.  We must
-        // not send a success to the caller until it is.  The same applies to handling retries,
-        // that is done in [`Self::start_creating_timeline`].
+        // not send a success to the caller until it is.  The same applies to idempotent retries.
+        //
+        // TODO: the timeline is already visible in [`Self::timelines`]; a caller could incorrectly
+        // assume that, because they can see the timeline via API, that the creation is done and
+        // that it is durable. Ideally, we would keep the timeline hidden (in [`Self::timelines_creating`])
+        // until it is durable, e.g., by extending the time we hold the creation guard. This also
+        // interacts with UninitializedTimeline and is generally a bit tricky.
+        //
+        // To re-emphasize: the only correct way to create a timeline is to repeat calling the
+        // creation API until it returns success. Only then is durability guaranteed.
+        info!(creation_result=%result.discriminant(), "waiting for timeline to be durable");
         result
             .timeline()
             .remote_client
@@ -3332,6 +3378,7 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         resources: TimelineResources,
         cause: CreateTimelineCause,
+        create_idempotency: CreateTimelineIdempotency,
     ) -> anyhow::Result<Arc<Timeline>> {
         let state = match cause {
             CreateTimelineCause::Load => {
@@ -3361,6 +3408,7 @@ impl Tenant {
             pg_version,
             state,
             self.attach_wal_lag_cooldown.clone(),
+            create_idempotency,
             self.cancel.child_token(),
         );
 
@@ -4046,6 +4094,8 @@ impl Tenant {
             .schedule_index_upload_for_full_metadata_update(&metadata)
             .context("branch initial metadata upload")?;
 
+        // Callers are responsible to wait for uploads to complete and for activating the timeline.
+
         Ok(CreateTimelineResult::Created(new_timeline))
     }
 
@@ -4079,7 +4129,7 @@ impl Tenant {
         new_timeline_id: TimelineId,
         idempotency: CreateTimelineIdempotency,
     ) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
-        match self.create_timeline_create_guard(new_timeline_id) {
+        match self.create_timeline_create_guard(new_timeline_id, idempotency) {
             Ok(create_guard) => {
                 pausable_failpoint!("timeline-creation-after-uninit");
                 Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
@@ -4091,62 +4141,30 @@ impl Tenant {
                 Err(CreateTimelineError::AlreadyCreating)
             }
             Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)),
-            Err(TimelineExclusionError::AlreadyExists(existing)) => {
-                debug!("timeline already exists");
+            Err(TimelineExclusionError::AlreadyExists { existing, arg }) => {
+                {
+                    let existing = &existing.create_idempotency;
+                    let _span = info_span!("idempotency_check", ?existing, ?arg).entered();
 
-                // Idempotency: creating the same timeline twice is not an error, unless
-                // the second creation has different parameters.
-                //
-                // TODO: this is a crutch; we should store the CreateTimelineState as an
-                // immutable attribute in the index part, and compare them using derive(`Eq`).
-                match idempotency {
-                    CreateTimelineIdempotency::Bootstrap { pg_version } => {
-                        if existing.pg_version != pg_version {
-                            info!("timeline already exists with different pg_version");
+                    match (existing, &arg) {
+                        // FailWithConflict => no idempotency check
+                        (CreateTimelineIdempotency::FailWithConflict, _)
+                        | (_, CreateTimelineIdempotency::FailWithConflict) => {
+                            warn!("timeline already exists, failing request");
                             return Err(CreateTimelineError::Conflict);
                         }
-                        if existing.get_ancestor_timeline_id().is_some() {
-                            info!("timeline already exists with an ancestor");
-                            return Err(CreateTimelineError::Conflict);
+                        // Idempotent <=> CreateTimelineIdempotency is identical
+                        (x, y) if x == y => {
+                            info!("timeline already exists and idempotency matches, succeeding request");
+                            // fallthrough
                         }
-                        if existing.get_ancestor_lsn() != Lsn::INVALID {
-                            info!("timeline already exists with an ancestor LSN");
-                            return Err(CreateTimelineError::Conflict);
-                        }
-                    }
-                    CreateTimelineIdempotency::Branch {
-                        ancestor_timeline_id,
-                        ancestor_start_lsn,
-                    } => {
-                        if existing.get_ancestor_timeline_id() != Some(ancestor_timeline_id) {
-                            info!("timeline already exists with different ancestor");
-                            return Err(CreateTimelineError::Conflict);
-                        }
-                        if existing.get_ancestor_lsn() != ancestor_start_lsn {
-                            info!("timeline already exists with different ancestor LSN");
+                        (_, _) => {
+                            warn!("idempotency conflict, failing request");
                             return Err(CreateTimelineError::Conflict);
                         }
                     }
                 }
 
-                // Wait for uploads to complete, so that when we return Ok, the timeline
-                // is known to be durable on remote storage. Just like we do at the end of
-                // this function, after we have created the timeline ourselves.
-                //
-                // We only really care that the initial version of `index_part.json` has
-                // been uploaded. That's enough to remember that the timeline
-                // exists. However, there is no function to wait specifically for that so
-                // we just wait for all in-progress uploads to finish.
-                existing
-                    .remote_client
-                    .wait_completion()
-                    .await
-                    .context("wait for timeline uploads to complete")?;
-
-                // TODO: shouldn't we also wait for timeline to become active?
-                // Code before this(https://github.com/neondatabase/neon/pull/9366) refactoring
-                // didn't do it.
-
                 Ok(StartCreatingTimelineResult::Idempotent(existing))
             }
         }
@@ -4359,6 +4377,8 @@ impl Tenant {
         // All done!
         let timeline = raw_timeline.finish_creation()?;
 
+        // Callers are responsible to wait for uploads to complete and for activating the timeline.
+
         Ok(CreateTimelineResult::Created(timeline))
     }
 
@@ -4409,6 +4429,7 @@ impl Tenant {
                 ancestor,
                 resources,
                 CreateTimelineCause::Load,
+                create_guard.idempotency.clone(),
             )
             .context("Failed to create timeline data structure")?;
 
@@ -4449,12 +4470,14 @@ impl Tenant {
     fn create_timeline_create_guard(
         &self,
         timeline_id: TimelineId,
+        idempotency: CreateTimelineIdempotency,
     ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
         let tenant_shard_id = self.tenant_shard_id;
 
         let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
 
-        let create_guard = TimelineCreateGuard::new(self, timeline_id, timeline_path.clone())?;
+        let create_guard =
+            TimelineCreateGuard::new(self, timeline_id, timeline_path.clone(), idempotency)?;
 
         // At this stage, we have got exclusive access to in-memory state for this timeline ID
         // for creation.
@@ -5090,7 +5113,10 @@ mod tests {
             .await
         {
             Ok(_) => panic!("duplicate timeline creation should fail"),
-            Err(e) => assert_eq!(e.to_string(), "Already exists".to_string()),
+            Err(e) => assert_eq!(
+                e.to_string(),
+                "timeline already exists with different parameters".to_string()
+            ),
         }
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7b40a24c54..f8d61dac5e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -424,6 +424,9 @@ pub struct Timeline {
     pub(crate) handles: handle::PerTimelineState<crate::page_service::TenantManagerTypes>,
 
     pub(crate) attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
+
+    /// Cf. [`crate::tenant::CreateTimelineIdempotency`].
+    pub(crate) create_idempotency: crate::tenant::CreateTimelineIdempotency,
 }
 
 pub type TimelineDeleteProgress = Arc<tokio::sync::Mutex<DeleteTimelineFlow>>;
@@ -2136,6 +2139,7 @@ impl Timeline {
         pg_version: u32,
         state: TimelineState,
         attach_wal_lag_cooldown: Arc<OnceLock<WalLagCooldown>>,
+        create_idempotency: crate::tenant::CreateTimelineIdempotency,
         cancel: CancellationToken,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
@@ -2274,6 +2278,8 @@ impl Timeline {
                 handles: Default::default(),
 
                 attach_wal_lag_cooldown,
+
+                create_idempotency,
             };
 
             result.repartition_threshold =
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 4799aab436..a664bb59e1 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -313,6 +313,7 @@ impl DeleteTimelineFlow {
                 // Important. We dont pass ancestor above because it can be missing.
                 // Thus we need to skip the validation here.
                 CreateTimelineCause::Delete,
+                crate::tenant::CreateTimelineIdempotency::FailWithConflict, // doesn't matter what we put here
             )
             .context("create_timeline_struct")?;
 
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 2b60e670ea..7d66c5aec8 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -5,7 +5,11 @@ use camino::Utf8PathBuf;
 use tracing::{error, info, info_span};
 use utils::{fs_ext, id::TimelineId, lsn::Lsn};
 
-use crate::{context::RequestContext, import_datadir, tenant::Tenant};
+use crate::{
+    context::RequestContext,
+    import_datadir,
+    tenant::{CreateTimelineIdempotency, Tenant},
+};
 
 use super::Timeline;
 
@@ -165,13 +169,17 @@ pub(crate) struct TimelineCreateGuard<'t> {
     owning_tenant: &'t Tenant,
     timeline_id: TimelineId,
     pub(crate) timeline_path: Utf8PathBuf,
+    pub(crate) idempotency: CreateTimelineIdempotency,
 }
 
 /// Errors when acquiring exclusive access to a timeline ID for creation
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum TimelineExclusionError {
     #[error("Already exists")]
-    AlreadyExists(Arc<Timeline>),
+    AlreadyExists {
+        existing: Arc<Timeline>,
+        arg: CreateTimelineIdempotency,
+    },
     #[error("Already creating")]
     AlreadyCreating,
 
@@ -185,6 +193,7 @@ impl<'t> TimelineCreateGuard<'t> {
         owning_tenant: &'t Tenant,
         timeline_id: TimelineId,
         timeline_path: Utf8PathBuf,
+        idempotency: CreateTimelineIdempotency,
     ) -> Result<Self, TimelineExclusionError> {
         // Lock order: this is the only place we take both locks.  During drop() we only
         // lock creating_timelines
@@ -195,7 +204,10 @@ impl<'t> TimelineCreateGuard<'t> {
         > = owning_tenant.timelines_creating.lock().unwrap();
 
         if let Some(existing) = timelines.get(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyExists(existing.clone()))
+            Err(TimelineExclusionError::AlreadyExists {
+                existing: existing.clone(),
+                arg: idempotency,
+            })
         } else if creating_timelines.contains(&timeline_id) {
             Err(TimelineExclusionError::AlreadyCreating)
         } else {
@@ -204,6 +216,7 @@ impl<'t> TimelineCreateGuard<'t> {
                 owning_tenant,
                 timeline_id,
                 timeline_path,
+                idempotency,
             })
         }
     }

From 8297f7a1816e43fcde83dbf4c01551d2bdef49b7 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 25 Oct 2024 15:09:02 +0100
Subject: [PATCH 089/239] pageserver: fix N^2 I/O when processing relation
 drops in transaction abort (#9507)

## Problem

We have some known N^2 behaviors when it comes to large relation counts,
due to the monolithic encoding and full rewrites of of RelDirectory each
time a relation is added. Ordinarily our backpressure mechanisms give
"slow but steady" performance when creating/dropping/truncating
relations. However, in the case of a transaction abort, it is possible
for a single WAL record to drop an unbounded number of relations. The
results in an unavailable compute, as when it sends one of these
records, it can stall the pageserver's ingest for many minutes, even
though the compute only sent a small amount of WAL.

Closes https://github.com/neondatabase/neon/issues/9505

## Summary of changes

- Rewrite relation-dropping code to do one read/modify/write cycle of
RelDirectory, instead of doing it separately for each relation in a
loop.
- Add a test for the bug scenario encountered:
`test_tx_abort_with_many_relations`

The test has ~40s runtime on my workstation. About 1 second of that is
the part where we wait for ingest to catch up after a rollback, the rest
is the slowness of creating and truncating a large number of relations.


---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs    | 55 ++++++++-------
 pageserver/src/walingest.rs            | 36 +++++-----
 test_runner/regress/test_pg_regress.py | 97 ++++++++++++++++++++++++++
 3 files changed, 146 insertions(+), 42 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index f2a11e65c1..19233a28cc 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1506,35 +1506,42 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
-    /// Drop a relation.
-    pub async fn put_rel_drop(&mut self, rel: RelTag, ctx: &RequestContext) -> anyhow::Result<()> {
-        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
+    /// Drop some relations
+    pub(crate) async fn put_rel_drops(
+        &mut self,
+        drop_relations: HashMap<(u32, u32), Vec<RelTag>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        for ((spc_node, db_node), rel_tags) in drop_relations {
+            let dir_key = rel_dir_to_key(spc_node, db_node);
+            let buf = self.get(dir_key, ctx).await?;
+            let mut dir = RelDirectory::des(&buf)?;
 
-        // Remove it from the directory entry
-        let dir_key = rel_dir_to_key(rel.spcnode, rel.dbnode);
-        let buf = self.get(dir_key, ctx).await?;
-        let mut dir = RelDirectory::des(&buf)?;
+            let mut dirty = false;
+            for rel_tag in rel_tags {
+                if dir.rels.remove(&(rel_tag.relnode, rel_tag.forknum)) {
+                    dirty = true;
 
-        self.pending_directory_entries
-            .push((DirectoryKind::Rel, dir.rels.len()));
+                    // update logical size
+                    let size_key = rel_size_to_key(rel_tag);
+                    let old_size = self.get(size_key, ctx).await?.get_u32_le();
+                    self.pending_nblocks -= old_size as i64;
 
-        if dir.rels.remove(&(rel.relnode, rel.forknum)) {
-            self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
-        } else {
-            warn!("dropped rel {} did not exist in rel directory", rel);
+                    // Remove entry from relation size cache
+                    self.tline.remove_cached_rel_size(&rel_tag);
+
+                    // Delete size entry, as well as all blocks
+                    self.delete(rel_key_range(rel_tag));
+                }
+            }
+
+            if dirty {
+                self.put(dir_key, Value::Image(Bytes::from(RelDirectory::ser(&dir)?)));
+                self.pending_directory_entries
+                    .push((DirectoryKind::Rel, dir.rels.len()));
+            }
         }
 
-        // update logical size
-        let size_key = rel_size_to_key(rel);
-        let old_size = self.get(size_key, ctx).await?.get_u32_le();
-        self.pending_nblocks -= old_size as i64;
-
-        // Remove enty from relation size cache
-        self.tline.remove_cached_rel_size(&rel);
-
-        // Delete size entry, as well as all blocks
-        self.delete(rel_key_range(rel));
-
         Ok(())
     }
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index d81552ac77..9e43e10801 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,6 +21,7 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.
 
+use std::collections::HashMap;
 use std::sync::Arc;
 use std::sync::OnceLock;
 use std::time::Duration;
@@ -1620,6 +1621,12 @@ impl WalIngest {
             },
         )?;
 
+        // Group relations to drop by dbNode.  This map will contain all relations that _might_
+        // exist, we will reduce it to which ones really exist later.  This map can be huge if
+        // the transaction touches a huge number of relations (there is no bound on this in
+        // postgres).
+        let mut drop_relations: HashMap<(u32, u32), Vec<RelTag>> = HashMap::new();
+
         for xnode in &parsed.xnodes {
             for forknum in MAIN_FORKNUM..=INIT_FORKNUM {
                 let rel = RelTag {
@@ -1628,15 +1635,16 @@ impl WalIngest {
                     dbnode: xnode.dbnode,
                     relnode: xnode.relnode,
                 };
-                if modification
-                    .tline
-                    .get_rel_exists(rel, Version::Modified(modification), ctx)
-                    .await?
-                {
-                    self.put_rel_drop(modification, rel, ctx).await?;
-                }
+                drop_relations
+                    .entry((xnode.spcnode, xnode.dbnode))
+                    .or_default()
+                    .push(rel);
             }
         }
+
+        // Execute relation drops in a batch: the number may be huge, so deleting individually is prohibitively expensive
+        modification.put_rel_drops(drop_relations, ctx).await?;
+
         if origin_id != 0 {
             modification
                 .set_replorigin(origin_id, parsed.origin_lsn)
@@ -2346,16 +2354,6 @@ impl WalIngest {
         Ok(())
     }
 
-    async fn put_rel_drop(
-        &mut self,
-        modification: &mut DatadirModification<'_>,
-        rel: RelTag,
-        ctx: &RequestContext,
-    ) -> Result<()> {
-        modification.put_rel_drop(rel, ctx).await?;
-        Ok(())
-    }
-
     async fn handle_rel_extend(
         &mut self,
         modification: &mut DatadirModification<'_>,
@@ -2869,7 +2867,9 @@ mod tests {
 
         // Drop rel
         let mut m = tline.begin_modification(Lsn(0x30));
-        walingest.put_rel_drop(&mut m, TESTREL_A, &ctx).await?;
+        let mut rel_drops = HashMap::new();
+        rel_drops.insert((TESTREL_A.spcnode, TESTREL_A.dbnode), vec![TESTREL_A]);
+        m.put_rel_drops(rel_drops, &ctx).await?;
         m.commit(&ctx).await?;
 
         // Check that rel is not visible anymore
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 45ce5b1c5b..b97fccddf5 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,10 +3,13 @@
 #
 from __future__ import annotations
 
+import os
+from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
 
 import pytest
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
     NeonEnv,
@@ -324,3 +327,97 @@ def test_sql_regress(
         pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)
 
     post_checks(env, test_output_dir, DBNAME, endpoint)
+
+
+@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+def test_tx_abort_with_many_relations(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    This is not a pg_regress test as such, but perhaps it should be -- this test exercises postgres
+    behavior when aborting a transaction with lots of relations.
+
+    Reproducer for https://github.com/neondatabase/neon/issues/9505
+    """
+
+    env = neon_env_builder.init_start()
+    ep = env.endpoints.create_start(
+        "main",
+        tenant_id=env.initial_tenant,
+        config_lines=[
+            "shared_buffers=1000MB",
+            "max_locks_per_transaction=16384",
+        ],
+    )
+
+    # How many relations: this number is tuned to be long enough to take tens of seconds
+    # if the rollback code path is buggy, tripping the test's timeout.
+    n = 4000
+
+    def create():
+        # Create many relations
+        log.info(f"Creating {n} relations...")
+        ep.safe_psql_many(
+            [
+                "BEGIN",
+                f"""DO $$
+            DECLARE
+                i INT;
+                table_name TEXT;
+            BEGIN
+                FOR i IN 1..{n} LOOP
+                    table_name := 'table_' || i;
+                    EXECUTE 'CREATE TABLE IF NOT EXISTS ' || table_name || ' (id SERIAL PRIMARY KEY, data TEXT)';
+                END LOOP;
+            END $$;
+            """,
+                "COMMIT",
+            ]
+        )
+
+    def truncate():
+        # Truncate relations, then roll back the transaction containing the truncations
+        log.info(f"Truncating {n} relations...")
+        ep.safe_psql_many(
+            [
+                "BEGIN",
+                f"""DO $$
+            DECLARE
+                i INT;
+                table_name TEXT;
+            BEGIN
+                FOR i IN 1..{n} LOOP
+                    table_name := 'table_' || i;
+                    EXECUTE 'TRUNCATE ' || table_name ;
+                END LOOP;
+            END $$;
+            """,
+            ]
+        )
+
+    def rollback_and_wait():
+        log.info(f"Rolling back after truncating {n} relations...")
+        ep.safe_psql("ROLLBACK")
+
+        # Restart the endpoint: this ensures that we can read back what we just wrote, i.e. pageserver
+        # ingest has caught up.
+        ep.stop()
+        log.info(f"Starting endpoint after truncating {n} relations...")
+        ep.start()
+        log.info(f"Started endpoint after truncating {n} relations...")
+
+    # Actual create & truncate phases may be slow, these involves lots of WAL records.  We do not
+    # apply a special timeout, they are expected to complete within general test timeout
+    create()
+    truncate()
+
+    # Run in a thread because the failure case is to take pathologically long time, and we don't want
+    # to block the test executor on that.
+    with ThreadPoolExecutor(max_workers=1) as exec:
+        try:
+            # Rollback phase should be fast: this is one WAL record that we should process efficiently
+            fut = exec.submit(rollback_and_wait)
+            fut.result(timeout=5)
+        except:
+            exec.shutdown(wait=False, cancel_futures=True)
+            raise

From dbadb0f9bbb3409f366a9f44ead3854d60676fd3 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 25 Oct 2024 15:34:19 +0100
Subject: [PATCH 090/239] proxy: propagate session IDs (#9509)

fixes #9367 by sending session IDs to local_proxy, and also returns
session IDs to the client for easier debugging.
---
 proxy/src/serverless/mod.rs           | 23 +++++++++++++++++++++--
 proxy/src/serverless/sql_over_http.rs | 10 ++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 29ff7b9d91..8fb7a771d9 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -32,6 +32,7 @@ use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
 use rand::SeedableRng;
+use sql_over_http::{uuid_to_header_value, NEON_REQUEST_ID};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::{TcpListener, TcpStream};
 use tokio::time::timeout;
@@ -309,7 +310,18 @@ async fn connection_handler(
         hyper_util::rt::TokioIo::new(conn),
         hyper::service::service_fn(move |req: hyper::Request<Incoming>| {
             // First HTTP request shares the same session ID
-            let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
+            let mut session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
+
+            if matches!(backend.auth_backend, crate::auth::Backend::Local(_)) {
+                // take session_id from request, if given.
+                if let Some(id) = req
+                    .headers()
+                    .get(&NEON_REQUEST_ID)
+                    .and_then(|id| uuid::Uuid::try_parse_ascii(id.as_bytes()).ok())
+                {
+                    session_id = id;
+                }
+            }
 
             // Cancel the current inflight HTTP request if the requets stream is closed.
             // This is slightly different to `_cancel_connection` in that
@@ -335,8 +347,15 @@ async fn connection_handler(
                 .map_ok_or_else(api_error_into_response, |r| r),
             );
             async move {
-                let res = handler.await;
+                let mut res = handler.await;
                 cancel_request.disarm();
+
+                // add the session ID to the response
+                if let Ok(resp) = &mut res {
+                    resp.headers_mut()
+                        .append(&NEON_REQUEST_ID, uuid_to_header_value(session_id));
+                }
+
                 res
             }
         }),
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 8e2d4c126a..1f3eec6d19 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -23,6 +23,7 @@ use typed_json::json;
 use url::Url;
 use urlencoding;
 use utils::http::error::ApiError;
+use uuid::Uuid;
 
 use super::backend::{LocalProxyConnError, PoolingBackend};
 use super::conn_pool::{AuthData, ConnInfoWithAuth};
@@ -63,6 +64,8 @@ enum Payload {
     Batch(BatchQueryData),
 }
 
+pub(super) static NEON_REQUEST_ID: HeaderName = HeaderName::from_static("neon-request-id");
+
 static CONN_STRING: HeaderName = HeaderName::from_static("neon-connection-string");
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
@@ -706,6 +709,12 @@ static HEADERS_TO_FORWARD: &[&HeaderName] = &[
     &TXN_DEFERRABLE,
 ];
 
+pub(crate) fn uuid_to_header_value(id: Uuid) -> HeaderValue {
+    let mut uuid = [0; uuid::fmt::Hyphenated::LENGTH];
+    HeaderValue::from_str(id.as_hyphenated().encode_lower(&mut uuid[..]))
+        .expect("uuid hyphenated format should be all valid header characters")
+}
+
 async fn handle_auth_broker_inner(
     ctx: &RequestMonitoring,
     request: Request<Incoming>,
@@ -732,6 +741,7 @@ async fn handle_auth_broker_inner(
             req = req.header(h, hv);
         }
     }
+    req = req.header(&NEON_REQUEST_ID, uuid_to_header_value(ctx.session_id()));
 
     let req = req
         .body(body)

From 700b102b0ffb7447f577a94e3b79b33ff48ba519 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Fri, 25 Oct 2024 17:48:29 +0300
Subject: [PATCH 091/239] safekeeper: retry eviction. (#9485)

Without this manager may sleep forever after eviction failure without
retries.
---
 safekeeper/src/bin/safekeeper.rs    | 2 ++
 safekeeper/src/timeline_eviction.rs | 9 +++++----
 safekeeper/src/timeline_manager.rs  | 7 ++++++-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 1e5f963a4f..1248428d33 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -193,6 +193,8 @@ struct Args {
     /// Usually, timeline eviction has to wait for `partial_backup_timeout` before being eligible for eviction,
     /// but if a timeline is un-evicted and then _not_ written to, it would immediately flap to evicting again,
     /// if it weren't for `eviction_min_resident` preventing that.
+    ///
+    /// Also defines interval for eviction retries.
     #[arg(long, value_parser = humantime::parse_duration, default_value = DEFAULT_EVICTION_MIN_RESIDENT)]
     eviction_min_resident: Duration,
 }
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index fae6571277..f5363ae9b0 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -66,15 +66,15 @@ impl Manager {
         ready
     }
 
-    /// Evict the timeline to remote storage.
+    /// Evict the timeline to remote storage. Returns whether the eviction was successful.
     #[instrument(name = "evict_timeline", skip_all)]
-    pub(crate) async fn evict_timeline(&mut self) {
+    pub(crate) async fn evict_timeline(&mut self) -> bool {
         assert!(!self.is_offloaded);
         let partial_backup_uploaded = match &self.partial_backup_uploaded {
             Some(p) => p.clone(),
             None => {
                 warn!("no partial backup uploaded, skipping eviction");
-                return;
+                return false;
             }
         };
 
@@ -91,11 +91,12 @@ impl Manager {
 
         if let Err(e) = do_eviction(self, &partial_backup_uploaded).await {
             warn!("failed to evict timeline: {:?}", e);
-            return;
+            return false;
         }
 
         info!("successfully evicted timeline");
         NUM_EVICTED_TIMELINES.inc();
+        true
     }
 
     /// Attempt to restore evicted timeline from remote storage; it must be
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 2129e86baa..f0583dd3ff 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -297,7 +297,12 @@ pub async fn main_task(
                 match mgr.global_rate_limiter.try_acquire_eviction() {
                     Some(_permit) => {
                         mgr.set_status(Status::EvictTimeline);
-                        mgr.evict_timeline().await;
+                        if !mgr.evict_timeline().await {
+                            // eviction failed, try again later
+                            mgr.evict_not_before =
+                                Instant::now() + rand_duration(&mgr.conf.eviction_min_resident);
+                            update_next_event(&mut next_event, mgr.evict_not_before);
+                        }
                     }
                     None => {
                         // we can't evict timeline now, will try again later

From 9909551f47e86d39e2df1cdeba79774e87744f17 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 25 Oct 2024 17:22:35 +0200
Subject: [PATCH 092/239] safekeeper: fix version in
 `TimelinePersistentState::empty()` (#9521)

## Problem

The Postgres version in `TimelinePersistentState::empty()` is incorrect:
the major version should be multiplied by 10000.

## Summary of changes

Multiply the version by 10000.
---
 safekeeper/src/state.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 8dd873ee77..0826a148ec 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -143,8 +143,8 @@ impl TimelinePersistentState {
         TimelinePersistentState::new(
             &TenantTimelineId::empty(),
             ServerInfo {
-                pg_version: 17, /* Postgres server version */
-                system_id: 0,   /* Postgres system identifier */
+                pg_version: 170000, /* Postgres server version (major * 10000) */
+                system_id: 0,       /* Postgres system identifier */
                 wal_seg_size: 16 * 1024 * 1024,
             },
             vec![],

From b54b632c6a4c7f0da88cd789ff92a644562d056e Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 25 Oct 2024 19:19:52 +0200
Subject: [PATCH 093/239] safekeeper: don't pass conf into storage constructors
 (#9523)

## Problem

The storage components take an entire `SafekeeperConf` during
construction, but only actually use the `no_sync` field. This makes it
hard to understand the storage inputs (which fields do they actually
care about?), and is also inconvenient for tests and benchmarks that
need to set up a lot of unnecessary boilerplate.

## Summary of changes

* Don't take the entire config, but pass in the `no_sync` field
explicitly.
* Take the timeline dir instead of `ttid` as an input, since it's the
only thing it cares about.
* Fix a couple of tests to not leak tempdirs.
* Various minor tweaks.
---
 safekeeper/src/control_file.rs         | 136 ++++++++-----------------
 safekeeper/src/copy_timeline.rs        |   2 +-
 safekeeper/src/lib.rs                  |   1 +
 safekeeper/src/timeline.rs             |  14 ++-
 safekeeper/src/timelines_global_map.rs |   4 +-
 safekeeper/src/wal_storage.rs          |  19 ++--
 6 files changed, 67 insertions(+), 109 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index cd82e43780..06e5afbf74 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -14,12 +14,10 @@ use std::path::Path;
 use std::time::Instant;
 
 use crate::control_file_upgrade::downgrade_v9_to_v8;
+use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
 use crate::state::{EvictionState, TimelinePersistentState};
-use crate::{control_file_upgrade::upgrade_control_file, timeline::get_timeline_dir};
-use utils::{bin_ser::LeSer, id::TenantTimelineId};
-
-use crate::SafeKeeperConf;
+use utils::bin_ser::LeSer;
 
 pub const SK_MAGIC: u32 = 0xcafeceefu32;
 pub const SK_FORMAT_VERSION: u32 = 9;
@@ -54,13 +52,12 @@ pub struct FileStorage {
 
 impl FileStorage {
     /// Initialize storage by loading state from disk.
-    pub fn restore_new(ttid: &TenantTimelineId, conf: &SafeKeeperConf) -> Result<FileStorage> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        let state = Self::load_control_file_from_dir(&timeline_dir)?;
+    pub fn restore_new(timeline_dir: &Utf8Path, no_sync: bool) -> Result<FileStorage> {
+        let state = Self::load_control_file_from_dir(timeline_dir)?;
 
         Ok(FileStorage {
-            timeline_dir,
-            no_sync: conf.no_sync,
+            timeline_dir: timeline_dir.to_path_buf(),
+            no_sync,
             state,
             last_persist_at: Instant::now(),
         })
@@ -71,16 +68,16 @@ impl FileStorage {
     /// Note: we normally call this in temp directory for atomic init, so
     /// interested in FileStorage as a result only in tests.
     pub async fn create_new(
-        dir: Utf8PathBuf,
-        conf: &SafeKeeperConf,
+        timeline_dir: &Utf8Path,
         state: TimelinePersistentState,
+        no_sync: bool,
     ) -> Result<FileStorage> {
         // we don't support creating new timelines in offloaded state
         assert!(matches!(state.eviction_state, EvictionState::Present));
 
         let mut store = FileStorage {
-            timeline_dir: dir,
-            no_sync: conf.no_sync,
+            timeline_dir: timeline_dir.to_path_buf(),
+            no_sync,
             state: state.clone(),
             last_persist_at: Instant::now(),
         };
@@ -239,89 +236,46 @@ mod test {
     use tokio::fs;
     use utils::lsn::Lsn;
 
-    fn stub_conf() -> SafeKeeperConf {
-        let workdir = camino_tempfile::tempdir().unwrap().into_path();
-        SafeKeeperConf {
-            workdir,
-            ..SafeKeeperConf::dummy()
-        }
-    }
+    const NO_SYNC: bool = true;
 
-    async fn load_from_control_file(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
-            .await
-            .expect("failed to create timeline dir");
-        Ok((
-            FileStorage::restore_new(ttid, conf)?,
-            FileStorage::load_control_file_from_dir(&timeline_dir)?,
-        ))
-    }
+    #[tokio::test]
+    async fn test_read_write_safekeeper_state() -> anyhow::Result<()> {
+        let tempdir = camino_tempfile::tempdir()?;
+        let mut state = TimelinePersistentState::empty();
+        let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
 
-    async fn create(
-        conf: &SafeKeeperConf,
-        ttid: &TenantTimelineId,
-    ) -> Result<(FileStorage, TimelinePersistentState)> {
-        let timeline_dir = get_timeline_dir(conf, ttid);
-        fs::create_dir_all(&timeline_dir)
-            .await
-            .expect("failed to create timeline dir");
-        let state = TimelinePersistentState::empty();
-        let storage = FileStorage::create_new(timeline_dir, conf, state.clone()).await?;
-        Ok((storage, state))
+        // Make a change.
+        state.commit_lsn = Lsn(42);
+        storage.persist(&state).await?;
+
+        // Reload the state. It should match the previously persisted state.
+        let loaded_state = FileStorage::load_control_file_from_dir(tempdir.path())?;
+        assert_eq!(loaded_state, state);
+        Ok(())
     }
 
     #[tokio::test]
-    async fn test_read_write_safekeeper_state() {
-        let conf = stub_conf();
-        let ttid = TenantTimelineId::generate();
-        {
-            let (mut storage, mut state) =
-                create(&conf, &ttid).await.expect("failed to create state");
-            // change something
-            state.commit_lsn = Lsn(42);
-            storage
-                .persist(&state)
-                .await
-                .expect("failed to persist state");
-        }
-
-        let (_, state) = load_from_control_file(&conf, &ttid)
-            .await
-            .expect("failed to read state");
-        assert_eq!(state.commit_lsn, Lsn(42));
-    }
-
-    #[tokio::test]
-    async fn test_safekeeper_state_checksum_mismatch() {
-        let conf = stub_conf();
-        let ttid = TenantTimelineId::generate();
-        {
-            let (mut storage, mut state) =
-                create(&conf, &ttid).await.expect("failed to read state");
-
-            // change something
-            state.commit_lsn = Lsn(42);
-            storage
-                .persist(&state)
-                .await
-                .expect("failed to persist state");
-        }
-        let control_path = get_timeline_dir(&conf, &ttid).join(CONTROL_FILE_NAME);
-        let mut data = fs::read(&control_path).await.unwrap();
-        data[0] += 1; // change the first byte of the file to fail checksum validation
-        fs::write(&control_path, &data)
-            .await
-            .expect("failed to write control file");
-
-        match load_from_control_file(&conf, &ttid).await {
-            Err(err) => assert!(err
-                .to_string()
-                .contains("safekeeper control file checksum mismatch")),
-            Ok(_) => panic!("expected error"),
+    async fn test_safekeeper_state_checksum_mismatch() -> anyhow::Result<()> {
+        let tempdir = camino_tempfile::tempdir()?;
+        let mut state = TimelinePersistentState::empty();
+        let mut storage = FileStorage::create_new(tempdir.path(), state.clone(), NO_SYNC).await?;
+
+        // Make a change.
+        state.commit_lsn = Lsn(42);
+        storage.persist(&state).await?;
+
+        // Change the first byte to fail checksum validation.
+        let ctrl_path = tempdir.path().join(CONTROL_FILE_NAME);
+        let mut data = fs::read(&ctrl_path).await?;
+        data[0] += 1;
+        fs::write(&ctrl_path, &data).await?;
+
+        // Loading the file should fail checksum validation.
+        if let Err(err) = FileStorage::load_control_file_from_dir(tempdir.path()) {
+            assert!(err.to_string().contains("control file checksum mismatch"))
+        } else {
+            panic!("expected checksum error")
         }
+        Ok(())
     }
 }
diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 52b13dc5e3..1bf0cc668f 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -154,7 +154,7 @@ pub async fn handle_request(request: Request) -> Result<()> {
     new_state.peer_horizon_lsn = request.until_lsn;
     new_state.backup_lsn = new_backup_lsn;
 
-    FileStorage::create_new(tli_dir_path.clone(), conf, new_state.clone()).await?;
+    FileStorage::create_new(&tli_dir_path, new_state.clone(), conf.no_sync).await?;
 
     // now we have a ready timeline in a temp directory
     validate_temp_timeline(conf, request.destination_ttid, &tli_dir_path).await?;
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index 277becb96b..b1cddaf062 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -113,6 +113,7 @@ impl SafeKeeperConf {
 
 impl SafeKeeperConf {
     #[cfg(test)]
+    #[allow(unused)]
     fn dummy() -> Self {
         SafeKeeperConf {
             workdir: Utf8PathBuf::from("./"),
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index dd4d161226..c737dfcf9b 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -328,15 +328,19 @@ impl SharedState {
     /// Restore SharedState from control file. If file doesn't exist, bails out.
     fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
         let timeline_dir = get_timeline_dir(conf, ttid);
-        let control_store = control_file::FileStorage::restore_new(ttid, conf)?;
+        let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?;
         if control_store.server.wal_seg_size == 0 {
             bail!(TimelineError::UninitializedWalSegSize(*ttid));
         }
 
         let sk = match control_store.eviction_state {
             EvictionState::Present => {
-                let wal_store =
-                    wal_storage::PhysicalStorage::new(ttid, timeline_dir, conf, &control_store)?;
+                let wal_store = wal_storage::PhysicalStorage::new(
+                    ttid,
+                    &timeline_dir,
+                    &control_store,
+                    conf.no_sync,
+                )?;
                 StateSK::Loaded(SafeKeeper::new(
                     TimelineState::new(control_store),
                     wal_store,
@@ -1046,9 +1050,9 @@ impl ManagerTimeline {
         // trying to restore WAL storage
         let wal_store = wal_storage::PhysicalStorage::new(
             &self.ttid,
-            self.timeline_dir.clone(),
-            &conf,
+            &self.timeline_dir,
             shared.sk.state(),
+            conf.no_sync,
         )?;
 
         // updating control file
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 538bb6e5d2..33d94da034 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -244,7 +244,7 @@ impl GlobalTimelines {
         // immediately initialize first WAL segment as well.
         let state =
             TimelinePersistentState::new(&ttid, server_info, vec![], commit_lsn, local_start_lsn)?;
-        control_file::FileStorage::create_new(tmp_dir_path.clone(), &conf, state).await?;
+        control_file::FileStorage::create_new(&tmp_dir_path, state, conf.no_sync).await?;
         let timeline = GlobalTimelines::load_temp_timeline(ttid, &tmp_dir_path, true).await?;
         Ok(timeline)
     }
@@ -596,7 +596,7 @@ pub async fn validate_temp_timeline(
         bail!("wal_seg_size is not set");
     }
 
-    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path.clone(), conf, &control_store)?;
+    let wal_store = wal_storage::PhysicalStorage::new(&ttid, path, &control_store, conf.no_sync)?;
 
     let commit_lsn = control_store.commit_lsn;
     let flush_lsn = wal_store.flush_lsn();
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 61d7825ae6..33b8bfe28e 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -29,7 +29,6 @@ use crate::metrics::{
 };
 use crate::state::TimelinePersistentState;
 use crate::wal_backup::{read_object, remote_timeline_path};
-use crate::SafeKeeperConf;
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::XLOG_BLCKSZ;
@@ -87,7 +86,9 @@ pub trait Storage {
 pub struct PhysicalStorage {
     metrics: WalStorageMetrics,
     timeline_dir: Utf8PathBuf,
-    conf: SafeKeeperConf,
+
+    /// Disables fsync if true.
+    no_sync: bool,
 
     /// Size of WAL segment in bytes.
     wal_seg_size: usize,
@@ -151,9 +152,9 @@ impl PhysicalStorage {
     /// the disk. Otherwise, all LSNs are set to zero.
     pub fn new(
         ttid: &TenantTimelineId,
-        timeline_dir: Utf8PathBuf,
-        conf: &SafeKeeperConf,
+        timeline_dir: &Utf8Path,
         state: &TimelinePersistentState,
+        no_sync: bool,
     ) -> Result<PhysicalStorage> {
         let wal_seg_size = state.server.wal_seg_size as usize;
 
@@ -198,8 +199,8 @@ impl PhysicalStorage {
 
         Ok(PhysicalStorage {
             metrics: WalStorageMetrics::default(),
-            timeline_dir,
-            conf: conf.clone(),
+            timeline_dir: timeline_dir.to_path_buf(),
+            no_sync,
             wal_seg_size,
             pg_version: state.server.pg_version,
             system_id: state.server.system_id,
@@ -224,7 +225,7 @@ impl PhysicalStorage {
 
     /// Call fdatasync if config requires so.
     async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
-        if !self.conf.no_sync {
+        if !self.no_sync {
             self.metrics
                 .observe_flush_seconds(time_io_closure(file.sync_data()).await?);
         }
@@ -263,9 +264,7 @@ impl PhysicalStorage {
 
             // Note: this doesn't get into observe_flush_seconds metric. But
             // segment init should be separate metric, if any.
-            if let Err(e) =
-                durable_rename(&tmp_path, &wal_file_partial_path, !self.conf.no_sync).await
-            {
+            if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await {
                 // Probably rename succeeded, but fsync of it failed. Remove
                 // the file then to avoid using it.
                 remove_file(wal_file_partial_path)

From 76328ada0597acd49d5150c6eec5804f22c47b86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Fri, 25 Oct 2024 22:06:27 +0200
Subject: [PATCH 094/239] Fix unoffload_timeline races with creation (#9525)

This PR does two things:

1. Obtain a `TimelineCreateGuard` object in `unoffload_timeline`. This
prevents two unoffload tasks from racing with each other. While they
already obtain locks for `timelines` and `offloaded_timelines`, they
aren't sufficient, as we have already constructed an entire timeline at
that point. We shouldn't ever have two `Timeline` objects in the same
process at the same time.
2. don't allow timeline creations for timelines that have been
offloaded. Obviously they already exist, so we should not allow
creation. the previous logic only looked at the timelines list.

Part of #8088
---
 pageserver/src/tenant.rs                 | 61 +++++++++++++++++++++---
 pageserver/src/tenant/timeline/uninit.rs | 40 ++++++++++------
 2 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d4f6384d9b..f846e145c5 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -294,11 +294,11 @@ pub struct Tenant {
 
     /// During timeline creation, we first insert the TimelineId to the
     /// creating map, then `timelines`, then remove it from the creating map.
-    /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_creating`
+    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
     timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
 
     /// Possibly offloaded and archived timelines
-    /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_offloaded`
+    /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
     timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
 
     // This mutex prevents creation of new timelines during GC.
@@ -584,13 +584,19 @@ impl OffloadedTimeline {
     }
 }
 
+impl fmt::Debug for OffloadedTimeline {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        write!(f, "OffloadedTimeline<{}>", self.timeline_id)
+    }
+}
+
 #[derive(Copy, Clone, PartialEq, Eq, Hash, Debug)]
 pub enum MaybeOffloaded {
     Yes,
     No,
 }
 
-#[derive(Clone)]
+#[derive(Clone, Debug)]
 pub enum TimelineOrOffloaded {
     Timeline(Arc<Timeline>),
     Offloaded(Arc<OffloadedTimeline>),
@@ -1815,6 +1821,8 @@ impl Tenant {
     }
 
     /// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline
+    ///
+    /// Counterpart to [`offload_timeline`].
     async fn unoffload_timeline(
         self: &Arc<Self>,
         timeline_id: TimelineId,
@@ -1823,6 +1831,24 @@ impl Tenant {
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
         info!("unoffloading timeline");
         let cancel = self.cancel.clone();
+
+        // Protect against concurrent attempts to use this TimelineId
+        // We don't care much about idempotency, as it's ensured a layer above.
+        let allow_offloaded = true;
+        let _create_guard = self
+            .create_timeline_create_guard(
+                timeline_id,
+                CreateTimelineIdempotency::FailWithConflict,
+                allow_offloaded,
+            )
+            .map_err(|err| match err {
+                TimelineExclusionError::AlreadyCreating => TimelineArchivalError::AlreadyInProgress,
+                TimelineExclusionError::AlreadyExists { .. } => {
+                    TimelineArchivalError::Other(anyhow::anyhow!("Timeline already exists"))
+                }
+                TimelineExclusionError::Other(e) => TimelineArchivalError::Other(e),
+            })?;
+
         let timeline_preload = self
             .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel.clone())
             .await;
@@ -4129,7 +4155,8 @@ impl Tenant {
         new_timeline_id: TimelineId,
         idempotency: CreateTimelineIdempotency,
     ) -> Result<StartCreatingTimelineResult<'_>, CreateTimelineError> {
-        match self.create_timeline_create_guard(new_timeline_id, idempotency) {
+        let allow_offloaded = false;
+        match self.create_timeline_create_guard(new_timeline_id, idempotency, allow_offloaded) {
             Ok(create_guard) => {
                 pausable_failpoint!("timeline-creation-after-uninit");
                 Ok(StartCreatingTimelineResult::CreateGuard(create_guard))
@@ -4141,10 +4168,21 @@ impl Tenant {
                 Err(CreateTimelineError::AlreadyCreating)
             }
             Err(TimelineExclusionError::Other(e)) => Err(CreateTimelineError::Other(e)),
-            Err(TimelineExclusionError::AlreadyExists { existing, arg }) => {
+            Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Offloaded(_existing),
+                ..
+            }) => {
+                info!("timeline already exists but is offloaded");
+                Err(CreateTimelineError::Conflict)
+            }
+            Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Timeline(existing),
+                arg,
+            }) => {
                 {
                     let existing = &existing.create_idempotency;
                     let _span = info_span!("idempotency_check", ?existing, ?arg).entered();
+                    debug!("timeline already exists");
 
                     match (existing, &arg) {
                         // FailWithConflict => no idempotency check
@@ -4467,17 +4505,26 @@ impl Tenant {
 
     /// Get a guard that provides exclusive access to the timeline directory, preventing
     /// concurrent attempts to create the same timeline.
+    ///
+    /// The `allow_offloaded` parameter controls whether to tolerate the existence of
+    /// offloaded timelines or not.
     fn create_timeline_create_guard(
         &self,
         timeline_id: TimelineId,
         idempotency: CreateTimelineIdempotency,
+        allow_offloaded: bool,
     ) -> Result<TimelineCreateGuard, TimelineExclusionError> {
         let tenant_shard_id = self.tenant_shard_id;
 
         let timeline_path = self.conf.timeline_path(&tenant_shard_id, &timeline_id);
 
-        let create_guard =
-            TimelineCreateGuard::new(self, timeline_id, timeline_path.clone(), idempotency)?;
+        let create_guard = TimelineCreateGuard::new(
+            self,
+            timeline_id,
+            timeline_path.clone(),
+            idempotency,
+            allow_offloaded,
+        )?;
 
         // At this stage, we have got exclusive access to in-memory state for this timeline ID
         // for creation.
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index 7d66c5aec8..c398289a5c 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -8,7 +8,7 @@ use utils::{fs_ext, id::TimelineId, lsn::Lsn};
 use crate::{
     context::RequestContext,
     import_datadir,
-    tenant::{CreateTimelineIdempotency, Tenant},
+    tenant::{CreateTimelineIdempotency, Tenant, TimelineOrOffloaded},
 };
 
 use super::Timeline;
@@ -177,7 +177,7 @@ pub(crate) struct TimelineCreateGuard<'t> {
 pub(crate) enum TimelineExclusionError {
     #[error("Already exists")]
     AlreadyExists {
-        existing: Arc<Timeline>,
+        existing: TimelineOrOffloaded,
         arg: CreateTimelineIdempotency,
     },
     #[error("Already creating")]
@@ -194,31 +194,41 @@ impl<'t> TimelineCreateGuard<'t> {
         timeline_id: TimelineId,
         timeline_path: Utf8PathBuf,
         idempotency: CreateTimelineIdempotency,
+        allow_offloaded: bool,
     ) -> Result<Self, TimelineExclusionError> {
         // Lock order: this is the only place we take both locks.  During drop() we only
         // lock creating_timelines
         let timelines = owning_tenant.timelines.lock().unwrap();
+        let timelines_offloaded = owning_tenant.timelines_offloaded.lock().unwrap();
         let mut creating_timelines: std::sync::MutexGuard<
             '_,
             std::collections::HashSet<TimelineId>,
         > = owning_tenant.timelines_creating.lock().unwrap();
 
         if let Some(existing) = timelines.get(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyExists {
-                existing: existing.clone(),
+            return Err(TimelineExclusionError::AlreadyExists {
+                existing: TimelineOrOffloaded::Timeline(existing.clone()),
                 arg: idempotency,
-            })
-        } else if creating_timelines.contains(&timeline_id) {
-            Err(TimelineExclusionError::AlreadyCreating)
-        } else {
-            creating_timelines.insert(timeline_id);
-            Ok(Self {
-                owning_tenant,
-                timeline_id,
-                timeline_path,
-                idempotency,
-            })
+            });
         }
+        if !allow_offloaded {
+            if let Some(existing) = timelines_offloaded.get(&timeline_id) {
+                return Err(TimelineExclusionError::AlreadyExists {
+                    existing: TimelineOrOffloaded::Offloaded(existing.clone()),
+                    arg: idempotency,
+                });
+            }
+        }
+        if creating_timelines.contains(&timeline_id) {
+            return Err(TimelineExclusionError::AlreadyCreating);
+        }
+        creating_timelines.insert(timeline_id);
+        Ok(Self {
+            owning_tenant,
+            timeline_id,
+            timeline_path,
+            idempotency,
+        })
     }
 }
 

From 85b954f44992eae7d3684cec2dff86c614214cbc Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Fri, 25 Oct 2024 16:30:57 -0400
Subject: [PATCH 095/239] pageserver: add tokio-epoll-uring slots waiters queue
 depth metrics (#9482)

In complement to
https://github.com/neondatabase/tokio-epoll-uring/pull/56.

## Problem

We want to make tokio-epoll-uring slots waiters queue depth observable
via Prometheus.

## Summary of changes

- Add `pageserver_tokio_epoll_uring_slots_submission_queue_depth`
metrics as a `Histogram`.
- Each thread-local tokio-epoll-uring system is given a `LocalHistogram`
to observe the metrics.
- Keep a list of `Arc<ThreadLocalMetrics>` used on-demand to flush data
to the shared histogram.
- Extend `Collector::collect` to report
`pageserver_tokio_epoll_uring_slots_submission_queue_depth`.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 Cargo.lock                                    |   4 +-
 libs/metrics/src/lib.rs                       |   1 +
 pageserver/src/metrics.rs                     | 115 +++++++++++++++++-
 .../io_engine/tokio_epoll_uring_ext.rs        |  18 ++-
 test_runner/fixtures/metrics.py               |   1 +
 5 files changed, 129 insertions(+), 10 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ad29fa4634..7fa5df29fd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6272,7 +6272,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6788,7 +6788,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#08ccfa94ff5507727bf4d8d006666b5b192e04c6"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
 dependencies = [
  "bytes",
  "io-uring",
diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index cd4526c089..64e56cb691 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -19,6 +19,7 @@ use once_cell::sync::Lazy;
 use prometheus::core::{
     Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
 };
+pub use prometheus::local::LocalHistogram;
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 8f697558d6..1473729186 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -3040,13 +3040,111 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }
 
 pub mod tokio_epoll_uring {
-    use metrics::{register_int_counter, UIntGauge};
+    use std::{
+        collections::HashMap,
+        sync::{Arc, Mutex},
+    };
+
+    use metrics::{register_histogram, register_int_counter, Histogram, LocalHistogram, UIntGauge};
     use once_cell::sync::Lazy;
 
+    /// Shared storage for tokio-epoll-uring thread local metrics.
+    pub(crate) static THREAD_LOCAL_METRICS_STORAGE: Lazy<ThreadLocalMetricsStorage> =
+        Lazy::new(|| {
+            let slots_submission_queue_depth = register_histogram!(
+                "pageserver_tokio_epoll_uring_slots_submission_queue_depth",
+                "The slots waiters queue depth of each tokio_epoll_uring system",
+                vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0, 512.0, 1024.0],
+            )
+            .expect("failed to define a metric");
+            ThreadLocalMetricsStorage {
+                observers: Mutex::new(HashMap::new()),
+                slots_submission_queue_depth,
+            }
+        });
+
+    pub struct ThreadLocalMetricsStorage {
+        /// List of thread local metrics observers.
+        observers: Mutex<HashMap<u64, Arc<ThreadLocalMetrics>>>,
+        /// A histogram shared between all thread local systems
+        /// for collecting slots submission queue depth.
+        slots_submission_queue_depth: Histogram,
+    }
+
+    /// Each thread-local [`tokio_epoll_uring::System`] gets one of these as its
+    /// [`tokio_epoll_uring::metrics::PerSystemMetrics`] generic.
+    ///
+    /// The System makes observations into [`Self`] and periodically, the collector
+    /// comes along and flushes [`Self`] into the shared storage [`THREAD_LOCAL_METRICS_STORAGE`].
+    ///
+    /// [`LocalHistogram`] is `!Send`, so, we need to put it behind a [`Mutex`].
+    /// But except for the periodic flush, the lock is uncontended so there's no waiting
+    /// for cache coherence protocol to get an exclusive cache line.
+    pub struct ThreadLocalMetrics {
+        /// Local observer of thread local tokio-epoll-uring system's slots waiters queue depth.
+        slots_submission_queue_depth: Mutex<LocalHistogram>,
+    }
+
+    impl ThreadLocalMetricsStorage {
+        /// Registers a new thread local system. Returns a thread local metrics observer.
+        pub fn register_system(&self, id: u64) -> Arc<ThreadLocalMetrics> {
+            let per_system_metrics = Arc::new(ThreadLocalMetrics::new(
+                self.slots_submission_queue_depth.local(),
+            ));
+            let mut g = self.observers.lock().unwrap();
+            g.insert(id, Arc::clone(&per_system_metrics));
+            per_system_metrics
+        }
+
+        /// Removes metrics observer for a thread local system.
+        /// This should be called before dropping a thread local system.
+        pub fn remove_system(&self, id: u64) {
+            let mut g = self.observers.lock().unwrap();
+            g.remove(&id);
+        }
+
+        /// Flush all thread local metrics to the shared storage.
+        pub fn flush_thread_local_metrics(&self) {
+            let g = self.observers.lock().unwrap();
+            g.values().for_each(|local| {
+                local.flush();
+            });
+        }
+    }
+
+    impl ThreadLocalMetrics {
+        pub fn new(slots_submission_queue_depth: LocalHistogram) -> Self {
+            ThreadLocalMetrics {
+                slots_submission_queue_depth: Mutex::new(slots_submission_queue_depth),
+            }
+        }
+
+        /// Flushes the thread local metrics to shared aggregator.
+        pub fn flush(&self) {
+            let Self {
+                slots_submission_queue_depth,
+            } = self;
+            slots_submission_queue_depth.lock().unwrap().flush();
+        }
+    }
+
+    impl tokio_epoll_uring::metrics::PerSystemMetrics for ThreadLocalMetrics {
+        fn observe_slots_submission_queue_depth(&self, queue_depth: u64) {
+            let Self {
+                slots_submission_queue_depth,
+            } = self;
+            slots_submission_queue_depth
+                .lock()
+                .unwrap()
+                .observe(queue_depth as f64);
+        }
+    }
+
     pub struct Collector {
         descs: Vec<metrics::core::Desc>,
         systems_created: UIntGauge,
         systems_destroyed: UIntGauge,
+        thread_local_metrics_storage: &'static ThreadLocalMetricsStorage,
     }
 
     impl metrics::core::Collector for Collector {
@@ -3056,7 +3154,7 @@ pub mod tokio_epoll_uring {
 
         fn collect(&self) -> Vec<metrics::proto::MetricFamily> {
             let mut mfs = Vec::with_capacity(Self::NMETRICS);
-            let tokio_epoll_uring::metrics::Metrics {
+            let tokio_epoll_uring::metrics::GlobalMetrics {
                 systems_created,
                 systems_destroyed,
             } = tokio_epoll_uring::metrics::global();
@@ -3064,12 +3162,21 @@ pub mod tokio_epoll_uring {
             mfs.extend(self.systems_created.collect());
             self.systems_destroyed.set(systems_destroyed);
             mfs.extend(self.systems_destroyed.collect());
+
+            self.thread_local_metrics_storage
+                .flush_thread_local_metrics();
+
+            mfs.extend(
+                self.thread_local_metrics_storage
+                    .slots_submission_queue_depth
+                    .collect(),
+            );
             mfs
         }
     }
 
     impl Collector {
-        const NMETRICS: usize = 2;
+        const NMETRICS: usize = 3;
 
         #[allow(clippy::new_without_default)]
         pub fn new() -> Self {
@@ -3101,6 +3208,7 @@ pub mod tokio_epoll_uring {
                 descs,
                 systems_created,
                 systems_destroyed,
+                thread_local_metrics_storage: &THREAD_LOCAL_METRICS_STORAGE,
             }
         }
     }
@@ -3460,6 +3568,7 @@ pub fn preinitialize_metrics() {
     Lazy::force(&RECONSTRUCT_TIME);
     Lazy::force(&BASEBACKUP_QUERY_TIME);
     Lazy::force(&COMPUTE_COMMANDS_COUNTERS);
+    Lazy::force(&tokio_epoll_uring::THREAD_LOCAL_METRICS_STORAGE);
 
     tenant_throttling::preinitialize_global_metrics();
 }
diff --git a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
index 6ea19d6b2d..c67215492f 100644
--- a/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
+++ b/pageserver/src/virtual_file/io_engine/tokio_epoll_uring_ext.rs
@@ -16,18 +16,24 @@ use tokio_epoll_uring::{System, SystemHandle};
 
 use crate::virtual_file::on_fatal_io_error;
 
-use crate::metrics::tokio_epoll_uring as metrics;
+use crate::metrics::tokio_epoll_uring::{self as metrics, THREAD_LOCAL_METRICS_STORAGE};
 
 #[derive(Clone)]
 struct ThreadLocalState(Arc<ThreadLocalStateInner>);
 
 struct ThreadLocalStateInner {
-    cell: tokio::sync::OnceCell<SystemHandle>,
+    cell: tokio::sync::OnceCell<SystemHandle<metrics::ThreadLocalMetrics>>,
     launch_attempts: AtomicU32,
     /// populated through fetch_add from [`THREAD_LOCAL_STATE_ID`]
     thread_local_state_id: u64,
 }
 
+impl Drop for ThreadLocalStateInner {
+    fn drop(&mut self) {
+        THREAD_LOCAL_METRICS_STORAGE.remove_system(self.thread_local_state_id);
+    }
+}
+
 impl ThreadLocalState {
     pub fn new() -> Self {
         Self(Arc::new(ThreadLocalStateInner {
@@ -71,7 +77,8 @@ pub async fn thread_local_system() -> Handle {
                         &fake_cancel,
                     )
                     .await;
-                    let res = System::launch()
+                    let per_system_metrics = metrics::THREAD_LOCAL_METRICS_STORAGE.register_system(inner.thread_local_state_id);
+                    let res = System::launch_with_metrics(per_system_metrics)
                     // this might move us to another executor thread => loop outside the get_or_try_init, not inside it
                     .await;
                     match res {
@@ -86,6 +93,7 @@ pub async fn thread_local_system() -> Handle {
                                 emit_launch_failure_process_stats();
                             });
                             metrics::THREAD_LOCAL_LAUNCH_FAILURES.inc();
+                            metrics::THREAD_LOCAL_METRICS_STORAGE.remove_system(inner.thread_local_state_id);
                             Err(())
                         }
                         // abort the process instead of panicking because pageserver usually becomes half-broken if we panic somewhere.
@@ -115,7 +123,7 @@ fn emit_launch_failure_process_stats() {
     // number of threads
     // rss / system memory usage generally
 
-    let tokio_epoll_uring::metrics::Metrics {
+    let tokio_epoll_uring::metrics::GlobalMetrics {
         systems_created,
         systems_destroyed,
     } = tokio_epoll_uring::metrics::global();
@@ -182,7 +190,7 @@ fn emit_launch_failure_process_stats() {
 pub struct Handle(ThreadLocalState);
 
 impl std::ops::Deref for Handle {
-    type Target = SystemHandle;
+    type Target = SystemHandle<metrics::ThreadLocalMetrics>;
 
     fn deref(&self) -> &Self::Target {
         self.0
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index e056ea77d4..39c8f70a9c 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -150,6 +150,7 @@ PAGESERVER_GLOBAL_METRICS: tuple[str, ...] = (
     counter("pageserver_tenant_throttling_count_accounted_finish_global"),
     counter("pageserver_tenant_throttling_wait_usecs_sum_global"),
     counter("pageserver_tenant_throttling_count_global"),
+    *histogram("pageserver_tokio_epoll_uring_slots_submission_queue_depth"),
 )
 
 PAGESERVER_PER_TENANT_METRICS: tuple[str, ...] = (

From 80262e724fcb8081c441b440aa3e7dce0ab11d4d Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Sat, 26 Oct 2024 08:24:15 +0100
Subject: [PATCH 096/239] build(deps): bump werkzeug from 3.0.3 to 3.0.6
 (#9527)

---
 poetry.lock    | 10 +++++-----
 pyproject.toml |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e307b873f3..7abd794235 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -3118,13 +3118,13 @@ files = [
 
 [[package]]
 name = "werkzeug"
-version = "3.0.3"
+version = "3.0.6"
 description = "The comprehensive WSGI web application library."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "werkzeug-3.0.3-py3-none-any.whl", hash = "sha256:fc9645dc43e03e4d630d23143a04a7f947a9a3b5727cd535fdfe155a17cc48c8"},
-    {file = "werkzeug-3.0.3.tar.gz", hash = "sha256:097e5bfda9f0aba8da6b8545146def481d06aa7d3266e7448e2cccf67dd8bd18"},
+    {file = "werkzeug-3.0.6-py3-none-any.whl", hash = "sha256:1bc0c2310d2fbb07b1dd1105eba2f7af72f322e1e455f2f93c993bee8c8a5f17"},
+    {file = "werkzeug-3.0.6.tar.gz", hash = "sha256:a8dd59d4de28ca70471a34cba79bed5f7ef2e036a76b3ab0835474246eb41f8d"},
 ]
 
 [package.dependencies]
@@ -3406,4 +3406,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "f52632571e34b0e51b059c280c35d6ff6f69f6a8c9586caca78282baf635be91"
+content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
diff --git a/pyproject.toml b/pyproject.toml
index 862ed49638..d4926cfb9a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -23,7 +23,7 @@ backoff = "^2.2.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "^3.0.3"
+Werkzeug = "^3.0.6"
 pytest-order = "^1.1.0"
 allure-pytest = "^2.13.2"
 pytest-asyncio = "^0.21.0"

From e7277885b3e95e41f3e2cab6c52d4e9e3981e27d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Sat, 26 Oct 2024 15:27:57 +0200
Subject: [PATCH 097/239] Don't consider archived timelines for synthetic size
 calculation (#9497)

Archived timelines should not count towards synthetic size.

Closes #9384.

Part of #8088.
---
 pageserver/src/tenant/size.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 4a4c698b56..6c3276ea3c 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -187,6 +187,8 @@ pub(super) async fn gather_inputs(
     // but it is unlikely to cause any issues. In the worst case,
     // the calculation will error out.
     timelines.retain(|t| t.is_active());
+    // Also filter out archived timelines.
+    timelines.retain(|t| t.is_archived() != Some(true));
 
     // Build a map of branch points.
     let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();

From 923974d4da4f6f0df754f598149c7679aab0dad2 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Oct 2024 08:47:12 +0000
Subject: [PATCH 098/239] safekeeper: don't un-evict timelines during snapshot
 API handler (#9428)

## Problem

When we use pull_timeline API on an evicted timeline, it gets downloaded
to serve the snapshot API request. That means that to evacuate all the
timelines from a node, the node needs enough disk space to download
partial segments from all timelines, which may not be physically the
case.

Closes: #8833

## Summary of changes

- Add a "try" variant of acquiring a residence guard, that returns None
if the timeline is offloaded
- During snapshot API handler, take a different code path if the
timeline isn't resident, where we just read the checkpoint and don't try
to read any segments.
---
 safekeeper/src/http/routes.rs            |   8 --
 safekeeper/src/pull_timeline.rs          | 140 ++++++++++++++++++++---
 safekeeper/src/timeline.rs               |  59 +++++++---
 safekeeper/src/timeline_eviction.rs      |   3 +
 safekeeper/src/timeline_manager.rs       |  27 +++++
 test_runner/regress/test_wal_acceptor.py | 103 +++++++++++++++++
 6 files changed, 298 insertions(+), 42 deletions(-)

diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index b4590fe3e5..df68f8a68e 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -262,14 +262,6 @@ async fn timeline_snapshot_handler(request: Request<Body>) -> Result<Response<Bo
     check_permission(&request, Some(ttid.tenant_id))?;
 
     let tli = GlobalTimelines::get(ttid).map_err(ApiError::from)?;
-    // Note: with evicted timelines it should work better then de-evict them and
-    // stream; probably start_snapshot would copy partial s3 file to dest path
-    // and stream control file, or return WalResidentTimeline if timeline is not
-    // evicted.
-    let tli = tli
-        .wal_residence_guard()
-        .await
-        .map_err(ApiError::InternalServerError)?;
 
     // To stream the body use wrap_stream which wants Stream of Result<Bytes>,
     // so create the chan and write to it in another task.
diff --git a/safekeeper/src/pull_timeline.rs b/safekeeper/src/pull_timeline.rs
index c7f5165f90..c700e18cc7 100644
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -8,6 +8,7 @@ use serde::{Deserialize, Serialize};
 use std::{
     cmp::min,
     io::{self, ErrorKind},
+    sync::Arc,
 };
 use tokio::{fs::OpenOptions, io::AsyncWrite, sync::mpsc, task};
 use tokio_tar::{Archive, Builder, Header};
@@ -25,8 +26,8 @@ use crate::{
         routes::TimelineStatus,
     },
     safekeeper::Term,
-    state::TimelinePersistentState,
-    timeline::WalResidentTimeline,
+    state::{EvictionState, TimelinePersistentState},
+    timeline::{Timeline, WalResidentTimeline},
     timelines_global_map::{create_temp_timeline_dir, validate_temp_timeline},
     wal_backup,
     wal_storage::open_wal_file,
@@ -43,18 +44,33 @@ use utils::{
 /// Stream tar archive of timeline to tx.
 #[instrument(name = "snapshot", skip_all, fields(ttid = %tli.ttid))]
 pub async fn stream_snapshot(
-    tli: WalResidentTimeline,
+    tli: Arc<Timeline>,
     source: NodeId,
     destination: NodeId,
     tx: mpsc::Sender<Result<Bytes>>,
 ) {
-    if let Err(e) = stream_snapshot_guts(tli, source, destination, tx.clone()).await {
-        // Error type/contents don't matter as they won't can't reach the client
-        // (hyper likely doesn't do anything with it), but http stream will be
-        // prematurely terminated. It would be nice to try to send the error in
-        // trailers though.
-        tx.send(Err(anyhow!("snapshot failed"))).await.ok();
-        error!("snapshot failed: {:#}", e);
+    match tli.try_wal_residence_guard().await {
+        Err(e) => {
+            tx.send(Err(anyhow!("Error checking residence: {:#}", e)))
+                .await
+                .ok();
+        }
+        Ok(maybe_resident_tli) => {
+            if let Err(e) = match maybe_resident_tli {
+                Some(resident_tli) => {
+                    stream_snapshot_resident_guts(resident_tli, source, destination, tx.clone())
+                        .await
+                }
+                None => stream_snapshot_offloaded_guts(tli, source, destination, tx.clone()).await,
+            } {
+                // Error type/contents don't matter as they won't can't reach the client
+                // (hyper likely doesn't do anything with it), but http stream will be
+                // prematurely terminated. It would be nice to try to send the error in
+                // trailers though.
+                tx.send(Err(anyhow!("snapshot failed"))).await.ok();
+                error!("snapshot failed: {:#}", e);
+            }
+        }
     }
 }
 
@@ -80,12 +96,10 @@ impl Drop for SnapshotContext {
     }
 }
 
-pub async fn stream_snapshot_guts(
-    tli: WalResidentTimeline,
-    source: NodeId,
-    destination: NodeId,
+/// Build a tokio_tar stream that sends encoded bytes into a Bytes channel.
+fn prepare_tar_stream(
     tx: mpsc::Sender<Result<Bytes>>,
-) -> Result<()> {
+) -> tokio_tar::Builder<impl AsyncWrite + Unpin + Send> {
     // tokio-tar wants Write implementor, but we have mpsc tx <Result<Bytes>>;
     // use SinkWriter as a Write impl. That is,
     // - create Sink from the tx. It returns PollSendError if chan is closed.
@@ -100,12 +114,38 @@ pub async fn stream_snapshot_guts(
     // - SinkWriter (not surprisingly) wants sink of &[u8], not bytes, so wrap
     // into CopyToBytes. This is a data copy.
     let copy_to_bytes = CopyToBytes::new(oksink);
-    let mut writer = SinkWriter::new(copy_to_bytes);
-    let pinned_writer = std::pin::pin!(writer);
+    let writer = SinkWriter::new(copy_to_bytes);
+    let pinned_writer = Box::pin(writer);
 
     // Note that tokio_tar append_* funcs use tokio::io::copy with 8KB buffer
     // which is also likely suboptimal.
-    let mut ar = Builder::new_non_terminated(pinned_writer);
+    Builder::new_non_terminated(pinned_writer)
+}
+
+/// Implementation of snapshot for an offloaded timeline, only reads control file
+pub(crate) async fn stream_snapshot_offloaded_guts(
+    tli: Arc<Timeline>,
+    source: NodeId,
+    destination: NodeId,
+    tx: mpsc::Sender<Result<Bytes>>,
+) -> Result<()> {
+    let mut ar = prepare_tar_stream(tx);
+
+    tli.snapshot_offloaded(&mut ar, source, destination).await?;
+
+    ar.finish().await?;
+
+    Ok(())
+}
+
+/// Implementation of snapshot for a timeline which is resident (includes some segment data)
+pub async fn stream_snapshot_resident_guts(
+    tli: WalResidentTimeline,
+    source: NodeId,
+    destination: NodeId,
+    tx: mpsc::Sender<Result<Bytes>>,
+) -> Result<()> {
+    let mut ar = prepare_tar_stream(tx);
 
     let bctx = tli.start_snapshot(&mut ar, source, destination).await?;
     pausable_failpoint!("sk-snapshot-after-list-pausable");
@@ -138,6 +178,70 @@ pub async fn stream_snapshot_guts(
     Ok(())
 }
 
+impl Timeline {
+    /// Simple snapshot for an offloaded timeline: we will only upload a renamed partial segment and
+    /// pass a modified control file into the provided tar stream (nothing with data segments on disk, since
+    /// we are offloaded and there aren't any)
+    async fn snapshot_offloaded<W: AsyncWrite + Unpin + Send>(
+        self: &Arc<Timeline>,
+        ar: &mut tokio_tar::Builder<W>,
+        source: NodeId,
+        destination: NodeId,
+    ) -> Result<()> {
+        // Take initial copy of control file, then release state lock
+        let mut control_file = {
+            let shared_state = self.write_shared_state().await;
+
+            let control_file = TimelinePersistentState::clone(shared_state.sk.state());
+
+            // Rare race: we got unevicted between entering function and reading control file.
+            // We error out and let API caller retry.
+            if !matches!(control_file.eviction_state, EvictionState::Offloaded(_)) {
+                bail!("Timeline was un-evicted during snapshot, please retry");
+            }
+
+            control_file
+        };
+
+        // Modify the partial segment of the in-memory copy for the control file to
+        // point to the destination safekeeper.
+        let replace = control_file
+            .partial_backup
+            .replace_uploaded_segment(source, destination)?;
+
+        let Some(replace) = replace else {
+            // In Manager:: ready_for_eviction, we do not permit eviction unless the timeline
+            // has a partial segment.  It is unexpected that
+            anyhow::bail!("Timeline has no partial segment, cannot generate snapshot");
+        };
+
+        tracing::info!("Replacing uploaded partial segment in in-mem control file: {replace:?}");
+
+        // Optimistically try to copy the partial segment to the destination's path: this
+        // can fail if the timeline was un-evicted and modified in the background.
+        let remote_timeline_path = &self.remote_path;
+        wal_backup::copy_partial_segment(
+            &replace.previous.remote_path(remote_timeline_path),
+            &replace.current.remote_path(remote_timeline_path),
+        )
+        .await?;
+
+        // Since the S3 copy succeeded with the path given in our control file snapshot, and
+        // we are sending that snapshot in our response, we are giving the caller a consistent
+        // snapshot even if our local Timeline was unevicted or otherwise modified in the meantime.
+        let buf = control_file
+            .write_to_buf()
+            .with_context(|| "failed to serialize control store")?;
+        let mut header = Header::new_gnu();
+        header.set_size(buf.len().try_into().expect("never breaches u64"));
+        ar.append_data(&mut header, CONTROL_FILE_NAME, buf.as_slice())
+            .await
+            .with_context(|| "failed to append to archive")?;
+
+        Ok(())
+    }
+}
+
 impl WalResidentTimeline {
     /// Start streaming tar archive with timeline:
     /// 1) stream control file under lock;
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index c737dfcf9b..f0113978c4 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -797,14 +797,17 @@ impl Timeline {
         state.sk.term_bump(to).await
     }
 
-    /// Get the timeline guard for reading/writing WAL files.
-    /// If WAL files are not present on disk (evicted), they will be automatically
-    /// downloaded from remote storage. This is done in the manager task, which is
-    /// responsible for issuing all guards.
-    ///
-    /// NB: don't use this function from timeline_manager, it will deadlock.
-    /// NB: don't use this function while holding shared_state lock.
-    pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
+    /// Guts of [`Self::wal_residence_guard`] and [`Self::try_wal_residence_guard`]
+    async fn do_wal_residence_guard(
+        self: &Arc<Self>,
+        block: bool,
+    ) -> Result<Option<WalResidentTimeline>> {
+        let op_label = if block {
+            "wal_residence_guard"
+        } else {
+            "try_wal_residence_guard"
+        };
+
         if self.is_cancelled() {
             bail!(TimelineError::Cancelled(self.ttid));
         }
@@ -816,10 +819,13 @@ impl Timeline {
         // Wait 30 seconds for the guard to be acquired. It can time out if someone is
         // holding the lock (e.g. during `SafeKeeper::process_msg()`) or manager task
         // is stuck.
-        let res = tokio::time::timeout_at(
-            started_at + Duration::from_secs(30),
-            self.manager_ctl.wal_residence_guard(),
-        )
+        let res = tokio::time::timeout_at(started_at + Duration::from_secs(30), async {
+            if block {
+                self.manager_ctl.wal_residence_guard().await.map(Some)
+            } else {
+                self.manager_ctl.try_wal_residence_guard().await
+            }
+        })
         .await;
 
         let guard = match res {
@@ -827,14 +833,14 @@ impl Timeline {
                 let finished_at = Instant::now();
                 let elapsed = finished_at - started_at;
                 MISC_OPERATION_SECONDS
-                    .with_label_values(&["wal_residence_guard"])
+                    .with_label_values(&[op_label])
                     .observe(elapsed.as_secs_f64());
 
                 guard
             }
             Ok(Err(e)) => {
                 warn!(
-                    "error while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    "error acquiring in {op_label}, statuses {:?} => {:?}",
                     status_before,
                     self.mgr_status.get()
                 );
@@ -842,7 +848,7 @@ impl Timeline {
             }
             Err(_) => {
                 warn!(
-                    "timeout while acquiring WalResidentTimeline guard, statuses {:?} => {:?}",
+                    "timeout acquiring in {op_label} guard, statuses {:?} => {:?}",
                     status_before,
                     self.mgr_status.get()
                 );
@@ -850,7 +856,28 @@ impl Timeline {
             }
         };
 
-        Ok(WalResidentTimeline::new(self.clone(), guard))
+        Ok(guard.map(|g| WalResidentTimeline::new(self.clone(), g)))
+    }
+
+    /// Get the timeline guard for reading/writing WAL files.
+    /// If WAL files are not present on disk (evicted), they will be automatically
+    /// downloaded from remote storage. This is done in the manager task, which is
+    /// responsible for issuing all guards.
+    ///
+    /// NB: don't use this function from timeline_manager, it will deadlock.
+    /// NB: don't use this function while holding shared_state lock.
+    pub async fn wal_residence_guard(self: &Arc<Self>) -> Result<WalResidentTimeline> {
+        self.do_wal_residence_guard(true)
+            .await
+            .map(|m| m.expect("Always get Some in block=true mode"))
+    }
+
+    /// Get the timeline guard for reading/writing WAL files if the timeline is resident,
+    /// else return None
+    pub(crate) async fn try_wal_residence_guard(
+        self: &Arc<Self>,
+    ) -> Result<Option<WalResidentTimeline>> {
+        self.do_wal_residence_guard(false).await
     }
 
     pub async fn backup_partial_reset(self: &Arc<Self>) -> Result<Vec<String>> {
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index f5363ae9b0..303421c837 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -56,6 +56,9 @@ impl Manager {
             // This also works for the first segment despite last_removed_segno
             // being 0 on init because this 0 triggers run of wal_removal_task
             // on success of which manager updates the horizon.
+            //
+            // **Note** pull_timeline functionality assumes that evicted timelines always have
+            // a partial segment: if we ever change this condition, must also update that code.
             && self
                 .partial_backup_uploaded
                 .as_ref()
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index f0583dd3ff..79200fff8d 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -100,6 +100,8 @@ const REFRESH_INTERVAL: Duration = Duration::from_millis(300);
 pub enum ManagerCtlMessage {
     /// Request to get a guard for WalResidentTimeline, with WAL files available locally.
     GuardRequest(tokio::sync::oneshot::Sender<anyhow::Result<ResidenceGuard>>),
+    /// Get a guard for WalResidentTimeline if the timeline is not currently offloaded, else None
+    TryGuardRequest(tokio::sync::oneshot::Sender<Option<ResidenceGuard>>),
     /// Request to drop the guard.
     GuardDrop(GuardId),
     /// Request to reset uploaded partial backup state.
@@ -110,6 +112,7 @@ impl std::fmt::Debug for ManagerCtlMessage {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             ManagerCtlMessage::GuardRequest(_) => write!(f, "GuardRequest"),
+            ManagerCtlMessage::TryGuardRequest(_) => write!(f, "TryGuardRequest"),
             ManagerCtlMessage::GuardDrop(id) => write!(f, "GuardDrop({:?})", id),
             ManagerCtlMessage::BackupPartialReset(_) => write!(f, "BackupPartialReset"),
         }
@@ -152,6 +155,19 @@ impl ManagerCtl {
             .and_then(std::convert::identity)
     }
 
+    /// Issue a new guard if the timeline is currently not offloaded, else return None
+    /// Sends a message to the manager and waits for the response.
+    /// Can be blocked indefinitely if the manager is stuck.
+    pub async fn try_wal_residence_guard(&self) -> anyhow::Result<Option<ResidenceGuard>> {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        self.manager_tx
+            .send(ManagerCtlMessage::TryGuardRequest(tx))?;
+
+        // wait for the manager to respond with the guard
+        rx.await
+            .map_err(|e| anyhow::anyhow!("response read fail: {:?}", e))
+    }
+
     /// Request timeline manager to reset uploaded partial segment state and
     /// wait for the result.
     pub async fn backup_partial_reset(&self) -> anyhow::Result<Vec<String>> {
@@ -674,6 +690,17 @@ impl Manager {
                     warn!("failed to reply with a guard, receiver dropped");
                 }
             }
+            Some(ManagerCtlMessage::TryGuardRequest(tx)) => {
+                let result = if self.is_offloaded {
+                    None
+                } else {
+                    Some(self.access_service.create_guard())
+                };
+
+                if tx.send(result).is_err() {
+                    warn!("failed to reply with a guard, receiver dropped");
+                }
+            }
             Some(ManagerCtlMessage::GuardDrop(guard_id)) => {
                 self.access_service.drop_guard(guard_id);
             }
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index d803cd7c78..157390c01c 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1998,6 +1998,109 @@ def test_pull_timeline_term_change(neon_env_builder: NeonEnvBuilder):
         pt_handle.join()
 
 
+def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
+    """
+    Verify that when pull_timeline is used on an evicted timeline, it does not result in
+    promoting any segments to local disk on the source, and the timeline is correctly instantiated
+    in evicted state on the destination.  This behavior is important to avoid ballooning disk
+    usage when doing mass migration of timelines.
+    """
+    neon_env_builder.num_safekeepers = 4
+    neon_env_builder.enable_safekeeper_remote_storage(default_remote_storage())
+
+    # Configure safekeepers with ultra-fast eviction policy
+    neon_env_builder.safekeeper_extra_opts = [
+        "--enable-offload",
+        "--partial-backup-timeout",
+        "50ms",
+        "--control-file-save-interval",
+        "1s",
+        # Safekeepers usually wait a while before evicting something: for this test we want them to
+        # evict things as soon as they are inactive.
+        "--eviction-min-resident=100ms",
+        "--delete-offloaded-wal",
+    ]
+
+    initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
+    env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    (src_sk, dst_sk) = (env.safekeepers[0], env.safekeepers[-1])
+    log.info(f"Will pull_timeline on destination {dst_sk.id} from source {src_sk.id}")
+
+    ep = env.endpoints.create("main")
+    ep.active_safekeepers = [s.id for s in env.safekeepers if s.id != dst_sk.id]
+    log.info(f"Compute writing initially to safekeepers: {ep.active_safekeepers}")
+    ep.active_safekeepers = [1, 2, 3]  # Exclude dst_sk from set written by compute initially
+    ep.start()
+    ep.safe_psql("CREATE TABLE t(i int)")
+    ep.safe_psql("INSERT INTO t VALUES (0)")
+    ep.stop()
+
+    wait_lsn_force_checkpoint_at_sk(src_sk, tenant_id, timeline_id, env.pageserver)
+
+    src_http = src_sk.http_client()
+    dst_http = dst_sk.http_client()
+
+    def evicted_on_source():
+        # Wait for timeline to go into evicted state
+        assert src_http.get_eviction_state(timeline_id) != "Present"
+        assert (
+            src_http.get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "evict"}
+            )
+            or 0 > 0
+        )
+        assert src_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
+        # Check that on source no segment files are present
+        assert src_sk.list_segments(tenant_id, timeline_id) == []
+
+    wait_until(60, 1, evicted_on_source)
+
+    # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
+    # destination should import the control file only & go into evicted mode immediately
+    dst_sk.pull_timeline([src_sk], tenant_id, timeline_id)
+
+    # Check that on source and destination no segment files are present
+    assert src_sk.list_segments(tenant_id, timeline_id) == []
+    assert dst_sk.list_segments(tenant_id, timeline_id) == []
+
+    # Check that the timeline on the destination is in the expected evicted state.
+    evicted_on_source()  # It should still be evicted on the source
+
+    def evicted_on_destination():
+        assert dst_http.get_eviction_state(timeline_id) != "Present"
+        assert dst_http.get_metric_value("safekeeper_evicted_timelines") or 0 > 0
+
+    # This should be fast, it is a wait_until because eviction state is updated
+    # in the background wrt pull_timeline.
+    wait_until(10, 0.1, evicted_on_destination)
+
+    # Delete the timeline on the source, to prove that deletion works on an
+    # evicted timeline _and_ that the final compute test is really not using
+    # the original location
+    src_sk.http_client().timeline_delete(tenant_id, timeline_id, only_local=True)
+
+    # Check that using the timeline correctly un-evicts it on the new location
+    ep.active_safekeepers = [2, 3, 4]
+    ep.start()
+    ep.safe_psql("INSERT INTO t VALUES (0)")
+    ep.stop()
+
+    def unevicted_on_dest():
+        assert (
+            dst_http.get_metric_value(
+                "safekeeper_eviction_events_completed_total", {"kind": "restore"}
+            )
+            or 0 > 0
+        )
+        n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
+        assert n_evicted == 0
+
+    wait_until(10, 1, unevicted_on_dest)
+
+
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
 # when compute is active, but there are no writes to the timeline. In that case
 # pageserver should maintain a single connection to safekeeper and don't attempt

From 33baca07b69bf674113d53a4c7f6e53b8e7a3396 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Oct 2024 09:26:01 +0000
Subject: [PATCH 099/239] storcon: add an API to cancel ongoing reconciler
 (#9520)

## Problem

If something goes wrong with a live migration, we currently only have
awkward ways to interrupt that:
- Restart the storage controller
- Ask it to do some other modification/migration on the shard, which we
don't really want.

## Summary of changes

- Add a new `/cancel` control API, and storcon_cli wrapper for it, which
fires the Reconciler's cancellation token. This is just for on-call use
and we do not expect it to be used by any other services.
---
 control_plane/storcon_cli/src/main.rs         | 14 +++++++
 storage_controller/src/http.rs                | 32 ++++++++++++++++
 storage_controller/src/service.rs             | 37 +++++++++++++++++++
 storage_controller/src/tenant_shard.rs        |  6 +++
 .../regress/test_storage_controller.py        | 15 +++++++-
 5 files changed, 103 insertions(+), 1 deletion(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index 73d89699ed..b7f38c6286 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -111,6 +111,11 @@ enum Command {
         #[arg(long)]
         node: NodeId,
     },
+    /// Cancel any ongoing reconciliation for this shard
+    TenantShardCancelReconcile {
+        #[arg(long)]
+        tenant_shard_id: TenantShardId,
+    },
     /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
     /// that is passed through to pageservers, and does not affect storage controller behavior.
     TenantConfig {
@@ -535,6 +540,15 @@ async fn main() -> anyhow::Result<()> {
                 )
                 .await?;
         }
+        Command::TenantShardCancelReconcile { tenant_shard_id } => {
+            storcon_client
+                .dispatch::<(), ()>(
+                    Method::PUT,
+                    format!("control/v1/tenant/{tenant_shard_id}/cancel_reconcile"),
+                    None,
+                )
+                .await?;
+        }
         Command::TenantConfig { tenant_id, config } => {
             let tenant_conf = serde_json::from_str(&config)?;
 
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index afefe8598c..face3d2c2d 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -968,6 +968,28 @@ async fn handle_tenant_shard_migrate(
     )
 }
 
+async fn handle_tenant_shard_cancel_reconcile(
+    service: Arc<Service>,
+    req: Request<Body>,
+) -> Result<Response<Body>, ApiError> {
+    check_permissions(&req, Scope::Admin)?;
+
+    let req = match maybe_forward(req).await {
+        ForwardOutcome::Forwarded(res) => {
+            return res;
+        }
+        ForwardOutcome::NotForwarded(req) => req,
+    };
+
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    json_response(
+        StatusCode::OK,
+        service
+            .tenant_shard_cancel_reconcile(tenant_shard_id)
+            .await?,
+    )
+}
+
 async fn handle_tenant_update_policy(req: Request<Body>) -> Result<Response<Body>, ApiError> {
     check_permissions(&req, Scope::Admin)?;
 
@@ -1776,6 +1798,16 @@ pub fn make_router(
                 RequestName("control_v1_tenant_migrate"),
             )
         })
+        .put(
+            "/control/v1/tenant/:tenant_shard_id/cancel_reconcile",
+            |r| {
+                tenant_service_handler(
+                    r,
+                    handle_tenant_shard_cancel_reconcile,
+                    RequestName("control_v1_tenant_cancel_reconcile"),
+                )
+            },
+        )
         .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
             tenant_service_handler(
                 r,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index a2a6e63dd2..32029c1232 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4834,6 +4834,43 @@ impl Service {
         Ok(TenantShardMigrateResponse {})
     }
 
+    /// 'cancel' in this context means cancel any ongoing reconcile
+    pub(crate) async fn tenant_shard_cancel_reconcile(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), ApiError> {
+        // Take state lock and fire the cancellation token, after which we drop lock and wait for any ongoing reconcile to complete
+        let waiter = {
+            let locked = self.inner.write().unwrap();
+            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                return Err(ApiError::NotFound(
+                    anyhow::anyhow!("Tenant shard not found").into(),
+                ));
+            };
+
+            let waiter = shard.get_waiter();
+            match waiter {
+                None => {
+                    tracing::info!("Shard does not have an ongoing Reconciler");
+                    return Ok(());
+                }
+                Some(waiter) => {
+                    tracing::info!("Cancelling Reconciler");
+                    shard.cancel_reconciler();
+                    waiter
+                }
+            }
+        };
+
+        // Cancellation should be prompt.  If this fails we have still done our job of firing the
+        // cancellation token, but by returning an ApiError we will indicate to the caller that
+        // the Reconciler is misbehaving and not respecting the cancellation token
+        self.await_waiters(vec![waiter], SHORT_RECONCILE_TIMEOUT)
+            .await?;
+
+        Ok(())
+    }
+
     /// This is for debug/support only: we simply drop all state for a tenant, without
     /// detaching or deleting it on pageservers.
     pub(crate) async fn tenant_drop(&self, tenant_id: TenantId) -> Result<(), ApiError> {
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index e696c72ba7..27c97d3b86 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -1317,6 +1317,12 @@ impl TenantShard {
         })
     }
 
+    pub(crate) fn cancel_reconciler(&self) {
+        if let Some(handle) = self.reconciler.as_ref() {
+            handle.cancel.cancel()
+        }
+    }
+
     /// Get a waiter for any reconciliation in flight, but do not start reconciliation
     /// if it is not already running
     pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index d4bc4b1a4f..40fee7661a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -872,6 +872,14 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     assert sum(v["shard_count"] for v in response.json()["nodes"].values()) == 3
     assert all(v["may_schedule"] for v in response.json()["nodes"].values())
 
+    # Reconciler cancel API should be a no-op when nothing is in flight
+    env.storage_controller.request(
+        "PUT",
+        f"{env.storage_controller_api}/control/v1/tenant/{tenant_id}-0102/cancel_reconcile",
+        headers=env.storage_controller.headers(TokenScope.ADMIN),
+    )
+
+    # Node unclean drop API
     response = env.storage_controller.request(
         "POST",
         f"{env.storage_controller_api}/debug/v1/node/{env.pageservers[1].id}/drop",
@@ -879,6 +887,7 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
     )
     assert len(env.storage_controller.node_list()) == 1
 
+    # Tenant unclean drop API
     response = env.storage_controller.request(
         "POST",
         f"{env.storage_controller_api}/debug/v1/tenant/{tenant_id}/drop",
@@ -892,7 +901,6 @@ def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
         headers=env.storage_controller.headers(TokenScope.ADMIN),
     )
     assert len(response.json()) == 1
-
     # Check that the 'drop' APIs didn't leave things in a state that would fail a consistency check: they're
     # meant to be unclean wrt the pageserver state, but not leave a broken storage controller behind.
     env.storage_controller.consistency_check()
@@ -1660,6 +1668,11 @@ def test_storcon_cli(neon_env_builder: NeonEnvBuilder):
     storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])
     assert "Stop" in storcon_cli(["tenants"])[3]
 
+    # Cancel ongoing reconcile on a tenant
+    storcon_cli(
+        ["tenant-shard-cancel-reconcile", "--tenant-shard-id", f"{env.initial_tenant}-0104"]
+    )
+
     # Change a tenant's placement
     storcon_cli(
         ["tenant-policy", "--tenant-id", str(env.initial_tenant), "--placement", "secondary"]

From 93987b5a4a1defe6d6e99a8e63c3652b26eace1f Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 28 Oct 2024 11:11:12 +0000
Subject: [PATCH 100/239] tests: add test_storage_controller_onboard_detached
 (#9431)

## Problem

We haven't historically taken this API route where we would onboard a
tenant to the controller in detached state. It worked, but we didn't
have test coverage.

## Summary of changes

- Add a test that onboards a tenant to the storage controller in
Detached mode, and checks that deleting it without attaching it works as
expected.
---
 .../regress/test_storage_controller.py        | 98 +++++++++++++++++--
 1 file changed, 91 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 40fee7661a..c8de292588 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -18,6 +18,7 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    NeonPageserver,
     PageserverAvailability,
     PageserverSchedulingPolicy,
     PgBin,
@@ -298,17 +299,20 @@ def test_storage_controller_restart(neon_env_builder: NeonEnvBuilder):
     env.storage_controller.consistency_check()
 
 
-@pytest.mark.parametrize("warm_up", [True, False])
-def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
+def prepare_onboarding_env(
+    neon_env_builder: NeonEnvBuilder,
+) -> tuple[NeonEnv, NeonPageserver, TenantId, int]:
     """
-    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
-    which provides the /location_config API.  This is similar to creating a tenant,
-    but imports the generation number.
+    For tests that do onboarding of a tenant to the storage controller, a small dance to
+    set up one pageserver that won't be managed by the storage controller and create
+    a tenant there.
     """
-
     # One pageserver to simulate legacy environment, two to be managed by storage controller
     neon_env_builder.num_pageservers = 3
 
+    # Enable tests to use methods that require real S3 API
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
     # Start services by hand so that we can skip registration on one of the pageservers
     env = neon_env_builder.init_configs()
     env.broker.start()
@@ -329,7 +333,6 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     # will be attached after onboarding
     env.pageservers[1].start()
     env.pageservers[2].start()
-    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
 
     for sk in env.safekeepers:
         sk.start()
@@ -339,6 +342,23 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     generation = 123
     origin_ps.tenant_create(tenant_id, generation=generation)
 
+    origin_ps.http_client().timeline_create(PgVersion.NOT_SET, tenant_id, TimelineId.generate())
+
+    return (env, origin_ps, tenant_id, generation)
+
+
+@pytest.mark.parametrize("warm_up", [True, False])
+def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up: bool):
+    """
+    We onboard tenants to the sharding service by treating it as a 'virtual pageserver'
+    which provides the /location_config API.  This is similar to creating a tenant,
+    but imports the generation number.
+    """
+
+    env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
+
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
     # As if doing a live migration, first configure origin into stale mode
     r = origin_ps.http_client().tenant_location_conf(
         tenant_id,
@@ -475,6 +495,70 @@ def test_storage_controller_onboarding(neon_env_builder: NeonEnvBuilder, warm_up
     env.storage_controller.consistency_check()
 
 
+@run_only_on_default_postgres("this test doesn't start an endpoint")
+def test_storage_controller_onboard_detached(neon_env_builder: NeonEnvBuilder):
+    """
+    Sometimes, the control plane wants to delete a tenant that wasn't attached to any pageserver,
+    and also wasn't ever registered with the storage controller.
+
+    It may do this by calling /location_conf in mode Detached and then calling the delete API
+    as normal.
+    """
+
+    env, origin_ps, tenant_id, generation = prepare_onboarding_env(neon_env_builder)
+
+    remote_prefix = "/".join(
+        (
+            "tenants",
+            str(tenant_id),
+        )
+    )
+
+    # Detach it from its original pageserver.
+    origin_ps.http_client().tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    # Since we will later assert that remote data is gone, as a control also check it was ever there
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=remote_prefix,
+    )
+
+    # Register with storage controller in Detached state
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+    generation += 1
+    r = virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": generation,
+        },
+    )
+    assert len(r["shards"]) == 0  # location_conf tells us there are no attached shards
+
+    # Onboarding in Detached state shouldn't have attached it to any pageserver
+    for ps in env.pageservers:
+        assert ps.http_client().tenant_list() == []
+
+    # Delete it via the storage controller
+    virtual_ps_http.tenant_delete(tenant_id)
+
+    # Check that we really deleted it
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=remote_prefix,
+    )
+
+
 def test_storage_controller_compute_hook(
     httpserver: HTTPServer,
     neon_env_builder: NeonEnvBuilder,

From 01b6843e1214496343d3401081e4bede17d8a025 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 28 Oct 2024 12:09:47 +0000
Subject: [PATCH 101/239] Route pgbouncer logs to virtio-serial (#9488)

virtio-serial is much more performant than /dev/console emulation,
therefore, is much more suitable for the verbose logs inside vm. This
commit changes routing for pgbouncer logs, since we've recently noticed
it can emit large volumes of logs.

Manually tested on staging by pinning a compute image to my test
project.

Should help with https://github.com/neondatabase/cloud/issues/19072
---
 compute/vm-image-spec-bookworm.yaml | 2 +-
 compute/vm-image-spec-bullseye.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index 51a55b513f..79f894c289 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -18,7 +18,7 @@ commands:
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index 43e57a4ed5..ff04b9e4c6 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -18,7 +18,7 @@ commands:
   - name: pgbouncer
     user: postgres
     sysvInitAction: respawn
-    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini'
+    shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini 2>&1 > /dev/virtio-ports/tech.neon.log.0'
   - name: local_proxy
     user: postgres
     sysvInitAction: respawn

From 8dd555d3964ae28cabec65a7e59a7047b47bac25 Mon Sep 17 00:00:00 2001
From: Rahul Patil <rahul@neon.tech>
Date: Mon, 28 Oct 2024 13:17:09 +0100
Subject: [PATCH 102/239] ci(proxy): Update GH action flag on proxy deployment
 (#9535)

## Problem

Based on a recent proxy deployment issue, we deployed another proxy
version (proxy-scram), which was not needed when deploying a specific
proxy type. we have
[PR](https://github.com/neondatabase/infra/pull/2142) to update on the
infra branch and need to update CI in this repo which triggers proxy
deployment.

## Summary of changes

- Update proxy deployment flag

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .github/workflows/build_and_test.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 0d3ea7db28..82a24b29d3 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1116,7 +1116,11 @@ jobs:
 
             gh workflow --repo neondatabase/infra run deploy-proxy-prod.yml --ref main \
               -f deployPgSniRouter=true \
-              -f deployProxy=true \
+              -f deployProxyLink=true \
+              -f deployPrivatelinkProxy=true \
+              -f deployLegacyProxyScram=true \
+              -f deployProxyScram=true \
+              -f deployProxyAuthBroker=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
           else

From 25f1e5cfebe865e0c7126f0b0e0ca9e00be0731b Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 28 Oct 2024 15:02:20 +0000
Subject: [PATCH 103/239] [proxy] demote warnings and remove dead-argument
 (#9512)

fixes https://github.com/neondatabase/cloud/issues/19000
---
 proxy/src/auth/backend/hacks.rs | 6 +++---
 proxy/src/bin/proxy.rs          | 6 ------
 2 files changed, 3 insertions(+), 9 deletions(-)

diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 8ab8d5d37f..28bdacd769 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -1,5 +1,5 @@
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::{debug, info};
 
 use super::{ComputeCredentials, ComputeUserInfo, ComputeUserInfoNoEndpoint};
 use crate::auth::{self, AuthFlow};
@@ -21,7 +21,7 @@ pub(crate) async fn authenticate_cleartext(
     secret: AuthSecret,
     config: &'static AuthenticationConfig,
 ) -> auth::Result<ComputeCredentials> {
-    warn!("cleartext auth flow override is enabled, proceeding");
+    debug!("cleartext auth flow override is enabled, proceeding");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
@@ -61,7 +61,7 @@ pub(crate) async fn password_hack_no_authentication(
     info: ComputeUserInfoNoEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
 ) -> auth::Result<(ComputeUserInfo, Vec<u8>)> {
-    warn!("project not specified, resorting to the password hack auth flow");
+    debug!("project not specified, resorting to the password hack auth flow");
     ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
 
     // pause the timer while we communicate with the client
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 6e190029aa..82c259efc8 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -137,9 +137,6 @@ struct ProxyCliArgs {
     /// size of the threadpool for password hashing
     #[clap(long, default_value_t = 4)]
     scram_thread_pool_size: u8,
-    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
-    disable_dynamic_rate_limiter: bool,
     /// Endpoint rate limiter max number of requests per second.
     ///
     /// Provided in the form `<Requests Per Second>@<Bucket Duration Size>`.
@@ -615,9 +612,6 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
              and metric-collection-interval must be specified"
         ),
     };
-    if !args.disable_dynamic_rate_limiter {
-        bail!("dynamic rate limiter should be disabled");
-    }
 
     let config::ConcurrencyLockOptions {
         shards,

From 3d64a7ddcdf23f8eefc343258438c91251d58488 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 28 Oct 2024 11:23:30 -0500
Subject: [PATCH 104/239] Add pg_mooncake to compute-node.Dockerfile

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .github/workflows/build_and_test.yml |  1 +
 compute/compute-node.Dockerfile      | 37 ++++++++++++++++++++++++----
 2 files changed, 33 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 82a24b29d3..c308c41efd 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -839,6 +839,7 @@ jobs:
       - name: Build vm image
         run: |
           ./vm-builder \
+            -size=2G \
             -spec=compute/vm-image-spec-${{ matrix.version.debian }}.yaml \
             -src=neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
             -dst=neondatabase/vm-compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 6451e309f0..dfed01daae 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -666,7 +666,7 @@ RUN apt-get update && \
 #
 # Use new version only for v17
 # because Release_2024_09_1 has some backward incompatible changes
-# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1 
+# https://github.com/rdkit/rdkit/releases/tag/Release_2024_09_1
 ENV PATH="/usr/local/pgsql/bin/:/usr/local/pgsql/:$PATH"
 RUN case "${PG_VERSION}" in \
     "v17") \
@@ -860,13 +860,14 @@ ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
 USER nonroot
 WORKDIR /home/nonroot
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 is not supported yet by pgrx. Quit" && exit 0;; \
-    esac && \
-    curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
     chmod +x rustup-init && \
     ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
     rm rustup-init && \
+    case "${PG_VERSION}" in \
+        'v17') \
+            echo 'v17 is not supported yet by pgrx. Quit' && exit 0;; \
+    esac && \
     cargo install --locked --version 0.11.3 cargo-pgrx && \
     /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
 
@@ -1041,6 +1042,31 @@ RUN wget https://github.com/pgpartman/pg_partman/archive/refs/tags/v5.1.0.tar.gz
     make -j $(getconf _NPROCESSORS_ONLN) install && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_partman.control
 
+#########################################################################################
+#
+# Layer "pg_mooncake"
+# compile pg_mooncake extension
+#
+#########################################################################################
+FROM rust-extensions-build AS pg-mooncake-build
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
+ENV PATH="/usr/local/pgsql/bin/:$PATH"
+
+RUN case "${PG_VERSION}" in \
+        'v14') \
+            echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
+    esac && \
+    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
+    cd pg_mooncake-src && \
+    git checkout "${PG_MOONCAKE_VERSION}" && \
+    git submodule update --init --depth 1 --recursive && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) && \
+    make BUILD_TYPE=release -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_mooncake.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -1084,6 +1110,7 @@ COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-partman-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-mooncake-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From 3bad52543fa018a11beded31885d95150e6f907a Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 28 Oct 2024 17:42:35 +0100
Subject: [PATCH 105/239] We don't have legacy proxies anymore (#9544)

We don't have legacy scram proxies anymore:
cc: https://github.com/neondatabase/cloud/issues/9745
---
 .github/workflows/build_and_test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index c308c41efd..bba51ddc92 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -1119,7 +1119,6 @@ jobs:
               -f deployPgSniRouter=true \
               -f deployProxyLink=true \
               -f deployPrivatelinkProxy=true \
-              -f deployLegacyProxyScram=true \
               -f deployProxyScram=true \
               -f deployProxyAuthBroker=true \
               -f branch=main \

From 248558dee85849fd95fece7f8e0a730c14eb0660 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 28 Oct 2024 18:18:37 +0100
Subject: [PATCH 106/239] safekeeper: refactor `WalAcceptor` to be event-driven
 (#9462)

## Problem

The `WalAcceptor` main loop currently uses two nested loops to consume
inbound messages. This makes it hard to slot in periodic events like
metrics collection. It also duplicates the event processing code, and assumes
all messages in steady state are AppendRequests (other messages types may
be dropped if following an AppendRequest).

## Summary of changes

Refactor the `WalAcceptor` loop to be event driven.
---
 safekeeper/src/receive_wal.rs | 120 ++++++++++++++++++----------------
 1 file changed, 63 insertions(+), 57 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 3dbf72298f..f97e127a17 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -21,18 +21,15 @@ use postgres_backend::QueryError;
 use pq_proto::BeMessage;
 use serde::Deserialize;
 use serde::Serialize;
+use std::future;
 use std::net::SocketAddr;
 use std::sync::Arc;
 use tokio::io::AsyncRead;
 use tokio::io::AsyncWrite;
-use tokio::sync::mpsc::channel;
-use tokio::sync::mpsc::error::TryRecvError;
-use tokio::sync::mpsc::Receiver;
-use tokio::sync::mpsc::Sender;
+use tokio::sync::mpsc::{channel, Receiver, Sender};
 use tokio::task;
 use tokio::task::JoinHandle;
-use tokio::time::Duration;
-use tokio::time::Instant;
+use tokio::time::{Duration, MissedTickBehavior};
 use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -444,9 +441,9 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
     }
 }
 
-// Send keepalive messages to walproposer, to make sure it receives updates
-// even when it writes a steady stream of messages.
-const KEEPALIVE_INTERVAL: Duration = Duration::from_secs(1);
+/// The WAL flush interval. This ensures we periodically flush the WAL and send AppendResponses to
+/// walproposer, even when it's writing a steady stream of messages.
+const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
 
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
 /// replies to reply_tx.
@@ -494,67 +491,76 @@ impl WalAcceptor {
     async fn run(&mut self) -> anyhow::Result<()> {
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
 
-        // After this timestamp we will stop processing AppendRequests and send a response
-        // to the walproposer. walproposer sends at least one AppendRequest per second,
-        // we will send keepalives by replying to these requests once per second.
-        let mut next_keepalive = Instant::now();
+        // Periodically flush the WAL.
+        let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
+        flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
+        flush_ticker.tick().await; // skip the initial, immediate tick
 
-        while let Some(mut next_msg) = self.msg_rx.recv().await {
-            // Update walreceiver state in shmem for reporting.
-            if let ProposerAcceptorMessage::Elected(_) = &next_msg {
-                walreceiver_guard.get().status = WalReceiverStatus::Streaming;
-            }
+        // Tracks unflushed appends.
+        let mut dirty = false;
 
-            let reply_msg = if matches!(next_msg, ProposerAcceptorMessage::AppendRequest(_)) {
-                // Loop through AppendRequests while available to write as many WAL records as
-                // possible without fsyncing.
-                //
-                // Make sure the WAL is flushed before returning, see:
-                // https://github.com/neondatabase/neon/issues/9259
-                //
-                // Note: this will need to be rewritten if we want to read non-AppendRequest messages here.
-                // Otherwise, we might end up in a situation where we read a message, but don't
-                // process it.
-                while let ProposerAcceptorMessage::AppendRequest(append_request) = next_msg {
-                    let noflush_msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
-
-                    if let Some(reply) = self.tli.process_msg(&noflush_msg).await? {
-                        if self.reply_tx.send(reply).await.is_err() {
-                            break; // disconnected, flush WAL and return on next send/recv
-                        }
-                    }
-
-                    // get out of this loop if keepalive time is reached
-                    if Instant::now() >= next_keepalive {
+        loop {
+            let reply = tokio::select! {
+                // Process inbound message.
+                msg = self.msg_rx.recv() => {
+                    // If disconnected, break to flush WAL and return.
+                    let Some(mut msg) = msg else {
                         break;
+                    };
+
+                    // Update walreceiver state in shmem for reporting.
+                    if let ProposerAcceptorMessage::Elected(_) = &msg {
+                        walreceiver_guard.get().status = WalReceiverStatus::Streaming;
                     }
 
-                    // continue pulling AppendRequests if available
-                    match self.msg_rx.try_recv() {
-                        Ok(msg) => next_msg = msg,
-                        Err(TryRecvError::Empty) => break,
-                        // on disconnect, flush WAL and return on next send/recv
-                        Err(TryRecvError::Disconnected) => break,
-                    };
+                    // Don't flush the WAL on every append, only periodically via flush_ticker.
+                    // This batches multiple appends per fsync. If the channel is empty after
+                    // sending the reply, we'll schedule an immediate flush.
+                    if let ProposerAcceptorMessage::AppendRequest(append_request) = msg {
+                        msg = ProposerAcceptorMessage::NoFlushAppendRequest(append_request);
+                        dirty = true;
+                    }
+
+                    self.tli.process_msg(&msg).await?
                 }
 
-                // flush all written WAL to the disk
-                self.tli
-                    .process_msg(&ProposerAcceptorMessage::FlushWAL)
-                    .await?
-            } else {
-                // process message other than AppendRequest
-                self.tli.process_msg(&next_msg).await?
+                // While receiving AppendRequests, flush the WAL periodically and respond with an
+                // AppendResponse to let walproposer know we're still alive.
+                _ = flush_ticker.tick(), if dirty => {
+                    dirty = false;
+                    self.tli
+                        .process_msg(&ProposerAcceptorMessage::FlushWAL)
+                        .await?
+                }
+
+                // If there are no pending messages, flush the WAL immediately.
+                //
+                // TODO: this should be done via flush_ticker.reset_immediately(), but that's always
+                // delayed by 1ms due to this bug: https://github.com/tokio-rs/tokio/issues/6866.
+                _ = future::ready(()), if dirty && self.msg_rx.is_empty() => {
+                    dirty = false;
+                    flush_ticker.reset();
+                    self.tli
+                        .process_msg(&ProposerAcceptorMessage::FlushWAL)
+                        .await?
+                }
             };
 
-            if let Some(reply) = reply_msg {
+            // Send reply, if any.
+            if let Some(reply) = reply {
                 if self.reply_tx.send(reply).await.is_err() {
-                    return Ok(()); // chan closed, streaming terminated
+                    break; // disconnected, break to flush WAL and return
                 }
-                // reset keepalive time
-                next_keepalive = Instant::now() + KEEPALIVE_INTERVAL;
             }
         }
+
+        // Flush WAL on disconnect, see https://github.com/neondatabase/neon/issues/9259.
+        if dirty {
+            self.tli
+                .process_msg(&ProposerAcceptorMessage::FlushWAL)
+                .await?;
+        }
+
         Ok(())
     }
 }

From 57c21aff9f7a3074292f20efac319e3b248da484 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 28 Oct 2024 15:51:14 -0400
Subject: [PATCH 107/239] refactor(pageserver): remove aux v1 configs (#9494)

## Problem

Part of https://github.com/neondatabase/neon/issues/8623

## Summary of changes

Removed all aux-v1 config processing code. Note that we persisted it
into the index part file, so we cannot really remove the field from
index part. I also kept the config item within the tenant config, but we
will not read it any more.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 control_plane/src/pageserver.rs               |  12 +-
 libs/pageserver_api/src/config.rs             |   7 -
 libs/pageserver_api/src/models.rs             | 129 ------------------
 pageserver/pagebench/src/cmd/aux_files.rs     |   7 +-
 pageserver/src/tenant.rs                      |   1 -
 pageserver/src/tenant/config.rs               |   9 --
 storage_controller/src/service.rs             |  11 +-
 test_runner/fixtures/neon_cli.py              |   9 --
 test_runner/fixtures/neon_fixtures.py         |  12 --
 test_runner/fixtures/parametrize.py           |  11 --
 test_runner/fixtures/utils.py                 |  16 ---
 .../performance/test_logical_replication.py   |   3 +-
 .../regress/test_attach_tenant_config.py      |   1 -
 .../regress/test_logical_replication.py       |  17 ---
 14 files changed, 5 insertions(+), 240 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 5b5828c6ed..8df0a714ec 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,7 @@ use std::time::Duration;
 
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
-use pageserver_api::models::{self, AuxFilePolicy, TenantInfo, TimelineInfo};
+use pageserver_api::models::{self, TenantInfo, TimelineInfo};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -399,11 +399,6 @@ impl PageServerNode {
                 .map(serde_json::from_str)
                 .transpose()
                 .context("parse `timeline_get_throttle` from json")?,
-            switch_aux_file_policy: settings
-                .remove("switch_aux_file_policy")
-                .map(|x| x.parse::<AuxFilePolicy>())
-                .transpose()
-                .context("Failed to parse 'switch_aux_file_policy'")?,
             lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
             lsn_lease_length_for_ts: settings
                 .remove("lsn_lease_length_for_ts")
@@ -499,11 +494,6 @@ impl PageServerNode {
                     .map(serde_json::from_str)
                     .transpose()
                     .context("parse `timeline_get_throttle` from json")?,
-                switch_aux_file_policy: settings
-                    .remove("switch_aux_file_policy")
-                    .map(|x| x.parse::<AuxFilePolicy>())
-                    .transpose()
-                    .context("Failed to parse 'switch_aux_file_policy'")?,
                 lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
                 lsn_lease_length_for_ts: settings
                     .remove("lsn_lease_length_for_ts")
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 896a5d8069..6b2d6cf625 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -250,12 +250,6 @@ pub struct TenantConfigToml {
     // Expresed in multiples of checkpoint distance.
     pub image_layer_creation_check_threshold: u8,
 
-    /// Switch to a new aux file policy. Switching this flag requires the user has not written any aux file into
-    /// the storage before, and this flag cannot be switched back. Otherwise there will be data corruptions.
-    /// There is a `last_aux_file_policy` flag which gets persisted in `index_part.json` once the first aux
-    /// file is written.
-    pub switch_aux_file_policy: crate::models::AuxFilePolicy,
-
     /// The length for an explicit LSN lease request.
     /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
     #[serde(with = "humantime_serde")]
@@ -475,7 +469,6 @@ impl Default for TenantConfigToml {
             lazy_slru_download: false,
             timeline_get_throttle: crate::models::ThrottleConfig::disabled(),
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
-            switch_aux_file_policy: crate::models::AuxFilePolicy::default_tenant_config(),
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
         }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index d37f62185c..0a4992aea4 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,7 +10,6 @@ use std::{
     io::{BufRead, Read},
     num::{NonZeroU32, NonZeroU64, NonZeroUsize},
     str::FromStr,
-    sync::atomic::AtomicUsize,
     time::{Duration, SystemTime},
 };
 
@@ -309,7 +308,6 @@ pub struct TenantConfig {
     pub lazy_slru_download: Option<bool>,
     pub timeline_get_throttle: Option<ThrottleConfig>,
     pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
     pub lsn_lease_length: Option<String>,
     pub lsn_lease_length_for_ts: Option<String>,
 }
@@ -350,68 +348,6 @@ pub enum AuxFilePolicy {
     CrossValidation,
 }
 
-impl AuxFilePolicy {
-    pub fn is_valid_migration_path(from: Option<Self>, to: Self) -> bool {
-        matches!(
-            (from, to),
-            (None, _) | (Some(AuxFilePolicy::CrossValidation), AuxFilePolicy::V2)
-        )
-    }
-
-    /// If a tenant writes aux files without setting `switch_aux_policy`, this value will be used.
-    pub fn default_tenant_config() -> Self {
-        Self::V2
-    }
-}
-
-/// The aux file policy memory flag. Users can store `Option<AuxFilePolicy>` into this atomic flag. 0 == unspecified.
-pub struct AtomicAuxFilePolicy(AtomicUsize);
-
-impl AtomicAuxFilePolicy {
-    pub fn new(policy: Option<AuxFilePolicy>) -> Self {
-        Self(AtomicUsize::new(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-        ))
-    }
-
-    pub fn load(&self) -> Option<AuxFilePolicy> {
-        match self.0.load(std::sync::atomic::Ordering::Acquire) {
-            0 => None,
-            other => Some(AuxFilePolicy::from_usize(other)),
-        }
-    }
-
-    pub fn store(&self, policy: Option<AuxFilePolicy>) {
-        self.0.store(
-            policy.map(AuxFilePolicy::to_usize).unwrap_or_default(),
-            std::sync::atomic::Ordering::Release,
-        );
-    }
-}
-
-impl AuxFilePolicy {
-    pub fn to_usize(self) -> usize {
-        match self {
-            Self::V1 => 1,
-            Self::CrossValidation => 2,
-            Self::V2 => 3,
-        }
-    }
-
-    pub fn try_from_usize(this: usize) -> Option<Self> {
-        match this {
-            1 => Some(Self::V1),
-            2 => Some(Self::CrossValidation),
-            3 => Some(Self::V2),
-            _ => None,
-        }
-    }
-
-    pub fn from_usize(this: usize) -> Self {
-        Self::try_from_usize(this).unwrap()
-    }
-}
-
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 #[serde(tag = "kind")]
 pub enum EvictionPolicy {
@@ -1633,71 +1569,6 @@ mod tests {
         }
     }
 
-    #[test]
-    fn test_aux_file_migration_path() {
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V1
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::V2
-        ));
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            None,
-            AuxFilePolicy::CrossValidation
-        ));
-        // Self-migration is not a valid migration path, and the caller should handle it by itself.
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations not allowed
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::V2
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::V1
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V2),
-            AuxFilePolicy::CrossValidation
-        ));
-        assert!(!AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::V1),
-            AuxFilePolicy::CrossValidation
-        ));
-        // Migrations allowed
-        assert!(AuxFilePolicy::is_valid_migration_path(
-            Some(AuxFilePolicy::CrossValidation),
-            AuxFilePolicy::V2
-        ));
-    }
-
-    #[test]
-    fn test_aux_parse() {
-        assert_eq!(AuxFilePolicy::from_str("V2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(AuxFilePolicy::from_str("v2").unwrap(), AuxFilePolicy::V2);
-        assert_eq!(
-            AuxFilePolicy::from_str("cross-validation").unwrap(),
-            AuxFilePolicy::CrossValidation
-        );
-    }
-
     #[test]
     fn test_image_compression_algorithm_parsing() {
         use ImageCompressionAlgorithm::*;
diff --git a/pageserver/pagebench/src/cmd/aux_files.rs b/pageserver/pagebench/src/cmd/aux_files.rs
index bce3285606..923a7f1f18 100644
--- a/pageserver/pagebench/src/cmd/aux_files.rs
+++ b/pageserver/pagebench/src/cmd/aux_files.rs
@@ -1,4 +1,4 @@
-use pageserver_api::models::{AuxFilePolicy, TenantConfig, TenantConfigRequest};
+use pageserver_api::models::{TenantConfig, TenantConfigRequest};
 use pageserver_api::shard::TenantShardId;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -66,10 +66,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
     mgmt_api_client
         .tenant_config(&TenantConfigRequest {
             tenant_id: timeline.tenant_id,
-            config: TenantConfig {
-                switch_aux_file_policy: Some(AuxFilePolicy::V2),
-                ..Default::default()
-            },
+            config: TenantConfig::default(),
         })
         .await?;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f846e145c5..64e871cada 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4853,7 +4853,6 @@ pub(crate) mod harness {
                 image_layer_creation_check_threshold: Some(
                     tenant_conf.image_layer_creation_check_threshold,
                 ),
-                switch_aux_file_policy: Some(tenant_conf.switch_aux_file_policy),
                 lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
             }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 502cb62fe8..ce686c89ef 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,7 +9,6 @@
 //! may lead to a data loss.
 //!
 pub(crate) use pageserver_api::config::TenantConfigToml as TenantConf;
-use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::CompactionAlgorithmSettings;
 use pageserver_api::models::EvictionPolicy;
 use pageserver_api::models::{self, ThrottleConfig};
@@ -341,10 +340,6 @@ pub struct TenantConfOpt {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub image_layer_creation_check_threshold: Option<u8>,
 
-    #[serde(skip_serializing_if = "Option::is_none")]
-    #[serde(default)]
-    pub switch_aux_file_policy: Option<AuxFilePolicy>,
-
     #[serde(skip_serializing_if = "Option::is_none")]
     #[serde(with = "humantime_serde")]
     #[serde(default)]
@@ -410,9 +405,6 @@ impl TenantConfOpt {
             image_layer_creation_check_threshold: self
                 .image_layer_creation_check_threshold
                 .unwrap_or(global_conf.image_layer_creation_check_threshold),
-            switch_aux_file_policy: self
-                .switch_aux_file_policy
-                .unwrap_or(global_conf.switch_aux_file_policy),
             lsn_lease_length: self
                 .lsn_lease_length
                 .unwrap_or(global_conf.lsn_lease_length),
@@ -470,7 +462,6 @@ impl From<TenantConfOpt> for models::TenantConfig {
             lazy_slru_download: value.lazy_slru_download,
             timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
-            switch_aux_file_policy: value.switch_aux_file_policy,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
         }
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 32029c1232..3f6cbfef59 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -4958,16 +4958,7 @@ impl Service {
                     stripe_size,
                 },
                 placement_policy: Some(PlacementPolicy::Attached(0)), // No secondaries, for convenient debug/hacking
-
-                // There is no way to know what the tenant's config was: revert to defaults
-                //
-                // TODO: remove `switch_aux_file_policy` once we finish auxv2 migration
-                //
-                // we write to both v1+v2 storage, so that the test case can use either storage format for testing
-                config: TenantConfig {
-                    switch_aux_file_policy: Some(models::AuxFilePolicy::CrossValidation),
-                    ..TenantConfig::default()
-                },
+                config: TenantConfig::default(),
             })
             .await?;
 
diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 1b2767e296..d220ea57a2 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -16,7 +16,6 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.common_types import IndexPartDump
 from fixtures.pg_version import PgVersion
-from fixtures.utils import AuxFileStore
 
 if TYPE_CHECKING:
     from typing import (
@@ -201,7 +200,6 @@ class NeonLocalCli(AbstractNeonCli):
         shard_stripe_size: Optional[int] = None,
         placement_policy: Optional[str] = None,
         set_default: bool = False,
-        aux_file_policy: Optional[AuxFileStore] = None,
     ):
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -223,13 +221,6 @@ class NeonLocalCli(AbstractNeonCli):
                 )
             )
 
-        if aux_file_policy is AuxFileStore.V2:
-            args.extend(["-c", "switch_aux_file_policy:v2"])
-        elif aux_file_policy is AuxFileStore.V1:
-            args.extend(["-c", "switch_aux_file_policy:v1"])
-        elif aux_file_policy is AuxFileStore.CrossValidation:
-            args.extend(["-c", "switch_aux_file_policy:cross-validation"])
-
         if set_default:
             args.append("--set-default")
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 6491069f20..a8ec144fe9 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -94,7 +94,6 @@ from fixtures.utils import (
     subprocess_capture,
     wait_until,
 )
-from fixtures.utils import AuxFileStore as AuxFileStore  # reexport
 
 from .neon_api import NeonAPI, NeonApiEndpoint
 
@@ -353,7 +352,6 @@ class NeonEnvBuilder:
         initial_tenant: Optional[TenantId] = None,
         initial_timeline: Optional[TimelineId] = None,
         pageserver_virtual_file_io_engine: Optional[str] = None,
-        pageserver_aux_file_policy: Optional[AuxFileStore] = None,
         pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
         storage_controller_port_override: Optional[int] = None,
@@ -405,8 +403,6 @@ class NeonEnvBuilder:
                 f"Overriding pageserver default compaction algorithm to {self.pageserver_default_tenant_config_compaction_algorithm}"
             )
 
-        self.pageserver_aux_file_policy = pageserver_aux_file_policy
-
         self.safekeeper_extra_opts = safekeeper_extra_opts
 
         self.storage_controller_port_override = storage_controller_port_override
@@ -467,7 +463,6 @@ class NeonEnvBuilder:
             timeline_id=env.initial_timeline,
             shard_count=initial_tenant_shard_count,
             shard_stripe_size=initial_tenant_shard_stripe_size,
-            aux_file_policy=self.pageserver_aux_file_policy,
         )
         assert env.initial_tenant == initial_tenant
         assert env.initial_timeline == initial_timeline
@@ -1027,7 +1022,6 @@ class NeonEnv:
         self.control_plane_compute_hook_api = config.control_plane_compute_hook_api
 
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
-        self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
         self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
 
         # Create the neon_local's `NeonLocalInitConf`
@@ -1323,7 +1317,6 @@ class NeonEnv:
         shard_stripe_size: Optional[int] = None,
         placement_policy: Optional[str] = None,
         set_default: bool = False,
-        aux_file_policy: Optional[AuxFileStore] = None,
     ) -> tuple[TenantId, TimelineId]:
         """
         Creates a new tenant, returns its id and its initial timeline's id.
@@ -1340,7 +1333,6 @@ class NeonEnv:
             shard_stripe_size=shard_stripe_size,
             placement_policy=placement_policy,
             set_default=set_default,
-            aux_file_policy=aux_file_policy,
         )
 
         return tenant_id, timeline_id
@@ -1398,7 +1390,6 @@ def neon_simple_env(
     compatibility_pg_distrib_dir: Path,
     pg_version: PgVersion,
     pageserver_virtual_file_io_engine: str,
-    pageserver_aux_file_policy: Optional[AuxFileStore],
     pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
     pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnv]:
@@ -1431,7 +1422,6 @@ def neon_simple_env(
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
-        pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
         combination=combination,
@@ -1458,7 +1448,6 @@ def neon_env_builder(
     top_output_dir: Path,
     pageserver_virtual_file_io_engine: str,
     pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
-    pageserver_aux_file_policy: Optional[AuxFileStore],
     record_property: Callable[[str, object], None],
     pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnvBuilder]:
@@ -1501,7 +1490,6 @@ def neon_env_builder(
         test_name=request.node.name,
         test_output_dir=test_output_dir,
         test_overlay_dir=test_overlay_dir,
-        pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
         pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
     ) as builder:
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index 4114c2fcb3..1131bf090f 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -10,12 +10,6 @@ from _pytest.python import Metafunc
 
 from fixtures.pg_version import PgVersion
 
-if TYPE_CHECKING:
-    from typing import Any, Optional
-
-    from fixtures.utils import AuxFileStore
-
-
 if TYPE_CHECKING:
     from typing import Any, Optional
 
@@ -50,11 +44,6 @@ def pageserver_virtual_file_io_mode() -> Optional[str]:
     return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
 
 
-@pytest.fixture(scope="function", autouse=True)
-def pageserver_aux_file_policy() -> Optional[AuxFileStore]:
-    return None
-
-
 def get_pageserver_default_tenant_config_compaction_algorithm() -> Optional[dict[str, Any]]:
     toml_table = os.getenv("PAGESERVER_DEFAULT_TENANT_CONFIG_COMPACTION_ALGORITHM")
     if toml_table is None:
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index d12fa59abc..01b7cf1026 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import contextlib
-import enum
 import json
 import os
 import re
@@ -515,21 +514,6 @@ def assert_no_errors(log_file: Path, service: str, allowed_errors: list[str]):
     assert not errors, f"First log error on {service}: {errors[0]}\nHint: use scripts/check_allowed_errors.sh to test any new allowed_error you add"
 
 
-@enum.unique
-class AuxFileStore(str, enum.Enum):
-    V1 = "v1"
-    V2 = "v2"
-    CrossValidation = "cross-validation"
-
-    @override
-    def __repr__(self) -> str:
-        return f"'aux-{self.value}'"
-
-    @override
-    def __str__(self) -> str:
-        return f"'aux-{self.value}'"
-
-
 def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str]):
     """
     This is essentially:
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 815d186ab9..8b2a296bdd 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -9,7 +9,7 @@ import pytest
 from fixtures.benchmark_fixture import MetricReport
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
-from fixtures.neon_fixtures import AuxFileStore, logical_replication_sync
+from fixtures.neon_fixtures import logical_replication_sync
 
 if TYPE_CHECKING:
     from fixtures.benchmark_fixture import NeonBenchmarker
@@ -17,7 +17,6 @@ if TYPE_CHECKING:
     from fixtures.neon_fixtures import NeonEnv, PgBin
 
 
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.V2])
 @pytest.mark.timeout(1000)
 def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
     env = neon_simple_env
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 4a7017994d..83d003a5cc 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -172,7 +172,6 @@ def test_fully_custom_config(positive_env: NeonEnv):
         },
         "walreceiver_connect_timeout": "13m",
         "image_layer_creation_check_threshold": 1,
-        "switch_aux_file_policy": "cross-validation",
         "lsn_lease_length": "1m",
         "lsn_lease_length_for_ts": "5s",
     }
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index c26bf058e2..30027463df 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -5,11 +5,9 @@ from functools import partial
 from random import choice
 from string import ascii_lowercase
 
-import pytest
 from fixtures.common_types import Lsn
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    AuxFileStore,
     NeonEnv,
     NeonEnvBuilder,
     PgProtocol,
@@ -23,17 +21,6 @@ def random_string(n: int):
     return "".join([choice(ascii_lowercase) for _ in range(n)])
 
 
-@pytest.mark.parametrize(
-    "pageserver_aux_file_policy", [AuxFileStore.V2, AuxFileStore.CrossValidation]
-)
-def test_aux_file_v2_flag(neon_simple_env: NeonEnv, pageserver_aux_file_policy: AuxFileStore):
-    env = neon_simple_env
-    with env.pageserver.http_client() as client:
-        tenant_config = client.tenant_config(env.initial_tenant).effective_config
-        assert pageserver_aux_file_policy == tenant_config["switch_aux_file_policy"]
-
-
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -173,7 +160,6 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
     def slot_removed(ep):
         assert (
@@ -350,7 +336,6 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 #
 # Most pages start with a contrecord, so we don't do anything special
 # to ensure that.
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -395,7 +380,6 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
     env = neon_simple_env
 
@@ -467,7 +451,6 @@ def test_slots_and_branching(neon_simple_env: NeonEnv):
     ws_cur.execute("select pg_create_logical_replication_slot('my_slot', 'pgoutput')")
 
 
-@pytest.mark.parametrize("pageserver_aux_file_policy", [AuxFileStore.CrossValidation])
 def test_replication_shutdown(neon_simple_env: NeonEnv):
     # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed
     env = neon_simple_env

From f7c61e856f05e4a796ef82bff53e7b9a01b3d0f3 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 28 Oct 2024 16:03:02 -0400
Subject: [PATCH 108/239] fix(pageserver): bump tokio-epoll-uring (#9546)

Includes https://github.com/neondatabase/tokio-epoll-uring/pull/58 that
fixes the clippy error.

## Summary of changes

Update the version of tokio-epoll-uring

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7fa5df29fd..610b607482 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -6272,7 +6272,7 @@ dependencies = [
 [[package]]
 name = "tokio-epoll-uring"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
 dependencies = [
  "futures",
  "nix 0.26.4",
@@ -6788,7 +6788,7 @@ dependencies = [
 [[package]]
 name = "uring-common"
 version = "0.1.0"
-source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#cb2dcea2058034bc209e7917b01c5097712a3168"
+source = "git+https://github.com/neondatabase/tokio-epoll-uring.git?branch=main#33e00106a268644d02ba0461bbd64476073b0ee1"
 dependencies = [
  "bytes",
  "io-uring",

From 7d5f6b6a528de4068ace6eac2d45305d91a7d011 Mon Sep 17 00:00:00 2001
From: George MacKerron <georgemackerron@neon.tech>
Date: Mon, 28 Oct 2024 20:06:36 +0000
Subject: [PATCH 109/239] Build `pgrag` extensions x3 (#8486)

Build the pgrag extensions (rag, rag_bge_small_en_v15, and
rag_jina_reranker_v1_tiny_en) as part of the compute node Dockerfile.

---------

Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 compute/compute-node.Dockerfile | 81 +++++++++++++++++++++++++++++++++
 1 file changed, 81 insertions(+)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index dfed01daae..1b2167ea11 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -873,6 +873,85 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
 
 USER root
 
+#########################################################################################
+#
+# Layer "rust extensions pgrx12"
+#
+# pgrx started to support Postgres 17 since version 12,
+# but some older extension aren't compatible with it.
+# This layer should be used as a base for new pgrx extensions,
+# and eventually get merged with `rust-extensions-build`
+#
+#########################################################################################
+FROM build-deps AS rust-extensions-build-pgrx12
+ARG PG_VERSION
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y curl libclang-dev && \
+    useradd -ms /bin/bash nonroot -b /home
+
+ENV HOME=/home/nonroot
+ENV PATH="/home/nonroot/.cargo/bin:/usr/local/pgsql/bin/:$PATH"
+USER nonroot
+WORKDIR /home/nonroot
+
+RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && \
+    chmod +x rustup-init && \
+    ./rustup-init -y --no-modify-path --profile minimal --default-toolchain stable && \
+    rm rustup-init && \
+    cargo install --locked --version 0.12.6 cargo-pgrx && \
+    /bin/bash -c 'cargo pgrx init --pg${PG_VERSION:1}=/usr/local/pgsql/bin/pg_config'
+
+USER root
+
+#########################################################################################
+#
+# Layers "pg-onnx-build" and "pgrag-pg-build"
+# Compile "pgrag" extensions
+#
+#########################################################################################
+
+FROM rust-extensions-build-pgrx12 AS pg-onnx-build
+
+# cmake 3.26 or higher is required, so installing it using pip (bullseye-backports has cmake 3.25).
+# Install it using virtual environment, because Python 3.11 (the default version on Debian 12 (Bookworm)) complains otherwise
+RUN apt-get update && apt-get install -y python3 python3-pip python3-venv && \
+    python3 -m venv venv && \
+    . venv/bin/activate && \
+    python3 -m pip install cmake==3.30.5 && \
+    wget https://github.com/microsoft/onnxruntime/archive/refs/tags/v1.18.1.tar.gz -O onnxruntime.tar.gz && \
+    mkdir onnxruntime-src && cd onnxruntime-src && tar xzf ../onnxruntime.tar.gz --strip-components=1 -C . && \
+    ./build.sh --config Release --parallel --skip_submodule_sync --skip_tests --allow_running_as_root
+
+
+FROM pg-onnx-build AS pgrag-pg-build
+
+RUN apt-get install -y protobuf-compiler && \
+    wget https://github.com/neondatabase-labs/pgrag/archive/refs/tags/v0.0.0.tar.gz -O pgrag.tar.gz &&  \
+    echo "2cbe394c1e74fc8bcad9b52d5fbbfb783aef834ca3ce44626cfd770573700bb4 pgrag.tar.gz" | sha256sum --check && \
+    mkdir pgrag-src && cd pgrag-src && tar xzf ../pgrag.tar.gz --strip-components=1 -C . && \
+    \
+    cd exts/rag && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    cargo pgrx install --release && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag.control && \
+    \
+    cd ../rag_bge_small_en_v15 && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/bge_small_en_v15.onnx \
+        cargo pgrx install --release --features remote_onnx && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_bge_small_en_v15.control && \
+    \
+    cd ../rag_jina_reranker_v1_tiny_en && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    ORT_LIB_LOCATION=/home/nonroot/onnxruntime-src/build/Linux \
+        REMOTE_ONNX_URL=http://pg-ext-s3-gateway/pgrag-data/jina_reranker_v1_tiny_en.onnx \
+        cargo pgrx install --release --features remote_onnx && \
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/rag_jina_reranker_v1_tiny_en.control
+
+
 #########################################################################################
 #
 # Layer "pg-jsonschema-pg-build"
@@ -1085,6 +1164,7 @@ COPY --from=h3-pg-build /h3/usr /
 COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-jsonschema-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-graphql-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-tiktoken-pg-build /usr/local/pgsql/ /usr/local/pgsql/
@@ -1274,6 +1354,7 @@ COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.patch /ext-src/
 COPY --from=pgjwt-pg-build /pgjwt.tar.gz /ext-src
+#COPY --from=pgrag-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 #COPY --from=pg-jsonschema-pg-build /home/nonroot/pg_jsonschema.tar.gz /ext-src
 #COPY --from=pg-graphql-pg-build /home/nonroot/pg_graphql.tar.gz /ext-src
 #COPY --from=pg-tiktoken-pg-build /home/nonroot/pg_tiktoken.tar.gz /ext-src

From 062456561783d09cbe8eeedcf0a244d13866bd50 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 29 Oct 2024 00:47:15 +0200
Subject: [PATCH 110/239] Create the notion of unstable extensions

As a DBaaS provider, Neon needs to provide a stable platform for
customers to build applications upon. At the same time however, we also
need to enable customers to use the latest and greatest technology, so
they can prototype their work, and we can solicit feedback. If all
extensions are treated the same in terms of stability, it is hard to
meet that goal.

There are now two new GUCs created by the Neon extension:

neon.allow_unstable_extensions: This is a session GUC which allows
a session to install and load unstable extensions.

neon.unstable_extensions: This is a comma-separated list of extension
names. We can check if a CREATE EXTENSION statement is attempting to
install an unstable extension, and if so, deny the request if
neon.allow_unstable_extensions is not set to true.

Signed-off-by: Tristan Partin <tristan@neon.tech>
Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pgxn/neon/Makefile                            |   1 +
 pgxn/neon/neon.c                              |   2 +
 pgxn/neon/neon_pgversioncompat.c              |   1 +
 pgxn/neon/unstable_extensions.c               | 129 ++++++++++++++++++
 pgxn/neon/unstable_extensions.h               |   6 +
 .../regress/test_unstable_extensions.py       |  50 +++++++
 6 files changed, 189 insertions(+)
 create mode 100644 pgxn/neon/unstable_extensions.c
 create mode 100644 pgxn/neon/unstable_extensions.h
 create mode 100644 test_runner/regress/test_unstable_extensions.py

diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile
index 42f2a8efda..c87ae59fd6 100644
--- a/pgxn/neon/Makefile
+++ b/pgxn/neon/Makefile
@@ -16,6 +16,7 @@ OBJS = \
 	neon_walreader.o \
 	pagestore_smgr.o \
 	relsize_cache.o \
+	unstable_extensions.o \
 	walproposer.o \
 	walproposer_pg.o \
 	control_plane_connector.o \
diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c
index f8ec725c18..dc87d79e87 100644
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -30,6 +30,7 @@
 #include "neon.h"
 #include "control_plane_connector.h"
 #include "logical_replication_monitor.h"
+#include "unstable_extensions.h"
 #include "walsender_hooks.h"
 #if PG_MAJORVERSION_NUM >= 16
 #include "storage/ipc.h"
@@ -424,6 +425,7 @@ _PG_init(void)
 	LogicalFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 	SlotFuncs_Custom_XLogReaderRoutines = NeonOnDemandXLogReaderRoutines;
 
+	InitUnstableExtensionsSupport();
 	InitLogicalReplicationMonitor();
 	InitControlPlaneConnector();
 
diff --git a/pgxn/neon/neon_pgversioncompat.c b/pgxn/neon/neon_pgversioncompat.c
index a0dbddde4b..7c404fb5a9 100644
--- a/pgxn/neon/neon_pgversioncompat.c
+++ b/pgxn/neon/neon_pgversioncompat.c
@@ -42,3 +42,4 @@ InitMaterializedSRF(FunctionCallInfo fcinfo, bits32 flags)
 	MemoryContextSwitchTo(old_context);
 }
 #endif
+
diff --git a/pgxn/neon/unstable_extensions.c b/pgxn/neon/unstable_extensions.c
new file mode 100644
index 0000000000..a3445cb268
--- /dev/null
+++ b/pgxn/neon/unstable_extensions.c
@@ -0,0 +1,129 @@
+#include <stdlib.h>
+#include <string.h>
+
+#include "postgres.h"
+
+#include "nodes/plannodes.h"
+#include "nodes/parsenodes.h"
+#include "tcop/utility.h"
+#include "utils/errcodes.h"
+#include "utils/guc.h"
+
+#include "neon_pgversioncompat.h"
+#include "unstable_extensions.h"
+
+static bool					allow_unstable_extensions = false;
+static char				   *unstable_extensions = NULL;
+
+static ProcessUtility_hook_type PreviousProcessUtilityHook = NULL;
+
+static bool
+list_contains(char const* comma_separated_list, char const* val)
+{
+	char const* occ = comma_separated_list;
+	size_t val_len = strlen(val);
+
+	if (val_len == 0)
+		return false;
+
+	while ((occ = strstr(occ, val)) != NULL)
+	{
+		if ((occ == comma_separated_list || occ[-1] == ',')
+			&& (occ[val_len] == '\0' || occ[val_len] == ','))
+		{
+			return true;
+		}
+		occ += val_len;
+	}
+
+	return false;
+}
+
+
+static void
+CheckUnstableExtension(
+	PlannedStmt *pstmt,
+	const char *queryString,
+	bool readOnlyTree,
+	ProcessUtilityContext context,
+	ParamListInfo params,
+	QueryEnvironment *queryEnv,
+	DestReceiver *dest,
+	QueryCompletion *qc)
+{
+	Node	   *parseTree = pstmt->utilityStmt;
+
+	if (allow_unstable_extensions || unstable_extensions == NULL)
+		goto process;
+
+	switch (nodeTag(parseTree))
+	{
+		case T_CreateExtensionStmt:
+		{
+			CreateExtensionStmt *stmt = castNode(CreateExtensionStmt, parseTree);
+			if (list_contains(unstable_extensions, stmt->extname))
+			{
+				ereport(ERROR,
+						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
+						 errmsg("installing %s is currently prohibited", stmt->extname),
+						 errhint("Set neon.allow_unstable_extensions to true")));
+			}
+			break;
+		}
+		default:
+			goto process;
+	}
+
+process:
+	if (PreviousProcessUtilityHook)
+	{
+		PreviousProcessUtilityHook(
+			pstmt,
+			queryString,
+			readOnlyTree,
+			context,
+			params,
+			queryEnv,
+			dest,
+			qc);
+	}
+	else
+	{
+		standard_ProcessUtility(
+			pstmt,
+			queryString,
+			readOnlyTree,
+			context,
+			params,
+			queryEnv,
+			dest,
+			qc);
+	}
+}
+
+void
+InitUnstableExtensionsSupport(void)
+{
+	DefineCustomBoolVariable(
+		"neon.allow_unstable_extensions",
+		"Allow unstable extensions to be installed and used",
+		NULL,
+		&allow_unstable_extensions,
+		false,
+		PGC_SUSET,
+		0,
+		NULL, NULL, NULL);
+
+	DefineCustomStringVariable(
+		"neon.unstable_extensions",
+		"Allow unstable extensions to be installed and used",
+		NULL,
+		&unstable_extensions,
+		NULL,
+		PGC_SUSET,
+		0,
+		NULL, NULL, NULL);
+
+	PreviousProcessUtilityHook = ProcessUtility_hook;
+	ProcessUtility_hook = CheckUnstableExtension;
+}
diff --git a/pgxn/neon/unstable_extensions.h b/pgxn/neon/unstable_extensions.h
new file mode 100644
index 0000000000..3c695e9fb2
--- /dev/null
+++ b/pgxn/neon/unstable_extensions.h
@@ -0,0 +1,6 @@
+#ifndef __NEON_UNSTABLE_EXTENSIONS_H__
+#define __NEON_UNSTABLE_EXTENSIONS_H__
+
+void InitUnstableExtensionsSupport(void);
+
+#endif
diff --git a/test_runner/regress/test_unstable_extensions.py b/test_runner/regress/test_unstable_extensions.py
new file mode 100644
index 0000000000..06a62ccfd8
--- /dev/null
+++ b/test_runner/regress/test_unstable_extensions.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, cast
+
+import pytest
+from psycopg2.errors import InsufficientPrivilege
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
+
+
+def test_unstable_extensions_installation(neon_simple_env: NeonEnv):
+    """
+    Test that the unstable extension support within the neon extension can
+    block extension installation.
+    """
+    env = neon_simple_env
+
+    neon_unstable_extensions = "pg_prewarm,amcheck"
+
+    endpoint = env.endpoints.create(
+        "main",
+        config_lines=[
+            "neon.allow_unstable_extensions=false",
+            f"neon.unstable_extensions='{neon_unstable_extensions}'",
+        ],
+    )
+    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.start()
+
+    with endpoint.cursor() as cursor:
+        cursor.execute("SELECT current_setting('neon.unstable_extensions')")
+        result = cursor.fetchone()
+        assert result is not None
+        setting = cast("str", result[0])
+        assert setting == neon_unstable_extensions
+
+        with pytest.raises(InsufficientPrivilege):
+            cursor.execute("CREATE EXTENSION pg_prewarm")
+
+        with pytest.raises(InsufficientPrivilege):
+            cursor.execute("CREATE EXTENSION amcheck")
+
+        # Make sure that we can install a "stable" extension
+        cursor.execute("CREATE EXTENSION pageinspect")
+
+        cursor.execute("BEGIN")
+        cursor.execute("SET neon.allow_unstable_extensions TO true")
+        cursor.execute("CREATE EXTENSION pg_prewarm")
+        cursor.execute("COMMIT")

From 4df3987054a7cef88322713b9c4a0e3b1a706131 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 28 Oct 2024 18:21:45 -0500
Subject: [PATCH 111/239] Get role name when not a C string

We will only have a C string if the specified role is a string.
Otherwise, we need to resolve references to public, current_role,
current_user, and session_user.

Fixes: https://github.com/neondatabase/cloud/issues/19323
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/control_plane_connector.c        | 12 +++++++-
 test_runner/regress/test_ddl_forwarding.py | 32 ++++++++++++++++++++++
 2 files changed, 43 insertions(+), 1 deletion(-)

diff --git a/pgxn/neon/control_plane_connector.c b/pgxn/neon/control_plane_connector.c
index 4713103909..b47b22cd20 100644
--- a/pgxn/neon/control_plane_connector.c
+++ b/pgxn/neon/control_plane_connector.c
@@ -18,6 +18,7 @@
  *
  *-------------------------------------------------------------------------
  */
+
 #include "postgres.h"
 
 #include <curl/curl.h>
@@ -508,6 +509,8 @@ NeonXactCallback(XactEvent event, void *arg)
 static bool
 RoleIsNeonSuperuser(const char *role_name)
 {
+	Assert(role_name);
+
 	return strcmp(role_name, "neon_superuser") == 0;
 }
 
@@ -670,7 +673,7 @@ HandleCreateRole(CreateRoleStmt *stmt)
 static void
 HandleAlterRole(AlterRoleStmt *stmt)
 {
-	const char *role_name = stmt->role->rolename;
+	char	   *role_name;
 	DefElem    *dpass;
 	ListCell   *option;
 	bool		found = false;
@@ -678,6 +681,7 @@ HandleAlterRole(AlterRoleStmt *stmt)
 
 	InitRoleTableIfNeeded();
 
+	role_name = get_rolespec_name(stmt->role);
 	if (RoleIsNeonSuperuser(role_name) && !superuser())
 		elog(ERROR, "can't ALTER neon_superuser");
 
@@ -689,9 +693,13 @@ HandleAlterRole(AlterRoleStmt *stmt)
 		if (strcmp(defel->defname, "password") == 0)
 			dpass = defel;
 	}
+
 	/* We only care about updates to the password */
 	if (!dpass)
+	{
+		pfree(role_name);
 		return;
+	}
 
 	entry = hash_search(CurrentDdlTable->role_table,
 						role_name,
@@ -704,6 +712,8 @@ HandleAlterRole(AlterRoleStmt *stmt)
 	else
 		entry->password = NULL;
 	entry->type = Op_Set;
+
+	pfree(role_name);
 }
 
 static void
diff --git a/test_runner/regress/test_ddl_forwarding.py b/test_runner/regress/test_ddl_forwarding.py
index 96657b3ce4..e517e83e6f 100644
--- a/test_runner/regress/test_ddl_forwarding.py
+++ b/test_runner/regress/test_ddl_forwarding.py
@@ -7,6 +7,7 @@ import psycopg2
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, VanillaPostgres
+from psycopg2.errors import UndefinedObject
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -335,3 +336,34 @@ def test_ddl_forwarding_invalid_db(neon_simple_env: NeonEnv):
         if not result:
             raise AssertionError("Could not count databases")
         assert result[0] == 0, "Database 'failure' still exists after restart"
+
+
+def test_ddl_forwarding_role_specs(neon_simple_env: NeonEnv):
+    """
+    Postgres has a concept of role specs:
+
+        ROLESPEC_CSTRING: ALTER ROLE xyz
+        ROLESPEC_CURRENT_USER: ALTER ROLE current_user
+        ROLESPEC_CURRENT_ROLE: ALTER ROLE current_role
+        ROLESPEC_SESSION_USER: ALTER ROLE session_user
+        ROLESPEC_PUBLIC: ALTER ROLE public
+
+    The extension is required to serialize these special role spec into
+    usernames for the purpose of DDL forwarding.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    with endpoint.cursor() as cur:
+        # ROLESPEC_CSTRING
+        cur.execute("ALTER ROLE cloud_admin WITH PASSWORD 'york'")
+        # ROLESPEC_CURRENT_USER
+        cur.execute("ALTER ROLE current_user WITH PASSWORD 'pork'")
+        # ROLESPEC_CURRENT_ROLE
+        cur.execute("ALTER ROLE current_role WITH PASSWORD 'cork'")
+        # ROLESPEC_SESSION_USER
+        cur.execute("ALTER ROLE session_user WITH PASSWORD 'bork'")
+        # ROLESPEC_PUBLIC
+        with pytest.raises(UndefinedObject):
+            cur.execute("ALTER ROLE public WITH PASSWORD 'dork'")

From 62f5d484d994be08eaedd7b6627f194b91e7b93e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 29 Oct 2024 01:36:05 +0100
Subject: [PATCH 112/239] Assert the tenant to be active in
 `unoffload_timeline` (#9539)

Currently, all callers of `unoffload_timeline` ensure that the tenant
the unoffload operation is called on is active. We rely on it being
active as we activate the timeline below and don't want to race with the
activation code of the tenant (in the worst case, activating a timeline
twice).

Therefore, add this assertion.

Part of #8088
---
 pageserver/src/tenant.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 64e871cada..7011ae9e63 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1830,6 +1830,18 @@ impl Tenant {
         ctx: RequestContext,
     ) -> Result<Arc<Timeline>, TimelineArchivalError> {
         info!("unoffloading timeline");
+
+        // We activate the timeline below manually, so this must be called on an active timeline.
+        // We expect callers of this function to ensure this.
+        match self.current_state() {
+            TenantState::Activating { .. }
+            | TenantState::Attaching
+            | TenantState::Broken { .. } => {
+                panic!("Timeline expected to be active")
+            }
+            TenantState::Stopping { .. } => return Err(TimelineArchivalError::Cancelled),
+            TenantState::Active => {}
+        }
         let cancel = self.cancel.clone();
 
         // Protect against concurrent attempts to use this TimelineId

From 07b974480c642bc79a63cfd0d456a607533fe966 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Tue, 29 Oct 2024 10:00:34 +0000
Subject: [PATCH 113/239] pageserver: move things around to prepare for
 decoding logic (#9504)

## Problem

We wish to have high level WAL decoding logic in `wal_decoder::decoder`
module.

## Summary of Changes

For this we need the `Value` and `NeonWalRecord` types accessible there, so:
1. Move `Value` and `NeonWalRecord` to `pageserver::value` and
`pageserver::record` respectively.
2. Get rid of `pageserver::repository` (follow up from (1))
3. Move PG specific WAL record types to `postgres_ffi::walrecord`. In
theory they could live in `wal_decoder`, but it would create a circular
dependency between `wal_decoder` and `postgres_ffi`. Long term it makes
sense for those types to be PG version specific, so that will work out nicely.
4. Move higher level WAL record types (to be ingested by pageserver)
into `wal_decoder::models`

Related: https://github.com/neondatabase/neon/issues/9335
Epic: https://github.com/neondatabase/neon/issues/9329
---
 Cargo.lock                                    |  16 +
 Cargo.toml                                    |   2 +
 libs/pageserver_api/src/lib.rs                |   2 +
 libs/pageserver_api/src/record.rs             | 113 +++
 .../pageserver_api/src/value.rs               |  80 +-
 libs/postgres_ffi/Cargo.toml                  |   1 +
 libs/postgres_ffi/src/lib.rs                  |   1 +
 .../postgres_ffi}/src/walrecord.rs            | 942 ++++++++----------
 libs/wal_decoder/Cargo.toml                   |  18 +
 libs/wal_decoder/src/decoder.rs               |   1 +
 libs/wal_decoder/src/lib.rs                   |   2 +
 libs/wal_decoder/src/models.rs                | 167 ++++
 pageserver/Cargo.toml                         |   3 +-
 pageserver/benches/bench_ingest.rs            |   3 +-
 pageserver/benches/bench_layer_map.rs         |   2 +-
 pageserver/benches/bench_walredo.rs           |   3 +-
 pageserver/ctl/src/draw_timeline_dir.rs       |   2 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |   2 +-
 pageserver/ctl/src/layers.rs                  |   2 +-
 pageserver/src/deletion_queue.rs              |   3 +-
 pageserver/src/http/routes.rs                 |   4 +-
 pageserver/src/import_datadir.rs              |   3 +-
 pageserver/src/lib.rs                         |   2 -
 pageserver/src/pgdatadir_mapping.rs           |   6 +-
 pageserver/src/tenant.rs                      |  32 +-
 pageserver/src/tenant/gc_result.rs            |  57 ++
 pageserver/src/tenant/layer_map.rs            |   2 +-
 pageserver/src/tenant/mgr.rs                  |   2 +-
 pageserver/src/tenant/storage_layer.rs        |   4 +-
 .../storage_layer/batch_split_writer.rs       |   3 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  14 +-
 .../tenant/storage_layer/filter_iterator.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |   5 +-
 .../tenant/storage_layer/inmemory_layer.rs    |   3 +-
 .../src/tenant/storage_layer/layer/tests.rs   |   4 +-
 .../src/tenant/storage_layer/layer_desc.rs    |   2 +-
 .../src/tenant/storage_layer/layer_name.rs    |   2 +-
 .../tenant/storage_layer/merge_iterator.rs    |  19 +-
 pageserver/src/tenant/timeline.rs             |  21 +-
 pageserver/src/tenant/timeline/compaction.rs  |   9 +-
 .../walreceiver/walreceiver_connection.rs     |   2 +-
 pageserver/src/walingest.rs                   | 151 +--
 pageserver/src/walredo.rs                     |   9 +-
 pageserver/src/walredo/apply_neon.rs          |   4 +-
 pageserver/src/walredo/process.rs             |   2 +-
 45 files changed, 925 insertions(+), 806 deletions(-)
 create mode 100644 libs/pageserver_api/src/record.rs
 rename pageserver/src/repository.rs => libs/pageserver_api/src/value.rs (73%)
 rename {pageserver => libs/postgres_ffi}/src/walrecord.rs (88%)
 create mode 100644 libs/wal_decoder/Cargo.toml
 create mode 100644 libs/wal_decoder/src/decoder.rs
 create mode 100644 libs/wal_decoder/src/lib.rs
 create mode 100644 libs/wal_decoder/src/models.rs
 create mode 100644 pageserver/src/tenant/gc_result.rs

diff --git a/Cargo.lock b/Cargo.lock
index 610b607482..c5af247e8b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3749,6 +3749,7 @@ dependencies = [
  "tracing",
  "url",
  "utils",
+ "wal_decoder",
  "walkdir",
  "workspace_hack",
 ]
@@ -4186,6 +4187,7 @@ dependencies = [
  "regex",
  "serde",
  "thiserror",
+ "tracing",
  "utils",
 ]
 
@@ -6954,6 +6956,20 @@ dependencies = [
  "utils",
 ]
 
+[[package]]
+name = "wal_decoder"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "bytes",
+ "pageserver_api",
+ "postgres_ffi",
+ "serde",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "walkdir"
 version = "2.3.3"
diff --git a/Cargo.toml b/Cargo.toml
index 4c6a24ecde..7f9a766ff9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -33,6 +33,7 @@ members = [
     "libs/postgres_ffi/wal_craft",
     "libs/vm_monitor",
     "libs/walproposer",
+    "libs/wal_decoder",
 ]
 
 [workspace.package]
@@ -238,6 +239,7 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
 walproposer = { version = "0.1", path = "./libs/walproposer/" }
+wal_decoder = { version = "0.1", path = "./libs/wal_decoder" }
 
 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
diff --git a/libs/pageserver_api/src/lib.rs b/libs/pageserver_api/src/lib.rs
index 532185a366..ff705e79cd 100644
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -5,9 +5,11 @@ pub mod controller_api;
 pub mod key;
 pub mod keyspace;
 pub mod models;
+pub mod record;
 pub mod reltag;
 pub mod shard;
 /// Public API types
 pub mod upcall_api;
+pub mod value;
 
 pub mod config;
diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs
new file mode 100644
index 0000000000..b80ed2f203
--- /dev/null
+++ b/libs/pageserver_api/src/record.rs
@@ -0,0 +1,113 @@
+//! This module defines the WAL record format used within the pageserver.
+
+use bytes::Bytes;
+use postgres_ffi::walrecord::{describe_postgres_wal_record, MultiXactMember};
+use postgres_ffi::{MultiXactId, MultiXactOffset, TimestampTz, TransactionId};
+use serde::{Deserialize, Serialize};
+use utils::bin_ser::DeserializeError;
+
+/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
+/// around a PostgreSQL WAL record, or a custom neon-specific "record".
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub enum NeonWalRecord {
+    /// Native PostgreSQL WAL record
+    Postgres { will_init: bool, rec: Bytes },
+
+    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
+    ClearVisibilityMapFlags {
+        new_heap_blkno: Option<u32>,
+        old_heap_blkno: Option<u32>,
+        flags: u8,
+    },
+    /// Mark transaction IDs as committed on a CLOG page
+    ClogSetCommitted {
+        xids: Vec<TransactionId>,
+        timestamp: TimestampTz,
+    },
+    /// Mark transaction IDs as aborted on a CLOG page
+    ClogSetAborted { xids: Vec<TransactionId> },
+    /// Extend multixact offsets SLRU
+    MultixactOffsetCreate {
+        mid: MultiXactId,
+        moff: MultiXactOffset,
+    },
+    /// Extend multixact members SLRU.
+    MultixactMembersCreate {
+        moff: MultiXactOffset,
+        members: Vec<MultiXactMember>,
+    },
+    /// Update the map of AUX files, either writing or dropping an entry
+    AuxFile {
+        file_path: String,
+        content: Option<Bytes>,
+    },
+
+    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
+    #[cfg(feature = "testing")]
+    Test {
+        /// Append a string to the image.
+        append: String,
+        /// Clear the image before appending.
+        clear: bool,
+        /// Treat this record as an init record. `clear` should be set to true if this field is set
+        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
+        /// its references in `timeline.rs`.
+        will_init: bool,
+    },
+}
+
+impl NeonWalRecord {
+    /// Does replaying this WAL record initialize the page from scratch, or does
+    /// it need to be applied over the previous image of the page?
+    pub fn will_init(&self) -> bool {
+        // If you change this function, you'll also need to change ValueBytes::will_init
+        match self {
+            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
+            #[cfg(feature = "testing")]
+            NeonWalRecord::Test { will_init, .. } => *will_init,
+            // None of the special neon record types currently initialize the page
+            _ => false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_append(s: impl AsRef<str>) -> Self {
+        Self::Test {
+            append: s.as_ref().to_string(),
+            clear: false,
+            will_init: false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_clear() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: false,
+        }
+    }
+
+    #[cfg(feature = "testing")]
+    pub fn wal_init() -> Self {
+        Self::Test {
+            append: "".to_string(),
+            clear: true,
+            will_init: true,
+        }
+    }
+}
+
+/// Build a human-readable string to describe a WAL record
+///
+/// For debugging purposes
+pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
+    match rec {
+        NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
+            "will_init: {}, {}",
+            will_init,
+            describe_postgres_wal_record(rec)?
+        )),
+        _ => Ok(format!("{:?}", rec)),
+    }
+}
diff --git a/pageserver/src/repository.rs b/libs/pageserver_api/src/value.rs
similarity index 73%
rename from pageserver/src/repository.rs
rename to libs/pageserver_api/src/value.rs
index e4ebafd927..1f8ed30a9a 100644
--- a/pageserver/src/repository.rs
+++ b/libs/pageserver_api/src/value.rs
@@ -1,13 +1,16 @@
-use crate::walrecord::NeonWalRecord;
-use anyhow::Result;
+//! This module defines the value type used by the storage engine.
+//!
+//! A [`Value`] represents either a completely new value for one Key ([`Value::Image`]),
+//! or a "delta" of how to get from previous version of the value to the new one
+//! ([`Value::WalRecord`]])
+//!
+//! Note that the [`Value`] type is used for the permananent storage format, so any
+//! changes to it must be backwards compatible.
+
+use crate::record::NeonWalRecord;
 use bytes::Bytes;
 use serde::{Deserialize, Serialize};
-use std::ops::AddAssign;
-use std::time::Duration;
 
-pub use pageserver_api::key::{Key, KEY_SIZE};
-
-/// A 'value' stored for a one Key.
 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub enum Value {
     /// An Image value contains a full copy of the value
@@ -20,10 +23,12 @@ pub enum Value {
 }
 
 impl Value {
+    #[inline(always)]
     pub fn is_image(&self) -> bool {
         matches!(self, Value::Image(_))
     }
 
+    #[inline(always)]
     pub fn will_init(&self) -> bool {
         match self {
             Value::Image(_) => true,
@@ -33,17 +38,18 @@ impl Value {
 }
 
 #[derive(Debug, PartialEq)]
-pub(crate) enum InvalidInput {
+pub enum InvalidInput {
     TooShortValue,
     TooShortPostgresRecord,
 }
 
 /// We could have a ValueRef where everything is `serde(borrow)`. Before implementing that, lets
 /// use this type for querying if a slice looks some particular way.
-pub(crate) struct ValueBytes;
+pub struct ValueBytes;
 
 impl ValueBytes {
-    pub(crate) fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
+    #[inline(always)]
+    pub fn will_init(raw: &[u8]) -> Result<bool, InvalidInput> {
         if raw.len() < 12 {
             return Err(InvalidInput::TooShortValue);
         }
@@ -79,6 +85,7 @@ impl ValueBytes {
 mod test {
     use super::*;
 
+    use bytes::Bytes;
     use utils::bin_ser::BeSer;
 
     macro_rules! roundtrip {
@@ -229,56 +236,3 @@ mod test {
         assert!(!ValueBytes::will_init(&expected).unwrap());
     }
 }
-
-///
-/// Result of performing GC
-///
-#[derive(Default, Serialize, Debug)]
-pub struct GcResult {
-    pub layers_total: u64,
-    pub layers_needed_by_cutoff: u64,
-    pub layers_needed_by_pitr: u64,
-    pub layers_needed_by_branches: u64,
-    pub layers_needed_by_leases: u64,
-    pub layers_not_updated: u64,
-    pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
-
-    #[serde(serialize_with = "serialize_duration_as_millis")]
-    pub elapsed: Duration,
-
-    /// The layers which were garbage collected.
-    ///
-    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
-    /// dropped in tests.
-    #[cfg(feature = "testing")]
-    #[serde(skip)]
-    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
-}
-
-// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
-fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-{
-    d.as_millis().serialize(serializer)
-}
-
-impl AddAssign for GcResult {
-    fn add_assign(&mut self, other: Self) {
-        self.layers_total += other.layers_total;
-        self.layers_needed_by_pitr += other.layers_needed_by_pitr;
-        self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
-        self.layers_needed_by_branches += other.layers_needed_by_branches;
-        self.layers_needed_by_leases += other.layers_needed_by_leases;
-        self.layers_not_updated += other.layers_not_updated;
-        self.layers_removed += other.layers_removed;
-
-        self.elapsed += other.elapsed;
-
-        #[cfg(feature = "testing")]
-        {
-            let mut other = other;
-            self.doomed_layers.append(&mut other.doomed_layers);
-        }
-    }
-}
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index ef17833a48..e1f5443cbe 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -15,6 +15,7 @@ memoffset.workspace = true
 thiserror.workspace = true
 serde.workspace = true
 utils.workspace = true
+tracing.workspace = true
 
 [dev-dependencies]
 env_logger.workspace = true
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 0d46ed6aac..6b219488ac 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -217,6 +217,7 @@ macro_rules! enum_pgversion {
 
 pub mod pg_constants;
 pub mod relfile_utils;
+pub mod walrecord;
 
 // Export some widely used datatypes that are unlikely to change across Postgres versions
 pub use v14::bindings::RepOriginId;
diff --git a/pageserver/src/walrecord.rs b/libs/postgres_ffi/src/walrecord.rs
similarity index 88%
rename from pageserver/src/walrecord.rs
rename to libs/postgres_ffi/src/walrecord.rs
index dd199e2c55..dedbaef64d 100644
--- a/pageserver/src/walrecord.rs
+++ b/libs/postgres_ffi/src/walrecord.rs
@@ -1,107 +1,144 @@
+//! This module houses types used in decoding of PG WAL
+//! records.
 //!
-//! Functions for parsing WAL records.
-//!
+//! TODO: Generate separate types for each supported PG version
 
-use anyhow::Result;
+use crate::pg_constants;
+use crate::XLogRecord;
+use crate::{
+    BlockNumber, MultiXactId, MultiXactOffset, MultiXactStatus, Oid, RepOriginId, TimestampTz,
+    TransactionId,
+};
+use crate::{BLCKSZ, XLOG_SIZE_OF_XLOG_RECORD};
 use bytes::{Buf, Bytes};
-use postgres_ffi::dispatch_pgversion;
-use postgres_ffi::pg_constants;
-use postgres_ffi::BLCKSZ;
-use postgres_ffi::{BlockNumber, TimestampTz};
-use postgres_ffi::{MultiXactId, MultiXactOffset, MultiXactStatus, Oid, TransactionId};
-use postgres_ffi::{RepOriginId, XLogRecord, XLOG_SIZE_OF_XLOG_RECORD};
 use serde::{Deserialize, Serialize};
-use tracing::*;
-use utils::{bin_ser::DeserializeError, lsn::Lsn};
+use utils::bin_ser::DeserializeError;
+use utils::lsn::Lsn;
 
-/// Each update to a page is represented by a NeonWalRecord. It can be a wrapper
-/// around a PostgreSQL WAL record, or a custom neon-specific "record".
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub enum NeonWalRecord {
-    /// Native PostgreSQL WAL record
-    Postgres { will_init: bool, rec: Bytes },
-
-    /// Clear bits in heap visibility map. ('flags' is bitmap of bits to clear)
-    ClearVisibilityMapFlags {
-        new_heap_blkno: Option<u32>,
-        old_heap_blkno: Option<u32>,
-        flags: u8,
-    },
-    /// Mark transaction IDs as committed on a CLOG page
-    ClogSetCommitted {
-        xids: Vec<TransactionId>,
-        timestamp: TimestampTz,
-    },
-    /// Mark transaction IDs as aborted on a CLOG page
-    ClogSetAborted { xids: Vec<TransactionId> },
-    /// Extend multixact offsets SLRU
-    MultixactOffsetCreate {
-        mid: MultiXactId,
-        moff: MultiXactOffset,
-    },
-    /// Extend multixact members SLRU.
-    MultixactMembersCreate {
-        moff: MultiXactOffset,
-        members: Vec<MultiXactMember>,
-    },
-    /// Update the map of AUX files, either writing or dropping an entry
-    AuxFile {
-        file_path: String,
-        content: Option<Bytes>,
-    },
-
-    /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
-    #[cfg(test)]
-    Test {
-        /// Append a string to the image.
-        append: String,
-        /// Clear the image before appending.
-        clear: bool,
-        /// Treat this record as an init record. `clear` should be set to true if this field is set
-        /// to true. This record does not need the history WALs to reconstruct. See [`NeonWalRecord::will_init`] and
-        /// its references in `timeline.rs`.
-        will_init: bool,
-    },
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactCreate {
+    pub mid: MultiXactId,
+    /* new MultiXact's ID */
+    pub moff: MultiXactOffset,
+    /* its starting offset in members file */
+    pub nmembers: u32,
+    /* number of member XIDs */
+    pub members: Vec<MultiXactMember>,
 }
 
-impl NeonWalRecord {
-    /// Does replaying this WAL record initialize the page from scratch, or does
-    /// it need to be applied over the previous image of the page?
-    pub fn will_init(&self) -> bool {
-        // If you change this function, you'll also need to change ValueBytes::will_init
-        match self {
-            NeonWalRecord::Postgres { will_init, rec: _ } => *will_init,
-            #[cfg(test)]
-            NeonWalRecord::Test { will_init, .. } => *will_init,
-            // None of the special neon record types currently initialize the page
-            _ => false,
+impl XlMultiXactCreate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate {
+        let mid = buf.get_u32_le();
+        let moff = buf.get_u32_le();
+        let nmembers = buf.get_u32_le();
+        let mut members = Vec::new();
+        for _ in 0..nmembers {
+            members.push(MultiXactMember::decode(buf));
+        }
+        XlMultiXactCreate {
+            mid,
+            moff,
+            nmembers,
+            members,
         }
     }
+}
 
-    #[cfg(test)]
-    pub(crate) fn wal_append(s: impl AsRef<str>) -> Self {
-        Self::Test {
-            append: s.as_ref().to_string(),
-            clear: false,
-            will_init: false,
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlMultiXactTruncate {
+    pub oldest_multi_db: Oid,
+    /* to-be-truncated range of multixact offsets */
+    pub start_trunc_off: MultiXactId,
+    /* just for completeness' sake */
+    pub end_trunc_off: MultiXactId,
+
+    /* to-be-truncated range of multixact members */
+    pub start_trunc_memb: MultiXactOffset,
+    pub end_trunc_memb: MultiXactOffset,
+}
+
+impl XlMultiXactTruncate {
+    pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate {
+        XlMultiXactTruncate {
+            oldest_multi_db: buf.get_u32_le(),
+            start_trunc_off: buf.get_u32_le(),
+            end_trunc_off: buf.get_u32_le(),
+            start_trunc_memb: buf.get_u32_le(),
+            end_trunc_memb: buf.get_u32_le(),
         }
     }
+}
 
-    #[cfg(test)]
-    pub(crate) fn wal_clear() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: false,
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlRelmapUpdate {
+    pub dbid: Oid,   /* database ID, or 0 for shared map */
+    pub tsid: Oid,   /* database's tablespace, or pg_global */
+    pub nbytes: i32, /* size of relmap data */
+}
+
+impl XlRelmapUpdate {
+    pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate {
+        XlRelmapUpdate {
+            dbid: buf.get_u32_le(),
+            tsid: buf.get_u32_le(),
+            nbytes: buf.get_i32_le(),
         }
     }
+}
 
-    #[cfg(test)]
-    pub(crate) fn wal_init() -> Self {
-        Self::Test {
-            append: "".to_string(),
-            clear: true,
-            will_init: true,
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginDrop {
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginDrop {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
+        XlReploriginDrop {
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug)]
+pub struct XlReploriginSet {
+    pub remote_lsn: Lsn,
+    pub node_id: RepOriginId,
+}
+
+impl XlReploriginSet {
+    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
+        XlReploriginSet {
+            remote_lsn: Lsn(buf.get_u64_le()),
+            node_id: buf.get_u16_le(),
+        }
+    }
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Copy)]
+pub struct RelFileNode {
+    pub spcnode: Oid, /* tablespace */
+    pub dbnode: Oid,  /* database */
+    pub relnode: Oid, /* relation */
+}
+
+#[repr(C)]
+#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
+pub struct MultiXactMember {
+    pub xid: TransactionId,
+    pub status: MultiXactStatus,
+}
+
+impl MultiXactMember {
+    pub fn decode(buf: &mut Bytes) -> MultiXactMember {
+        MultiXactMember {
+            xid: buf.get_u32_le(),
+            status: buf.get_u32_le(),
         }
     }
 }
@@ -164,17 +201,17 @@ impl DecodedWALRecord {
     /// Check if this WAL record represents a legacy "copy" database creation, which populates new relations
     /// by reading other existing relations' data blocks.  This is more complex to apply than new-style database
     /// creations which simply include all the desired blocks in the WAL, so we need a helper function to detect this case.
-    pub(crate) fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
+    pub fn is_dbase_create_copy(&self, pg_version: u32) -> bool {
         if self.xl_rmid == pg_constants::RM_DBASE_ID {
             let info = self.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
             match pg_version {
                 14 => {
                     // Postgres 14 database creations are always the legacy kind
-                    info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
+                    info == crate::v14::bindings::XLOG_DBASE_CREATE
                 }
-                15 => info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                16 => info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
-                17 => info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                15 => info == crate::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                16 => info == crate::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY,
+                17 => info == crate::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY,
                 _ => {
                     panic!("Unsupported postgres version {pg_version}")
                 }
@@ -185,35 +222,294 @@ impl DecodedWALRecord {
     }
 }
 
-#[repr(C)]
-#[derive(Debug, Clone, Copy)]
-pub struct RelFileNode {
-    pub spcnode: Oid, /* tablespace */
-    pub dbnode: Oid,  /* database */
-    pub relnode: Oid, /* relation */
-}
+/// Main routine to decode a WAL record and figure out which blocks are modified
+//
+// See xlogrecord.h for details
+// The overall layout of an XLOG record is:
+//		Fixed-size header (XLogRecord struct)
+//      XLogRecordBlockHeader struct
+//          If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
+//	           If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an
+//	           XLogRecordBlockCompressHeader struct follows.
+//          If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows
+//          BlockNumber follows
+//      XLogRecordBlockHeader struct
+//      ...
+//      XLogRecordDataHeader[Short|Long] struct
+//      block data
+//      block data
+//      ...
+//      main data
+//
+//
+// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in.
+// It would be more natural for this function to return a DecodedWALRecord as return value,
+// but reusing the caller-supplied struct avoids an allocation.
+// This code is in the hot path for digesting incoming WAL, and is very performance sensitive.
+//
+pub fn decode_wal_record(
+    record: Bytes,
+    decoded: &mut DecodedWALRecord,
+    pg_version: u32,
+) -> anyhow::Result<()> {
+    let mut rnode_spcnode: u32 = 0;
+    let mut rnode_dbnode: u32 = 0;
+    let mut rnode_relnode: u32 = 0;
+    let mut got_rnode = false;
+    let mut origin_id: u16 = 0;
 
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlRelmapUpdate {
-    pub dbid: Oid,   /* database ID, or 0 for shared map */
-    pub tsid: Oid,   /* database's tablespace, or pg_global */
-    pub nbytes: i32, /* size of relmap data */
-}
+    let mut buf = record.clone();
 
-impl XlRelmapUpdate {
-    pub fn decode(buf: &mut Bytes) -> XlRelmapUpdate {
-        XlRelmapUpdate {
-            dbid: buf.get_u32_le(),
-            tsid: buf.get_u32_le(),
-            nbytes: buf.get_i32_le(),
+    // 1. Parse XLogRecord struct
+
+    // FIXME: assume little-endian here
+    let xlogrec = XLogRecord::from_bytes(&mut buf)?;
+
+    tracing::trace!(
+        "decode_wal_record xl_rmid = {} xl_info = {}",
+        xlogrec.xl_rmid,
+        xlogrec.xl_info
+    );
+
+    let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;
+
+    if buf.remaining() != remaining {
+        //TODO error
+    }
+
+    let mut max_block_id = 0;
+    let mut blocks_total_len: u32 = 0;
+    let mut main_data_len = 0;
+    let mut datatotal: u32 = 0;
+    decoded.blocks.clear();
+
+    // 2. Decode the headers.
+    // XLogRecordBlockHeaders if any,
+    // XLogRecordDataHeader[Short|Long]
+    while buf.remaining() > datatotal as usize {
+        let block_id = buf.get_u8();
+
+        match block_id {
+            pg_constants::XLR_BLOCK_ID_DATA_SHORT => {
+                /* XLogRecordDataHeaderShort */
+                main_data_len = buf.get_u8() as u32;
+                datatotal += main_data_len;
+            }
+
+            pg_constants::XLR_BLOCK_ID_DATA_LONG => {
+                /* XLogRecordDataHeaderLong */
+                main_data_len = buf.get_u32_le();
+                datatotal += main_data_len;
+            }
+
+            pg_constants::XLR_BLOCK_ID_ORIGIN => {
+                // RepOriginId is uint16
+                origin_id = buf.get_u16_le();
+            }
+
+            pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
+                // TransactionId is uint32
+                buf.advance(4);
+            }
+
+            0..=pg_constants::XLR_MAX_BLOCK_ID => {
+                /* XLogRecordBlockHeader */
+                let mut blk = DecodedBkpBlock::new();
+
+                if block_id <= max_block_id {
+                    // TODO
+                    //report_invalid_record(state,
+                    //			  "out-of-order block_id %u at %X/%X",
+                    //			  block_id,
+                    //			  (uint32) (state->ReadRecPtr >> 32),
+                    //			  (uint32) state->ReadRecPtr);
+                    //    goto err;
+                }
+                max_block_id = block_id;
+
+                let fork_flags: u8 = buf.get_u8();
+                blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
+                blk.flags = fork_flags;
+                blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
+                blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
+                blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0;
+                blk.data_len = buf.get_u16_le();
+
+                /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
+
+                datatotal += blk.data_len as u32;
+                blocks_total_len += blk.data_len as u32;
+
+                if blk.has_image {
+                    blk.bimg_len = buf.get_u16_le();
+                    blk.hole_offset = buf.get_u16_le();
+                    blk.bimg_info = buf.get_u8();
+
+                    blk.apply_image = dispatch_pgversion!(
+                        pg_version,
+                        (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0
+                    );
+
+                    let blk_img_is_compressed =
+                        crate::bkpimage_is_compressed(blk.bimg_info, pg_version);
+
+                    if blk_img_is_compressed {
+                        tracing::debug!("compressed block image , pg_version = {}", pg_version);
+                    }
+
+                    if blk_img_is_compressed {
+                        if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
+                            blk.hole_length = buf.get_u16_le();
+                        } else {
+                            blk.hole_length = 0;
+                        }
+                    } else {
+                        blk.hole_length = BLCKSZ - blk.bimg_len;
+                    }
+                    datatotal += blk.bimg_len as u32;
+                    blocks_total_len += blk.bimg_len as u32;
+
+                    /*
+                     * cross-check that hole_offset > 0, hole_length > 0 and
+                     * bimg_len < BLCKSZ if the HAS_HOLE flag is set.
+                     */
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0
+                        && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ)
+                    {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
+                                      (unsigned int) blk->hole_offset,
+                                      (unsigned int) blk->hole_length,
+                                      (unsigned int) blk->bimg_len,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+
+                    /*
+                     * cross-check that hole_offset == 0 and hole_length == 0 if
+                     * the HAS_HOLE flag is not set.
+                     */
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
+                        && (blk.hole_offset != 0 || blk.hole_length != 0)
+                    {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
+                                      (unsigned int) blk->hole_offset,
+                                      (unsigned int) blk->hole_length,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+
+                    /*
+                     * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
+                     * flag is set.
+                     */
+                    if !blk_img_is_compressed && blk.bimg_len == BLCKSZ {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
+                                      (unsigned int) blk->bimg_len,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+
+                    /*
+                     * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor
+                     * IS_COMPRESSED flag is set.
+                     */
+                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
+                        && !blk_img_is_compressed
+                        && blk.bimg_len != BLCKSZ
+                    {
+                        // TODO
+                        /*
+                        report_invalid_record(state,
+                                      "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
+                                      (unsigned int) blk->data_len,
+                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                        goto err;
+                                     */
+                    }
+                }
+                if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
+                    rnode_spcnode = buf.get_u32_le();
+                    rnode_dbnode = buf.get_u32_le();
+                    rnode_relnode = buf.get_u32_le();
+                    got_rnode = true;
+                } else if !got_rnode {
+                    // TODO
+                    /*
+                    report_invalid_record(state,
+                                    "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
+                                    (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
+                    goto err;           */
+                }
+
+                blk.rnode_spcnode = rnode_spcnode;
+                blk.rnode_dbnode = rnode_dbnode;
+                blk.rnode_relnode = rnode_relnode;
+
+                blk.blkno = buf.get_u32_le();
+                tracing::trace!(
+                    "this record affects {}/{}/{} blk {}",
+                    rnode_spcnode,
+                    rnode_dbnode,
+                    rnode_relnode,
+                    blk.blkno
+                );
+
+                decoded.blocks.push(blk);
+            }
+
+            _ => {
+                // TODO: invalid block_id
+            }
         }
     }
+
+    // 3. Decode blocks.
+    let mut ptr = record.len() - buf.remaining();
+    for blk in decoded.blocks.iter_mut() {
+        if blk.has_image {
+            blk.bimg_offset = ptr as u32;
+            ptr += blk.bimg_len as usize;
+        }
+        if blk.has_data {
+            ptr += blk.data_len as usize;
+        }
+    }
+    // We don't need them, so just skip blocks_total_len bytes
+    buf.advance(blocks_total_len as usize);
+    assert_eq!(ptr, record.len() - buf.remaining());
+
+    let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
+
+    // 4. Decode main_data
+    if main_data_len > 0 {
+        assert_eq!(buf.remaining(), main_data_len as usize);
+    }
+
+    decoded.xl_xid = xlogrec.xl_xid;
+    decoded.xl_info = xlogrec.xl_info;
+    decoded.xl_rmid = xlogrec.xl_rmid;
+    decoded.record = record;
+    decoded.origin_id = origin_id;
+    decoded.main_data_offset = main_data_offset;
+
+    Ok(())
 }
 
 pub mod v14 {
+    use crate::{OffsetNumber, TransactionId};
     use bytes::{Buf, Bytes};
-    use postgres_ffi::{OffsetNumber, TransactionId};
 
     #[repr(C)]
     #[derive(Debug)]
@@ -383,8 +679,8 @@ pub mod v15 {
 
 pub mod v16 {
     pub use super::v14::{XlHeapInsert, XlHeapLockUpdated, XlHeapMultiInsert, XlParameterChange};
+    use crate::{OffsetNumber, TransactionId};
     use bytes::{Buf, Bytes};
-    use postgres_ffi::{OffsetNumber, TransactionId};
 
     pub struct XlHeapDelete {
         pub xmax: TransactionId,
@@ -450,8 +746,8 @@ pub mod v16 {
 
     /* Since PG16, we have the Neon RMGR (RM_NEON_ID) to manage Neon-flavored WAL. */
     pub mod rm_neon {
+        use crate::{OffsetNumber, TransactionId};
         use bytes::{Buf, Bytes};
-        use postgres_ffi::{OffsetNumber, TransactionId};
 
         #[repr(C)]
         #[derive(Debug)]
@@ -563,8 +859,8 @@ pub mod v16 {
 
 pub mod v17 {
     pub use super::v14::XlHeapLockUpdated;
+    pub use crate::{TimeLineID, TimestampTz};
     use bytes::{Buf, Bytes};
-    pub use postgres_ffi::{TimeLineID, TimestampTz};
 
     pub use super::v16::rm_neon;
     pub use super::v16::{
@@ -742,7 +1038,7 @@ impl XlXactParsedRecord {
                 let spcnode = buf.get_u32_le();
                 let dbnode = buf.get_u32_le();
                 let relnode = buf.get_u32_le();
-                trace!(
+                tracing::trace!(
                     "XLOG_XACT_COMMIT relfilenode {}/{}/{}",
                     spcnode,
                     dbnode,
@@ -756,9 +1052,9 @@ impl XlXactParsedRecord {
             }
         }
 
-        if xinfo & postgres_ffi::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
+        if xinfo & crate::v15::bindings::XACT_XINFO_HAS_DROPPED_STATS != 0 {
             let nitems = buf.get_i32_le();
-            debug!(
+            tracing::debug!(
                 "XLOG_XACT_COMMIT-XACT_XINFO_HAS_DROPPED_STAT nitems {}",
                 nitems
             );
@@ -778,7 +1074,7 @@ impl XlXactParsedRecord {
 
         if xinfo & pg_constants::XACT_XINFO_HAS_TWOPHASE != 0 {
             xid = buf.get_u32_le();
-            debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
+            tracing::debug!("XLOG_XACT_COMMIT-XACT_XINFO_HAS_TWOPHASE xid {}", xid);
         }
 
         let origin_lsn = if xinfo & pg_constants::XACT_XINFO_HAS_ORIGIN != 0 {
@@ -822,78 +1118,6 @@ impl XlClogTruncate {
     }
 }
 
-#[repr(C)]
-#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
-pub struct MultiXactMember {
-    pub xid: TransactionId,
-    pub status: MultiXactStatus,
-}
-
-impl MultiXactMember {
-    pub fn decode(buf: &mut Bytes) -> MultiXactMember {
-        MultiXactMember {
-            xid: buf.get_u32_le(),
-            status: buf.get_u32_le(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlMultiXactCreate {
-    pub mid: MultiXactId,
-    /* new MultiXact's ID */
-    pub moff: MultiXactOffset,
-    /* its starting offset in members file */
-    pub nmembers: u32,
-    /* number of member XIDs */
-    pub members: Vec<MultiXactMember>,
-}
-
-impl XlMultiXactCreate {
-    pub fn decode(buf: &mut Bytes) -> XlMultiXactCreate {
-        let mid = buf.get_u32_le();
-        let moff = buf.get_u32_le();
-        let nmembers = buf.get_u32_le();
-        let mut members = Vec::new();
-        for _ in 0..nmembers {
-            members.push(MultiXactMember::decode(buf));
-        }
-        XlMultiXactCreate {
-            mid,
-            moff,
-            nmembers,
-            members,
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlMultiXactTruncate {
-    pub oldest_multi_db: Oid,
-    /* to-be-truncated range of multixact offsets */
-    pub start_trunc_off: MultiXactId,
-    /* just for completeness' sake */
-    pub end_trunc_off: MultiXactId,
-
-    /* to-be-truncated range of multixact members */
-    pub start_trunc_memb: MultiXactOffset,
-    pub end_trunc_memb: MultiXactOffset,
-}
-
-impl XlMultiXactTruncate {
-    pub fn decode(buf: &mut Bytes) -> XlMultiXactTruncate {
-        XlMultiXactTruncate {
-            oldest_multi_db: buf.get_u32_le(),
-            start_trunc_off: buf.get_u32_le(),
-            end_trunc_off: buf.get_u32_le(),
-            start_trunc_memb: buf.get_u32_le(),
-            end_trunc_memb: buf.get_u32_le(),
-        }
-    }
-}
-
 #[repr(C)]
 #[derive(Debug)]
 pub struct XlLogicalMessage {
@@ -950,337 +1174,7 @@ impl XlRunningXacts {
     }
 }
 
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginDrop {
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginDrop {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginDrop {
-        XlReploriginDrop {
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
-#[repr(C)]
-#[derive(Debug)]
-pub struct XlReploriginSet {
-    pub remote_lsn: Lsn,
-    pub node_id: RepOriginId,
-}
-
-impl XlReploriginSet {
-    pub fn decode(buf: &mut Bytes) -> XlReploriginSet {
-        XlReploriginSet {
-            remote_lsn: Lsn(buf.get_u64_le()),
-            node_id: buf.get_u16_le(),
-        }
-    }
-}
-
-/// Main routine to decode a WAL record and figure out which blocks are modified
-//
-// See xlogrecord.h for details
-// The overall layout of an XLOG record is:
-//		Fixed-size header (XLogRecord struct)
-//      XLogRecordBlockHeader struct
-//          If pg_constants::BKPBLOCK_HAS_IMAGE, an XLogRecordBlockImageHeader struct follows
-//	           If pg_constants::BKPIMAGE_HAS_HOLE and pg_constants::BKPIMAGE_IS_COMPRESSED, an
-//	           XLogRecordBlockCompressHeader struct follows.
-//          If pg_constants::BKPBLOCK_SAME_REL is not set, a RelFileNode follows
-//          BlockNumber follows
-//      XLogRecordBlockHeader struct
-//      ...
-//      XLogRecordDataHeader[Short|Long] struct
-//      block data
-//      block data
-//      ...
-//      main data
-//
-//
-// For performance reasons, the caller provides the DecodedWALRecord struct and the function just fills it in.
-// It would be more natural for this function to return a DecodedWALRecord as return value,
-// but reusing the caller-supplied struct avoids an allocation.
-// This code is in the hot path for digesting incoming WAL, and is very performance sensitive.
-//
-pub fn decode_wal_record(
-    record: Bytes,
-    decoded: &mut DecodedWALRecord,
-    pg_version: u32,
-) -> Result<()> {
-    let mut rnode_spcnode: u32 = 0;
-    let mut rnode_dbnode: u32 = 0;
-    let mut rnode_relnode: u32 = 0;
-    let mut got_rnode = false;
-    let mut origin_id: u16 = 0;
-
-    let mut buf = record.clone();
-
-    // 1. Parse XLogRecord struct
-
-    // FIXME: assume little-endian here
-    let xlogrec = XLogRecord::from_bytes(&mut buf)?;
-
-    trace!(
-        "decode_wal_record xl_rmid = {} xl_info = {}",
-        xlogrec.xl_rmid,
-        xlogrec.xl_info
-    );
-
-    let remaining: usize = xlogrec.xl_tot_len as usize - XLOG_SIZE_OF_XLOG_RECORD;
-
-    if buf.remaining() != remaining {
-        //TODO error
-    }
-
-    let mut max_block_id = 0;
-    let mut blocks_total_len: u32 = 0;
-    let mut main_data_len = 0;
-    let mut datatotal: u32 = 0;
-    decoded.blocks.clear();
-
-    // 2. Decode the headers.
-    // XLogRecordBlockHeaders if any,
-    // XLogRecordDataHeader[Short|Long]
-    while buf.remaining() > datatotal as usize {
-        let block_id = buf.get_u8();
-
-        match block_id {
-            pg_constants::XLR_BLOCK_ID_DATA_SHORT => {
-                /* XLogRecordDataHeaderShort */
-                main_data_len = buf.get_u8() as u32;
-                datatotal += main_data_len;
-            }
-
-            pg_constants::XLR_BLOCK_ID_DATA_LONG => {
-                /* XLogRecordDataHeaderLong */
-                main_data_len = buf.get_u32_le();
-                datatotal += main_data_len;
-            }
-
-            pg_constants::XLR_BLOCK_ID_ORIGIN => {
-                // RepOriginId is uint16
-                origin_id = buf.get_u16_le();
-            }
-
-            pg_constants::XLR_BLOCK_ID_TOPLEVEL_XID => {
-                // TransactionId is uint32
-                buf.advance(4);
-            }
-
-            0..=pg_constants::XLR_MAX_BLOCK_ID => {
-                /* XLogRecordBlockHeader */
-                let mut blk = DecodedBkpBlock::new();
-
-                if block_id <= max_block_id {
-                    // TODO
-                    //report_invalid_record(state,
-                    //			  "out-of-order block_id %u at %X/%X",
-                    //			  block_id,
-                    //			  (uint32) (state->ReadRecPtr >> 32),
-                    //			  (uint32) state->ReadRecPtr);
-                    //    goto err;
-                }
-                max_block_id = block_id;
-
-                let fork_flags: u8 = buf.get_u8();
-                blk.forknum = fork_flags & pg_constants::BKPBLOCK_FORK_MASK;
-                blk.flags = fork_flags;
-                blk.has_image = (fork_flags & pg_constants::BKPBLOCK_HAS_IMAGE) != 0;
-                blk.has_data = (fork_flags & pg_constants::BKPBLOCK_HAS_DATA) != 0;
-                blk.will_init = (fork_flags & pg_constants::BKPBLOCK_WILL_INIT) != 0;
-                blk.data_len = buf.get_u16_le();
-
-                /* TODO cross-check that the HAS_DATA flag is set iff data_length > 0 */
-
-                datatotal += blk.data_len as u32;
-                blocks_total_len += blk.data_len as u32;
-
-                if blk.has_image {
-                    blk.bimg_len = buf.get_u16_le();
-                    blk.hole_offset = buf.get_u16_le();
-                    blk.bimg_info = buf.get_u8();
-
-                    blk.apply_image = dispatch_pgversion!(
-                        pg_version,
-                        (blk.bimg_info & pgv::bindings::BKPIMAGE_APPLY) != 0
-                    );
-
-                    let blk_img_is_compressed =
-                        postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version);
-
-                    if blk_img_is_compressed {
-                        debug!("compressed block image , pg_version = {}", pg_version);
-                    }
-
-                    if blk_img_is_compressed {
-                        if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0 {
-                            blk.hole_length = buf.get_u16_le();
-                        } else {
-                            blk.hole_length = 0;
-                        }
-                    } else {
-                        blk.hole_length = BLCKSZ - blk.bimg_len;
-                    }
-                    datatotal += blk.bimg_len as u32;
-                    blocks_total_len += blk.bimg_len as u32;
-
-                    /*
-                     * cross-check that hole_offset > 0, hole_length > 0 and
-                     * bimg_len < BLCKSZ if the HAS_HOLE flag is set.
-                     */
-                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE != 0
-                        && (blk.hole_offset == 0 || blk.hole_length == 0 || blk.bimg_len == BLCKSZ)
-                    {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "pg_constants::BKPIMAGE_HAS_HOLE set, but hole offset %u length %u block image length %u at %X/%X",
-                                      (unsigned int) blk->hole_offset,
-                                      (unsigned int) blk->hole_length,
-                                      (unsigned int) blk->bimg_len,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-
-                    /*
-                     * cross-check that hole_offset == 0 and hole_length == 0 if
-                     * the HAS_HOLE flag is not set.
-                     */
-                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
-                        && (blk.hole_offset != 0 || blk.hole_length != 0)
-                    {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "pg_constants::BKPIMAGE_HAS_HOLE not set, but hole offset %u length %u at %X/%X",
-                                      (unsigned int) blk->hole_offset,
-                                      (unsigned int) blk->hole_length,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-
-                    /*
-                     * cross-check that bimg_len < BLCKSZ if the IS_COMPRESSED
-                     * flag is set.
-                     */
-                    if !blk_img_is_compressed && blk.bimg_len == BLCKSZ {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length %u at %X/%X",
-                                      (unsigned int) blk->bimg_len,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-
-                    /*
-                     * cross-check that bimg_len = BLCKSZ if neither HAS_HOLE nor
-                     * IS_COMPRESSED flag is set.
-                     */
-                    if blk.bimg_info & pg_constants::BKPIMAGE_HAS_HOLE == 0
-                        && !blk_img_is_compressed
-                        && blk.bimg_len != BLCKSZ
-                    {
-                        // TODO
-                        /*
-                        report_invalid_record(state,
-                                      "neither pg_constants::BKPIMAGE_HAS_HOLE nor pg_constants::BKPIMAGE_IS_COMPRESSED set, but block image length is %u at %X/%X",
-                                      (unsigned int) blk->data_len,
-                                      (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                        goto err;
-                                     */
-                    }
-                }
-                if fork_flags & pg_constants::BKPBLOCK_SAME_REL == 0 {
-                    rnode_spcnode = buf.get_u32_le();
-                    rnode_dbnode = buf.get_u32_le();
-                    rnode_relnode = buf.get_u32_le();
-                    got_rnode = true;
-                } else if !got_rnode {
-                    // TODO
-                    /*
-                    report_invalid_record(state,
-                                    "pg_constants::BKPBLOCK_SAME_REL set but no previous rel at %X/%X",
-                                    (uint32) (state->ReadRecPtr >> 32), (uint32) state->ReadRecPtr);
-                    goto err;           */
-                }
-
-                blk.rnode_spcnode = rnode_spcnode;
-                blk.rnode_dbnode = rnode_dbnode;
-                blk.rnode_relnode = rnode_relnode;
-
-                blk.blkno = buf.get_u32_le();
-                trace!(
-                    "this record affects {}/{}/{} blk {}",
-                    rnode_spcnode,
-                    rnode_dbnode,
-                    rnode_relnode,
-                    blk.blkno
-                );
-
-                decoded.blocks.push(blk);
-            }
-
-            _ => {
-                // TODO: invalid block_id
-            }
-        }
-    }
-
-    // 3. Decode blocks.
-    let mut ptr = record.len() - buf.remaining();
-    for blk in decoded.blocks.iter_mut() {
-        if blk.has_image {
-            blk.bimg_offset = ptr as u32;
-            ptr += blk.bimg_len as usize;
-        }
-        if blk.has_data {
-            ptr += blk.data_len as usize;
-        }
-    }
-    // We don't need them, so just skip blocks_total_len bytes
-    buf.advance(blocks_total_len as usize);
-    assert_eq!(ptr, record.len() - buf.remaining());
-
-    let main_data_offset = (xlogrec.xl_tot_len - main_data_len) as usize;
-
-    // 4. Decode main_data
-    if main_data_len > 0 {
-        assert_eq!(buf.remaining(), main_data_len as usize);
-    }
-
-    decoded.xl_xid = xlogrec.xl_xid;
-    decoded.xl_info = xlogrec.xl_info;
-    decoded.xl_rmid = xlogrec.xl_rmid;
-    decoded.record = record;
-    decoded.origin_id = origin_id;
-    decoded.main_data_offset = main_data_offset;
-
-    Ok(())
-}
-
-///
-/// Build a human-readable string to describe a WAL record
-///
-/// For debugging purposes
-pub fn describe_wal_record(rec: &NeonWalRecord) -> Result<String, DeserializeError> {
-    match rec {
-        NeonWalRecord::Postgres { will_init, rec } => Ok(format!(
-            "will_init: {}, {}",
-            will_init,
-            describe_postgres_wal_record(rec)?
-        )),
-        _ => Ok(format!("{:?}", rec)),
-    }
-}
-
-fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeError> {
+pub fn describe_postgres_wal_record(record: &Bytes) -> Result<String, DeserializeError> {
     // TODO: It would be nice to use the PostgreSQL rmgrdesc infrastructure for this.
     // Maybe use the postgres wal redo process, the same used for replaying WAL records?
     // Or could we compile the rmgrdesc routines into the dump_layer_file() binary directly,
diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
new file mode 100644
index 0000000000..3f80f8fcdb
--- /dev/null
+++ b/libs/wal_decoder/Cargo.toml
@@ -0,0 +1,18 @@
+[package]
+name = "wal_decoder"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[features]
+testing = []
+
+[dependencies]
+anyhow.workspace = true
+bytes.workspace = true
+pageserver_api.workspace = true
+postgres_ffi.workspace = true
+serde.workspace = true
+tracing.workspace = true
+utils.workspace = true
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
new file mode 100644
index 0000000000..8b13789179
--- /dev/null
+++ b/libs/wal_decoder/src/decoder.rs
@@ -0,0 +1 @@
+
diff --git a/libs/wal_decoder/src/lib.rs b/libs/wal_decoder/src/lib.rs
new file mode 100644
index 0000000000..05349d17c9
--- /dev/null
+++ b/libs/wal_decoder/src/lib.rs
@@ -0,0 +1,2 @@
+pub mod decoder;
+pub mod models;
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
new file mode 100644
index 0000000000..58f8e1b2da
--- /dev/null
+++ b/libs/wal_decoder/src/models.rs
@@ -0,0 +1,167 @@
+//! This module houses types which represent decoded PG WAL records
+//! ready for the pageserver to interpret. They are derived from the original
+//! WAL records, so that each struct corresponds closely to one WAL record of
+//! a specific kind. They contain the same information as the original WAL records,
+//! just decoded into structs and fields for easier access.
+//!
+//! The ingestion code uses these structs to help with parsing the WAL records,
+//! and it splits them into a stream of modifications to the key-value pairs that
+//! are ultimately stored in delta layers.  See also the split-out counterparts in
+//! [`postgres_ffi::walrecord`].
+//!
+//! The pipeline which processes WAL records is not super obvious, so let's follow
+//! the flow of an example XACT_COMMIT Postgres record:
+//!
+//! (Postgres XACT_COMMIT record)
+//! |
+//! |--> pageserver::walingest::WalIngest::decode_xact_record
+//!      |
+//!      |--> ([`XactRecord::Commit`])
+//!           |
+//!           |--> pageserver::walingest::WalIngest::ingest_xact_record
+//!                |
+//!                |--> (NeonWalRecord::ClogSetCommitted)
+//!                     |
+//!                     |--> write to KV store within the pageserver
+
+use bytes::Bytes;
+use pageserver_api::reltag::{RelTag, SlruKind};
+use postgres_ffi::walrecord::{
+    XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
+    XlSmgrTruncate, XlXactParsedRecord,
+};
+use postgres_ffi::{Oid, TransactionId};
+use utils::lsn::Lsn;
+
+pub enum HeapamRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+pub struct ClearVmBits {
+    pub new_heap_blkno: Option<u32>,
+    pub old_heap_blkno: Option<u32>,
+    pub vm_rel: RelTag,
+    pub flags: u8,
+}
+
+pub enum NeonrmgrRecord {
+    ClearVmBits(ClearVmBits),
+}
+
+pub enum SmgrRecord {
+    Create(SmgrCreate),
+    Truncate(XlSmgrTruncate),
+}
+
+pub struct SmgrCreate {
+    pub rel: RelTag,
+}
+
+pub enum DbaseRecord {
+    Create(DbaseCreate),
+    Drop(DbaseDrop),
+}
+
+pub struct DbaseCreate {
+    pub db_id: Oid,
+    pub tablespace_id: Oid,
+    pub src_db_id: Oid,
+    pub src_tablespace_id: Oid,
+}
+
+pub struct DbaseDrop {
+    pub db_id: Oid,
+    pub tablespace_ids: Vec<Oid>,
+}
+
+pub enum ClogRecord {
+    ZeroPage(ClogZeroPage),
+    Truncate(ClogTruncate),
+}
+
+pub struct ClogZeroPage {
+    pub segno: u32,
+    pub rpageno: u32,
+}
+
+pub struct ClogTruncate {
+    pub pageno: u32,
+    pub oldest_xid: TransactionId,
+    pub oldest_xid_db: Oid,
+}
+
+pub enum XactRecord {
+    Commit(XactCommon),
+    Abort(XactCommon),
+    CommitPrepared(XactCommon),
+    AbortPrepared(XactCommon),
+    Prepare(XactPrepare),
+}
+
+pub struct XactCommon {
+    pub parsed: XlXactParsedRecord,
+    pub origin_id: u16,
+    // Fields below are only used for logging
+    pub xl_xid: TransactionId,
+    pub lsn: Lsn,
+}
+
+pub struct XactPrepare {
+    pub xl_xid: TransactionId,
+    pub data: Bytes,
+}
+
+pub enum MultiXactRecord {
+    ZeroPage(MultiXactZeroPage),
+    Create(XlMultiXactCreate),
+    Truncate(XlMultiXactTruncate),
+}
+
+pub struct MultiXactZeroPage {
+    pub slru_kind: SlruKind,
+    pub segno: u32,
+    pub rpageno: u32,
+}
+
+pub enum RelmapRecord {
+    Update(RelmapUpdate),
+}
+
+pub struct RelmapUpdate {
+    pub update: XlRelmapUpdate,
+    pub buf: Bytes,
+}
+
+pub enum XlogRecord {
+    Raw(RawXlogRecord),
+}
+
+pub struct RawXlogRecord {
+    pub info: u8,
+    pub lsn: Lsn,
+    pub buf: Bytes,
+}
+
+pub enum LogicalMessageRecord {
+    Put(PutLogicalMessage),
+    #[cfg(feature = "testing")]
+    Failpoint,
+}
+
+pub struct PutLogicalMessage {
+    pub path: String,
+    pub buf: Bytes,
+}
+
+pub enum StandbyRecord {
+    RunningXacts(StandbyRunningXacts),
+}
+
+pub struct StandbyRunningXacts {
+    pub oldest_running_xid: TransactionId,
+}
+
+pub enum ReploriginRecord {
+    Set(XlReploriginSet),
+    Drop(XlReploriginDrop),
+}
diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
index 2531abc7a1..ecb8fa7491 100644
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -8,7 +8,7 @@ license.workspace = true
 default = []
 # Enables test-only APIs, incuding failpoints. In particular, enables the `fail_point!` macro,
 # which adds some runtime cost to run tests on outage conditions
-testing = ["fail/failpoints", "pageserver_api/testing" ]
+testing = ["fail/failpoints", "pageserver_api/testing", "wal_decoder/testing"]
 
 [dependencies]
 anyhow.workspace = true
@@ -83,6 +83,7 @@ enum-map.workspace = true
 enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
+wal_decoder.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index d98b23acce..0a1ad9cd6b 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -8,13 +8,12 @@ use pageserver::{
     context::{DownloadBehavior, RequestContext},
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
-    repository::Value,
     task_mgr::TaskKind,
     tenant::storage_layer::inmemory_layer::SerializedBatch,
     tenant::storage_layer::InMemoryLayer,
     virtual_file,
 };
-use pageserver_api::{key::Key, shard::TenantShardId};
+use pageserver_api::{key::Key, shard::TenantShardId, value::Value};
 use utils::{
     bin_ser::BeSer,
     id::{TenantId, TimelineId},
diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index 1353e79f7c..5c5b52db44 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -1,9 +1,9 @@
 use criterion::measurement::WallTime;
 use pageserver::keyspace::{KeyPartitioning, KeySpace};
-use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
+use pageserver_api::key::Key;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
diff --git a/pageserver/benches/bench_walredo.rs b/pageserver/benches/bench_walredo.rs
index 45936cb3fa..d3551b56e1 100644
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -60,7 +60,8 @@ use anyhow::Context;
 use bytes::{Buf, Bytes};
 use criterion::{BenchmarkId, Criterion};
 use once_cell::sync::Lazy;
-use pageserver::{config::PageServerConf, walrecord::NeonWalRecord, walredo::PostgresRedoManager};
+use pageserver::{config::PageServerConf, walredo::PostgresRedoManager};
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::{key::Key, shard::TenantShardId};
 use std::{
     future::Future,
diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs
index bc939f9688..177e65ef79 100644
--- a/pageserver/ctl/src/draw_timeline_dir.rs
+++ b/pageserver/ctl/src/draw_timeline_dir.rs
@@ -51,7 +51,7 @@
 //!
 
 use anyhow::{Context, Result};
-use pageserver::repository::Key;
+use pageserver_api::key::Key;
 use std::cmp::Ordering;
 use std::io::{self, BufRead};
 use std::path::PathBuf;
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 7dd2a5d05c..451d2a1d69 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -14,12 +14,12 @@ use std::ops::Range;
 use std::{fs, str};
 
 use pageserver::page_cache::{self, PAGE_SZ};
-use pageserver::repository::{Key, KEY_SIZE};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
 use pageserver::tenant::storage_layer::range_overlaps;
 use pageserver::virtual_file::{self, VirtualFile};
+use pageserver_api::key::{Key, KEY_SIZE};
 
 use utils::{bin_ser::BeSer, lsn::Lsn};
 
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index c0b2b6ae89..22627d72c8 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -14,13 +14,13 @@ use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
-    repository::{Key, KEY_SIZE},
     tenant::{
         block_io::FileBlockReader, disk_btree::VisitDirection,
         storage_layer::delta_layer::DELTA_KEY_SIZE,
     },
     virtual_file::VirtualFile,
 };
+use pageserver_api::key::{Key, KEY_SIZE};
 use std::fs;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 73bdc90213..7733bdb640 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -696,7 +696,7 @@ impl DeletionQueue {
 mod test {
     use camino::Utf8Path;
     use hex_literal::hex;
-    use pageserver_api::{shard::ShardIndex, upcall_api::ReAttachResponseTenant};
+    use pageserver_api::{key::Key, shard::ShardIndex, upcall_api::ReAttachResponseTenant};
     use std::{io::ErrorKind, time::Duration};
     use tracing::info;
 
@@ -705,7 +705,6 @@ mod test {
 
     use crate::{
         controller_upcall_client::RetryForeverError,
-        repository::Key,
         tenant::{harness::TenantHarness, storage_layer::DeltaLayerName},
     };
 
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3943f62ac0..2d8f4309ca 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2232,13 +2232,13 @@ async fn getpage_at_lsn_handler(
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
 
-    struct Key(crate::repository::Key);
+    struct Key(pageserver_api::key::Key);
 
     impl std::str::FromStr for Key {
         type Err = anyhow::Error;
 
         fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
-            crate::repository::Key::from_hex(s).map(Key)
+            pageserver_api::key::Key::from_hex(s).map(Key)
         }
     }
 
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index ca87f1d080..530c91c4da 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -19,12 +19,11 @@ use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
-use crate::walrecord::decode_wal_record;
-use crate::walrecord::DecodedWALRecord;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use postgres_ffi::ControlFileData;
 use postgres_ffi::DBState_DB_SHUTDOWNED;
 use postgres_ffi::Oid;
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index d51931c768..ef6711397a 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -24,7 +24,6 @@ pub mod metrics;
 pub mod page_cache;
 pub mod page_service;
 pub mod pgdatadir_mapping;
-pub mod repository;
 pub mod span;
 pub(crate) mod statvfs;
 pub mod task_mgr;
@@ -32,7 +31,6 @@ pub mod tenant;
 pub mod utilization;
 pub mod virtual_file;
 pub mod walingest;
-pub mod walrecord;
 pub mod walredo;
 
 use camino::Utf8Path;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 19233a28cc..dc2dc08b53 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -7,14 +7,14 @@
 //! Clarify that)
 //!
 use super::tenant::{PageReconstructError, Timeline};
+use crate::aux_file;
 use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id_no_shard_id;
-use crate::walrecord::NeonWalRecord;
-use crate::{aux_file, repository::*};
 use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes, BytesMut};
 use enum_map::Enum;
+use pageserver_api::key::Key;
 use pageserver_api::key::{
     dbdir_key_range, rel_block_to_key, rel_dir_to_key, rel_key_range, rel_size_to_key,
     relmap_file_key, repl_origin_key, repl_origin_key_range, slru_block_to_key, slru_dir_to_key,
@@ -22,7 +22,9 @@ use pageserver_api::key::{
     CompactKey, AUX_FILES_KEY, CHECKPOINT_KEY, CONTROLFILE_KEY, DBDIR_KEY, TWOPHASEDIR_KEY,
 };
 use pageserver_api::keyspace::SparseKeySpace;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::value::Value;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, RepOriginId, TimestampTz, TransactionId};
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7011ae9e63..8445601d29 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -92,11 +92,11 @@ use crate::metrics::{
     remove_tenant_metrics, BROKEN_TENANTS_SET, CIRCUIT_BREAKERS_BROKEN, CIRCUIT_BREAKERS_UNBROKEN,
     TENANT_STATE_METRIC, TENANT_SYNTHETIC_SIZE_METRIC,
 };
-use crate::repository::GcResult;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
+use crate::tenant::gc_result::GcResult;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
@@ -160,6 +160,7 @@ pub(crate) mod timeline;
 pub mod size;
 
 mod gc_block;
+mod gc_result;
 pub(crate) mod throttle;
 
 pub(crate) use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -467,10 +468,10 @@ impl WalRedoManager {
     /// This method is cancellation-safe.
     pub async fn request_redo(
         &self,
-        key: crate::repository::Key,
+        key: pageserver_api::key::Key,
         lsn: Lsn,
         base_img: Option<(Lsn, bytes::Bytes)>,
-        records: Vec<(Lsn, crate::walrecord::NeonWalRecord)>,
+        records: Vec<(Lsn, pageserver_api::record::NeonWalRecord)>,
         pg_version: u32,
     ) -> Result<bytes::Bytes, walredo::Error> {
         match self {
@@ -4818,7 +4819,8 @@ pub(crate) mod harness {
     use crate::deletion_queue::mock::MockDeletionQueue;
     use crate::l0_flush::L0FlushConfig;
     use crate::walredo::apply_neon;
-    use crate::{repository::Key, walrecord::NeonWalRecord};
+    use pageserver_api::key::Key;
+    use pageserver_api::record::NeonWalRecord;
 
     use super::*;
     use hex_literal::hex;
@@ -5087,25 +5089,30 @@ mod tests {
 
     use super::*;
     use crate::keyspace::KeySpaceAccum;
-    use crate::repository::{Key, Value};
     use crate::tenant::harness::*;
     use crate::tenant::timeline::CompactFlags;
-    use crate::walrecord::NeonWalRecord;
     use crate::DEFAULT_PG_VERSION;
     use bytes::{Bytes, BytesMut};
     use hex_literal::hex;
     use itertools::Itertools;
-    use pageserver_api::key::{AUX_KEY_PREFIX, NON_INHERITED_RANGE};
+    use pageserver_api::key::{Key, AUX_KEY_PREFIX, NON_INHERITED_RANGE};
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
+    use pageserver_api::value::Value;
     use rand::{thread_rng, Rng};
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
     use tests::timeline::{GetVectoredError, ShutdownMode};
-    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
-    use timeline::{DeltaLayerTestDesc, GcInfo};
+    use timeline::DeltaLayerTestDesc;
     use utils::id::TenantId;
 
+    #[cfg(feature = "testing")]
+    use pageserver_api::record::NeonWalRecord;
+    #[cfg(feature = "testing")]
+    use timeline::compaction::{KeyHistoryRetention, KeyLogAtLsn};
+    #[cfg(feature = "testing")]
+    use timeline::GcInfo;
+
     static TEST_KEY: Lazy<Key> =
         Lazy::new(|| Key::from_slice(&hex!("010000000033333333444444445500000001")));
 
@@ -7670,6 +7677,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_neon_test_record() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_neon_test_record").await?;
@@ -7861,6 +7869,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
@@ -8057,6 +8066,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_generate_key_retention() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_generate_key_retention").await?;
@@ -8404,6 +8414,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_with_retain_lsns() -> anyhow::Result<()> {
         let harness =
@@ -8644,6 +8655,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_with_retain_lsns_single_key() -> anyhow::Result<()>
     {
@@ -8852,6 +8864,7 @@ mod tests {
         Ok(())
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_bottom_most_compaction_on_branch() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_simple_bottom_most_compaction_on_branch").await?;
@@ -9053,6 +9066,7 @@ mod tests {
     //
     // When querying the key range [A, B) we need to read at different LSN ranges
     // for [A, C) and [C, B). This test checks that the described edge case is handled correctly.
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_vectored_read_with_nested_image_layer() -> anyhow::Result<()> {
         let harness = TenantHarness::create("test_vectored_read_with_nested_image_layer").await?;
diff --git a/pageserver/src/tenant/gc_result.rs b/pageserver/src/tenant/gc_result.rs
new file mode 100644
index 0000000000..c805aafeab
--- /dev/null
+++ b/pageserver/src/tenant/gc_result.rs
@@ -0,0 +1,57 @@
+use anyhow::Result;
+use serde::Serialize;
+use std::ops::AddAssign;
+use std::time::Duration;
+
+///
+/// Result of performing GC
+///
+#[derive(Default, Serialize, Debug)]
+pub struct GcResult {
+    pub layers_total: u64,
+    pub layers_needed_by_cutoff: u64,
+    pub layers_needed_by_pitr: u64,
+    pub layers_needed_by_branches: u64,
+    pub layers_needed_by_leases: u64,
+    pub layers_not_updated: u64,
+    pub layers_removed: u64, // # of layer files removed because they have been made obsolete by newer ondisk files.
+
+    #[serde(serialize_with = "serialize_duration_as_millis")]
+    pub elapsed: Duration,
+
+    /// The layers which were garbage collected.
+    ///
+    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
+    /// dropped in tests.
+    #[cfg(feature = "testing")]
+    #[serde(skip)]
+    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
+}
+
+// helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
+fn serialize_duration_as_millis<S>(d: &Duration, serializer: S) -> Result<S::Ok, S::Error>
+where
+    S: serde::Serializer,
+{
+    d.as_millis().serialize(serializer)
+}
+
+impl AddAssign for GcResult {
+    fn add_assign(&mut self, other: Self) {
+        self.layers_total += other.layers_total;
+        self.layers_needed_by_pitr += other.layers_needed_by_pitr;
+        self.layers_needed_by_cutoff += other.layers_needed_by_cutoff;
+        self.layers_needed_by_branches += other.layers_needed_by_branches;
+        self.layers_needed_by_leases += other.layers_needed_by_leases;
+        self.layers_not_updated += other.layers_not_updated;
+        self.layers_removed += other.layers_removed;
+
+        self.elapsed += other.elapsed;
+
+        #[cfg(feature = "testing")]
+        {
+            let mut other = other;
+            self.doomed_layers.append(&mut other.doomed_layers);
+        }
+    }
+}
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 707233b003..7f15baed10 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -48,9 +48,9 @@ mod layer_coverage;
 
 use crate::context::RequestContext;
 use crate::keyspace::KeyPartitioning;
-use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceAccum};
 use range_set_blaze::{CheckSortedDisjoint, RangeSetBlaze};
 use std::collections::{HashMap, VecDeque};
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 0567f8f3a7..a4c458b737 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2811,7 +2811,7 @@ where
 }
 
 use {
-    crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
+    crate::tenant::gc_result::GcResult, pageserver_api::models::TimelineGcRequest,
     utils::http::error::ApiError,
 };
 
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 4a63491e90..8f4219bbbc 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -11,11 +11,11 @@ mod layer_name;
 pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
-use crate::repository::Value;
-use crate::walrecord::NeonWalRecord;
 use bytes::Bytes;
 use pageserver_api::key::Key;
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::value::Value;
 use std::cmp::{Ordering, Reverse};
 use std::collections::hash_map::Entry;
 use std::collections::{BinaryHeap, HashMap};
diff --git a/pageserver/src/tenant/storage_layer/batch_split_writer.rs b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
index 272e422c90..8a397ceb7a 100644
--- a/pageserver/src/tenant/storage_layer/batch_split_writer.rs
+++ b/pageserver/src/tenant/storage_layer/batch_split_writer.rs
@@ -5,7 +5,8 @@ use pageserver_api::key::{Key, KEY_SIZE};
 use utils::{id::TimelineId, lsn::Lsn, shard::TenantShardId};
 
 use crate::tenant::storage_layer::Layer;
-use crate::{config::PageServerConf, context::RequestContext, repository::Value, tenant::Timeline};
+use crate::{config::PageServerConf, context::RequestContext, tenant::Timeline};
+use pageserver_api::value::Value;
 
 use super::layer::S3_UPLOAD_LIMIT;
 use super::{
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 641729d681..10165b1d06 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -30,7 +30,6 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
-use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, BlockCursor, BlockLease, BlockReader, FileBlockReader};
 use crate::tenant::disk_btree::{
@@ -46,7 +45,7 @@ use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::{FullSlice, IoBufExt};
 use crate::virtual_file::IoBufferMut;
 use crate::virtual_file::{self, MaybeFatalIo, VirtualFile};
-use crate::{walrecord, TEMP_FILE_SUFFIX};
+use crate::TEMP_FILE_SUFFIX;
 use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
@@ -54,9 +53,11 @@ use futures::StreamExt;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::key::DBDIR_KEY;
+use pageserver_api::key::{Key, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::ImageCompressionAlgorithm;
 use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::collections::VecDeque;
@@ -1293,7 +1294,7 @@ impl DeltaLayerInner {
                     // is it an image or will_init walrecord?
                     // FIXME: this could be handled by threading the BlobRef to the
                     // VectoredReadBuilder
-                    let will_init = crate::repository::ValueBytes::will_init(&data)
+                    let will_init = pageserver_api::value::ValueBytes::will_init(&data)
                         .inspect_err(|_e| {
                             #[cfg(feature = "testing")]
                             tracing::error!(data=?utils::Hex(&data), err=?_e, %key, %lsn, "failed to parse will_init out of serialized value");
@@ -1356,7 +1357,7 @@ impl DeltaLayerInner {
                     format!(" img {} bytes", img.len())
                 }
                 Value::WalRecord(rec) => {
-                    let wal_desc = walrecord::describe_wal_record(&rec)?;
+                    let wal_desc = pageserver_api::record::describe_wal_record(&rec)?;
                     format!(
                         " rec {} bytes will_init: {} {}",
                         buf.len(),
@@ -1610,7 +1611,6 @@ pub(crate) mod test {
     use rand::RngCore;
 
     use super::*;
-    use crate::repository::Value;
     use crate::tenant::harness::TIMELINE_ID;
     use crate::tenant::storage_layer::{Layer, ResidentLayer};
     use crate::tenant::vectored_blob_io::StreamingVectoredReadPlanner;
@@ -1622,6 +1622,7 @@ pub(crate) mod test {
         DEFAULT_PG_VERSION,
     };
     use bytes::Bytes;
+    use pageserver_api::value::Value;
 
     /// Construct an index for a fictional delta layer and and then
     /// traverse in order to plan vectored reads for a query. Finally,
@@ -1974,8 +1975,8 @@ pub(crate) mod test {
 
     #[tokio::test]
     async fn copy_delta_prefix_smoke() {
-        use crate::walrecord::NeonWalRecord;
         use bytes::Bytes;
+        use pageserver_api::record::NeonWalRecord;
 
         let h = crate::tenant::harness::TenantHarness::create("truncate_delta_smoke")
             .await
@@ -2198,6 +2199,7 @@ pub(crate) mod test {
         (k1, l1).cmp(&(k2, l2))
     }
 
+    #[cfg(feature = "testing")]
     pub(crate) fn sort_delta_value(
         (k1, l1, v1): &(Key, Lsn, Value),
         (k2, l2, v2): &(Key, Lsn, Value),
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
index f45dd4b801..ccfcf68e8f 100644
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -7,7 +7,7 @@ use pageserver_api::{
 };
 use utils::lsn::Lsn;
 
-use crate::repository::Value;
+use pageserver_api::value::Value;
 
 use super::merge_iterator::MergeIterator;
 
@@ -121,8 +121,8 @@ mod tests {
 
     #[tokio::test]
     async fn filter_keyspace_iterator() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("filter_iterator_filter_keyspace_iterator")
             .await
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 3f90df312d..c0d183dc08 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -28,7 +28,6 @@
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
 use crate::page_cache::{self, FileId, PAGE_SZ};
-use crate::repository::{Key, Value, KEY_SIZE};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::{BlockBuf, FileBlockReader};
 use crate::tenant::disk_btree::{
@@ -51,8 +50,10 @@ use hex;
 use itertools::Itertools;
 use pageserver_api::config::MaxVectoredReadBytes;
 use pageserver_api::key::DBDIR_KEY;
+use pageserver_api::key::{Key, KEY_SIZE};
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_api::value::Value;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::collections::VecDeque;
@@ -1125,6 +1126,7 @@ mod test {
     use pageserver_api::{
         key::Key,
         shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize},
+        value::Value,
     };
     use utils::{
         generation::Generation,
@@ -1134,7 +1136,6 @@ mod test {
 
     use crate::{
         context::RequestContext,
-        repository::Value,
         tenant::{
             config::TenantConf,
             harness::{TenantHarness, TIMELINE_ID},
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 7573ddb5cc..df448a0963 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -7,7 +7,6 @@
 use crate::assert_u64_eq_usize::{u64_to_usize, U64IsUsize, UsizeIsU64};
 use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext, RequestContextBuilder};
-use crate::repository::{Key, Value};
 use crate::tenant::ephemeral_file::EphemeralFile;
 use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
@@ -16,9 +15,11 @@ use crate::{l0_flush, page_cache};
 use anyhow::{anyhow, Context, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
+use pageserver_api::key::Key;
 use pageserver_api::keyspace::KeySpace;
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
+use pageserver_api::value::Value;
 use std::collections::{BTreeMap, HashMap};
 use std::sync::{Arc, OnceLock};
 use std::time::Instant;
diff --git a/pageserver/src/tenant/storage_layer/layer/tests.rs b/pageserver/src/tenant/storage_layer/layer/tests.rs
index 9de70f14ee..36dcc8d805 100644
--- a/pageserver/src/tenant/storage_layer/layer/tests.rs
+++ b/pageserver/src/tenant/storage_layer/layer/tests.rs
@@ -760,8 +760,8 @@ async fn evict_and_wait_does_not_wait_for_download() {
 /// Also checks that the same does not happen on a non-evicted layer (regression test).
 #[tokio::test(start_paused = true)]
 async fn eviction_cancellation_on_drop() {
-    use crate::repository::Value;
     use bytes::Bytes;
+    use pageserver_api::value::Value;
 
     // this is the runtime on which Layer spawns the blocking tasks on
     let handle = tokio::runtime::Handle::current();
@@ -782,7 +782,7 @@ async fn eviction_cancellation_on_drop() {
         let mut writer = timeline.writer().await;
         writer
             .put(
-                crate::repository::Key::from_i128(5),
+                pageserver_api::key::Key::from_i128(5),
                 Lsn(0x20),
                 &Value::Image(Bytes::from_static(b"this does not matter either")),
                 &ctx,
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index a30c25d780..2097e90764 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -3,7 +3,7 @@ use pageserver_api::shard::TenantShardId;
 use std::ops::Range;
 use utils::{id::TimelineId, lsn::Lsn};
 
-use crate::repository::Key;
+use pageserver_api::key::Key;
 
 use super::{DeltaLayerName, ImageLayerName, LayerName};
 
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 8e750e1187..2b98d74f9f 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -1,7 +1,7 @@
 //!
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
-use crate::repository::Key;
+use pageserver_api::key::Key;
 use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::fmt;
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index f91e27241d..980202f12c 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -7,7 +7,8 @@ use anyhow::bail;
 use pageserver_api::key::Key;
 use utils::lsn::Lsn;
 
-use crate::{context::RequestContext, repository::Value};
+use crate::context::RequestContext;
+use pageserver_api::value::Value;
 
 use super::{
     delta_layer::{DeltaLayerInner, DeltaLayerIterator},
@@ -291,12 +292,16 @@ mod tests {
     use crate::{
         tenant::{
             harness::{TenantHarness, TIMELINE_ID},
-            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta, sort_delta_value},
+            storage_layer::delta_layer::test::{produce_delta_layer, sort_delta},
         },
-        walrecord::NeonWalRecord,
         DEFAULT_PG_VERSION,
     };
 
+    #[cfg(feature = "testing")]
+    use crate::tenant::storage_layer::delta_layer::test::sort_delta_value;
+    #[cfg(feature = "testing")]
+    use pageserver_api::record::NeonWalRecord;
+
     async fn assert_merge_iter_equal(
         merge_iter: &mut MergeIterator<'_>,
         expect: &[(Key, Lsn, Value)],
@@ -319,8 +324,8 @@ mod tests {
 
     #[tokio::test]
     async fn merge_in_between() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("merge_iterator_merge_in_between")
             .await
@@ -384,8 +389,8 @@ mod tests {
 
     #[tokio::test]
     async fn delta_merge() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("merge_iterator_delta_merge")
             .await
@@ -458,10 +463,11 @@ mod tests {
         // TODO: test layers are loaded only when needed, reducing num of active iterators in k-merge
     }
 
+    #[cfg(feature = "testing")]
     #[tokio::test]
     async fn delta_image_mixed_merge() {
-        use crate::repository::Value;
         use bytes::Bytes;
+        use pageserver_api::value::Value;
 
         let harness = TenantHarness::create("merge_iterator_delta_image_mixed_merge")
             .await
@@ -586,5 +592,6 @@ mod tests {
         is_send(merge_iter);
     }
 
+    #[cfg(feature = "testing")]
     fn is_send(_: impl Send) {}
 }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index f8d61dac5e..d765a7c987 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -125,11 +125,12 @@ use utils::{
     simple_rcu::{Rcu, RcuReadGuard},
 };
 
-use crate::repository::GcResult;
-use crate::repository::{Key, Value};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
+use crate::tenant::gc_result::GcResult;
 use crate::ZERO_PAGE;
+use pageserver_api::key::Key;
+use pageserver_api::value::Value;
 
 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -5822,17 +5823,15 @@ fn is_send() {
 #[cfg(test)]
 mod tests {
     use pageserver_api::key::Key;
+    use pageserver_api::value::Value;
     use utils::{id::TimelineId, lsn::Lsn};
 
-    use crate::{
-        repository::Value,
-        tenant::{
-            harness::{test_img, TenantHarness},
-            layer_map::LayerMap,
-            storage_layer::{Layer, LayerName},
-            timeline::{DeltaLayerTestDesc, EvictionError},
-            Timeline,
-        },
+    use crate::tenant::{
+        harness::{test_img, TenantHarness},
+        layer_map::LayerMap,
+        storage_layer::{Layer, LayerName},
+        timeline::{DeltaLayerTestDesc, EvictionError},
+        Timeline,
     };
 
     #[tokio::test]
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 73e4f0e87c..70f93656cd 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -49,9 +49,10 @@ use pageserver_api::config::tenant_conf_defaults::{
     DEFAULT_CHECKPOINT_DISTANCE, DEFAULT_COMPACTION_THRESHOLD,
 };
 
-use crate::keyspace::KeySpace;
-use crate::repository::{Key, Value};
-use crate::walrecord::NeonWalRecord;
+use pageserver_api::key::Key;
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::value::Value;
 
 use utils::lsn::Lsn;
 
@@ -2148,7 +2149,7 @@ struct ResidentDeltaLayer(ResidentLayer);
 struct ResidentImageLayer(ResidentLayer);
 
 impl CompactionJobExecutor for TimelineAdaptor {
-    type Key = crate::repository::Key;
+    type Key = pageserver_api::key::Key;
 
     type Layer = OwnArc<PersistentLayerDesc>;
     type DeltaLayer = ResidentDeltaLayer;
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index cee259e2e0..739fadbc6b 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -31,11 +31,11 @@ use crate::{
     task_mgr::{TaskKind, WALRECEIVER_RUNTIME},
     tenant::{debug_assert_current_span_has_tenant_and_timeline_id, Timeline, WalReceiverInfo},
     walingest::WalIngest,
-    walrecord::{decode_wal_record, DecodedWALRecord},
 };
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
+use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use utils::{id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 9e43e10801..27b3f93845 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -29,8 +29,10 @@ use std::time::Instant;
 use std::time::SystemTime;
 
 use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::walrecord::*;
 use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
+use wal_decoder::models::*;
 
 use anyhow::{bail, Context, Result};
 use bytes::{Buf, Bytes, BytesMut};
@@ -44,9 +46,9 @@ use crate::pgdatadir_mapping::{DatadirModification, Version};
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
-use crate::walrecord::*;
 use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
@@ -108,143 +110,6 @@ struct WarnIngestLag {
     timestamp_invalid_msg_ratelimit: RateLimit,
 }
 
-// These structs are an intermediary representation of the PostgreSQL WAL records.
-// The ones prefixed with `Xl` are lower level, while the ones that are not have
-// all the required context to be acted upon by the pageserver.
-
-enum HeapamRecord {
-    ClearVmBits(ClearVmBits),
-}
-
-struct ClearVmBits {
-    new_heap_blkno: Option<u32>,
-    old_heap_blkno: Option<u32>,
-    vm_rel: RelTag,
-    flags: u8,
-}
-
-enum NeonrmgrRecord {
-    ClearVmBits(ClearVmBits),
-}
-
-enum SmgrRecord {
-    Create(SmgrCreate),
-    Truncate(XlSmgrTruncate),
-}
-
-struct SmgrCreate {
-    rel: RelTag,
-}
-
-enum DbaseRecord {
-    Create(DbaseCreate),
-    Drop(DbaseDrop),
-}
-
-struct DbaseCreate {
-    db_id: u32,
-    tablespace_id: u32,
-    src_db_id: u32,
-    src_tablespace_id: u32,
-}
-
-struct DbaseDrop {
-    db_id: u32,
-    tablespace_ids: Vec<u32>,
-}
-
-enum ClogRecord {
-    ZeroPage(ClogZeroPage),
-    Truncate(ClogTruncate),
-}
-
-struct ClogZeroPage {
-    segno: u32,
-    rpageno: u32,
-}
-
-struct ClogTruncate {
-    pageno: u32,
-    oldest_xid: u32,
-    oldest_xid_db: u32,
-}
-
-enum XactRecord {
-    Commit(XactCommon),
-    Abort(XactCommon),
-    CommitPrepared(XactCommon),
-    AbortPrepared(XactCommon),
-    Prepare(XactPrepare),
-}
-
-struct XactCommon {
-    parsed: XlXactParsedRecord,
-    origin_id: u16,
-    // Fields below are only used for logging
-    xl_xid: u32,
-    lsn: Lsn,
-}
-
-struct XactPrepare {
-    xl_xid: u32,
-    data: Bytes,
-}
-
-enum MultiXactRecord {
-    ZeroPage(MultiXactZeroPage),
-    Create(XlMultiXactCreate),
-    Truncate(XlMultiXactTruncate),
-}
-
-struct MultiXactZeroPage {
-    slru_kind: SlruKind,
-    segno: u32,
-    rpageno: u32,
-}
-
-enum RelmapRecord {
-    Update(RelmapUpdate),
-}
-
-struct RelmapUpdate {
-    update: XlRelmapUpdate,
-    buf: Bytes,
-}
-
-enum XlogRecord {
-    Raw(RawXlogRecord),
-}
-
-struct RawXlogRecord {
-    info: u8,
-    lsn: Lsn,
-    buf: Bytes,
-}
-
-enum LogicalMessageRecord {
-    Put(PutLogicalMessage),
-    #[cfg(feature = "testing")]
-    Failpoint,
-}
-
-struct PutLogicalMessage {
-    path: String,
-    buf: Bytes,
-}
-
-enum StandbyRecord {
-    RunningXacts(StandbyRunningXacts),
-}
-
-struct StandbyRunningXacts {
-    oldest_running_xid: u32,
-}
-
-enum ReploriginRecord {
-    Set(XlReploriginSet),
-    Drop(XlReploriginDrop),
-}
-
 impl WalIngest {
     pub async fn new(
         timeline: &Timeline,
@@ -284,7 +149,6 @@ impl WalIngest {
     /// relations/pages that the record affects.
     ///
     /// This function returns `true` if the record was ingested, and `false` if it was filtered out
-    ///
     pub async fn ingest_record(
         &mut self,
         decoded: DecodedWALRecord,
@@ -2218,7 +2082,7 @@ impl WalIngest {
     ) -> anyhow::Result<Option<LogicalMessageRecord>> {
         let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
         if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-            let xlrec = crate::walrecord::XlLogicalMessage::decode(buf);
+            let xlrec = XlLogicalMessage::decode(buf);
             let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
 
             #[cfg(feature = "testing")]
@@ -2246,7 +2110,7 @@ impl WalIngest {
     ) -> anyhow::Result<Option<StandbyRecord>> {
         let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
         if info == pg_constants::XLOG_RUNNING_XACTS {
-            let xlrec = crate::walrecord::XlRunningXacts::decode(buf);
+            let xlrec = XlRunningXacts::decode(buf);
             return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts {
                 oldest_running_xid: xlrec.oldest_running_xid,
             })));
@@ -2276,10 +2140,10 @@ impl WalIngest {
     ) -> anyhow::Result<Option<ReploriginRecord>> {
         let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
         if info == pg_constants::XLOG_REPLORIGIN_SET {
-            let xlrec = crate::walrecord::XlReploriginSet::decode(buf);
+            let xlrec = XlReploriginSet::decode(buf);
             return Ok(Some(ReploriginRecord::Set(xlrec)));
         } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
-            let xlrec = crate::walrecord::XlReploriginDrop::decode(buf);
+            let xlrec = XlReploriginDrop::decode(buf);
             return Ok(Some(ReploriginRecord::Drop(xlrec)));
         }
 
@@ -3146,6 +3010,7 @@ mod tests {
     async fn test_ingest_real_wal() {
         use crate::tenant::harness::*;
         use postgres_ffi::waldecoder::WalStreamDecoder;
+        use postgres_ffi::walrecord::decode_wal_record;
         use postgres_ffi::WAL_SEGMENT_SIZE;
 
         // Define test data path and constants.
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index a1c9fc5651..027a6eb7d7 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -29,11 +29,11 @@ use crate::metrics::{
     WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
     WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_TIME,
 };
-use crate::repository::Key;
-use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Bytes, BytesMut};
+use pageserver_api::key::Key;
 use pageserver_api::models::{WalRedoManagerProcessStatus, WalRedoManagerStatus};
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::sync::Arc;
@@ -548,9 +548,10 @@ impl PostgresRedoManager {
 #[cfg(test)]
 mod tests {
     use super::PostgresRedoManager;
-    use crate::repository::Key;
-    use crate::{config::PageServerConf, walrecord::NeonWalRecord};
+    use crate::config::PageServerConf;
     use bytes::Bytes;
+    use pageserver_api::key::Key;
+    use pageserver_api::record::NeonWalRecord;
     use pageserver_api::shard::TenantShardId;
     use std::str::FromStr;
     use tracing::Instrument;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index c067787f97..7aaa357318 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,8 +1,8 @@
-use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
 use bytes::BytesMut;
 use pageserver_api::key::Key;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
@@ -238,7 +238,7 @@ pub(crate) fn apply_in_neon(
             // No-op: this record will never be created in aux v2.
             warn!("AuxFile record should not be created in aux v2");
         }
-        #[cfg(test)]
+        #[cfg(feature = "testing")]
         NeonWalRecord::Test {
             append,
             clear,
diff --git a/pageserver/src/walredo/process.rs b/pageserver/src/walredo/process.rs
index f3197e68b5..7e9477cfbc 100644
--- a/pageserver/src/walredo/process.rs
+++ b/pageserver/src/walredo/process.rs
@@ -8,10 +8,10 @@ use crate::{
     metrics::{WalRedoKillCause, WAL_REDO_PROCESS_COUNTERS, WAL_REDO_RECORD_COUNTER},
     page_cache::PAGE_SZ,
     span::debug_assert_current_span_has_tenant_id,
-    walrecord::NeonWalRecord,
 };
 use anyhow::Context;
 use bytes::Bytes;
+use pageserver_api::record::NeonWalRecord;
 use pageserver_api::{reltag::RelTag, shard::TenantShardId};
 use postgres_ffi::BLCKSZ;
 #[cfg(feature = "testing")]

From a73402e646f9840fca2712045c37c37ba848dfcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 29 Oct 2024 11:41:53 +0100
Subject: [PATCH 114/239] Offloaded timeline deletion (#9519)

As pointed out in
https://github.com/neondatabase/neon/pull/9489#discussion_r1814699683 ,
we currently didn't support deletion for offloaded timelines after the
timeline has been loaded from the manifest instead of having been
offloaded.

This was because the upload queue hasn't been initialized yet. This PR
thus initializes the timeline and shuts it down immediately.

Part of #8088
---
 pageserver/src/tenant.rs                     | 15 +---
 pageserver/src/tenant/timeline/delete.rs     | 31 ++++++-
 test_runner/regress/test_timeline_archive.py | 88 +++++++++++++-------
 3 files changed, 88 insertions(+), 46 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8445601d29..7f8af67c2c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -626,19 +626,10 @@ impl TimelineOrOffloaded {
             TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
         }
     }
-    fn remote_client_maybe_construct(&self, tenant: &Tenant) -> Arc<RemoteTimelineClient> {
+    fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
         match self {
-            TimelineOrOffloaded::Timeline(timeline) => timeline.remote_client.clone(),
-            TimelineOrOffloaded::Offloaded(offloaded) => match offloaded.remote_client.clone() {
-                Some(remote_client) => remote_client,
-                None => {
-                    let remote_client = tenant.build_timeline_client(
-                        offloaded.timeline_id,
-                        tenant.remote_storage.clone(),
-                    );
-                    Arc::new(remote_client)
-                }
-            },
+            TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
+            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(),
         }
     }
 }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index a664bb59e1..53b65da515 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -6,7 +6,7 @@ use std::{
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
-use tracing::{error, info, instrument, Instrument};
+use tracing::{error, info, info_span, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
 
 use crate::{
@@ -15,7 +15,7 @@ use crate::{
     tenant::{
         metadata::TimelineMetadata,
         remote_timeline_client::{
-            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
+            self, MaybeDeletedIndexPart, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
         },
         CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
     },
@@ -258,7 +258,32 @@ impl DeleteTimelineFlow {
             ))?
         });
 
-        let remote_client = timeline.remote_client_maybe_construct(tenant);
+        let remote_client = match timeline.maybe_remote_client() {
+            Some(remote_client) => remote_client,
+            None => {
+                let remote_client = tenant
+                    .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
+                let result = remote_client
+                    .download_index_file(&tenant.cancel)
+                    .instrument(info_span!("download_index_file"))
+                    .await
+                    .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
+                let index_part = match result {
+                    MaybeDeletedIndexPart::Deleted(p) => {
+                        tracing::info!("Timeline already set as deleted in remote index");
+                        p
+                    }
+                    MaybeDeletedIndexPart::IndexPart(p) => p,
+                };
+                let remote_client = Arc::new(remote_client);
+
+                remote_client
+                    .init_upload_queue(&index_part)
+                    .map_err(DeleteTimelineError::Other)?;
+                remote_client.shutdown().await;
+                remote_client
+            }
+        };
         set_deleted_in_remote_index(&remote_client).await?;
 
         fail::fail_point!("timeline-delete-before-schedule", |_| {
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index cb8724dd1c..77efd7b749 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -137,14 +137,17 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         }
     )
 
-    # Create two branches and archive them
-    parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent", tenant_id)
-    leaf_timeline_id = env.create_branch(
-        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
+    # Create three branches that depend on each other, starting with two
+    grandparent_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_grandparent", tenant_id
+    )
+    parent_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_parent", tenant_id, "test_ancestor_branch_archive_grandparent"
     )
 
+    # write some stuff to the parent
     with env.endpoints.create_start(
-        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+        "test_ancestor_branch_archive_parent", tenant_id=tenant_id
     ) as endpoint:
         endpoint.safe_psql_many(
             [
@@ -154,6 +157,11 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         )
         sum = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
 
+    # create the third branch
+    leaf_timeline_id = env.create_branch(
+        "test_ancestor_branch_archive_branch1", tenant_id, "test_ancestor_branch_archive_parent"
+    )
+
     ps_http.timeline_archival_config(
         tenant_id,
         leaf_timeline_id,
@@ -171,6 +179,12 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
         state=TimelineArchivalState.ARCHIVED,
     )
 
+    ps_http.timeline_archival_config(
+        tenant_id,
+        grandparent_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+
     def timeline_offloaded_logged(timeline_id: TimelineId) -> bool:
         return (
             env.pageserver.log_contains(f".*{timeline_id}.* offloading archived timeline.*")
@@ -201,30 +215,34 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
 
     ps_http.timeline_archival_config(
         tenant_id,
-        parent_timeline_id,
+        grandparent_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
     ps_http.timeline_archival_config(
         tenant_id,
-        leaf_timeline_id,
+        parent_timeline_id,
         state=TimelineArchivalState.UNARCHIVED,
     )
-    leaf_detail = ps_http.timeline_detail(
+    parent_detail = ps_http.timeline_detail(
         tenant_id,
-        leaf_timeline_id,
+        parent_timeline_id,
     )
-    assert leaf_detail["is_archived"] is False
+    assert parent_detail["is_archived"] is False
 
     with env.endpoints.create_start(
-        "test_ancestor_branch_archive_branch1", tenant_id=tenant_id
+        "test_ancestor_branch_archive_parent", tenant_id=tenant_id
     ) as endpoint:
         sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key > 50")
         assert sum == sum_again
 
+    # Test that deletion of offloaded timelines works
+    ps_http.timeline_delete(tenant_id, leaf_timeline_id)
+
     assert not timeline_offloaded_logged(initial_timeline_id)
 
 
-def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
+@pytest.mark.parametrize("delete_timeline", [False, True])
+def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timeline: bool):
     """
     Test for persistence of timeline offload state
     """
@@ -306,27 +324,35 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder):
     assert timeline_offloaded_api(child_timeline_id)
     assert not timeline_offloaded_api(root_timeline_id)
 
-    ps_http.timeline_archival_config(
-        tenant_id,
-        child_timeline_id,
-        state=TimelineArchivalState.UNARCHIVED,
-    )
-    child_detail = ps_http.timeline_detail(
-        tenant_id,
-        child_timeline_id,
-    )
-    assert child_detail["is_archived"] is False
+    if delete_timeline:
+        ps_http.timeline_delete(tenant_id, child_timeline_id)
+        with pytest.raises(PageserverApiException, match="not found"):
+            ps_http.timeline_detail(
+                tenant_id,
+                child_timeline_id,
+            )
+    else:
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+        child_detail = ps_http.timeline_detail(
+            tenant_id,
+            child_timeline_id,
+        )
+        assert child_detail["is_archived"] is False
 
-    with env.endpoints.create_start(
-        "test_archived_branch_persisted", tenant_id=tenant_id
-    ) as endpoint:
-        sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
-        assert sum == sum_again
+        with env.endpoints.create_start(
+            "test_archived_branch_persisted", tenant_id=tenant_id
+        ) as endpoint:
+            sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key < 500")
+            assert sum == sum_again
 
-    assert_prefix_empty(
-        neon_env_builder.pageserver_remote_storage,
-        prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
-    )
+        assert_prefix_empty(
+            neon_env_builder.pageserver_remote_storage,
+            prefix=f"tenants/{str(env.initial_tenant)}/tenant-manifest",
+        )
 
     assert not timeline_offloaded_api(root_timeline_id)
 

From 45b558f480e76f46a61eb97504931c8bd211457b Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Tue, 29 Oct 2024 11:53:28 +0100
Subject: [PATCH 115/239] temporarily increase timeout for clickbench benchmark
 until regression is resolved (#9554)

## Problem

click bench job in benchmarking workflow has a performance regression
causing it to run in timeout of max job run.

Suspected root cause:
Project has been migrated from single pageserver to storage controller
managed project on Oct 14th.
Since then the regression shows.

## Summary of changes

Increase timeout of pytest to 12 hours.
Increase job timeout to 12 hours
---
 .github/workflows/benchmarking.yml | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 5ccfe48684..69b8bc5d70 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -671,6 +671,10 @@ jobs:
         password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
       options: --init
 
+    # Increase timeout to 12h, default timeout is 6h
+    # we have regression in clickbench causing it to run 2-3x longer
+    timeout-minutes: 720
+
     steps:
     - uses: actions/checkout@v4
 
@@ -716,7 +720,7 @@ jobs:
         test_selection: performance/test_perf_olap.py
         run_in_parallel: false
         save_perf_report: ${{ env.SAVE_PERF_REPORT }}
-        extra_params: -m remote_cluster --timeout 21600 -k test_clickbench
+        extra_params: -m remote_cluster --timeout 43200 -k test_clickbench
         pg_version: ${{ env.DEFAULT_PG_VERSION }}
       env:
         VIP_VAP_ACCESS_TOKEN: "${{ secrets.VIP_VAP_ACCESS_TOKEN }}"

From 47c35f67c392a9642a4f0ccaeb326a53913449e4 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 29 Oct 2024 11:01:09 +0000
Subject: [PATCH 116/239] [proxy]: fix JWT handling for AWS cognito. (#9536)

In the base64 payload of an aws cognito jwt, I saw the following:

```
"iss":"https:\/\/cognito-idp.us-west-2.amazonaws.com\/us-west-2_redacted"
```

issuers are supposed to be URLs, and URLs are always valid un-escaped
JSON. However, `\/` is a valid escape character so what AWS is doing is
technically correct... sigh...

This PR refactors the test suite and adds a new regression test for
cognito.
---
 proxy/src/auth/backend/jwt.rs | 508 +++++++++++++++++++++++++---------
 1 file changed, 383 insertions(+), 125 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 69ab4b8ccb..83c3617612 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -1,3 +1,4 @@
+use std::borrow::Cow;
 use std::future::Future;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
@@ -45,6 +46,7 @@ pub(crate) enum FetchAuthRulesError {
     RoleJwksNotConfigured,
 }
 
+#[derive(Clone)]
 pub(crate) struct AuthRule {
     pub(crate) id: String,
     pub(crate) jwks_url: url::Url,
@@ -277,7 +279,7 @@ impl JwkCacheEntryLock {
 
         // get the key from the JWKs if possible. If not, wait for the keys to update.
         let (jwk, expected_audience) = loop {
-            match guard.find_jwk_and_audience(kid, role_name) {
+            match guard.find_jwk_and_audience(&kid, role_name) {
                 Some(jwk) => break jwk,
                 None if guard.last_retrieved.elapsed() > MIN_RENEW => {
                     let _paused = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
@@ -312,7 +314,9 @@ impl JwkCacheEntryLock {
 
         if let Some(aud) = expected_audience {
             if payload.audience.0.iter().all(|s| s != aud) {
-                return Err(JwtError::InvalidJwtTokenAudience);
+                return Err(JwtError::InvalidClaims(
+                    JwtClaimsError::InvalidJwtTokenAudience,
+                ));
             }
         }
 
@@ -320,13 +324,15 @@ impl JwkCacheEntryLock {
 
         if let Some(exp) = payload.expiration {
             if now >= exp + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::JwtTokenHasExpired);
+                return Err(JwtError::InvalidClaims(JwtClaimsError::JwtTokenHasExpired));
             }
         }
 
         if let Some(nbf) = payload.not_before {
             if nbf >= now + CLOCK_SKEW_LEEWAY {
-                return Err(JwtError::JwtTokenNotYetReadyToUse);
+                return Err(JwtError::InvalidClaims(
+                    JwtClaimsError::JwtTokenNotYetReadyToUse,
+                ));
             }
         }
 
@@ -420,8 +426,8 @@ struct JwtHeader<'a> {
     #[serde(rename = "alg")]
     algorithm: jose_jwa::Algorithm,
     /// key id, must be provided for our usecase
-    #[serde(rename = "kid")]
-    key_id: Option<&'a str>,
+    #[serde(rename = "kid", borrow)]
+    key_id: Option<Cow<'a, str>>,
 }
 
 /// <https://datatracker.ietf.org/doc/html/rfc7519#section-4.1>
@@ -440,17 +446,17 @@ struct JwtPayload<'a> {
 
     // the following entries are only extracted for the sake of debug logging.
     /// Issuer of the JWT
-    #[serde(rename = "iss")]
-    issuer: Option<&'a str>,
+    #[serde(rename = "iss", borrow)]
+    issuer: Option<Cow<'a, str>>,
     /// Subject of the JWT (the user)
-    #[serde(rename = "sub")]
-    subject: Option<&'a str>,
+    #[serde(rename = "sub", borrow)]
+    subject: Option<Cow<'a, str>>,
     /// Unique token identifier
-    #[serde(rename = "jti")]
-    jwt_id: Option<&'a str>,
+    #[serde(rename = "jti", borrow)]
+    jwt_id: Option<Cow<'a, str>>,
     /// Unique session identifier
-    #[serde(rename = "sid")]
-    session_id: Option<&'a str>,
+    #[serde(rename = "sid", borrow)]
+    session_id: Option<Cow<'a, str>>,
 }
 
 /// `OneOrMany` supports parsing either a single item or an array of items.
@@ -585,14 +591,8 @@ pub(crate) enum JwtError {
     #[error("Provided authentication token is not a valid JWT encoding")]
     JwtEncoding(#[from] JwtEncodingError),
 
-    #[error("invalid JWT token audience")]
-    InvalidJwtTokenAudience,
-
-    #[error("JWT token has expired")]
-    JwtTokenHasExpired,
-
-    #[error("JWT token is not yet ready to use")]
-    JwtTokenNotYetReadyToUse,
+    #[error(transparent)]
+    InvalidClaims(#[from] JwtClaimsError),
 
     #[error("invalid P256 key")]
     InvalidP256Key(jose_jwk::crypto::Error),
@@ -644,6 +644,19 @@ pub enum JwtEncodingError {
     InvalidCompactForm,
 }
 
+#[derive(Error, Debug, PartialEq)]
+#[non_exhaustive]
+pub enum JwtClaimsError {
+    #[error("invalid JWT token audience")]
+    InvalidJwtTokenAudience,
+
+    #[error("JWT token has expired")]
+    JwtTokenHasExpired,
+
+    #[error("JWT token is not yet ready to use")]
+    JwtTokenNotYetReadyToUse,
+}
+
 #[allow(dead_code, reason = "Debug use only")]
 #[derive(Debug)]
 pub(crate) enum KeyType {
@@ -680,6 +693,8 @@ mod tests {
     use hyper_util::rt::TokioIo;
     use rand::rngs::OsRng;
     use rsa::pkcs8::DecodePrivateKey;
+    use serde::Serialize;
+    use serde_json::json;
     use signature::Signer;
     use tokio::net::TcpListener;
 
@@ -693,6 +708,7 @@ mod tests {
             key: jose_jwk::Key::Ec(pk),
             prm: jose_jwk::Parameters {
                 kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Es256)),
                 ..Default::default()
             },
         };
@@ -706,24 +722,47 @@ mod tests {
             key: jose_jwk::Key::Rsa(pk),
             prm: jose_jwk::Parameters {
                 kid: Some(kid),
+                alg: Some(jose_jwa::Algorithm::Signing(jose_jwa::Signing::Rs256)),
                 ..Default::default()
             },
         };
         (sk, jwk)
     }
 
+    fn now() -> u64 {
+        SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs()
+    }
+
     fn build_jwt_payload(kid: String, sig: jose_jwa::Signing) -> String {
+        let now = now();
+        let body = typed_json::json! {{
+            "exp": now + 3600,
+            "nbf": now,
+            "aud": ["audience1", "neon", "audience2"],
+            "sub": "user1",
+            "sid": "session1",
+            "jti": "token1",
+            "iss": "neon-testing",
+        }};
+        build_custom_jwt_payload(kid, body, sig)
+    }
+
+    fn build_custom_jwt_payload(
+        kid: String,
+        body: impl Serialize,
+        sig: jose_jwa::Signing,
+    ) -> String {
         let header = JwtHeader {
             algorithm: jose_jwa::Algorithm::Signing(sig),
-            key_id: Some(&kid),
+            key_id: Some(Cow::Owned(kid)),
         };
-        let body = typed_json::json! {{
-            "exp": SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).unwrap().as_secs() + 3600,
-        }};
 
         let header =
             base64::encode_config(serde_json::to_string(&header).unwrap(), URL_SAFE_NO_PAD);
-        let body = base64::encode_config(body.to_string(), URL_SAFE_NO_PAD);
+        let body = base64::encode_config(serde_json::to_string(&body).unwrap(), URL_SAFE_NO_PAD);
 
         format!("{header}.{body}")
     }
@@ -738,6 +777,16 @@ mod tests {
         format!("{payload}.{sig}")
     }
 
+    fn new_custom_ec_jwt(kid: String, key: &p256::SecretKey, body: impl Serialize) -> String {
+        use p256::ecdsa::{Signature, SigningKey};
+
+        let payload = build_custom_jwt_payload(kid, body, jose_jwa::Signing::Es256);
+        let sig: Signature = SigningKey::from(key).sign(payload.as_bytes());
+        let sig = base64::encode_config(sig.to_bytes(), URL_SAFE_NO_PAD);
+
+        format!("{payload}.{sig}")
+    }
+
     fn new_rsa_jwt(kid: String, key: rsa::RsaPrivateKey) -> String {
         use rsa::pkcs1v15::SigningKey;
         use rsa::signature::SignatureEncoding;
@@ -809,37 +858,34 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
 -----END PRIVATE KEY-----
 ";
 
-    #[tokio::test]
-    async fn renew() {
-        let (rs1, jwk1) = new_rsa_jwk(RS1, "1".into());
-        let (rs2, jwk2) = new_rsa_jwk(RS2, "2".into());
-        let (ec1, jwk3) = new_ec_jwk("3".into());
-        let (ec2, jwk4) = new_ec_jwk("4".into());
+    #[derive(Clone)]
+    struct Fetch(Vec<AuthRule>);
 
-        let foo_jwks = jose_jwk::JwkSet {
-            keys: vec![jwk1, jwk3],
-        };
-        let bar_jwks = jose_jwk::JwkSet {
-            keys: vec![jwk2, jwk4],
-        };
+    impl FetchAuthRules for Fetch {
+        async fn fetch_auth_rules(
+            &self,
+            _ctx: &RequestMonitoring,
+            _endpoint: EndpointId,
+        ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
+            Ok(self.0.clone())
+        }
+    }
 
+    async fn jwks_server(
+        router: impl for<'a> Fn(&'a str) -> Option<Vec<u8>> + Send + Sync + 'static,
+    ) -> SocketAddr {
+        let router = Arc::new(router);
         let service = service_fn(move |req| {
-            let foo_jwks = foo_jwks.clone();
-            let bar_jwks = bar_jwks.clone();
+            let router = Arc::clone(&router);
             async move {
-                let jwks = match req.uri().path() {
-                    "/foo" => &foo_jwks,
-                    "/bar" => &bar_jwks,
-                    _ => {
-                        return Response::builder()
-                            .status(404)
-                            .body(Full::new(Bytes::new()));
-                    }
-                };
-                let body = serde_json::to_vec(jwks).unwrap();
-                Response::builder()
-                    .status(200)
-                    .body(Full::new(Bytes::from(body)))
+                match router(req.uri().path()) {
+                    Some(body) => Response::builder()
+                        .status(200)
+                        .body(Full::new(Bytes::from(body))),
+                    None => Response::builder()
+                        .status(404)
+                        .body(Full::new(Bytes::new())),
+                }
             }
         });
 
@@ -854,84 +900,61 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
             }
         });
 
-        let client = reqwest::Client::new();
+        addr
+    }
 
-        #[derive(Clone)]
-        struct Fetch(SocketAddr, Vec<RoleNameInt>);
+    #[tokio::test]
+    async fn check_jwt_happy_path() {
+        let (rs1, jwk1) = new_rsa_jwk(RS1, "rs1".into());
+        let (rs2, jwk2) = new_rsa_jwk(RS2, "rs2".into());
+        let (ec1, jwk3) = new_ec_jwk("ec1".into());
+        let (ec2, jwk4) = new_ec_jwk("ec2".into());
 
-        impl FetchAuthRules for Fetch {
-            async fn fetch_auth_rules(
-                &self,
-                _ctx: &RequestMonitoring,
-                _endpoint: EndpointId,
-            ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
-                Ok(vec![
-                    AuthRule {
-                        id: "foo".to_owned(),
-                        jwks_url: format!("http://{}/foo", self.0).parse().unwrap(),
-                        audience: None,
-                        role_names: self.1.clone(),
-                    },
-                    AuthRule {
-                        id: "bar".to_owned(),
-                        jwks_url: format!("http://{}/bar", self.0).parse().unwrap(),
-                        audience: None,
-                        role_names: self.1.clone(),
-                    },
-                ])
-            }
-        }
+        let foo_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk1, jwk3],
+        };
+        let bar_jwks = jose_jwk::JwkSet {
+            keys: vec![jwk2, jwk4],
+        };
+
+        let jwks_addr = jwks_server(move |path| match path {
+            "/foo" => Some(serde_json::to_vec(&foo_jwks).unwrap()),
+            "/bar" => Some(serde_json::to_vec(&bar_jwks).unwrap()),
+            _ => None,
+        })
+        .await;
 
         let role_name1 = RoleName::from("anonymous");
         let role_name2 = RoleName::from("authenticated");
 
-        let fetch = Fetch(
-            addr,
-            vec![
-                RoleNameInt::from(&role_name1),
-                RoleNameInt::from(&role_name2),
-            ],
-        );
+        let roles = vec![
+            RoleNameInt::from(&role_name1),
+            RoleNameInt::from(&role_name2),
+        ];
+        let rules = vec![
+            AuthRule {
+                id: "foo".to_owned(),
+                jwks_url: format!("http://{jwks_addr}/foo").parse().unwrap(),
+                audience: None,
+                role_names: roles.clone(),
+            },
+            AuthRule {
+                id: "bar".to_owned(),
+                jwks_url: format!("http://{jwks_addr}/bar").parse().unwrap(),
+                audience: None,
+                role_names: roles.clone(),
+            },
+        ];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
 
         let endpoint = EndpointId::from("ep");
 
-        let jwk_cache = Arc::new(JwkCacheEntryLock::default());
-
-        let jwt1 = new_rsa_jwt("1".into(), rs1);
-        let jwt2 = new_rsa_jwt("2".into(), rs2);
-        let jwt3 = new_ec_jwt("3".into(), &ec1);
-        let jwt4 = new_ec_jwt("4".into(), &ec2);
-
-        // had the wrong kid, therefore will have the wrong ecdsa signature
-        let bad_jwt = new_ec_jwt("3".into(), &ec2);
-        // this role_name is not accepted
-        let bad_role_name = RoleName::from("cloud_admin");
-
-        let err = jwk_cache
-            .check_jwt(
-                &RequestMonitoring::test(),
-                &bad_jwt,
-                &client,
-                endpoint.clone(),
-                &role_name1,
-                &fetch,
-            )
-            .await
-            .unwrap_err();
-        assert!(err.to_string().contains("signature error"));
-
-        let err = jwk_cache
-            .check_jwt(
-                &RequestMonitoring::test(),
-                &jwt1,
-                &client,
-                endpoint.clone(),
-                &bad_role_name,
-                &fetch,
-            )
-            .await
-            .unwrap_err();
-        assert!(err.to_string().contains("jwk not found"));
+        let jwt1 = new_rsa_jwt("rs1".into(), rs1);
+        let jwt2 = new_rsa_jwt("rs2".into(), rs2);
+        let jwt3 = new_ec_jwt("ec1".into(), &ec1);
+        let jwt4 = new_ec_jwt("ec2".into(), &ec2);
 
         let tokens = [jwt1, jwt2, jwt3, jwt4];
         let role_names = [role_name1, role_name2];
@@ -940,15 +963,250 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
                 jwk_cache
                     .check_jwt(
                         &RequestMonitoring::test(),
-                        token,
-                        &client,
                         endpoint.clone(),
                         role,
                         &fetch,
+                        token,
                     )
                     .await
                     .unwrap();
             }
         }
     }
+
+    /// AWS Cognito escapes the `/` in the URL.
+    #[tokio::test]
+    async fn check_jwt_regression_cognito_issuer() {
+        let (key, jwk) = new_ec_jwk("key".into());
+
+        let now = now();
+        let token = new_custom_ec_jwt(
+            "key".into(),
+            &key,
+            typed_json::json! {{
+                "sub": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
+                // cognito uses `\/`. I cannot replicated that easily here as serde_json will refuse
+                // to write that escape character. instead I will make a bogus URL using `\` instead.
+                "iss": "https:\\\\cognito-idp.us-west-2.amazonaws.com\\us-west-2_abcdefgh",
+                "client_id": "abcdefghijklmnopqrstuvwxyz",
+                "origin_jti": "6759d132-3fe7-446e-9e90-2fe7e8017893",
+                "event_id": "ec9c36ab-b01d-46a0-94e4-87fde6767065",
+                "token_use": "access",
+                "scope": "aws.cognito.signin.user.admin",
+                "auth_time":now,
+                "exp":now + 60,
+                "iat":now,
+                "jti": "b241614b-0b93-4bdc-96db-0a3c7061d9c0",
+                "username": "dd9a73fd-e785-4a13-aae1-e691ce43e89d",
+            }},
+        );
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+
+        let jwks_addr = jwks_server(move |_path| Some(serde_json::to_vec(&jwks).unwrap())).await;
+
+        let role_name = RoleName::from("anonymous");
+        let rules = vec![AuthRule {
+            id: "aws-cognito".to_owned(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: None,
+            role_names: vec![RoleNameInt::from(&role_name)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let endpoint = EndpointId::from("ep");
+
+        jwk_cache
+            .check_jwt(
+                &RequestMonitoring::test(),
+                endpoint.clone(),
+                &role_name,
+                &fetch,
+                &token,
+            )
+            .await
+            .unwrap();
+    }
+
+    #[tokio::test]
+    async fn check_jwt_invalid_signature() {
+        let (_, jwk) = new_ec_jwk("1".into());
+        let (key, _) = new_ec_jwk("1".into());
+
+        // has a matching kid, but signed by the wrong key
+        let bad_jwt = new_ec_jwt("1".into(), &key);
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+        let jwks_addr = jwks_server(move |path| match path {
+            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
+            _ => None,
+        })
+        .await;
+
+        let role = RoleName::from("authenticated");
+
+        let rules = vec![AuthRule {
+            id: String::new(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: None,
+            role_names: vec![RoleNameInt::from(&role)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let ep = EndpointId::from("ep");
+
+        let ctx = RequestMonitoring::test();
+        let err = jwk_cache
+            .check_jwt(&ctx, ep, &role, &fetch, &bad_jwt)
+            .await
+            .unwrap_err();
+        assert!(
+            matches!(err, JwtError::Signature(_)),
+            "expected \"signature error\", got {err:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn check_jwt_unknown_role() {
+        let (key, jwk) = new_rsa_jwk(RS1, "1".into());
+        let jwt = new_rsa_jwt("1".into(), key);
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+        let jwks_addr = jwks_server(move |path| match path {
+            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
+            _ => None,
+        })
+        .await;
+
+        let role = RoleName::from("authenticated");
+        let rules = vec![AuthRule {
+            id: String::new(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: None,
+            role_names: vec![RoleNameInt::from(&role)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let ep = EndpointId::from("ep");
+
+        // this role_name is not accepted
+        let bad_role_name = RoleName::from("cloud_admin");
+
+        let ctx = RequestMonitoring::test();
+        let err = jwk_cache
+            .check_jwt(&ctx, ep, &bad_role_name, &fetch, &jwt)
+            .await
+            .unwrap_err();
+
+        assert!(
+            matches!(err, JwtError::JwkNotFound),
+            "expected \"jwk not found\", got {err:?}"
+        );
+    }
+
+    #[tokio::test]
+    async fn check_jwt_invalid_claims() {
+        let (key, jwk) = new_ec_jwk("1".into());
+
+        let jwks = jose_jwk::JwkSet { keys: vec![jwk] };
+        let jwks_addr = jwks_server(move |path| match path {
+            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
+            _ => None,
+        })
+        .await;
+
+        let now = SystemTime::now()
+            .duration_since(SystemTime::UNIX_EPOCH)
+            .unwrap()
+            .as_secs();
+
+        struct Test {
+            body: serde_json::Value,
+            error: JwtClaimsError,
+        }
+
+        let table = vec![
+            Test {
+                body: json! {{
+                    "nbf": now + 60,
+                    "aud": "neon",
+                }},
+                error: JwtClaimsError::JwtTokenNotYetReadyToUse,
+            },
+            Test {
+                body: json! {{
+                    "exp": now - 60,
+                    "aud": ["neon"],
+                }},
+                error: JwtClaimsError::JwtTokenHasExpired,
+            },
+            Test {
+                body: json! {{
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": [],
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": "foo",
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": ["foo"],
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+            Test {
+                body: json! {{
+                    "aud": ["foo", "bar"],
+                }},
+                error: JwtClaimsError::InvalidJwtTokenAudience,
+            },
+        ];
+
+        let role = RoleName::from("authenticated");
+
+        let rules = vec![AuthRule {
+            id: String::new(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: Some("neon".to_string()),
+            role_names: vec![RoleNameInt::from(&role)],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let ep = EndpointId::from("ep");
+
+        let ctx = RequestMonitoring::test();
+        for test in table {
+            let jwt = new_custom_ec_jwt("1".into(), &key, test.body);
+
+            match jwk_cache
+                .check_jwt(&ctx, ep.clone(), &role, &fetch, &jwt)
+                .await
+            {
+                Err(JwtError::InvalidClaims(error)) if error == test.error => {}
+                Err(err) => {
+                    panic!("expected {:?}, got {err:?}", test.error)
+                }
+                Ok(_payload) => {
+                    panic!("expected {:?}, got ok", test.error)
+                }
+            }
+        }
+    }
 }

From d4cbc8cfeb433733d312d8761c3f3bab816df04e Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 29 Oct 2024 11:39:09 +0000
Subject: [PATCH 117/239] [auth_broker]: regress test (#9541)

python based regression test setup for auth_broker. This uses a http
mock for cplane as well as the JWKs url.

complications:
1. We cannot just use local_proxy binary, as that requires the
pg_session_jwt extension which we don't have available in the current
test suite
2. We cannot use just any old http mock for local_proxy, as auth_broker
requires http2 to local_proxy

as such, I used the h2 library to implement an echo server - copied from
the examples in the h2 docs.
---
 poetry.lock                             |  45 +++--
 pyproject.toml                          |   3 +
 test_runner/conftest.py                 |   1 +
 test_runner/fixtures/h2server.py        | 198 ++++++++++++++++++++
 test_runner/fixtures/neon_fixtures.py   | 239 +++++++++++++++++++++---
 test_runner/regress/test_auth_broker.py |  37 ++++
 test_runner/stubs/h2/README.md          |   1 +
 test_runner/stubs/h2/__init__.pyi       |   0
 test_runner/stubs/h2/config.pyi         |  42 +++++
 test_runner/stubs/h2/connection.pyi     | 142 ++++++++++++++
 test_runner/stubs/h2/errors.pyi         |  17 ++
 test_runner/stubs/h2/events.pyi         | 106 +++++++++++
 test_runner/stubs/h2/exceptions.pyi     |  48 +++++
 test_runner/stubs/h2/frame_buffer.pyi   |  19 ++
 test_runner/stubs/h2/settings.pyi       |  61 ++++++
 test_runner/stubs/h2/stream.pyi         | 184 ++++++++++++++++++
 test_runner/stubs/h2/utilities.pyi      |  25 +++
 test_runner/stubs/h2/windows.pyi        |  13 ++
 18 files changed, 1143 insertions(+), 38 deletions(-)
 create mode 100644 test_runner/fixtures/h2server.py
 create mode 100644 test_runner/regress/test_auth_broker.py
 create mode 100644 test_runner/stubs/h2/README.md
 create mode 100644 test_runner/stubs/h2/__init__.pyi
 create mode 100644 test_runner/stubs/h2/config.pyi
 create mode 100644 test_runner/stubs/h2/connection.pyi
 create mode 100644 test_runner/stubs/h2/errors.pyi
 create mode 100644 test_runner/stubs/h2/events.pyi
 create mode 100644 test_runner/stubs/h2/exceptions.pyi
 create mode 100644 test_runner/stubs/h2/frame_buffer.pyi
 create mode 100644 test_runner/stubs/h2/settings.pyi
 create mode 100644 test_runner/stubs/h2/stream.pyi
 create mode 100644 test_runner/stubs/h2/utilities.pyi
 create mode 100644 test_runner/stubs/h2/windows.pyi

diff --git a/poetry.lock b/poetry.lock
index 7abd794235..36ea82a446 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1521,6 +1521,21 @@ files = [
 [package.dependencies]
 six = "*"
 
+[[package]]
+name = "jwcrypto"
+version = "1.5.6"
+description = "Implementation of JOSE Web standards"
+optional = false
+python-versions = ">= 3.8"
+files = [
+    {file = "jwcrypto-1.5.6-py3-none-any.whl", hash = "sha256:150d2b0ebbdb8f40b77f543fb44ffd2baeff48788be71f67f03566692fd55789"},
+    {file = "jwcrypto-1.5.6.tar.gz", hash = "sha256:771a87762a0c081ae6166958a954f80848820b2ab066937dc8b8379d65b1b039"},
+]
+
+[package.dependencies]
+cryptography = ">=3.4"
+typing-extensions = ">=4.5.0"
+
 [[package]]
 name = "kafka-python"
 version = "2.0.2"
@@ -2111,7 +2126,6 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2120,8 +2134,6 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@@ -2603,7 +2615,6 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2912,6 +2923,20 @@ files = [
     {file = "tomli-2.0.1.tar.gz", hash = "sha256:de526c12914f0c550d15924c62d72abc48d6fe7364aa87328337a31007fe8a4f"},
 ]
 
+[[package]]
+name = "types-jwcrypto"
+version = "1.5.0.20240925"
+description = "Typing stubs for jwcrypto"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-jwcrypto-1.5.0.20240925.tar.gz", hash = "sha256:50e17b790378c96239344476c7bd13b52d0c7eeb6d16c2d53723e48cc6bbf4fe"},
+    {file = "types_jwcrypto-1.5.0.20240925-py3-none-any.whl", hash = "sha256:2d12a2d528240d326075e896aafec7056b9136bf3207fa6ccf3fcb8fbf9e11a1"},
+]
+
+[package.dependencies]
+cryptography = "*"
+
 [[package]]
 name = "types-psutil"
 version = "5.9.5.12"
@@ -3159,16 +3184,6 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3406,4 +3421,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "0f4804119f417edf8e1fbd6d715d2e8d70ad731334fa9570304a2203f83339cf"
+content-hash = "ad5c9ee7723359af22bbd7fa41538dcf78913c02e947a13a8f9a87eb3a59039e"
diff --git a/pyproject.toml b/pyproject.toml
index d4926cfb9a..faa5f9123c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -42,6 +42,9 @@ pytest-repeat = "^0.9.3"
 websockets = "^12.0"
 clickhouse-connect = "^0.7.16"
 kafka-python = "^2.0.2"
+jwcrypto = "^1.5.6"
+h2 = "^4.1.0"
+types-jwcrypto = "^1.5.0.20240925"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4a3194c691..84eda52d33 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -3,6 +3,7 @@ from __future__ import annotations
 pytest_plugins = (
     "fixtures.pg_version",
     "fixtures.parametrize",
+    "fixtures.h2server",
     "fixtures.httpserver",
     "fixtures.compute_reconfigure",
     "fixtures.storage_controller_proxy",
diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py
new file mode 100644
index 0000000000..92783e1fb2
--- /dev/null
+++ b/test_runner/fixtures/h2server.py
@@ -0,0 +1,198 @@
+"""
+https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html
+
+auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :)
+"""
+
+import asyncio
+import collections
+import io
+import json
+from collections.abc import AsyncIterable
+
+import pytest_asyncio
+from h2.config import H2Configuration
+from h2.connection import H2Connection
+from h2.errors import ErrorCodes
+from h2.events import (
+    ConnectionTerminated,
+    DataReceived,
+    RemoteSettingsChanged,
+    RequestReceived,
+    StreamEnded,
+    StreamReset,
+    WindowUpdated,
+)
+from h2.exceptions import ProtocolError, StreamClosedError
+from h2.settings import SettingCodes
+
+RequestData = collections.namedtuple("RequestData", ["headers", "data"])
+
+
+class H2Server:
+    def __init__(self, host, port) -> None:
+        self.host = host
+        self.port = port
+
+
+class H2Protocol(asyncio.Protocol):
+    def __init__(self):
+        config = H2Configuration(client_side=False, header_encoding="utf-8")
+        self.conn = H2Connection(config=config)
+        self.transport = None
+        self.stream_data = {}
+        self.flow_control_futures = {}
+
+    def connection_made(self, transport: asyncio.Transport):  # type: ignore[override]
+        self.transport = transport
+        self.conn.initiate_connection()
+        self.transport.write(self.conn.data_to_send())
+
+    def connection_lost(self, _exc):
+        for future in self.flow_control_futures.values():
+            future.cancel()
+        self.flow_control_futures = {}
+
+    def data_received(self, data: bytes):
+        assert self.transport is not None
+        try:
+            events = self.conn.receive_data(data)
+        except ProtocolError:
+            self.transport.write(self.conn.data_to_send())
+            self.transport.close()
+        else:
+            self.transport.write(self.conn.data_to_send())
+            for event in events:
+                if isinstance(event, RequestReceived):
+                    self.request_received(event.headers, event.stream_id)
+                elif isinstance(event, DataReceived):
+                    self.receive_data(event.data, event.stream_id)
+                elif isinstance(event, StreamEnded):
+                    self.stream_complete(event.stream_id)
+                elif isinstance(event, ConnectionTerminated):
+                    self.transport.close()
+                elif isinstance(event, StreamReset):
+                    self.stream_reset(event.stream_id)
+                elif isinstance(event, WindowUpdated):
+                    self.window_updated(event.stream_id, event.delta)
+                elif isinstance(event, RemoteSettingsChanged):
+                    if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings:
+                        self.window_updated(None, 0)
+
+                self.transport.write(self.conn.data_to_send())
+
+    def request_received(self, headers: list[tuple[str, str]], stream_id: int):
+        headers_map = collections.OrderedDict(headers)
+
+        # Store off the request data.
+        request_data = RequestData(headers_map, io.BytesIO())
+        self.stream_data[stream_id] = request_data
+
+    def stream_complete(self, stream_id: int):
+        """
+        When a stream is complete, we can send our response.
+        """
+        try:
+            request_data = self.stream_data[stream_id]
+        except KeyError:
+            # Just return, we probably 405'd this already
+            return
+
+        headers = request_data.headers
+        body = request_data.data.getvalue().decode("utf-8")
+
+        data = json.dumps({"headers": headers, "body": body}, indent=4).encode("utf8")
+
+        response_headers = (
+            (":status", "200"),
+            ("content-type", "application/json"),
+            ("content-length", str(len(data))),
+        )
+        self.conn.send_headers(stream_id, response_headers)
+        asyncio.ensure_future(self.send_data(data, stream_id))
+
+    def receive_data(self, data: bytes, stream_id: int):
+        """
+        We've received some data on a stream. If that stream is one we're
+        expecting data on, save it off. Otherwise, reset the stream.
+        """
+        try:
+            stream_data = self.stream_data[stream_id]
+        except KeyError:
+            self.conn.reset_stream(stream_id, error_code=ErrorCodes.PROTOCOL_ERROR)
+        else:
+            stream_data.data.write(data)
+
+    def stream_reset(self, stream_id):
+        """
+        A stream reset was sent. Stop sending data.
+        """
+        if stream_id in self.flow_control_futures:
+            future = self.flow_control_futures.pop(stream_id)
+            future.cancel()
+
+    async def send_data(self, data, stream_id):
+        """
+        Send data according to the flow control rules.
+        """
+        while data:
+            while self.conn.local_flow_control_window(stream_id) < 1:
+                try:
+                    await self.wait_for_flow_control(stream_id)
+                except asyncio.CancelledError:
+                    return
+
+            chunk_size = min(
+                self.conn.local_flow_control_window(stream_id),
+                len(data),
+                self.conn.max_outbound_frame_size,
+            )
+
+            try:
+                self.conn.send_data(
+                    stream_id, data[:chunk_size], end_stream=(chunk_size == len(data))
+                )
+            except (StreamClosedError, ProtocolError):
+                # The stream got closed and we didn't get told. We're done
+                # here.
+                break
+
+            assert self.transport is not None
+            self.transport.write(self.conn.data_to_send())
+            data = data[chunk_size:]
+
+    async def wait_for_flow_control(self, stream_id):
+        """
+        Waits for a Future that fires when the flow control window is opened.
+        """
+        f: asyncio.Future[None] = asyncio.Future()
+        self.flow_control_futures[stream_id] = f
+        await f
+
+    def window_updated(self, stream_id, delta):
+        """
+        A window update frame was received. Unblock some number of flow control
+        Futures.
+        """
+        if stream_id and stream_id in self.flow_control_futures:
+            f = self.flow_control_futures.pop(stream_id)
+            f.set_result(delta)
+        elif not stream_id:
+            for f in self.flow_control_futures.values():
+                f.set_result(delta)
+
+            self.flow_control_futures = {}
+
+
+@pytest_asyncio.fixture(scope="function")
+async def http2_echoserver() -> AsyncIterable[H2Server]:
+    loop = asyncio.get_event_loop()
+    serve = await loop.create_server(H2Protocol, "127.0.0.1", 0)
+    (host, port) = serve.sockets[0].getsockname()
+
+    asyncio.create_task(serve.wait_closed())
+
+    server = H2Server(host, port)
+    yield server
+
+    serve.close()
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index a8ec144fe9..1b9bc873f4 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -35,11 +35,13 @@ import toml
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
+from jwcrypto import jwk
 
 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
 from psycopg2.extensions import cursor as PgCursor
 from psycopg2.extensions import make_dsn, parse_dsn
+from pytest_httpserver import HTTPServer
 from urllib3.util.retry import Retry
 
 from fixtures import overlayfs
@@ -53,6 +55,7 @@ from fixtures.common_types import (
     TimelineId,
 )
 from fixtures.endpoint.http import EndpointHttpClient
+from fixtures.h2server import H2Server
 from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.neon_cli import NeonLocalCli, Pagectl
@@ -3080,6 +3083,31 @@ class PSQL:
         )
 
 
+def generate_proxy_tls_certs(common_name: str, key_path: Path, crt_path: Path):
+    if not key_path.exists():
+        r = subprocess.run(
+            [
+                "openssl",
+                "req",
+                "-new",
+                "-x509",
+                "-days",
+                "365",
+                "-nodes",
+                "-text",
+                "-out",
+                str(crt_path),
+                "-keyout",
+                str(key_path),
+                "-subj",
+                f"/CN={common_name}",
+                "-addext",
+                f"subjectAltName = DNS:{common_name}",
+            ]
+        )
+        assert r.returncode == 0
+
+
 class NeonProxy(PgProtocol):
     link_auth_uri: str = "http://dummy-uri"
 
@@ -3178,29 +3206,7 @@ class NeonProxy(PgProtocol):
         # generate key of it doesn't exist
         crt_path = self.test_output_dir / "proxy.crt"
         key_path = self.test_output_dir / "proxy.key"
-
-        if not key_path.exists():
-            r = subprocess.run(
-                [
-                    "openssl",
-                    "req",
-                    "-new",
-                    "-x509",
-                    "-days",
-                    "365",
-                    "-nodes",
-                    "-text",
-                    "-out",
-                    str(crt_path),
-                    "-keyout",
-                    str(key_path),
-                    "-subj",
-                    "/CN=*.localtest.me",
-                    "-addext",
-                    "subjectAltName = DNS:*.localtest.me",
-                ]
-            )
-            assert r.returncode == 0
+        generate_proxy_tls_certs("*.localtest.me", key_path, crt_path)
 
         args = [
             str(self.neon_binpath / "proxy"),
@@ -3380,6 +3386,125 @@ class NeonProxy(PgProtocol):
         assert out == "ok"
 
 
+class NeonAuthBroker:
+    class ControlPlane:
+        def __init__(self, endpoint: str):
+            self.endpoint = endpoint
+
+        def extra_args(self) -> list[str]:
+            args = [
+                *["--auth-backend", "console"],
+                *["--auth-endpoint", self.endpoint],
+            ]
+            return args
+
+    def __init__(
+        self,
+        neon_binpath: Path,
+        test_output_dir: Path,
+        http_port: int,
+        mgmt_port: int,
+        external_http_port: int,
+        auth_backend: NeonAuthBroker.ControlPlane,
+    ):
+        self.domain = "apiauth.localtest.me"  # resolves to 127.0.0.1
+        self.host = "127.0.0.1"
+        self.http_port = http_port
+        self.external_http_port = external_http_port
+        self.neon_binpath = neon_binpath
+        self.test_output_dir = test_output_dir
+        self.mgmt_port = mgmt_port
+        self.auth_backend = auth_backend
+        self.http_timeout_seconds = 15
+        self._popen: Optional[subprocess.Popen[bytes]] = None
+
+    def start(self) -> NeonAuthBroker:
+        assert self._popen is None
+
+        # generate key of it doesn't exist
+        crt_path = self.test_output_dir / "proxy.crt"
+        key_path = self.test_output_dir / "proxy.key"
+        generate_proxy_tls_certs("apiauth.localtest.me", key_path, crt_path)
+
+        args = [
+            str(self.neon_binpath / "proxy"),
+            *["--http", f"{self.host}:{self.http_port}"],
+            *["--mgmt", f"{self.host}:{self.mgmt_port}"],
+            *["--wss", f"{self.host}:{self.external_http_port}"],
+            *["-c", str(crt_path)],
+            *["-k", str(key_path)],
+            *["--sql-over-http-pool-opt-in", "false"],
+            *["--is-auth-broker", "true"],
+            *self.auth_backend.extra_args(),
+        ]
+
+        logfile = open(self.test_output_dir / "proxy.log", "w")
+        self._popen = subprocess.Popen(args, stdout=logfile, stderr=logfile)
+        self._wait_until_ready()
+        return self
+
+    # Sends SIGTERM to the proxy if it has been started
+    def terminate(self):
+        if self._popen:
+            self._popen.terminate()
+
+    # Waits for proxy to exit if it has been opened with a default timeout of
+    # two seconds. Raises subprocess.TimeoutExpired if the proxy does not exit in time.
+    def wait_for_exit(self, timeout=2):
+        if self._popen:
+            self._popen.wait(timeout=timeout)
+
+    @backoff.on_exception(backoff.expo, requests.exceptions.RequestException, max_time=10)
+    def _wait_until_ready(self):
+        assert (
+            self._popen and self._popen.poll() is None
+        ), "Proxy exited unexpectedly. Check test log."
+        requests.get(f"http://{self.host}:{self.http_port}/v1/status")
+
+    async def query(self, query, args, **kwargs):
+        user = kwargs["user"]
+        token = kwargs["token"]
+        expected_code = kwargs.get("expected_code")
+
+        log.info(f"Executing http query: {query}")
+
+        connstr = f"postgresql://{user}@{self.domain}/postgres"
+        async with httpx.AsyncClient(verify=str(self.test_output_dir / "proxy.crt")) as client:
+            response = await client.post(
+                f"https://{self.domain}:{self.external_http_port}/sql",
+                json={"query": query, "params": args},
+                headers={
+                    "Neon-Connection-String": connstr,
+                    "Authorization": f"Bearer {token}",
+                },
+            )
+
+            if expected_code is not None:
+                assert response.status_code == expected_code, f"response: {response.json()}"
+            return response.json()
+
+    def get_metrics(self) -> str:
+        request_result = requests.get(f"http://{self.host}:{self.http_port}/metrics")
+        return request_result.text
+
+    def __enter__(self) -> NeonAuthBroker:
+        return self
+
+    def __exit__(
+        self,
+        _exc_type: Optional[type[BaseException]],
+        _exc_value: Optional[BaseException],
+        _traceback: Optional[TracebackType],
+    ):
+        if self._popen is not None:
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warning("failed to gracefully terminate proxy; killing")
+                self._popen.kill()
+
+
 @pytest.fixture(scope="function")
 def link_proxy(
     port_distributor: PortDistributor, neon_binpath: Path, test_output_dir: Path
@@ -3444,6 +3569,74 @@ def static_proxy(
         yield proxy
 
 
+@pytest.fixture(scope="function")
+def neon_authorize_jwk() -> jwk.JWK:
+    kid = str(uuid.uuid4())
+    key = jwk.JWK.generate(kty="RSA", size=2048, alg="RS256", use="sig", kid=kid)
+    assert isinstance(key, jwk.JWK)
+    return key
+
+
+@pytest.fixture(scope="function")
+def static_auth_broker(
+    port_distributor: PortDistributor,
+    neon_binpath: Path,
+    test_output_dir: Path,
+    httpserver: HTTPServer,
+    neon_authorize_jwk: jwk.JWK,
+    http2_echoserver: H2Server,
+) -> Iterable[NeonAuthBroker]:
+    """Neon Auth Broker that routes to a mocked local_proxy and a mocked cplane HTTP API."""
+
+    local_proxy_addr = f"{http2_echoserver.host}:{http2_echoserver.port}"
+
+    # return local_proxy addr on ProxyWakeCompute.
+    httpserver.expect_request("/cplane/proxy_wake_compute").respond_with_json(
+        {
+            "address": local_proxy_addr,
+            "aux": {
+                "endpoint_id": "ep-foo-bar-1234",
+                "branch_id": "br-foo-bar",
+                "project_id": "foo-bar",
+            },
+        }
+    )
+
+    # return jwks mock addr on GetEndpointJwks
+    httpserver.expect_request(re.compile("^/cplane/endpoints/.+/jwks$")).respond_with_json(
+        {
+            "jwks": [
+                {
+                    "id": "foo",
+                    "jwks_url": httpserver.url_for("/authorize/jwks.json"),
+                    "provider_name": "test",
+                    "jwt_audience": None,
+                    "role_names": ["anonymous", "authenticated"],
+                }
+            ]
+        }
+    )
+
+    # return static fixture jwks.
+    jwk = neon_authorize_jwk.export_public(as_dict=True)
+    httpserver.expect_request("/authorize/jwks.json").respond_with_json({"keys": [jwk]})
+
+    mgmt_port = port_distributor.get_port()
+    http_port = port_distributor.get_port()
+    external_http_port = port_distributor.get_port()
+
+    with NeonAuthBroker(
+        neon_binpath=neon_binpath,
+        test_output_dir=test_output_dir,
+        http_port=http_port,
+        mgmt_port=mgmt_port,
+        external_http_port=external_http_port,
+        auth_backend=NeonAuthBroker.ControlPlane(httpserver.url_for("/cplane")),
+    ) as proxy:
+        proxy.start()
+        yield proxy
+
+
 class Endpoint(PgProtocol, LogUtils):
     """An object representing a Postgres compute endpoint managed by the control plane."""
 
diff --git a/test_runner/regress/test_auth_broker.py b/test_runner/regress/test_auth_broker.py
new file mode 100644
index 0000000000..11dc7d56b5
--- /dev/null
+++ b/test_runner/regress/test_auth_broker.py
@@ -0,0 +1,37 @@
+import json
+
+import pytest
+from fixtures.neon_fixtures import NeonAuthBroker
+from jwcrypto import jwk, jwt
+
+
+@pytest.mark.asyncio
+async def test_auth_broker_happy(
+    static_auth_broker: NeonAuthBroker,
+    neon_authorize_jwk: jwk.JWK,
+):
+    """
+    Signs a JWT and uses it to authorize a query to local_proxy.
+    """
+
+    token = jwt.JWT(
+        header={"kid": neon_authorize_jwk.key_id, "alg": "RS256"}, claims={"sub": "user1"}
+    )
+    token.make_signed_token(neon_authorize_jwk)
+    res = await static_auth_broker.query("foo", ["arg1"], user="anonymous", token=token.serialize())
+
+    # local proxy mock just echos back the request
+    # check that we forward the correct data
+
+    assert (
+        res["headers"]["authorization"] == f"Bearer {token.serialize()}"
+    ), "JWT should be forwarded"
+
+    assert (
+        "anonymous" in res["headers"]["neon-connection-string"]
+    ), "conn string should be forwarded"
+
+    assert json.loads(res["body"]) == {
+        "query": "foo",
+        "params": ["arg1"],
+    }, "Query body should be forwarded"
diff --git a/test_runner/stubs/h2/README.md b/test_runner/stubs/h2/README.md
new file mode 100644
index 0000000000..cdf181ff80
--- /dev/null
+++ b/test_runner/stubs/h2/README.md
@@ -0,0 +1 @@
+generated via `poetry run stubgen -p h2 -o test_runner/stubs`
diff --git a/test_runner/stubs/h2/__init__.pyi b/test_runner/stubs/h2/__init__.pyi
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/test_runner/stubs/h2/config.pyi b/test_runner/stubs/h2/config.pyi
new file mode 100644
index 0000000000..710005db69
--- /dev/null
+++ b/test_runner/stubs/h2/config.pyi
@@ -0,0 +1,42 @@
+from _typeshed import Incomplete
+
+class _BooleanConfigOption:
+    name: Incomplete
+    attr_name: Incomplete
+    def __init__(self, name) -> None: ...
+    def __get__(self, instance, owner): ...
+    def __set__(self, instance, value) -> None: ...
+
+class DummyLogger:
+    def __init__(self, *vargs) -> None: ...
+    def debug(self, *vargs, **kwargs) -> None: ...
+    def trace(self, *vargs, **kwargs) -> None: ...
+
+class OutputLogger:
+    file: Incomplete
+    trace_level: Incomplete
+    def __init__(self, file: Incomplete | None = ..., trace_level: bool = ...) -> None: ...
+    def debug(self, fmtstr, *args) -> None: ...
+    def trace(self, fmtstr, *args) -> None: ...
+
+class H2Configuration:
+    client_side: Incomplete
+    validate_outbound_headers: Incomplete
+    normalize_outbound_headers: Incomplete
+    validate_inbound_headers: Incomplete
+    normalize_inbound_headers: Incomplete
+    logger: Incomplete
+    def __init__(
+        self,
+        client_side: bool = ...,
+        header_encoding: Incomplete | None = ...,
+        validate_outbound_headers: bool = ...,
+        normalize_outbound_headers: bool = ...,
+        validate_inbound_headers: bool = ...,
+        normalize_inbound_headers: bool = ...,
+        logger: Incomplete | None = ...,
+    ) -> None: ...
+    @property
+    def header_encoding(self): ...
+    @header_encoding.setter
+    def header_encoding(self, value) -> None: ...
diff --git a/test_runner/stubs/h2/connection.pyi b/test_runner/stubs/h2/connection.pyi
new file mode 100644
index 0000000000..04be18ca74
--- /dev/null
+++ b/test_runner/stubs/h2/connection.pyi
@@ -0,0 +1,142 @@
+from enum import Enum, IntEnum
+
+from _typeshed import Incomplete
+
+from .config import H2Configuration as H2Configuration
+from .errors import ErrorCodes as ErrorCodes
+from .events import AlternativeServiceAvailable as AlternativeServiceAvailable
+from .events import ConnectionTerminated as ConnectionTerminated
+from .events import PingAckReceived as PingAckReceived
+from .events import PingReceived as PingReceived
+from .events import PriorityUpdated as PriorityUpdated
+from .events import RemoteSettingsChanged as RemoteSettingsChanged
+from .events import SettingsAcknowledged as SettingsAcknowledged
+from .events import UnknownFrameReceived as UnknownFrameReceived
+from .events import WindowUpdated as WindowUpdated
+from .exceptions import DenialOfServiceError as DenialOfServiceError
+from .exceptions import FlowControlError as FlowControlError
+from .exceptions import FrameTooLargeError as FrameTooLargeError
+from .exceptions import NoAvailableStreamIDError as NoAvailableStreamIDError
+from .exceptions import NoSuchStreamError as NoSuchStreamError
+from .exceptions import ProtocolError as ProtocolError
+from .exceptions import RFC1122Error as RFC1122Error
+from .exceptions import StreamClosedError as StreamClosedError
+from .exceptions import StreamIDTooLowError as StreamIDTooLowError
+from .exceptions import TooManyStreamsError as TooManyStreamsError
+from .frame_buffer import FrameBuffer as FrameBuffer
+from .settings import SettingCodes as SettingCodes
+from .settings import Settings as Settings
+from .stream import H2Stream as H2Stream
+from .stream import StreamClosedBy as StreamClosedBy
+from .utilities import guard_increment_window as guard_increment_window
+from .windows import WindowManager as WindowManager
+
+class ConnectionState(Enum):
+    IDLE: int
+    CLIENT_OPEN: int
+    SERVER_OPEN: int
+    CLOSED: int
+
+class ConnectionInputs(Enum):
+    SEND_HEADERS: int
+    SEND_PUSH_PROMISE: int
+    SEND_DATA: int
+    SEND_GOAWAY: int
+    SEND_WINDOW_UPDATE: int
+    SEND_PING: int
+    SEND_SETTINGS: int
+    SEND_RST_STREAM: int
+    SEND_PRIORITY: int
+    RECV_HEADERS: int
+    RECV_PUSH_PROMISE: int
+    RECV_DATA: int
+    RECV_GOAWAY: int
+    RECV_WINDOW_UPDATE: int
+    RECV_PING: int
+    RECV_SETTINGS: int
+    RECV_RST_STREAM: int
+    RECV_PRIORITY: int
+    SEND_ALTERNATIVE_SERVICE: int
+    RECV_ALTERNATIVE_SERVICE: int
+
+class AllowedStreamIDs(IntEnum):
+    EVEN: int
+    ODD: int
+
+class H2ConnectionStateMachine:
+    state: Incomplete
+    def __init__(self) -> None: ...
+    def process_input(self, input_): ...
+
+class H2Connection:
+    DEFAULT_MAX_OUTBOUND_FRAME_SIZE: int
+    DEFAULT_MAX_INBOUND_FRAME_SIZE: Incomplete
+    HIGHEST_ALLOWED_STREAM_ID: Incomplete
+    MAX_WINDOW_INCREMENT: Incomplete
+    DEFAULT_MAX_HEADER_LIST_SIZE: Incomplete
+    MAX_CLOSED_STREAMS: Incomplete
+    state_machine: Incomplete
+    streams: Incomplete
+    highest_inbound_stream_id: int
+    highest_outbound_stream_id: int
+    encoder: Incomplete
+    decoder: Incomplete
+    config: Incomplete
+    local_settings: Incomplete
+    remote_settings: Incomplete
+    outbound_flow_control_window: Incomplete
+    max_outbound_frame_size: Incomplete
+    max_inbound_frame_size: Incomplete
+    incoming_buffer: Incomplete
+    def __init__(self, config: Incomplete | None = ...) -> None: ...
+    @property
+    def open_outbound_streams(self): ...
+    @property
+    def open_inbound_streams(self): ...
+    @property
+    def inbound_flow_control_window(self): ...
+    def initiate_connection(self) -> None: ...
+    def initiate_upgrade_connection(self, settings_header: Incomplete | None = ...): ...
+    def get_next_available_stream_id(self): ...
+    def send_headers(
+        self,
+        stream_id,
+        headers,
+        end_stream: bool = ...,
+        priority_weight: Incomplete | None = ...,
+        priority_depends_on: Incomplete | None = ...,
+        priority_exclusive: Incomplete | None = ...,
+    ) -> None: ...
+    def send_data(
+        self, stream_id, data, end_stream: bool = ..., pad_length: Incomplete | None = ...
+    ) -> None: ...
+    def end_stream(self, stream_id) -> None: ...
+    def increment_flow_control_window(
+        self, increment, stream_id: Incomplete | None = ...
+    ) -> None: ...
+    def push_stream(self, stream_id, promised_stream_id, request_headers) -> None: ...
+    def ping(self, opaque_data) -> None: ...
+    def reset_stream(self, stream_id, error_code: int = ...) -> None: ...
+    def close_connection(
+        self,
+        error_code: int = ...,
+        additional_data: Incomplete | None = ...,
+        last_stream_id: Incomplete | None = ...,
+    ) -> None: ...
+    def update_settings(self, new_settings) -> None: ...
+    def advertise_alternative_service(
+        self, field_value, origin: Incomplete | None = ..., stream_id: Incomplete | None = ...
+    ) -> None: ...
+    def prioritize(
+        self,
+        stream_id,
+        weight: Incomplete | None = ...,
+        depends_on: Incomplete | None = ...,
+        exclusive: Incomplete | None = ...,
+    ) -> None: ...
+    def local_flow_control_window(self, stream_id): ...
+    def remote_flow_control_window(self, stream_id): ...
+    def acknowledge_received_data(self, acknowledged_size, stream_id) -> None: ...
+    def data_to_send(self, amount: Incomplete | None = ...): ...
+    def clear_outbound_data_buffer(self) -> None: ...
+    def receive_data(self, data): ...
diff --git a/test_runner/stubs/h2/errors.pyi b/test_runner/stubs/h2/errors.pyi
new file mode 100644
index 0000000000..b70c632f8c
--- /dev/null
+++ b/test_runner/stubs/h2/errors.pyi
@@ -0,0 +1,17 @@
+import enum
+
+class ErrorCodes(enum.IntEnum):
+    NO_ERROR: int
+    PROTOCOL_ERROR: int
+    INTERNAL_ERROR: int
+    FLOW_CONTROL_ERROR: int
+    SETTINGS_TIMEOUT: int
+    STREAM_CLOSED: int
+    FRAME_SIZE_ERROR: int
+    REFUSED_STREAM: int
+    CANCEL: int
+    COMPRESSION_ERROR: int
+    CONNECT_ERROR: int
+    ENHANCE_YOUR_CALM: int
+    INADEQUATE_SECURITY: int
+    HTTP_1_1_REQUIRED: int
diff --git a/test_runner/stubs/h2/events.pyi b/test_runner/stubs/h2/events.pyi
new file mode 100644
index 0000000000..75d0a9e53b
--- /dev/null
+++ b/test_runner/stubs/h2/events.pyi
@@ -0,0 +1,106 @@
+from _typeshed import Incomplete
+
+from .settings import ChangedSetting as ChangedSetting
+
+class Event: ...
+
+class RequestReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    stream_ended: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class ResponseReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    stream_ended: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class TrailersReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    stream_ended: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class _HeadersSent(Event): ...
+class _ResponseSent(_HeadersSent): ...
+class _RequestSent(_HeadersSent): ...
+class _TrailersSent(_HeadersSent): ...
+class _PushedRequestSent(_HeadersSent): ...
+
+class InformationalResponseReceived(Event):
+    stream_id: Incomplete
+    headers: Incomplete
+    priority_updated: Incomplete
+    def __init__(self) -> None: ...
+
+class DataReceived(Event):
+    stream_id: Incomplete
+    data: Incomplete
+    flow_controlled_length: Incomplete
+    stream_ended: Incomplete
+    def __init__(self) -> None: ...
+
+class WindowUpdated(Event):
+    stream_id: Incomplete
+    delta: Incomplete
+    def __init__(self) -> None: ...
+
+class RemoteSettingsChanged(Event):
+    changed_settings: Incomplete
+    def __init__(self) -> None: ...
+    @classmethod
+    def from_settings(cls, old_settings, new_settings): ...
+
+class PingReceived(Event):
+    ping_data: Incomplete
+    def __init__(self) -> None: ...
+
+class PingAckReceived(Event):
+    ping_data: Incomplete
+    def __init__(self) -> None: ...
+
+class StreamEnded(Event):
+    stream_id: Incomplete
+    def __init__(self) -> None: ...
+
+class StreamReset(Event):
+    stream_id: Incomplete
+    error_code: Incomplete
+    remote_reset: bool
+    def __init__(self) -> None: ...
+
+class PushedStreamReceived(Event):
+    pushed_stream_id: Incomplete
+    parent_stream_id: Incomplete
+    headers: Incomplete
+    def __init__(self) -> None: ...
+
+class SettingsAcknowledged(Event):
+    changed_settings: Incomplete
+    def __init__(self) -> None: ...
+
+class PriorityUpdated(Event):
+    stream_id: Incomplete
+    weight: Incomplete
+    depends_on: Incomplete
+    exclusive: Incomplete
+    def __init__(self) -> None: ...
+
+class ConnectionTerminated(Event):
+    error_code: Incomplete
+    last_stream_id: Incomplete
+    additional_data: Incomplete
+    def __init__(self) -> None: ...
+
+class AlternativeServiceAvailable(Event):
+    origin: Incomplete
+    field_value: Incomplete
+    def __init__(self) -> None: ...
+
+class UnknownFrameReceived(Event):
+    frame: Incomplete
+    def __init__(self) -> None: ...
diff --git a/test_runner/stubs/h2/exceptions.pyi b/test_runner/stubs/h2/exceptions.pyi
new file mode 100644
index 0000000000..82019d5ec1
--- /dev/null
+++ b/test_runner/stubs/h2/exceptions.pyi
@@ -0,0 +1,48 @@
+from _typeshed import Incomplete
+
+class H2Error(Exception): ...
+
+class ProtocolError(H2Error):
+    error_code: Incomplete
+
+class FrameTooLargeError(ProtocolError):
+    error_code: Incomplete
+
+class FrameDataMissingError(ProtocolError):
+    error_code: Incomplete
+
+class TooManyStreamsError(ProtocolError): ...
+
+class FlowControlError(ProtocolError):
+    error_code: Incomplete
+
+class StreamIDTooLowError(ProtocolError):
+    stream_id: Incomplete
+    max_stream_id: Incomplete
+    def __init__(self, stream_id, max_stream_id) -> None: ...
+
+class NoAvailableStreamIDError(ProtocolError): ...
+
+class NoSuchStreamError(ProtocolError):
+    stream_id: Incomplete
+    def __init__(self, stream_id) -> None: ...
+
+class StreamClosedError(NoSuchStreamError):
+    stream_id: Incomplete
+    error_code: Incomplete
+    def __init__(self, stream_id) -> None: ...
+
+class InvalidSettingsValueError(ProtocolError, ValueError):
+    error_code: Incomplete
+    def __init__(self, msg, error_code) -> None: ...
+
+class InvalidBodyLengthError(ProtocolError):
+    expected_length: Incomplete
+    actual_length: Incomplete
+    def __init__(self, expected, actual) -> None: ...
+
+class UnsupportedFrameError(ProtocolError): ...
+class RFC1122Error(H2Error): ...
+
+class DenialOfServiceError(ProtocolError):
+    error_code: Incomplete
diff --git a/test_runner/stubs/h2/frame_buffer.pyi b/test_runner/stubs/h2/frame_buffer.pyi
new file mode 100644
index 0000000000..f47adab704
--- /dev/null
+++ b/test_runner/stubs/h2/frame_buffer.pyi
@@ -0,0 +1,19 @@
+from .exceptions import (
+    FrameDataMissingError as FrameDataMissingError,
+)
+from .exceptions import (
+    FrameTooLargeError as FrameTooLargeError,
+)
+from .exceptions import (
+    ProtocolError as ProtocolError,
+)
+
+CONTINUATION_BACKLOG: int
+
+class FrameBuffer:
+    data: bytes
+    max_frame_size: int
+    def __init__(self, server: bool = ...) -> None: ...
+    def add_data(self, data) -> None: ...
+    def __iter__(self): ...
+    def __next__(self): ...
diff --git a/test_runner/stubs/h2/settings.pyi b/test_runner/stubs/h2/settings.pyi
new file mode 100644
index 0000000000..a352abe53e
--- /dev/null
+++ b/test_runner/stubs/h2/settings.pyi
@@ -0,0 +1,61 @@
+import enum
+from collections.abc import MutableMapping
+from typing import Any
+
+from _typeshed import Incomplete
+from h2.errors import ErrorCodes as ErrorCodes
+from h2.exceptions import InvalidSettingsValueError as InvalidSettingsValueError
+
+class SettingCodes(enum.IntEnum):
+    HEADER_TABLE_SIZE: Incomplete
+    ENABLE_PUSH: Incomplete
+    MAX_CONCURRENT_STREAMS: Incomplete
+    INITIAL_WINDOW_SIZE: Incomplete
+    MAX_FRAME_SIZE: Incomplete
+    MAX_HEADER_LIST_SIZE: Incomplete
+    ENABLE_CONNECT_PROTOCOL: Incomplete
+
+class ChangedSetting:
+    setting: Incomplete
+    original_value: Incomplete
+    new_value: Incomplete
+    def __init__(self, setting, original_value, new_value) -> None: ...
+
+class Settings(MutableMapping[str, Any]):
+    def __init__(self, client: bool = ..., initial_values: Incomplete | None = ...) -> None: ...
+    def acknowledge(self): ...
+    @property
+    def header_table_size(self): ...
+    @header_table_size.setter
+    def header_table_size(self, value) -> None: ...
+    @property
+    def enable_push(self): ...
+    @enable_push.setter
+    def enable_push(self, value) -> None: ...
+    @property
+    def initial_window_size(self): ...
+    @initial_window_size.setter
+    def initial_window_size(self, value) -> None: ...
+    @property
+    def max_frame_size(self): ...
+    @max_frame_size.setter
+    def max_frame_size(self, value) -> None: ...
+    @property
+    def max_concurrent_streams(self): ...
+    @max_concurrent_streams.setter
+    def max_concurrent_streams(self, value) -> None: ...
+    @property
+    def max_header_list_size(self): ...
+    @max_header_list_size.setter
+    def max_header_list_size(self, value) -> None: ...
+    @property
+    def enable_connect_protocol(self): ...
+    @enable_connect_protocol.setter
+    def enable_connect_protocol(self, value) -> None: ...
+    def __getitem__(self, key): ...
+    def __setitem__(self, key, value) -> None: ...
+    def __delitem__(self, key) -> None: ...
+    def __iter__(self): ...
+    def __len__(self) -> int: ...
+    def __eq__(self, other): ...
+    def __ne__(self, other): ...
diff --git a/test_runner/stubs/h2/stream.pyi b/test_runner/stubs/h2/stream.pyi
new file mode 100644
index 0000000000..d52ab8e72b
--- /dev/null
+++ b/test_runner/stubs/h2/stream.pyi
@@ -0,0 +1,184 @@
+from enum import Enum, IntEnum
+
+from _typeshed import Incomplete
+
+from .errors import ErrorCodes as ErrorCodes
+from .events import (
+    AlternativeServiceAvailable as AlternativeServiceAvailable,
+)
+from .events import (
+    DataReceived as DataReceived,
+)
+from .events import (
+    InformationalResponseReceived as InformationalResponseReceived,
+)
+from .events import (
+    PushedStreamReceived as PushedStreamReceived,
+)
+from .events import (
+    RequestReceived as RequestReceived,
+)
+from .events import (
+    ResponseReceived as ResponseReceived,
+)
+from .events import (
+    StreamEnded as StreamEnded,
+)
+from .events import (
+    StreamReset as StreamReset,
+)
+from .events import (
+    TrailersReceived as TrailersReceived,
+)
+from .events import (
+    WindowUpdated as WindowUpdated,
+)
+from .exceptions import (
+    FlowControlError as FlowControlError,
+)
+from .exceptions import (
+    InvalidBodyLengthError as InvalidBodyLengthError,
+)
+from .exceptions import (
+    ProtocolError as ProtocolError,
+)
+from .exceptions import (
+    StreamClosedError as StreamClosedError,
+)
+from .utilities import (
+    HeaderValidationFlags as HeaderValidationFlags,
+)
+from .utilities import (
+    authority_from_headers as authority_from_headers,
+)
+from .utilities import (
+    extract_method_header as extract_method_header,
+)
+from .utilities import (
+    guard_increment_window as guard_increment_window,
+)
+from .utilities import (
+    is_informational_response as is_informational_response,
+)
+from .utilities import (
+    normalize_inbound_headers as normalize_inbound_headers,
+)
+from .utilities import (
+    normalize_outbound_headers as normalize_outbound_headers,
+)
+from .utilities import (
+    validate_headers as validate_headers,
+)
+from .utilities import (
+    validate_outbound_headers as validate_outbound_headers,
+)
+from .windows import WindowManager as WindowManager
+
+class StreamState(IntEnum):
+    IDLE: int
+    RESERVED_REMOTE: int
+    RESERVED_LOCAL: int
+    OPEN: int
+    HALF_CLOSED_REMOTE: int
+    HALF_CLOSED_LOCAL: int
+    CLOSED: int
+
+class StreamInputs(Enum):
+    SEND_HEADERS: int
+    SEND_PUSH_PROMISE: int
+    SEND_RST_STREAM: int
+    SEND_DATA: int
+    SEND_WINDOW_UPDATE: int
+    SEND_END_STREAM: int
+    RECV_HEADERS: int
+    RECV_PUSH_PROMISE: int
+    RECV_RST_STREAM: int
+    RECV_DATA: int
+    RECV_WINDOW_UPDATE: int
+    RECV_END_STREAM: int
+    RECV_CONTINUATION: int
+    SEND_INFORMATIONAL_HEADERS: int
+    RECV_INFORMATIONAL_HEADERS: int
+    SEND_ALTERNATIVE_SERVICE: int
+    RECV_ALTERNATIVE_SERVICE: int
+    UPGRADE_CLIENT: int
+    UPGRADE_SERVER: int
+
+class StreamClosedBy(Enum):
+    SEND_END_STREAM: int
+    RECV_END_STREAM: int
+    SEND_RST_STREAM: int
+    RECV_RST_STREAM: int
+
+STREAM_OPEN: Incomplete
+
+class H2StreamStateMachine:
+    state: Incomplete
+    stream_id: Incomplete
+    client: Incomplete
+    headers_sent: Incomplete
+    trailers_sent: Incomplete
+    headers_received: Incomplete
+    trailers_received: Incomplete
+    stream_closed_by: Incomplete
+    def __init__(self, stream_id) -> None: ...
+    def process_input(self, input_): ...
+    def request_sent(self, previous_state): ...
+    def response_sent(self, previous_state): ...
+    def request_received(self, previous_state): ...
+    def response_received(self, previous_state): ...
+    def data_received(self, previous_state): ...
+    def window_updated(self, previous_state): ...
+    def stream_half_closed(self, previous_state): ...
+    def stream_ended(self, previous_state): ...
+    def stream_reset(self, previous_state): ...
+    def send_new_pushed_stream(self, previous_state): ...
+    def recv_new_pushed_stream(self, previous_state): ...
+    def send_push_promise(self, previous_state): ...
+    def recv_push_promise(self, previous_state): ...
+    def send_end_stream(self, previous_state) -> None: ...
+    def send_reset_stream(self, previous_state) -> None: ...
+    def reset_stream_on_error(self, previous_state) -> None: ...
+    def recv_on_closed_stream(self, previous_state) -> None: ...
+    def send_on_closed_stream(self, previous_state) -> None: ...
+    def recv_push_on_closed_stream(self, previous_state) -> None: ...
+    def send_push_on_closed_stream(self, previous_state) -> None: ...
+    def send_informational_response(self, previous_state): ...
+    def recv_informational_response(self, previous_state): ...
+    def recv_alt_svc(self, previous_state): ...
+    def send_alt_svc(self, previous_state) -> None: ...
+
+class H2Stream:
+    state_machine: Incomplete
+    stream_id: Incomplete
+    max_outbound_frame_size: Incomplete
+    request_method: Incomplete
+    outbound_flow_control_window: Incomplete
+    config: Incomplete
+    def __init__(self, stream_id, config, inbound_window_size, outbound_window_size) -> None: ...
+    @property
+    def inbound_flow_control_window(self): ...
+    @property
+    def open(self): ...
+    @property
+    def closed(self): ...
+    @property
+    def closed_by(self): ...
+    def upgrade(self, client_side) -> None: ...
+    def send_headers(self, headers, encoder, end_stream: bool = ...): ...
+    def push_stream_in_band(self, related_stream_id, headers, encoder): ...
+    def locally_pushed(self): ...
+    def send_data(self, data, end_stream: bool = ..., pad_length: Incomplete | None = ...): ...
+    def end_stream(self): ...
+    def advertise_alternative_service(self, field_value): ...
+    def increase_flow_control_window(self, increment): ...
+    def receive_push_promise_in_band(self, promised_stream_id, headers, header_encoding): ...
+    def remotely_pushed(self, pushed_headers): ...
+    def receive_headers(self, headers, end_stream, header_encoding): ...
+    def receive_data(self, data, end_stream, flow_control_len): ...
+    def receive_window_update(self, increment): ...
+    def receive_continuation(self) -> None: ...
+    def receive_alt_svc(self, frame): ...
+    def reset_stream(self, error_code: int = ...): ...
+    def stream_reset(self, frame): ...
+    def acknowledge_received_data(self, acknowledged_size): ...
diff --git a/test_runner/stubs/h2/utilities.pyi b/test_runner/stubs/h2/utilities.pyi
new file mode 100644
index 0000000000..e0a8d55d1d
--- /dev/null
+++ b/test_runner/stubs/h2/utilities.pyi
@@ -0,0 +1,25 @@
+from typing import NamedTuple
+
+from _typeshed import Incomplete
+
+from .exceptions import FlowControlError as FlowControlError
+from .exceptions import ProtocolError as ProtocolError
+
+UPPER_RE: Incomplete
+CONNECTION_HEADERS: Incomplete
+
+def extract_method_header(headers): ...
+def is_informational_response(headers): ...
+def guard_increment_window(current, increment): ...
+def authority_from_headers(headers): ...
+
+class HeaderValidationFlags(NamedTuple):
+    is_client: Incomplete
+    is_trailer: Incomplete
+    is_response_header: Incomplete
+    is_push_promise: Incomplete
+
+def validate_headers(headers, hdr_validation_flags): ...
+def normalize_outbound_headers(headers, hdr_validation_flags): ...
+def normalize_inbound_headers(headers, hdr_validation_flags): ...
+def validate_outbound_headers(headers, hdr_validation_flags): ...
diff --git a/test_runner/stubs/h2/windows.pyi b/test_runner/stubs/h2/windows.pyi
new file mode 100644
index 0000000000..7dc78e431c
--- /dev/null
+++ b/test_runner/stubs/h2/windows.pyi
@@ -0,0 +1,13 @@
+from _typeshed import Incomplete
+
+from .exceptions import FlowControlError as FlowControlError
+
+LARGEST_FLOW_CONTROL_WINDOW: Incomplete
+
+class WindowManager:
+    max_window_size: Incomplete
+    current_window_size: Incomplete
+    def __init__(self, max_window_size) -> None: ...
+    def window_consumed(self, size) -> None: ...
+    def window_opened(self, size) -> None: ...
+    def process_bytes(self, size): ...

From 4ef74215e1174186c7ab8cdb41d98cb9a327d07d Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 29 Oct 2024 13:00:03 +0000
Subject: [PATCH 118/239] pageserver: refactor generation-aware loading code
 into generic (#9545)

## Problem

Indices used to be the only kind of object where we had to search across
generations to find the most recent one. As of
https://github.com/neondatabase/neon/issues/9543, manifests will need
the same treatment.

## Summary of changes

- Refactor download_index_part to a generic download_generation_object
function, which will be usable for downloading manifest objects as well.
---
 .../tenant/remote_timeline_client/download.rs | 139 ++++++++++++------
 1 file changed, 91 insertions(+), 48 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 95f8f026d4..8679c68a27 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -403,59 +403,79 @@ async fn do_download_index_part(
     Ok((index_part, index_generation, index_part_mtime))
 }
 
-/// index_part.json objects are suffixed with a generation number, so we cannot
-/// directly GET the latest index part without doing some probing.
+/// Metadata objects are "generationed", meaning that they include a generation suffix.  This
+/// function downloads the object with the highest generation <= `my_generation`.
 ///
-/// In this function we probe for the most recent index in a generation <= our current generation.
-/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+/// Data objects (layer files) also include a generation in their path, but there is no equivalent
+/// search process, because their reference from an index includes the generation.
+///
+/// An expensive object listing operation is only done if necessary: the typical fast path is to issue two
+/// GET operations, one to our own generation (stale attachment case), and one to the immediately preceding
+/// generation (normal case when migrating/restarting).  Only if both of these return 404 do we fall back
+/// to listing objects.
+///
+/// * `my_generation`: the value of `[crate::tenant::Tenant::generation]`
+/// * `what`: for logging, what object are we downloading
+/// * `prefix`: when listing objects, use this prefix (i.e. the part of the object path before the generation)
+/// * `do_download`: a GET of the object in a particular generation, which should **retry indefinitely** unless
+///                  `cancel`` has fired.  This function does not do its own retries of GET operations, and relies
+///                  on the function passed in to do so.
+/// * `parse_path`: parse a fully qualified remote storage path to get the generation of the object.
+#[allow(clippy::too_many_arguments)]
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
-pub(crate) async fn download_index_part(
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
+pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
+    storage: &'a GenericRemoteStorage,
+    tenant_shard_id: &'a TenantShardId,
+    timeline_id: &'a TimelineId,
     my_generation: Generation,
-    cancel: &CancellationToken,
-) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    what: &str,
+    prefix: RemotePath,
+    do_download: DF,
+    parse_path: PF,
+    cancel: &'a CancellationToken,
+) -> Result<(T, Generation, SystemTime), DownloadError>
+where
+    DF: Fn(
+        &'a GenericRemoteStorage,
+        &'a TenantShardId,
+        &'a TimelineId,
+        Generation,
+        &'a CancellationToken,
+    ) -> DFF,
+    DFF: Future<Output = Result<(T, Generation, SystemTime), DownloadError>>,
+    PF: Fn(RemotePath) -> Option<Generation>,
+    T: 'static,
+{
     debug_assert_current_span_has_tenant_and_timeline_id();
 
     if my_generation.is_none() {
         // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(
-            storage,
-            tenant_shard_id,
-            timeline_id,
-            my_generation,
-            cancel,
-        )
-        .await;
+        return do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
     }
 
-    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
-    // index in our generation.
+    // Stale case: If we were intentionally attached in a stale generation, the remote object may already
+    // exist in our generation.
     //
     // This is an optimization to avoid doing the listing for the general case below.
-    let res =
-        do_download_index_part(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
+    let res = do_download(storage, tenant_shard_id, timeline_id, my_generation, cancel).await;
     match res {
-        Ok(index_part) => {
-            tracing::debug!(
-                "Found index_part from current generation (this is a stale attachment)"
-            );
-            return Ok(index_part);
+        Ok(decoded) => {
+            tracing::debug!("Found {what} from current generation (this is a stale attachment)");
+            return Ok(decoded);
         }
         Err(DownloadError::NotFound) => {}
         Err(e) => return Err(e),
     };
 
-    // Typical case: the previous generation of this tenant was running healthily, and had uploaded
-    // and index part.  We may safely start from this index without doing a listing, because:
+    // Typical case: the previous generation of this tenant was running healthily, and had uploaded the object
+    // we are seeking in that generation.  We may safely start from this index without doing a listing, because:
     //  - We checked for current generation case above
     //  - generations > my_generation are to be ignored
-    //  - any other indices that exist would have an older generation than `previous_gen`, and
-    //    we want to find the most recent index from a previous generation.
+    //  - any other objects that exist would have an older generation than `previous_gen`, and
+    //    we want to find the most recent object from a previous generation.
     //
     // This is an optimization to avoid doing the listing for the general case below.
-    let res = do_download_index_part(
+    let res = do_download(
         storage,
         tenant_shard_id,
         timeline_id,
@@ -464,14 +484,12 @@ pub(crate) async fn download_index_part(
     )
     .await;
     match res {
-        Ok(index_part) => {
-            tracing::debug!("Found index_part from previous generation");
-            return Ok(index_part);
+        Ok(decoded) => {
+            tracing::debug!("Found {what} from previous generation");
+            return Ok(decoded);
         }
         Err(DownloadError::NotFound) => {
-            tracing::debug!(
-                "No index_part found from previous generation, falling back to listing"
-            );
+            tracing::debug!("No {what} found from previous generation, falling back to listing");
         }
         Err(e) => {
             return Err(e);
@@ -481,12 +499,10 @@ pub(crate) async fn download_index_part(
     // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
     // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
     // to constructing a full index path with no generation, because the generation is a suffix.
-    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
-
-    let indices = download_retry(
+    let paths = download_retry(
         || async {
             storage
-                .list(Some(&index_prefix), ListingMode::NoDelimiter, None, cancel)
+                .list(Some(&prefix), ListingMode::NoDelimiter, None, cancel)
                 .await
         },
         "list index_part files",
@@ -497,22 +513,22 @@ pub(crate) async fn download_index_part(
 
     // General case logic for which index to use: the latest index whose generation
     // is <= our own.  See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
-    let max_previous_generation = indices
+    let max_previous_generation = paths
         .into_iter()
-        .filter_map(|o| parse_remote_index_path(o.key))
+        .filter_map(|o| parse_path(o.key))
         .filter(|g| g <= &my_generation)
         .max();
 
     match max_previous_generation {
         Some(g) => {
-            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
+            tracing::debug!("Found {what} in generation {g:?}");
+            do_download(storage, tenant_shard_id, timeline_id, g, cancel).await
         }
         None => {
             // Migration from legacy pre-generation state: we have a generation but no prior
             // attached pageservers did.  Try to load from a no-generation path.
-            tracing::debug!("No index_part.json* found");
-            do_download_index_part(
+            tracing::debug!("No {what}* found");
+            do_download(
                 storage,
                 tenant_shard_id,
                 timeline_id,
@@ -524,6 +540,33 @@ pub(crate) async fn download_index_part(
     }
 }
 
+/// index_part.json objects are suffixed with a generation number, so we cannot
+/// directly GET the latest index part without doing some probing.
+///
+/// In this function we probe for the most recent index in a generation <= our current generation.
+/// See "Finding the remote indices for timelines" in docs/rfcs/025-generation-numbers.md
+pub(crate) async fn download_index_part(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+    my_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
+    download_generation_object(
+        storage,
+        tenant_shard_id,
+        timeline_id,
+        my_generation,
+        "index_part",
+        index_prefix,
+        do_download_index_part,
+        parse_remote_index_path,
+        cancel,
+    )
+    .await
+}
+
 pub(crate) async fn download_initdb_tar_zst(
     conf: &'static PageServerConf,
     storage: &GenericRemoteStorage,

From 7a1331eee56a1590ef4fb73f07e70c013c7d9c84 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 29 Oct 2024 13:54:48 +0000
Subject: [PATCH 119/239] pageserver: make concurrent offloaded timeline
 operations safe wrt manifest uploads (#9557)

## Problem

Uploads of the tenant manifest could race between different tasks,
resulting in unexpected results in remote storage.

Closes: https://github.com/neondatabase/neon/issues/9556

## Summary of changes

- Create a central function for uploads that takes a tokio::sync::Mutex
- Store the latest upload in that Mutex, so that when there is lots of
concurrency (e.g. archive 20 timelines at once) we can coalesce their
manifest writes somewhat.
---
 pageserver/src/tenant.rs                      | 100 +++++++++++++-----
 .../src/tenant/remote_timeline_client.rs      |   2 +-
 .../tenant/remote_timeline_client/manifest.rs |   4 +-
 pageserver/src/tenant/timeline/delete.rs      |  43 +++-----
 pageserver/src/tenant/timeline/offload.rs     |  17 +--
 5 files changed, 94 insertions(+), 72 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7f8af67c2c..64e4eb46ce 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -302,6 +302,13 @@ pub struct Tenant {
     /// **Lock order**: if acquiring all (or a subset), acquire them in order `timelines`, `timelines_offloaded`, `timelines_creating`
     timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
 
+    /// Serialize writes of the tenant manifest to remote storage.  If there are concurrent operations
+    /// affecting the manifest, such as timeline deletion and timeline offload, they must wait for
+    /// each other (this could be optimized to coalesce writes if necessary).
+    ///
+    /// The contents of the Mutex are the last manifest we successfully uploaded
+    tenant_manifest_upload: tokio::sync::Mutex<Option<TenantManifest>>,
+
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
     // `timelines` mutex during all GC iteration
@@ -741,6 +748,24 @@ pub enum TimelineArchivalError {
     Other(anyhow::Error),
 }
 
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum TenantManifestError {
+    #[error("Remote storage error: {0}")]
+    RemoteStorage(anyhow::Error),
+
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+impl From<TenantManifestError> for TimelineArchivalError {
+    fn from(e: TenantManifestError) -> Self {
+        match e {
+            TenantManifestError::RemoteStorage(e) => Self::Other(e),
+            TenantManifestError::Cancelled => Self::Cancelled,
+        }
+    }
+}
+
 impl Debug for TimelineArchivalError {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
@@ -1526,18 +1551,7 @@ impl Tenant {
             offloaded_timelines_accessor.extend(offloaded_timelines_list.into_iter());
         }
         if !offloaded_timeline_ids.is_empty() {
-            let manifest = self.tenant_manifest();
-            // TODO: generation support
-            let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-            upload_tenant_manifest(
-                &self.remote_storage,
-                &self.tenant_shard_id,
-                generation,
-                &manifest,
-                &self.cancel,
-            )
-            .await
-            .map_err(TimelineArchivalError::Other)?;
+            self.store_tenant_manifest().await?;
         }
 
         // The local filesystem contents are a cache of what's in the remote IndexPart;
@@ -1918,18 +1932,7 @@ impl Tenant {
         };
 
         // Upload new list of offloaded timelines to S3
-        let manifest = self.tenant_manifest();
-        // TODO: generation support
-        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-        upload_tenant_manifest(
-            &self.remote_storage,
-            &self.tenant_shard_id,
-            generation,
-            &manifest,
-            &cancel,
-        )
-        .await
-        .map_err(TimelineArchivalError::Other)?;
+        self.store_tenant_manifest().await?;
 
         // Activate the timeline (if it makes sense)
         if !(timeline.is_broken() || timeline.is_stopping()) {
@@ -3126,7 +3129,7 @@ impl Tenant {
             }
         }
 
-        let tenant_manifest = self.tenant_manifest();
+        let tenant_manifest = self.build_tenant_manifest();
         // TODO: generation support
         let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
         for child_shard in child_shards {
@@ -3321,7 +3324,8 @@ impl Tenant {
             .unwrap_or(self.conf.default_tenant_conf.lsn_lease_length)
     }
 
-    pub(crate) fn tenant_manifest(&self) -> TenantManifest {
+    /// Generate an up-to-date TenantManifest based on the state of this Tenant.
+    fn build_tenant_manifest(&self) -> TenantManifest {
         let timelines_offloaded = self.timelines_offloaded.lock().unwrap();
 
         let mut timeline_manifests = timelines_offloaded
@@ -3529,6 +3533,7 @@ impl Tenant {
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
             timelines_offloaded: Mutex::new(HashMap::new()),
+            tenant_manifest_upload: Default::default(),
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
@@ -4708,6 +4713,49 @@ impl Tenant {
             .max()
             .unwrap_or(0)
     }
+
+    /// Serialize and write the latest TenantManifest to remote storage.
+    pub(crate) async fn store_tenant_manifest(&self) -> Result<(), TenantManifestError> {
+        // Only one manifest write may be done at at time, and the contents of the manifest
+        // must be loaded while holding this lock. This makes it safe to call this function
+        // from anywhere without worrying about colliding updates.
+        let mut guard = tokio::select! {
+            g = self.tenant_manifest_upload.lock() => {
+                g
+            },
+            _ = self.cancel.cancelled() => {
+                return Err(TenantManifestError::Cancelled);
+            }
+        };
+
+        let manifest = self.build_tenant_manifest();
+        if Some(&manifest) == (*guard).as_ref() {
+            // Optimisation: skip uploads that don't change anything.
+            return Ok(());
+        }
+
+        upload_tenant_manifest(
+            &self.remote_storage,
+            &self.tenant_shard_id,
+            self.generation,
+            &manifest,
+            &self.cancel,
+        )
+        .await
+        .map_err(|e| {
+            if self.cancel.is_cancelled() {
+                TenantManifestError::Cancelled
+            } else {
+                TenantManifestError::RemoteStorage(e)
+            }
+        })?;
+
+        // Store the successfully uploaded manifest, so that future callers can avoid
+        // re-uploading the same thing.
+        *guard = Some(manifest);
+
+        Ok(())
+    }
 }
 
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1c72c7fff8..19e762b9fa 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -249,7 +249,7 @@ pub(crate) use download::{
     list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
-pub(crate) use upload::{upload_initdb_dir, upload_tenant_manifest};
+pub(crate) use upload::upload_initdb_dir;
 
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index 7d92d45146..c4382cb648 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
 use utils::{id::TimelineId, lsn::Lsn};
 
 /// Tenant-shard scoped manifest
-#[derive(Clone, Serialize, Deserialize)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq)]
 pub struct TenantManifest {
     /// Debugging aid describing the version of this manifest.
     /// Can also be used for distinguishing breaking changes later on.
@@ -23,7 +23,7 @@ pub struct TenantManifest {
 /// Very similar to [`pageserver_api::models::OffloadedTimelineInfo`],
 /// but the two datastructures serve different needs, this is for a persistent disk format
 /// that must be backwards compatible, while the other is only for informative purposes.
-#[derive(Clone, Serialize, Deserialize, Copy)]
+#[derive(Clone, Serialize, Deserialize, Copy, PartialEq, Eq)]
 pub struct OffloadedTimelineManifest {
     pub timeline_id: TimelineId,
     /// Whether the timeline has a parent it has been branched off from or not
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 53b65da515..2c6161da15 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -14,10 +14,9 @@ use crate::{
     task_mgr::{self, TaskKind},
     tenant::{
         metadata::TimelineMetadata,
-        remote_timeline_client::{
-            self, MaybeDeletedIndexPart, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
-        },
-        CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
+        remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
+        CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
+        TimelineOrOffloaded,
     },
 };
 
@@ -176,32 +175,6 @@ async fn remove_maybe_offloaded_timeline_from_tenant(
     Ok(())
 }
 
-/// It is important that this gets called when DeletionGuard is being held.
-/// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn upload_new_tenant_manifest(
-    tenant: &Tenant,
-    _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
-    // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
-    // between the deletion of the index-part.json and reaching of this code.
-    // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
-    // However, we handle this case in tenant loading code so the next time we attach, the issue is
-    // resolved.
-    let manifest = tenant.tenant_manifest();
-    // TODO: generation support
-    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-    remote_timeline_client::upload_tenant_manifest(
-        &tenant.remote_storage,
-        &tenant.tenant_shard_id,
-        generation,
-        &manifest,
-        &tenant.cancel,
-    )
-    .await?;
-
-    Ok(())
-}
-
 /// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
 /// and deletes its data from both disk and s3.
 /// The sequence of steps:
@@ -480,7 +453,15 @@ impl DeleteTimelineFlow {
 
         remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
 
-        upload_new_tenant_manifest(tenant, &guard).await?;
+        // This is susceptible to race conditions, i.e. we won't continue deletions if there is a crash
+        // between the deletion of the index-part.json and reaching of this code.
+        // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
+        // However, we handle this case in tenant loading code so the next time we attach, the issue is
+        // resolved.
+        tenant
+            .store_tenant_manifest()
+            .await
+            .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
 
         *guard = Self::Finished;
 
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 8e6eceb084..305c139b54 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{remote_timeline_client, OffloadedTimeline, Tenant, TimelineOrOffloaded};
+use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
 
 pub(crate) async fn offload_timeline(
     tenant: &Tenant,
@@ -63,17 +63,10 @@ pub(crate) async fn offload_timeline(
     // at the next restart attach it again.
     // For that to happen, we'd need to make the manifest reflect our *intended* state,
     // not our actual state of offloaded timelines.
-    let manifest = tenant.tenant_manifest();
-    // TODO: generation support
-    let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
-    remote_timeline_client::upload_tenant_manifest(
-        &tenant.remote_storage,
-        &tenant.tenant_shard_id,
-        generation,
-        &manifest,
-        &tenant.cancel,
-    )
-    .await?;
+    tenant
+        .store_tenant_manifest()
+        .await
+        .map_err(|e| anyhow::anyhow!(e))?;
 
     Ok(())
 }

From 793ad50b7d54c2c45c19e362b7bd9894a389d2cb Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 29 Oct 2024 14:25:23 +0000
Subject: [PATCH 120/239] fix allow_unstable_extensions GUC - make it USERSET
 (#9563)

fix message wording
---
 pgxn/neon/unstable_extensions.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pgxn/neon/unstable_extensions.c b/pgxn/neon/unstable_extensions.c
index a3445cb268..72de2871f4 100644
--- a/pgxn/neon/unstable_extensions.c
+++ b/pgxn/neon/unstable_extensions.c
@@ -65,8 +65,8 @@ CheckUnstableExtension(
 			{
 				ereport(ERROR,
 						(errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
-						 errmsg("installing %s is currently prohibited", stmt->extname),
-						 errhint("Set neon.allow_unstable_extensions to true")));
+						 errmsg("%s extension is in beta and may be unstable or introduce backward-incompatible changes.\nWe recommend testing it in a separate, dedicated Neon project.", stmt->extname),
+						 errhint("to proceed with installation, run SET neon.allow_unstable_extensions='true'")));
 			}
 			break;
 		}
@@ -110,13 +110,13 @@ InitUnstableExtensionsSupport(void)
 		NULL,
 		&allow_unstable_extensions,
 		false,
-		PGC_SUSET,
+		PGC_USERSET,
 		0,
 		NULL, NULL, NULL);
 
 	DefineCustomStringVariable(
 		"neon.unstable_extensions",
-		"Allow unstable extensions to be installed and used",
+		"List of unstable extensions",
 		NULL,
 		&unstable_extensions,
 		NULL,

From 57499640c5ab677796f61a5cd813fd6c881998e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jakub=20Ko=C5=82odziejczak?=
 <31549762+mrl5@users.noreply.github.com>
Date: Tue, 29 Oct 2024 16:44:45 +0100
Subject: [PATCH 121/239] proxy: more granular http status codes for
 sql-over-http errors (#9549)

closes #9532
---
 proxy/src/serverless/error.rs         |  5 +++++
 proxy/src/serverless/mod.rs           |  1 +
 proxy/src/serverless/sql_over_http.rs | 24 +++++++++++++++++++++---
 test_runner/regress/test_proxy.py     |  2 +-
 4 files changed, 28 insertions(+), 4 deletions(-)
 create mode 100644 proxy/src/serverless/error.rs

diff --git a/proxy/src/serverless/error.rs b/proxy/src/serverless/error.rs
new file mode 100644
index 0000000000..323c91baa5
--- /dev/null
+++ b/proxy/src/serverless/error.rs
@@ -0,0 +1,5 @@
+use http::StatusCode;
+
+pub trait HttpCodeError {
+    fn get_http_status_code(&self) -> StatusCode;
+}
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 8fb7a771d9..edbb0347d3 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -6,6 +6,7 @@ mod backend;
 pub mod cancel_set;
 mod conn_pool;
 mod conn_pool_lib;
+mod error;
 mod http_conn_pool;
 mod http_util;
 mod json;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 1f3eec6d19..0713c27d65 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -28,6 +28,7 @@ use uuid::Uuid;
 use super::backend::{LocalProxyConnError, PoolingBackend};
 use super::conn_pool::{AuthData, ConnInfoWithAuth};
 use super::conn_pool_lib::{self, ConnInfo};
+use super::error::HttpCodeError;
 use super::http_util::json_response;
 use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
 use super::local_conn_pool;
@@ -238,7 +239,6 @@ fn get_conn_info(
     Ok(ConnInfoWithAuth { conn_info, auth })
 }
 
-// TODO: return different http error codes
 pub(crate) async fn handle(
     config: &'static ProxyConfig,
     ctx: RequestMonitoring,
@@ -319,9 +319,8 @@ pub(crate) async fn handle(
                 "forwarding error to user"
             );
 
-            // TODO: this shouldn't always be bad request.
             json_response(
-                StatusCode::BAD_REQUEST,
+                e.get_http_status_code(),
                 json!({
                     "message": message,
                     "code": code,
@@ -405,6 +404,25 @@ impl UserFacingError for SqlOverHttpError {
     }
 }
 
+impl HttpCodeError for SqlOverHttpError {
+    fn get_http_status_code(&self) -> StatusCode {
+        match self {
+            SqlOverHttpError::ReadPayload(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::ConnectCompute(h) => match h.get_error_kind() {
+                ErrorKind::User => StatusCode::BAD_REQUEST,
+                _ => StatusCode::INTERNAL_SERVER_ERROR,
+            },
+            SqlOverHttpError::ConnInfo(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::RequestTooLarge(_) => StatusCode::PAYLOAD_TOO_LARGE,
+            SqlOverHttpError::ResponseTooLarge(_) => StatusCode::INSUFFICIENT_STORAGE,
+            SqlOverHttpError::InvalidIsolationLevel => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::Postgres(_) => StatusCode::BAD_REQUEST,
+            SqlOverHttpError::JsonConversion(_) => StatusCode::INTERNAL_SERVER_ERROR,
+            SqlOverHttpError::Cancelled(_) => StatusCode::INTERNAL_SERVER_ERROR,
+        }
+    }
+}
+
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index f598900af9..e59d46e352 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -561,7 +561,7 @@ def test_sql_over_http_pool_dos(static_proxy: NeonProxy):
 
     # query generates a million rows - should hit the 10MB reponse limit quickly
     response = query(
-        400,
+        507,
         "select * from generate_series(1, 5000) a cross join generate_series(1, 5000) b cross join (select 'foo'::foo) c;",
     )
     assert "response is too large (max is 10485760 bytes)" in response["message"]

From 80e163004242ebb048447053a3fa3c9d432dd085 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 29 Oct 2024 15:57:52 +0000
Subject: [PATCH 122/239] Use pg_mooncake from our fork. (#9565)

Switch to main repo once
https://github.com/Mooncake-Labs/pg_mooncake/pull/3 is merged
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 1b2167ea11..85fb9c441d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1131,14 +1131,14 @@ FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PG_MOONCAKE_VERSION=0a7de4c0b5c7b1a5e2175e1c5f4625b97b7346f1
+ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN case "${PG_VERSION}" in \
         'v14') \
             echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
     esac && \
-    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
+    git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \
     cd pg_mooncake-src && \
     git checkout "${PG_MOONCAKE_VERSION}" && \
     git submodule update --init --depth 1 --recursive && \

From 0c075fab3a73c89a6857b46c9b85afca968ee646 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Tue, 29 Oct 2024 18:40:10 +0200
Subject: [PATCH 123/239] Add --replica parameter to basebackup (#9553)

## Problem

See https://github.com/neondatabase/neon/pull/9458
This PR separates PS related changes in #9458 from compute_ctl changes
to enforce that PS is deployed before compute.

## Summary of changes

This PR adds handlings of `--replica` parameters of backebackup to page
server.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/basebackup.rs   | 11 ++++++++--
 pageserver/src/page_service.rs | 37 +++++++++++++++++++++-------------
 2 files changed, 32 insertions(+), 16 deletions(-)

diff --git a/pageserver/src/basebackup.rs b/pageserver/src/basebackup.rs
index 975318419f..cae0ffb980 100644
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -59,6 +59,7 @@ pub async fn send_basebackup_tarball<'a, W>(
     req_lsn: Option<Lsn>,
     prev_lsn: Option<Lsn>,
     full_backup: bool,
+    replica: bool,
     ctx: &'a RequestContext,
 ) -> Result<(), BasebackupError>
 where
@@ -110,8 +111,8 @@ where
     };
 
     info!(
-        "taking basebackup lsn={}, prev_lsn={} (full_backup={})",
-        backup_lsn, prev_lsn, full_backup
+        "taking basebackup lsn={}, prev_lsn={} (full_backup={}, replica={})",
+        backup_lsn, prev_lsn, full_backup, replica
     );
 
     let basebackup = Basebackup {
@@ -120,6 +121,7 @@ where
         lsn: backup_lsn,
         prev_record_lsn: prev_lsn,
         full_backup,
+        replica,
         ctx,
     };
     basebackup
@@ -140,6 +142,7 @@ where
     lsn: Lsn,
     prev_record_lsn: Lsn,
     full_backup: bool,
+    replica: bool,
     ctx: &'a RequestContext,
 }
 
@@ -372,6 +375,10 @@ where
 
         for (path, content) in aux_files {
             if path.starts_with("pg_replslot") {
+                // Do not create LR slots at standby because they are not used but prevent WAL truncation
+                if self.replica {
+                    continue;
+                }
                 let offs = pg_constants::REPL_SLOT_ON_DISK_OFFSETOF_RESTART_LSN;
                 let restart_lsn = Lsn(u64::from_le_bytes(
                     content[offs..offs + 8].try_into().unwrap(),
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 62b14cb83e..aed8a87851 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1080,6 +1080,7 @@ impl PageServerHandler {
         prev_lsn: Option<Lsn>,
         full_backup: bool,
         gzip: bool,
+        replica: bool,
         ctx: &RequestContext,
     ) -> Result<(), QueryError>
     where
@@ -1132,6 +1133,7 @@ impl PageServerHandler {
                 lsn,
                 prev_lsn,
                 full_backup,
+                replica,
                 ctx,
             )
             .await
@@ -1154,6 +1156,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
+                    replica,
                     ctx,
                 )
                 .await
@@ -1170,6 +1173,7 @@ impl PageServerHandler {
                     lsn,
                     prev_lsn,
                     full_backup,
+                    replica,
                     ctx,
                 )
                 .await
@@ -1326,24 +1330,27 @@ where
                 .for_command(ComputeCommandKind::Basebackup)
                 .inc();
 
-            let (lsn, gzip) = match (params.get(2), params.get(3)) {
-                (None, _) => (None, false),
-                (Some(&"--gzip"), _) => (None, true),
-                (Some(lsn_str), gzip_str_opt) => {
-                    let lsn = Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?;
-                    let gzip = match gzip_str_opt {
-                        Some(&"--gzip") => true,
-                        None => false,
-                        Some(third_param) => {
+            let mut lsn = None;
+            let mut replica = false;
+            let mut gzip = false;
+            for param in &params[2..] {
+                if param.starts_with("--") {
+                    match *param {
+                        "--gzip" => gzip = true,
+                        "--replica" => replica = true,
+                        _ => {
                             return Err(QueryError::Other(anyhow::anyhow!(
-                                "Parameter in position 3 unknown {third_param}",
+                                "Unknown parameter {param}",
                             )))
                         }
-                    };
-                    (Some(lsn), gzip)
+                    }
+                } else {
+                    lsn = Some(
+                        Lsn::from_str(param)
+                            .with_context(|| format!("Failed to parse Lsn from {param}"))?,
+                    );
                 }
-            };
+            }
 
             let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
             let res = async {
@@ -1355,6 +1362,7 @@ where
                     None,
                     false,
                     gzip,
+                    replica,
                     &ctx,
                 )
                 .await?;
@@ -1415,6 +1423,7 @@ where
                 prev_lsn,
                 true,
                 false,
+                false,
                 &ctx,
             )
             .await?;

From 88ff8a78032d4b58ac9d44efbd8cd4ae2be6040d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 29 Oct 2024 14:25:32 -0400
Subject: [PATCH 124/239] feat(pageserver): support partial gc-compaction for
 lowest retain lsn (#9134)

part of https://github.com/neondatabase/neon/issues/8921,
https://github.com/neondatabase/neon/issues/9114

## Summary of changes

We start the partial compaction implementation with the image layer
partial generation. The partial compaction API now takes a key range. We
will only generate images for that key range for now, and remove layers
fully included in the key range after compaction.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/tenant.rs                     | 264 +++++++++++++++++--
 pageserver/src/tenant/timeline/compaction.rs | 229 +++++++++++-----
 2 files changed, 414 insertions(+), 79 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 64e4eb46ce..6ac11b0ae1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -5138,6 +5138,7 @@ mod tests {
     use pageserver_api::keyspace::KeySpace;
     use pageserver_api::models::{CompactionAlgorithm, CompactionAlgorithmSettings};
     use pageserver_api::value::Value;
+    use pageserver_compaction::helpers::overlaps_with;
     use rand::{thread_rng, Rng};
     use storage_layer::PersistentLayerKey;
     use tests::storage_layer::ValuesReconstructState;
@@ -7660,23 +7661,7 @@ mod tests {
         }
 
         // Check if old layers are removed / new layers have the expected LSN
-        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
-        all_layers.sort_by(|k1, k2| {
-            (
-                k1.is_delta,
-                k1.key_range.start,
-                k1.key_range.end,
-                k1.lsn_range.start,
-                k1.lsn_range.end,
-            )
-                .cmp(&(
-                    k2.is_delta,
-                    k2.key_range.start,
-                    k2.key_range.end,
-                    k2.lsn_range.start,
-                    k2.lsn_range.end,
-                ))
-        });
+        let all_layers = inspect_and_sort(&tline, None).await;
         assert_eq!(
             all_layers,
             vec![
@@ -9220,4 +9205,249 @@ mod tests {
 
         Ok(())
     }
+
+    async fn inspect_and_sort(
+        tline: &Arc<Timeline>,
+        filter: Option<std::ops::Range<Key>>,
+    ) -> Vec<PersistentLayerKey> {
+        let mut all_layers = tline.inspect_historic_layers().await.unwrap();
+        if let Some(filter) = filter {
+            all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
+        }
+        all_layers.sort_by(|k1, k2| {
+            (
+                k1.is_delta,
+                k1.key_range.start,
+                k1.key_range.end,
+                k1.lsn_range.start,
+                k1.lsn_range.end,
+            )
+                .cmp(&(
+                    k2.is_delta,
+                    k2.key_range.start,
+                    k2.key_range.end,
+                    k2.lsn_range.start,
+                    k2.lsn_range.end,
+                ))
+        });
+        all_layers
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("test_simple_partial_bottom_most_compaction").await?;
+        let (tenant, ctx) = harness.load().await;
+
+        fn get_key(id: u32) -> Key {
+            // using aux key here b/c they are guaranteed to be inside `collect_keyspace`.
+            let mut key = Key::from_hex("620000000033333333444444445500000000").unwrap();
+            key.field6 = id;
+            key
+        }
+
+        // img layer at 0x10
+        let img_layer = (0..10)
+            .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
+            .collect_vec();
+
+        let delta1 = vec![
+            (
+                get_key(1),
+                Lsn(0x20),
+                Value::Image(Bytes::from("value 1@0x20")),
+            ),
+            (
+                get_key(2),
+                Lsn(0x30),
+                Value::Image(Bytes::from("value 2@0x30")),
+            ),
+            (
+                get_key(3),
+                Lsn(0x40),
+                Value::Image(Bytes::from("value 3@0x40")),
+            ),
+        ];
+        let delta2 = vec![
+            (
+                get_key(5),
+                Lsn(0x20),
+                Value::Image(Bytes::from("value 5@0x20")),
+            ),
+            (
+                get_key(6),
+                Lsn(0x20),
+                Value::Image(Bytes::from("value 6@0x20")),
+            ),
+        ];
+        let delta3 = vec![
+            (
+                get_key(8),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 8@0x48")),
+            ),
+            (
+                get_key(9),
+                Lsn(0x48),
+                Value::Image(Bytes::from("value 9@0x48")),
+            ),
+        ];
+
+        let tline = tenant
+            .create_test_timeline_with_layers(
+                TIMELINE_ID,
+                Lsn(0x10),
+                DEFAULT_PG_VERSION,
+                &ctx,
+                vec![
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta1),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x20)..Lsn(0x48), delta2),
+                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
+                ], // delta layers
+                vec![(Lsn(0x10), img_layer)], // image layers
+                Lsn(0x50),
+            )
+            .await?;
+
+        {
+            // Update GC info
+            let mut guard = tline.gc_info.write().unwrap();
+            *guard = GcInfo {
+                retain_lsns: vec![(Lsn(0x20), tline.timeline_id, MaybeOffloaded::No)],
+                cutoffs: GcCutoffs {
+                    time: Lsn(0x30),
+                    space: Lsn(0x30),
+                },
+                leases: Default::default(),
+                within_ancestor_pitr: false,
+            };
+        }
+
+        let cancel = CancellationToken::new();
+
+        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
+        // can be removed because they might be used for other key ranges.
+        tline
+            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        assert_eq!(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(5)..get_key(7),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true
+                }
+            ]
+        );
+
+        // Do a partial compaction on key range 4..10
+        tline
+            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        assert_eq!(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    // if (in the future) GC kicks in, this layer will be removed
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(5)..get_key(7),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true
+                }
+            ]
+        );
+
+        // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
+        tline
+            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        assert_eq!(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false
+                },
+                PersistentLayerKey {
+                    key_range: get_key(1)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(5)..get_key(7),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true
+                }
+            ]
+        );
+
+        Ok(())
+    }
 }
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 70f93656cd..01c2803881 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1716,20 +1716,32 @@ impl Timeline {
         Ok(())
     }
 
-    /// An experimental compaction building block that combines compaction with garbage collection.
-    ///
-    /// The current implementation picks all delta + image layers that are below or intersecting with
-    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
-    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
-    /// and create delta layers with all deltas >= gc horizon.
     pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        use std::collections::BTreeSet;
+        self.partial_compact_with_gc(None, cancel, flags, ctx).await
+    }
 
+    /// An experimental compaction building block that combines compaction with garbage collection.
+    ///
+    /// The current implementation picks all delta + image layers that are below or intersecting with
+    /// the GC horizon without considering retain_lsns. Then, it does a full compaction over all these delta
+    /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
+    /// and create delta layers with all deltas >= gc horizon.
+    ///
+    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
+    /// is not complete yet, and if it is set, only image layers will be generated.
+    ///
+    pub(crate) async fn partial_compact_with_gc(
+        self: &Arc<Self>,
+        compaction_key_range: Option<Range<Key>>,
+        cancel: &CancellationToken,
+        flags: EnumSet<CompactFlags>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1750,8 +1762,13 @@ impl Timeline {
         .await?;
 
         let dry_run = flags.contains(CompactFlags::DryRun);
+        let partial_compaction = compaction_key_range.is_some();
 
-        info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        if let Some(ref compaction_key_range) = compaction_key_range {
+            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
+        } else {
+            info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
+        }
 
         scopeguard::defer! {
             info!("done enhanced gc bottom-most compaction");
@@ -1763,7 +1780,7 @@ impl Timeline {
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = {
+        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
             let guard = self.layers.read().await;
             let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
@@ -1779,7 +1796,7 @@ impl Timeline {
                     retain_lsns_below_horizon.push(*lsn);
                 }
             }
-            let mut selected_layers = Vec::new();
+            let mut selected_layers: Vec<Layer> = Vec::new();
             drop(gc_info);
             // Pick all the layers intersect or below the gc_cutoff, get the largest LSN in the selected layers.
             let Some(max_layer_lsn) = layers
@@ -1804,8 +1821,52 @@ impl Timeline {
             }
             retain_lsns_below_horizon.sort();
             (selected_layers, gc_cutoff, retain_lsns_below_horizon)
+        } else {
+            // In case of partial compaction, we currently only support generating image layers, and therefore,
+            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
+            let guard = self.layers.read().await;
+            let layers = guard.layer_map()?;
+            let gc_info = self.gc_info.read().unwrap();
+            let mut min_lsn = gc_info.cutoffs.select_min();
+            for (lsn, _, _) in &gc_info.retain_lsns {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
+            }
+            for lsn in gc_info.leases.keys() {
+                if lsn < &min_lsn {
+                    min_lsn = *lsn;
+                }
+            }
+            let mut selected_layers = Vec::new();
+            drop(gc_info);
+            // |-------| |-------| |-------|
+            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
+            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
+            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
+            // |-------| |-------| |-------|
+            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
+            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
+                unreachable!()
+            };
+            for desc in layers.iter_historic_layers() {
+                if desc.get_lsn_range().end <= min_lsn
+                    && overlaps_with(&desc.key_range, compaction_key_range)
+                {
+                    selected_layers.push(guard.get_from_desc(&desc));
+                }
+            }
+            if selected_layers.is_empty() {
+                info!("no layers to compact with gc");
+                return Ok(());
+            }
+            (selected_layers, min_lsn, Vec::new())
         };
         let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
+            if partial_compaction {
+                warn!("partial compaction cannot run on child branches (for now)");
+                return Ok(());
+            }
             Lsn(self.ancestor_lsn.0 + 1)
         } else {
             let res = retain_lsns_below_horizon
@@ -1833,23 +1894,18 @@ impl Timeline {
 
         self.check_compaction_space(&layer_selection).await?;
 
-        // Step 1: (In the future) construct a k-merge iterator over all layers. For now, simply collect all keys + LSNs.
-        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let mut lsn_split_point = BTreeSet::new(); // TODO: use a better data structure (range tree / range set?)
+        // Generate statistics for the compaction
         for layer in &layer_selection {
             let desc = layer.layer_desc();
             if desc.is_delta() {
-                // ignore single-key layer files
-                if desc.key_range.start.next() != desc.key_range.end {
-                    let lsn_range = &desc.lsn_range;
-                    lsn_split_point.insert(lsn_range.start);
-                    lsn_split_point.insert(lsn_range.end);
-                }
                 stat.visit_delta_layer(desc.file_size());
             } else {
                 stat.visit_image_layer(desc.file_size());
             }
         }
+
+        // Step 1: construct a k-merge iterator over all layers.
+        // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
         let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
             .iter()
             .map(|layer| layer.layer_desc().layer_name())
@@ -1900,7 +1956,10 @@ impl Timeline {
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    Key::MIN,
+                    compaction_key_range
+                        .as_ref()
+                        .map(|x| x.start)
+                        .unwrap_or(Key::MIN),
                     lowest_retain_lsn,
                     self.get_compaction_target_size(),
                     ctx,
@@ -1961,55 +2020,71 @@ impl Timeline {
             } else {
                 let last_key = last_key.as_mut().unwrap();
                 stat.on_unique_key_visited();
-                let retention = self
-                    .generate_key_retention(
-                        *last_key,
-                        &accumulated_values,
-                        gc_cutoff,
-                        &retain_lsns_below_horizon,
-                        COMPACTION_DELTA_THRESHOLD,
-                        get_ancestor_image(self, *last_key, ctx).await?,
-                    )
-                    .await?;
-                // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                retention
-                    .pipe_to(
-                        *last_key,
-                        &mut delta_layer_writer,
-                        image_layer_writer.as_mut(),
-                        &mut stat,
-                        ctx,
-                    )
-                    .await?;
+                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+                    !compaction_key_range.contains(last_key)
+                } else {
+                    false
+                };
+                if !skip_adding_key {
+                    let retention = self
+                        .generate_key_retention(
+                            *last_key,
+                            &accumulated_values,
+                            gc_cutoff,
+                            &retain_lsns_below_horizon,
+                            COMPACTION_DELTA_THRESHOLD,
+                            get_ancestor_image(self, *last_key, ctx).await?,
+                        )
+                        .await?;
+                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
+                    retention
+                        .pipe_to(
+                            *last_key,
+                            &mut delta_layer_writer,
+                            image_layer_writer.as_mut(),
+                            &mut stat,
+                            ctx,
+                        )
+                        .await?;
+                }
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
             }
         }
 
+        // TODO: move the below part to the loop body
         let last_key = last_key.expect("no keys produced during compaction");
-        // TODO: move this part to the loop body
         stat.on_unique_key_visited();
-        let retention = self
-            .generate_key_retention(
-                last_key,
-                &accumulated_values,
-                gc_cutoff,
-                &retain_lsns_below_horizon,
-                COMPACTION_DELTA_THRESHOLD,
-                get_ancestor_image(self, last_key, ctx).await?,
-            )
-            .await?;
-        // Put the image into the image layer. Currently we have a single big layer for the compaction.
-        retention
-            .pipe_to(
-                last_key,
-                &mut delta_layer_writer,
-                image_layer_writer.as_mut(),
-                &mut stat,
-                ctx,
-            )
-            .await?;
+
+        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
+            !compaction_key_range.contains(&last_key)
+        } else {
+            false
+        };
+        if !skip_adding_key {
+            let retention = self
+                .generate_key_retention(
+                    last_key,
+                    &accumulated_values,
+                    gc_cutoff,
+                    &retain_lsns_below_horizon,
+                    COMPACTION_DELTA_THRESHOLD,
+                    get_ancestor_image(self, last_key, ctx).await?,
+                )
+                .await?;
+            // Put the image into the image layer. Currently we have a single big layer for the compaction.
+            retention
+                .pipe_to(
+                    last_key,
+                    &mut delta_layer_writer,
+                    image_layer_writer.as_mut(),
+                    &mut stat,
+                    ctx,
+                )
+                .await?;
+        }
+        // end: move the above part to the loop body
 
         let discard = |key: &PersistentLayerKey| {
             let key = key.clone();
@@ -2018,8 +2093,12 @@ impl Timeline {
 
         let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
+                let end_key = compaction_key_range
+                    .as_ref()
+                    .map(|x| x.end)
+                    .unwrap_or(Key::MAX);
                 writer
-                    .finish_with_discard_fn(self, ctx, Key::MAX, discard)
+                    .finish_with_discard_fn(self, ctx, end_key, discard)
                     .await?
             } else {
                 drop(writer);
@@ -2038,6 +2117,10 @@ impl Timeline {
             Vec::new()
         };
 
+        if partial_compaction && !produced_delta_layers.is_empty() {
+            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
+        }
+
         let mut compact_to = Vec::new();
         let mut keep_layers = HashSet::new();
         let produced_delta_layers_len = produced_delta_layers.len();
@@ -2068,6 +2151,28 @@ impl Timeline {
         }
         let mut layer_selection = layer_selection;
         layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+        if let Some(ref compaction_key_range) = compaction_key_range {
+            // Partial compaction might select more data than it processes, e.g., if
+            // the compaction_key_range only partially overlaps:
+            //
+            //         [---compaction_key_range---]
+            //   [---A----][----B----][----C----][----D----]
+            //
+            // A,B,C,D are all in the `layer_selection`. The created image layers contain
+            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
+            //
+            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
+            // keep that data.
+            //
+            // The solution for now is to keep A and D completely.
+            // (layer_selection is what we'll remove from the layer map, so,
+            //  retain what is _not_ fully covered by compaction_key_range).
+            layer_selection.retain(|x| {
+                let key_range = &x.layer_desc().key_range;
+                key_range.start >= compaction_key_range.start
+                    && key_range.end <= compaction_key_range.end
+            });
+        }
 
         info!(
             "gc-compaction statistics: {}",

From 81f9aba0057fb6efefa71aaa57fe2b4ec93899ad Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 29 Oct 2024 15:16:23 -0400
Subject: [PATCH 125/239] fix(pagectl): layer parsing and image layer dump
 (#9571)

This patch contains various improvements for the pagectl tool.

## Summary of changes

* Rewrite layer name parsing: LayerName now supports all variants we use
now.
* Drop pagectl's own layer parsing function, use LayerName in the
pageserver crate.
* Support image layer dumping in the layer dump command using
ImageLayer::dump, drop the original implementation.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/ctl/src/layer_map_analyzer.rs      |  41 ++-----
 pageserver/ctl/src/layers.rs                  |  70 ++++--------
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +-
 .../src/tenant/storage_layer/image_layer.rs   |   2 +-
 .../src/tenant/storage_layer/layer_name.rs    | 105 +++++++++---------
 5 files changed, 90 insertions(+), 132 deletions(-)

diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index 451d2a1d69..11b8e98f57 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -2,7 +2,7 @@
 //!
 //! Currently it only analyzes holes, which are regions within the layer range that the layer contains no updates for. In the future it might do more analysis (maybe key quantiles?) but it should never return sensitive data.
 
-use anyhow::Result;
+use anyhow::{anyhow, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
@@ -11,13 +11,14 @@ use pageserver::virtual_file::api::IoMode;
 use std::cmp::Ordering;
 use std::collections::BinaryHeap;
 use std::ops::Range;
+use std::str::FromStr;
 use std::{fs, str};
 
 use pageserver::page_cache::{self, PAGE_SZ};
 use pageserver::tenant::block_io::FileBlockReader;
 use pageserver::tenant::disk_btree::{DiskBtreeReader, VisitDirection};
 use pageserver::tenant::storage_layer::delta_layer::{Summary, DELTA_KEY_SIZE};
-use pageserver::tenant::storage_layer::range_overlaps;
+use pageserver::tenant::storage_layer::{range_overlaps, LayerName};
 use pageserver::virtual_file::{self, VirtualFile};
 use pageserver_api::key::{Key, KEY_SIZE};
 
@@ -74,35 +75,15 @@ impl LayerFile {
     }
 }
 
-pub(crate) fn parse_filename(name: &str) -> Option<LayerFile> {
-    let split: Vec<&str> = name.split("__").collect();
-    if split.len() != 2 {
-        return None;
-    }
-    let keys: Vec<&str> = split[0].split('-').collect();
-    let lsn_and_opt_generation: Vec<&str> = split[1].split('v').collect();
-    let lsns: Vec<&str> = lsn_and_opt_generation[0].split('-').collect();
-    let the_lsns: [&str; 2];
+pub(crate) fn parse_filename(name: &str) -> anyhow::Result<LayerFile> {
+    let layer_name =
+        LayerName::from_str(name).map_err(|e| anyhow!("failed to parse layer name: {e}"))?;
 
-    /*
-     * Generations add a -vX-XXXXXX postfix, which causes issues when we try to
-     * parse 'vX' as an LSN.
-     */
-    let is_delta = if lsns.len() == 1 || lsns[1].is_empty() {
-        the_lsns = [lsns[0], lsns[0]];
-        false
-    } else {
-        the_lsns = [lsns[0], lsns[1]];
-        true
-    };
-
-    let key_range = Key::from_hex(keys[0]).unwrap()..Key::from_hex(keys[1]).unwrap();
-    let lsn_range = Lsn::from_hex(the_lsns[0]).unwrap()..Lsn::from_hex(the_lsns[1]).unwrap();
     let holes = Vec::new();
-    Some(LayerFile {
-        key_range,
-        lsn_range,
-        is_delta,
+    Ok(LayerFile {
+        key_range: layer_name.key_range().clone(),
+        lsn_range: layer_name.lsn_as_range(),
+        is_delta: layer_name.is_delta(),
         holes,
     })
 }
@@ -179,7 +160,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
 
             for layer in fs::read_dir(timeline.path())? {
                 let layer = layer?;
-                if let Some(mut layer_file) =
+                if let Ok(mut layer_file) =
                     parse_filename(&layer.file_name().into_string().unwrap())
                 {
                     if layer_file.is_delta {
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index 22627d72c8..6f543dcaa9 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -5,24 +5,12 @@ use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
-use pageserver::tenant::block_io::BlockCursor;
-use pageserver::tenant::disk_btree::DiskBtreeReader;
-use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
 use pageserver::tenant::storage_layer::{delta_layer, image_layer};
 use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::virtual_file::api::IoMode;
 use pageserver::{page_cache, virtual_file};
-use pageserver::{
-    tenant::{
-        block_io::FileBlockReader, disk_btree::VisitDirection,
-        storage_layer::delta_layer::DELTA_KEY_SIZE,
-    },
-    virtual_file::VirtualFile,
-};
-use pageserver_api::key::{Key, KEY_SIZE};
-use std::fs;
-use utils::bin_ser::BeSer;
+use std::fs::{self, File};
 use utils::id::{TenantId, TimelineId};
 
 use crate::layer_map_analyzer::parse_filename;
@@ -59,44 +47,30 @@ pub(crate) enum LayerCmd {
 }
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
-    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
     virtual_file::init(
         10,
         virtual_file::api::IoEngineKind::StdFs,
         IoMode::preferred(),
     );
     page_cache::init(100);
-    let file = VirtualFile::open(path, ctx).await?;
-    let file_id = page_cache::next_file_id();
-    let block_reader = FileBlockReader::new(&file, file_id);
-    let summary_blk = block_reader.read_blk(0, ctx).await?;
-    let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
-    let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new(
-        actual_summary.index_start_blk,
-        actual_summary.index_root_blk,
-        &block_reader,
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
+    let file = File::open(path)?;
+    let delta_layer = DeltaLayer::new_for_path(path, file)?;
+    delta_layer.dump(true, ctx).await?;
+    Ok(())
+}
+
+async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
+    virtual_file::init(
+        10,
+        virtual_file::api::IoEngineKind::StdFs,
+        IoMode::preferred(),
     );
-    // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API.
-    let mut all = vec![];
-    tree_reader
-        .visit(
-            &[0u8; DELTA_KEY_SIZE],
-            VisitDirection::Forwards,
-            |key, value_offset| {
-                let curr = Key::from_slice(&key[..KEY_SIZE]);
-                all.push((curr, BlobRef(value_offset)));
-                true
-            },
-            ctx,
-        )
-        .await?;
-    let cursor = BlockCursor::new_fileblockreader(&block_reader);
-    for (k, v) in all {
-        let value = cursor.read_blob(v.pos(), ctx).await?;
-        println!("key:{} value_len:{}", k, value.len());
-        assert!(k.is_i128_representable(), "invalid key: ");
-    }
-    // TODO(chi): special handling for last key?
+    page_cache::init(100);
+    let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
+    let file = File::open(path)?;
+    let image_layer = ImageLayer::new_for_path(path, file)?;
+    image_layer.dump(true, ctx).await?;
     Ok(())
 }
 
@@ -133,8 +107,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             let mut idx = 0;
             for layer in fs::read_dir(timeline_path)? {
                 let layer = layer?;
-                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
-                {
+                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
                     println!(
                         "[{:3}]  key:{}-{}\n       lsn:{}-{}\n       delta:{}",
                         idx,
@@ -163,8 +136,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             let mut idx = 0;
             for layer in fs::read_dir(timeline_path)? {
                 let layer = layer?;
-                if let Some(layer_file) = parse_filename(&layer.file_name().into_string().unwrap())
-                {
+                if let Ok(layer_file) = parse_filename(&layer.file_name().into_string().unwrap()) {
                     if *id == idx {
                         // TODO(chi): dedup code
                         println!(
@@ -180,7 +152,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                         if layer_file.is_delta {
                             read_delta_file(layer.path(), &ctx).await?;
                         } else {
-                            anyhow::bail!("not supported yet :(");
+                            read_image_file(layer.path(), &ctx).await?;
                         }
 
                         break;
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 10165b1d06..664c00a6b1 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -270,7 +270,7 @@ impl AsLayerDesc for DeltaLayer {
 }
 
 impl DeltaLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         self.desc.dump();
 
         if !verbose {
@@ -1438,7 +1438,7 @@ impl DeltaLayerInner {
         offset
     }
 
-    pub(crate) fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
+    pub fn iter<'a>(&'a self, ctx: &'a RequestContext) -> DeltaLayerIterator<'a> {
         let block_reader = FileBlockReader::new(&self.file, self.file_id);
         let tree_reader =
             DiskBtreeReader::new(self.index_start_blk, self.index_root_blk, block_reader);
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index c0d183dc08..834d1931d0 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -231,7 +231,7 @@ impl AsLayerDesc for ImageLayer {
 }
 
 impl ImageLayer {
-    pub(crate) async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
+    pub async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
         self.desc.dump();
 
         if !verbose {
diff --git a/pageserver/src/tenant/storage_layer/layer_name.rs b/pageserver/src/tenant/storage_layer/layer_name.rs
index 2b98d74f9f..addf3b85d9 100644
--- a/pageserver/src/tenant/storage_layer/layer_name.rs
+++ b/pageserver/src/tenant/storage_layer/layer_name.rs
@@ -2,13 +2,11 @@
 //! Helper functions for dealing with filenames of the image and delta layer files.
 //!
 use pageserver_api::key::Key;
-use std::borrow::Cow;
 use std::cmp::Ordering;
 use std::fmt;
 use std::ops::Range;
 use std::str::FromStr;
 
-use regex::Regex;
 use utils::lsn::Lsn;
 
 use super::PersistentLayerDesc;
@@ -60,32 +58,31 @@ impl Ord for DeltaLayerName {
 /// Represents the region of the LSN-Key space covered by a DeltaLayer
 ///
 /// ```text
-///    <key start>-<key end>__<LSN start>-<LSN end>
+///    <key start>-<key end>__<LSN start>-<LSN end>-<generation>
 /// ```
 impl DeltaLayerName {
     /// Parse the part of a delta layer's file name that represents the LayerName. Returns None
     /// if the filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
-        let mut parts = fname.split("__");
-        let mut key_parts = parts.next()?.split('-');
-        let mut lsn_parts = parts.next()?.split('-');
-
-        let key_start_str = key_parts.next()?;
-        let key_end_str = key_parts.next()?;
-        let lsn_start_str = lsn_parts.next()?;
-        let lsn_end_str = lsn_parts.next()?;
-
-        if parts.next().is_some() || key_parts.next().is_some() || key_parts.next().is_some() {
-            return None;
-        }
-
-        if key_start_str.len() != 36
-            || key_end_str.len() != 36
-            || lsn_start_str.len() != 16
-            || lsn_end_str.len() != 16
+        let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
+        let (key_start_str, key_end_str) = key_parts.split_once('-')?;
+        let (lsn_start_str, lsn_end_generation_parts) = lsn_generation_parts.split_once('-')?;
+        let lsn_end_str = if let Some((lsn_end_str, maybe_generation)) =
+            lsn_end_generation_parts.split_once('-')
         {
-            return None;
-        }
+            if maybe_generation.starts_with("v") {
+                // vY-XXXXXXXX
+                lsn_end_str
+            } else if maybe_generation.len() == 8 {
+                // XXXXXXXX
+                lsn_end_str
+            } else {
+                // no idea what this is
+                return None;
+            }
+        } else {
+            lsn_end_generation_parts
+        };
 
         let key_start = Key::from_hex(key_start_str).ok()?;
         let key_end = Key::from_hex(key_end_str).ok()?;
@@ -173,25 +170,29 @@ impl ImageLayerName {
 /// Represents the part of the Key-LSN space covered by an ImageLayer
 ///
 /// ```text
-///    <key start>-<key end>__<LSN>
+///    <key start>-<key end>__<LSN>-<generation>
 /// ```
 impl ImageLayerName {
     /// Parse a string as then LayerName part of an image layer file name. Returns None if the
     /// filename does not match the expected pattern.
     pub fn parse_str(fname: &str) -> Option<Self> {
-        let mut parts = fname.split("__");
-        let mut key_parts = parts.next()?.split('-');
-
-        let key_start_str = key_parts.next()?;
-        let key_end_str = key_parts.next()?;
-        let lsn_str = parts.next()?;
-        if parts.next().is_some() || key_parts.next().is_some() {
-            return None;
-        }
-
-        if key_start_str.len() != 36 || key_end_str.len() != 36 || lsn_str.len() != 16 {
-            return None;
-        }
+        let (key_parts, lsn_generation_parts) = fname.split_once("__")?;
+        let (key_start_str, key_end_str) = key_parts.split_once('-')?;
+        let lsn_str =
+            if let Some((lsn_str, maybe_generation)) = lsn_generation_parts.split_once('-') {
+                if maybe_generation.starts_with("v") {
+                    // vY-XXXXXXXX
+                    lsn_str
+                } else if maybe_generation.len() == 8 {
+                    // XXXXXXXX
+                    lsn_str
+                } else {
+                    // likely a delta layer
+                    return None;
+                }
+            } else {
+                lsn_generation_parts
+            };
 
         let key_start = Key::from_hex(key_start_str).ok()?;
         let key_end = Key::from_hex(key_end_str).ok()?;
@@ -258,6 +259,14 @@ impl LayerName {
         }
     }
 
+    /// Gets the LSN range encoded in the layer name.
+    pub fn lsn_as_range(&self) -> Range<Lsn> {
+        match &self {
+            LayerName::Image(layer) => layer.lsn_as_range(),
+            LayerName::Delta(layer) => layer.lsn_range.clone(),
+        }
+    }
+
     pub fn is_delta(&self) -> bool {
         matches!(self, LayerName::Delta(_))
     }
@@ -290,18 +299,8 @@ impl FromStr for LayerName {
     /// Self. When loading a physical layer filename, we drop any extra information
     /// not needed to build Self.
     fn from_str(value: &str) -> Result<Self, Self::Err> {
-        let gen_suffix_regex = Regex::new("^(?<base>.+)(?<gen>-v1-[0-9a-f]{8})$").unwrap();
-        let file_name: Cow<str> = match gen_suffix_regex.captures(value) {
-            Some(captures) => captures
-                .name("base")
-                .expect("Non-optional group")
-                .as_str()
-                .into(),
-            None => value.into(),
-        };
-
-        let delta = DeltaLayerName::parse_str(&file_name);
-        let image = ImageLayerName::parse_str(&file_name);
+        let delta = DeltaLayerName::parse_str(value);
+        let image = ImageLayerName::parse_str(value);
         let ok = match (delta, image) {
             (None, None) => {
                 return Err(format!(
@@ -367,11 +366,14 @@ mod test {
             lsn: Lsn::from_hex("00000000014FED58").unwrap(),
         });
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-v1-00000001").unwrap();
-        assert_eq!(parsed, expected,);
+        assert_eq!(parsed, expected);
+
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-00000001").unwrap();
+        assert_eq!(parsed, expected);
 
         // Omitting generation suffix is valid
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58").unwrap();
-        assert_eq!(parsed, expected,);
+        assert_eq!(parsed, expected);
     }
 
     #[test]
@@ -385,6 +387,9 @@ mod test {
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-v1-00000001").unwrap();
         assert_eq!(parsed, expected);
 
+        let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481-00000001").unwrap();
+        assert_eq!(parsed, expected);
+
         // Omitting generation suffix is valid
         let parsed = LayerName::from_str("000000000000000000000000000000000000-000000067F00000001000004DF0000000006__00000000014FED58-000000000154C481").unwrap();
         assert_eq!(parsed, expected);

From b77b9bdc9fd1ef7b1b3d86ca20877e22fd8928f9 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 29 Oct 2024 15:13:06 -0500
Subject: [PATCH 126/239] Add tests for sql-exporter metrics

Should help us keep non-working metrics from hitting staging or
production.

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
Fixes: https://github.com/neondatabase/neon/issues/8569
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 build-tools.Dockerfile                      |  12 +
 compute/Makefile                            |   3 +-
 compute/compute-node.Dockerfile             |   5 +-
 compute/etc/sql_exporter.jsonnet            |   4 +-
 poetry.lock                                 | 278 +++++++-----
 pyproject.toml                              |   6 +
 test_runner/fixtures/paths.py               |  11 +-
 test_runner/regress/test_compute_metrics.py | 448 +++++++++++++++++++-
 8 files changed, 651 insertions(+), 116 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 818cc1b6db..93f1e48afa 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -57,6 +57,18 @@ RUN set -e \
         zstd \
     && rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/*
 
+# sql_exporter
+
+# Keep the version the same as in compute/compute-node.Dockerfile and
+# test_runner/regress/test_compute_metrics.py.
+ENV SQL_EXPORTER_VERSION=0.13.1
+RUN curl -fsSL \
+    "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
+    --output sql_exporter.tar.gz \
+    && mkdir /tmp/sql_exporter \
+    && tar xzvf sql_exporter.tar.gz -C /tmp/sql_exporter --strip-components=1 \
+    && mv /tmp/sql_exporter/sql_exporter /usr/local/bin/sql_exporter
+
 # protobuf-compiler (protoc)
 ENV PROTOC_VERSION=25.1
 RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v${PROTOC_VERSION}/protoc-${PROTOC_VERSION}-linux-$(uname -m | sed 's/aarch64/aarch_64/g').zip" -o "protoc.zip" \
diff --git a/compute/Makefile b/compute/Makefile
index 645880ce70..0036196160 100644
--- a/compute/Makefile
+++ b/compute/Makefile
@@ -22,6 +22,7 @@ sql_exporter.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector \
 		--tla-str collector_file=neon_collector.yml \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter' \
 		etc/sql_exporter.jsonnet
 
 sql_exporter_autoscaling.yml: $(jsonnet_files)
@@ -29,7 +30,7 @@ sql_exporter_autoscaling.yml: $(jsonnet_files)
 		--output-file etc/$@ \
 		--tla-str collector_name=neon_collector_autoscaling \
 		--tla-str collector_file=neon_collector_autoscaling.yml \
-		--tla-str application_name=sql_exporter_autoscaling \
+		--tla-str 'connection_string=postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=sql_exporter_autoscaling' \
 		etc/sql_exporter.jsonnet
 
 .PHONY: clean
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 85fb9c441d..7e38ef8221 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1298,7 +1298,10 @@ RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin l
 #########################################################################################
 
 FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
-FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
+
+# Keep the version the same as in build-tools.Dockerfile and
+# test_runner/regress/test_compute_metrics.py.
+FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/compute/etc/sql_exporter.jsonnet b/compute/etc/sql_exporter.jsonnet
index 3c36fd4f68..e957dfd86e 100644
--- a/compute/etc/sql_exporter.jsonnet
+++ b/compute/etc/sql_exporter.jsonnet
@@ -1,4 +1,4 @@
-function(collector_name, collector_file, application_name='sql_exporter') {
+function(collector_name, collector_file, connection_string) {
   // Configuration for sql_exporter for autoscaling-agent
   // Global defaults.
   global: {
@@ -23,7 +23,7 @@ function(collector_name, collector_file, application_name='sql_exporter') {
   target: {
     // Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
     // the schema gets dropped or replaced to match the driver expected DSN format.
-    data_source_name: std.format('postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable&application_name=%s', [application_name]),
+    data_source_name: connection_string,
 
     // Collectors (referenced by name) to execute on the target.
     // Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
diff --git a/poetry.lock b/poetry.lock
index 36ea82a446..e06950cb52 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -1034,24 +1034,25 @@ test-randomorder = ["pytest-randomly"]
 
 [[package]]
 name = "docker"
-version = "4.2.2"
+version = "7.1.0"
 description = "A Python library for the Docker Engine API."
 optional = false
-python-versions = ">=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*, !=3.4.*"
+python-versions = ">=3.8"
 files = [
-    {file = "docker-4.2.2-py2.py3-none-any.whl", hash = "sha256:03a46400c4080cb6f7aa997f881ddd84fef855499ece219d75fbdb53289c17ab"},
-    {file = "docker-4.2.2.tar.gz", hash = "sha256:26eebadce7e298f55b76a88c4f8802476c5eaddbdbe38dbc6cce8781c47c9b54"},
+    {file = "docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0"},
+    {file = "docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c"},
 ]
 
 [package.dependencies]
-pypiwin32 = {version = "223", markers = "sys_platform == \"win32\" and python_version >= \"3.6\""}
-requests = ">=2.14.2,<2.18.0 || >2.18.0"
-six = ">=1.4.0"
-websocket-client = ">=0.32.0"
+pywin32 = {version = ">=304", markers = "sys_platform == \"win32\""}
+requests = ">=2.26.0"
+urllib3 = ">=1.26.0"
 
 [package.extras]
-ssh = ["paramiko (>=2.4.2)"]
-tls = ["cryptography (>=1.3.4)", "idna (>=2.0.0)", "pyOpenSSL (>=17.5.0)"]
+dev = ["coverage (==7.2.7)", "pytest (==7.4.2)", "pytest-cov (==4.1.0)", "pytest-timeout (==2.1.0)", "ruff (==0.1.8)"]
+docs = ["myst-parser (==0.18.0)", "sphinx (==5.1.1)"]
+ssh = ["paramiko (>=2.4.3)"]
+websockets = ["websocket-client (>=1.3.0)"]
 
 [[package]]
 name = "exceptiongroup"
@@ -1416,6 +1417,16 @@ files = [
     {file = "jsondiff-2.0.0.tar.gz", hash = "sha256:2795844ef075ec8a2b8d385c4d59f5ea48b08e7180fce3cb2787be0db00b1fb4"},
 ]
 
+[[package]]
+name = "jsonnet"
+version = "0.20.0"
+description = "Python bindings for Jsonnet - The data templating language"
+optional = false
+python-versions = "*"
+files = [
+    {file = "jsonnet-0.20.0.tar.gz", hash = "sha256:7e770c7bf3a366b97b650a39430450f77612e74406731eb75c5bd59f3f104d4f"},
+]
+
 [[package]]
 name = "jsonpatch"
 version = "1.32"
@@ -2126,6 +2137,7 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2134,6 +2146,8 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@@ -2340,20 +2354,6 @@ files = [
 [package.extras]
 diagrams = ["jinja2", "railroad-diagrams"]
 
-[[package]]
-name = "pypiwin32"
-version = "223"
-description = ""
-optional = false
-python-versions = "*"
-files = [
-    {file = "pypiwin32-223-py3-none-any.whl", hash = "sha256:67adf399debc1d5d14dffc1ab5acacb800da569754fafdc576b2a039485aa775"},
-    {file = "pypiwin32-223.tar.gz", hash = "sha256:71be40c1fbd28594214ecaecb58e7aa8b708eabfa0125c8a109ebd51edbd776a"},
-]
-
-[package.dependencies]
-pywin32 = ">=223"
-
 [[package]]
 name = "pyrsistent"
 version = "0.18.1"
@@ -2573,80 +2573,91 @@ files = [
 
 [[package]]
 name = "pywin32"
-version = "301"
+version = "308"
 description = "Python for Window Extensions"
 optional = false
 python-versions = "*"
 files = [
-    {file = "pywin32-301-cp35-cp35m-win32.whl", hash = "sha256:93367c96e3a76dfe5003d8291ae16454ca7d84bb24d721e0b74a07610b7be4a7"},
-    {file = "pywin32-301-cp35-cp35m-win_amd64.whl", hash = "sha256:9635df6998a70282bd36e7ac2a5cef9ead1627b0a63b17c731312c7a0daebb72"},
-    {file = "pywin32-301-cp36-cp36m-win32.whl", hash = "sha256:c866f04a182a8cb9b7855de065113bbd2e40524f570db73ef1ee99ff0a5cc2f0"},
-    {file = "pywin32-301-cp36-cp36m-win_amd64.whl", hash = "sha256:dafa18e95bf2a92f298fe9c582b0e205aca45c55f989937c52c454ce65b93c78"},
-    {file = "pywin32-301-cp37-cp37m-win32.whl", hash = "sha256:98f62a3f60aa64894a290fb7494bfa0bfa0a199e9e052e1ac293b2ad3cd2818b"},
-    {file = "pywin32-301-cp37-cp37m-win_amd64.whl", hash = "sha256:fb3b4933e0382ba49305cc6cd3fb18525df7fd96aa434de19ce0878133bf8e4a"},
-    {file = "pywin32-301-cp38-cp38-win32.whl", hash = "sha256:88981dd3cfb07432625b180f49bf4e179fb8cbb5704cd512e38dd63636af7a17"},
-    {file = "pywin32-301-cp38-cp38-win_amd64.whl", hash = "sha256:8c9d33968aa7fcddf44e47750e18f3d034c3e443a707688a008a2e52bbef7e96"},
-    {file = "pywin32-301-cp39-cp39-win32.whl", hash = "sha256:595d397df65f1b2e0beaca63a883ae6d8b6df1cdea85c16ae85f6d2e648133fe"},
-    {file = "pywin32-301-cp39-cp39-win_amd64.whl", hash = "sha256:87604a4087434cd814ad8973bd47d6524bd1fa9e971ce428e76b62a5e0860fdf"},
+    {file = "pywin32-308-cp310-cp310-win32.whl", hash = "sha256:796ff4426437896550d2981b9c2ac0ffd75238ad9ea2d3bfa67a1abd546d262e"},
+    {file = "pywin32-308-cp310-cp310-win_amd64.whl", hash = "sha256:4fc888c59b3c0bef905ce7eb7e2106a07712015ea1c8234b703a088d46110e8e"},
+    {file = "pywin32-308-cp310-cp310-win_arm64.whl", hash = "sha256:a5ab5381813b40f264fa3495b98af850098f814a25a63589a8e9eb12560f450c"},
+    {file = "pywin32-308-cp311-cp311-win32.whl", hash = "sha256:5d8c8015b24a7d6855b1550d8e660d8daa09983c80e5daf89a273e5c6fb5095a"},
+    {file = "pywin32-308-cp311-cp311-win_amd64.whl", hash = "sha256:575621b90f0dc2695fec346b2d6302faebd4f0f45c05ea29404cefe35d89442b"},
+    {file = "pywin32-308-cp311-cp311-win_arm64.whl", hash = "sha256:100a5442b7332070983c4cd03f2e906a5648a5104b8a7f50175f7906efd16bb6"},
+    {file = "pywin32-308-cp312-cp312-win32.whl", hash = "sha256:587f3e19696f4bf96fde9d8a57cec74a57021ad5f204c9e627e15c33ff568897"},
+    {file = "pywin32-308-cp312-cp312-win_amd64.whl", hash = "sha256:00b3e11ef09ede56c6a43c71f2d31857cf7c54b0ab6e78ac659497abd2834f47"},
+    {file = "pywin32-308-cp312-cp312-win_arm64.whl", hash = "sha256:9b4de86c8d909aed15b7011182c8cab38c8850de36e6afb1f0db22b8959e3091"},
+    {file = "pywin32-308-cp313-cp313-win32.whl", hash = "sha256:1c44539a37a5b7b21d02ab34e6a4d314e0788f1690d65b48e9b0b89f31abbbed"},
+    {file = "pywin32-308-cp313-cp313-win_amd64.whl", hash = "sha256:fd380990e792eaf6827fcb7e187b2b4b1cede0585e3d0c9e84201ec27b9905e4"},
+    {file = "pywin32-308-cp313-cp313-win_arm64.whl", hash = "sha256:ef313c46d4c18dfb82a2431e3051ac8f112ccee1a34f29c263c583c568db63cd"},
+    {file = "pywin32-308-cp37-cp37m-win32.whl", hash = "sha256:1f696ab352a2ddd63bd07430080dd598e6369152ea13a25ebcdd2f503a38f1ff"},
+    {file = "pywin32-308-cp37-cp37m-win_amd64.whl", hash = "sha256:13dcb914ed4347019fbec6697a01a0aec61019c1046c2b905410d197856326a6"},
+    {file = "pywin32-308-cp38-cp38-win32.whl", hash = "sha256:5794e764ebcabf4ff08c555b31bd348c9025929371763b2183172ff4708152f0"},
+    {file = "pywin32-308-cp38-cp38-win_amd64.whl", hash = "sha256:3b92622e29d651c6b783e368ba7d6722b1634b8e70bd376fd7610fe1992e19de"},
+    {file = "pywin32-308-cp39-cp39-win32.whl", hash = "sha256:7873ca4dc60ab3287919881a7d4f88baee4a6e639aa6962de25a98ba6b193341"},
+    {file = "pywin32-308-cp39-cp39-win_amd64.whl", hash = "sha256:71b3322d949b4cc20776436a9c9ba0eeedcbc9c650daa536df63f0ff111bb920"},
 ]
 
 [[package]]
 name = "pyyaml"
-version = "6.0.1"
+version = "6.0.2"
 description = "YAML parser and emitter for Python"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d858aa552c999bc8a8d57426ed01e40bef403cd8ccdd0fc5f6f04a00414cac2a"},
-    {file = "PyYAML-6.0.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:fd66fc5d0da6d9815ba2cebeb4205f95818ff4b79c3ebe268e75d961704af52f"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:69b023b2b4daa7548bcfbd4aa3da05b3a74b772db9e23b982788168117739938"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:81e0b275a9ecc9c0c0c07b4b90ba548307583c125f54d5b6946cfee6360c733d"},
-    {file = "PyYAML-6.0.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ba336e390cd8e4d1739f42dfe9bb83a3cc2e80f567d8805e11b46f4a943f5515"},
-    {file = "PyYAML-6.0.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:326c013efe8048858a6d312ddd31d56e468118ad4cdeda36c719bf5bb6192290"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win32.whl", hash = "sha256:bd4af7373a854424dabd882decdc5579653d7868b8fb26dc7d0e99f823aa5924"},
-    {file = "PyYAML-6.0.1-cp310-cp310-win_amd64.whl", hash = "sha256:fd1592b3fdf65fff2ad0004b5e363300ef59ced41c2e6b3a99d4089fa8c5435d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6965a7bc3cf88e5a1c3bd2e0b5c22f8d677dc88a455344035f03399034eb3007"},
-    {file = "PyYAML-6.0.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f003ed9ad21d6a4713f0a9b5a7a0a79e08dd0f221aff4525a2be4c346ee60aab"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:42f8152b8dbc4fe7d96729ec2b99c7097d656dc1213a3229ca5383f973a5ed6d"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:062582fca9fabdd2c8b54a3ef1c978d786e0f6b3a1510e0ac93ef59e0ddae2bc"},
-    {file = "PyYAML-6.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:d2b04aac4d386b172d5b9692e2d2da8de7bfb6c387fa4f801fbf6fb2e6ba4673"},
-    {file = "PyYAML-6.0.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e7d73685e87afe9f3b36c799222440d6cf362062f78be1013661b00c5c6f678b"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win32.whl", hash = "sha256:1635fd110e8d85d55237ab316b5b011de701ea0f29d07611174a1b42f1444741"},
-    {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
-    {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
-    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
-    {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
-    {file = "PyYAML-6.0.1-cp312-cp312-win_amd64.whl", hash = "sha256:0d3304d8c0adc42be59c5f8a4d9e3d7379e6955ad754aa9d6ab7a398b59dd1df"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:50550eb667afee136e9a77d6dc71ae76a44df8b3e51e41b77f6de2932bfe0f47"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1fe35611261b29bd1de0070f0b2f47cb6ff71fa6595c077e42bd0c419fa27b98"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:704219a11b772aea0d8ecd7058d0082713c3562b4e271b849ad7dc4a5c90c13c"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:afd7e57eddb1a54f0f1a974bc4391af8bcce0b444685d936840f125cf046d5bd"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win32.whl", hash = "sha256:fca0e3a251908a499833aa292323f32437106001d436eca0e6e7833256674585"},
-    {file = "PyYAML-6.0.1-cp36-cp36m-win_amd64.whl", hash = "sha256:f22ac1c3cac4dbc50079e965eba2c1058622631e526bd9afd45fedd49ba781fa"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:b1275ad35a5d18c62a7220633c913e1b42d44b46ee12554e5fd39c70a243d6a3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:18aeb1bf9a78867dc38b259769503436b7c72f7a1f1f4c93ff9a17de54319b27"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:596106435fa6ad000c2991a98fa58eeb8656ef2325d7e158344fb33864ed87e3"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:baa90d3f661d43131ca170712d903e6295d1f7a0f595074f151c0aed377c9b9c"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win32.whl", hash = "sha256:9046c58c4395dff28dd494285c82ba00b546adfc7ef001486fbf0324bc174fba"},
-    {file = "PyYAML-6.0.1-cp37-cp37m-win_amd64.whl", hash = "sha256:4fb147e7a67ef577a588a0e2c17b6db51dda102c71de36f8549b6816a96e1867"},
-    {file = "PyYAML-6.0.1-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:1d4c7e777c441b20e32f52bd377e0c409713e8bb1386e1099c2415f26e479595"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a0cd17c15d3bb3fa06978b4e8958dcdc6e0174ccea823003a106c7d4d7899ac5"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28c119d996beec18c05208a8bd78cbe4007878c6dd15091efb73a30e90539696"},
-    {file = "PyYAML-6.0.1-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7e07cbde391ba96ab58e532ff4803f79c4129397514e1413a7dc761ccd755735"},
-    {file = "PyYAML-6.0.1-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:49a183be227561de579b4a36efbb21b3eab9651dd81b1858589f796549873dd6"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win32.whl", hash = "sha256:184c5108a2aca3c5b3d3bf9395d50893a7ab82a38004c8f61c258d4428e80206"},
-    {file = "PyYAML-6.0.1-cp38-cp38-win_amd64.whl", hash = "sha256:1e2722cc9fbb45d9b87631ac70924c11d3a401b2d7f410cc0e3bbf249f2dca62"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9eb6caa9a297fc2c2fb8862bc5370d0303ddba53ba97e71f08023b6cd73d16a8"},
-    {file = "PyYAML-6.0.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:c8098ddcc2a85b61647b2590f825f3db38891662cfc2fc776415143f599bb859"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5773183b6446b2c99bb77e77595dd486303b4faab2b086e7b17bc6bef28865f6"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:b786eecbdf8499b9ca1d697215862083bd6d2a99965554781d0d8d1ad31e13a0"},
-    {file = "PyYAML-6.0.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bc1bf2925a1ecd43da378f4db9e4f799775d6367bdb94671027b73b393a7c42c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:04ac92ad1925b2cff1db0cfebffb6ffc43457495c9b3c39d3fcae417d7125dc5"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win32.whl", hash = "sha256:faca3bdcf85b2fc05d06ff3fbc1f83e1391b3e724afa3feba7d13eeab355484c"},
-    {file = "PyYAML-6.0.1-cp39-cp39-win_amd64.whl", hash = "sha256:510c9deebc5c0225e8c96813043e62b680ba2f9c50a08d3724c7f28a747d1486"},
-    {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0a9a2848a5b7feac301353437eb7d5957887edbf81d56e903999a75a3d743086"},
+    {file = "PyYAML-6.0.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:29717114e51c84ddfba879543fb232a6ed60086602313ca38cce623c1d62cfbf"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8824b5a04a04a047e72eea5cec3bc266db09e35de6bdfe34c9436ac5ee27d237"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7c36280e6fb8385e520936c3cb3b8042851904eba0e58d277dca80a5cfed590b"},
+    {file = "PyYAML-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ec031d5d2feb36d1d1a24380e4db6d43695f3748343d99434e6f5f9156aaa2ed"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:936d68689298c36b53b29f23c6dbb74de12b4ac12ca6cfe0e047bedceea56180"},
+    {file = "PyYAML-6.0.2-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:23502f431948090f597378482b4812b0caae32c22213aecf3b55325e049a6c68"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win32.whl", hash = "sha256:2e99c6826ffa974fe6e27cdb5ed0021786b03fc98e5ee3c5bfe1fd5015f42b99"},
+    {file = "PyYAML-6.0.2-cp310-cp310-win_amd64.whl", hash = "sha256:a4d3091415f010369ae4ed1fc6b79def9416358877534caf6a0fdd2146c87a3e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:cc1c1159b3d456576af7a3e4d1ba7e6924cb39de8f67111c735f6fc832082774"},
+    {file = "PyYAML-6.0.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1e2120ef853f59c7419231f3bf4e7021f1b936f6ebd222406c3b60212205d2ee"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5d225db5a45f21e78dd9358e58a98702a0302f2659a3c6cd320564b75b86f47c"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:5ac9328ec4831237bec75defaf839f7d4564be1e6b25ac710bd1a96321cc8317"},
+    {file = "PyYAML-6.0.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ad2a3decf9aaba3d29c8f537ac4b243e36bef957511b4766cb0057d32b0be85"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:ff3824dc5261f50c9b0dfb3be22b4567a6f938ccce4587b38952d85fd9e9afe4"},
+    {file = "PyYAML-6.0.2-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:797b4f722ffa07cc8d62053e4cff1486fa6dc094105d13fea7b1de7d8bf71c9e"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win32.whl", hash = "sha256:11d8f3dd2b9c1207dcaf2ee0bbbfd5991f571186ec9cc78427ba5bd32afae4b5"},
+    {file = "PyYAML-6.0.2-cp311-cp311-win_amd64.whl", hash = "sha256:e10ce637b18caea04431ce14fabcf5c64a1c61ec9c56b071a4b7ca131ca52d44"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:c70c95198c015b85feafc136515252a261a84561b7b1d51e3384e0655ddf25ab"},
+    {file = "PyYAML-6.0.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ce826d6ef20b1bc864f0a68340c8b3287705cae2f8b4b1d932177dcc76721725"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1f71ea527786de97d1a0cc0eacd1defc0985dcf6b3f17bb77dcfc8c34bec4dc5"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:9b22676e8097e9e22e36d6b7bda33190d0d400f345f23d4065d48f4ca7ae0425"},
+    {file = "PyYAML-6.0.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:80bab7bfc629882493af4aa31a4cfa43a4c57c83813253626916b8c7ada83476"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:0833f8694549e586547b576dcfaba4a6b55b9e96098b36cdc7ebefe667dfed48"},
+    {file = "PyYAML-6.0.2-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8b9c7197f7cb2738065c481a0461e50ad02f18c78cd75775628afb4d7137fb3b"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win32.whl", hash = "sha256:ef6107725bd54b262d6dedcc2af448a266975032bc85ef0172c5f059da6325b4"},
+    {file = "PyYAML-6.0.2-cp312-cp312-win_amd64.whl", hash = "sha256:7e7401d0de89a9a855c839bc697c079a4af81cf878373abd7dc625847d25cbd8"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:efdca5630322a10774e8e98e1af481aad470dd62c3170801852d752aa7a783ba"},
+    {file = "PyYAML-6.0.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:50187695423ffe49e2deacb8cd10510bc361faac997de9efef88badc3bb9e2d1"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0ffe8360bab4910ef1b9e87fb812d8bc0a308b0d0eef8c8f44e0254ab3b07133"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:17e311b6c678207928d649faa7cb0d7b4c26a0ba73d41e99c4fff6b6c3276484"},
+    {file = "PyYAML-6.0.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:70b189594dbe54f75ab3a1acec5f1e3faa7e8cf2f1e08d9b561cb41b845f69d5"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:41e4e3953a79407c794916fa277a82531dd93aad34e29c2a514c2c0c5fe971cc"},
+    {file = "PyYAML-6.0.2-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:68ccc6023a3400877818152ad9a1033e3db8625d899c72eacb5a668902e4d652"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win32.whl", hash = "sha256:bc2fa7c6b47d6bc618dd7fb02ef6fdedb1090ec036abab80d4681424b84c1183"},
+    {file = "PyYAML-6.0.2-cp313-cp313-win_amd64.whl", hash = "sha256:8388ee1976c416731879ac16da0aff3f63b286ffdd57cdeb95f3f2e085687563"},
+    {file = "PyYAML-6.0.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:24471b829b3bf607e04e88d79542a9d48bb037c2267d7927a874e6c205ca7e9a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7fded462629cfa4b685c5416b949ebad6cec74af5e2d42905d41e257e0869f5"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d84a1718ee396f54f3a086ea0a66d8e552b2ab2017ef8b420e92edbc841c352d"},
+    {file = "PyYAML-6.0.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9056c1ecd25795207ad294bcf39f2db3d845767be0ea6e6a34d856f006006083"},
+    {file = "PyYAML-6.0.2-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:82d09873e40955485746739bcb8b4586983670466c23382c19cffecbf1fd8706"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win32.whl", hash = "sha256:43fa96a3ca0d6b1812e01ced1044a003533c47f6ee8aca31724f78e93ccc089a"},
+    {file = "PyYAML-6.0.2-cp38-cp38-win_amd64.whl", hash = "sha256:01179a4a8559ab5de078078f37e5c1a30d76bb88519906844fd7bdea1b7729ff"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:688ba32a1cffef67fd2e9398a2efebaea461578b0923624778664cc1c914db5d"},
+    {file = "PyYAML-6.0.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:a8786accb172bd8afb8be14490a16625cbc387036876ab6ba70912730faf8e1f"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d8e03406cac8513435335dbab54c0d385e4a49e4945d2909a581c83647ca0290"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f753120cb8181e736c57ef7636e83f31b9c0d1722c516f7e86cf15b7aa57ff12"},
+    {file = "PyYAML-6.0.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b1fdb9dc17f5a7677423d508ab4f243a726dea51fa5e70992e59a7411c89d19"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:0b69e4ce7a131fe56b7e4d770c67429700908fc0752af059838b1cfb41960e4e"},
+    {file = "PyYAML-6.0.2-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:a9f8c2e67970f13b16084e04f134610fd1d374bf477b17ec1599185cf611d725"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win32.whl", hash = "sha256:6395c297d42274772abc367baaa79683958044e5d3835486c16da75d2a694631"},
+    {file = "PyYAML-6.0.2-cp39-cp39-win_amd64.whl", hash = "sha256:39693e1f8320ae4f43943590b49779ffb98acb81f788220ea932a6b6c51004d8"},
+    {file = "pyyaml-6.0.2.tar.gz", hash = "sha256:d584d9ec91ad65861cc08d42e834324ef890a082e591037abe114850ff7bbc3e"},
 ]
 
 [[package]]
@@ -2901,6 +2912,58 @@ files = [
 [package.dependencies]
 mpmath = ">=0.19"
 
+[[package]]
+name = "testcontainers"
+version = "4.8.1"
+description = "Python library for throwaway instances of anything that can run in a Docker container"
+optional = false
+python-versions = "<4.0,>=3.9"
+files = [
+    {file = "testcontainers-4.8.1-py3-none-any.whl", hash = "sha256:d8ae43e8fe34060fcd5c3f494e0b7652b7774beabe94568a2283d0881e94d489"},
+    {file = "testcontainers-4.8.1.tar.gz", hash = "sha256:5ded4820b7227ad526857eb3caaafcabce1bbac05d22ad194849b136ffae3cb0"},
+]
+
+[package.dependencies]
+docker = "*"
+typing-extensions = "*"
+urllib3 = "*"
+wrapt = "*"
+
+[package.extras]
+arangodb = ["python-arango (>=7.8,<8.0)"]
+aws = ["boto3", "httpx"]
+azurite = ["azure-storage-blob (>=12.19,<13.0)"]
+chroma = ["chromadb-client"]
+clickhouse = ["clickhouse-driver"]
+cosmosdb = ["azure-cosmos"]
+db2 = ["ibm_db_sa", "sqlalchemy"]
+generic = ["httpx", "redis"]
+google = ["google-cloud-datastore (>=2)", "google-cloud-pubsub (>=2)"]
+influxdb = ["influxdb", "influxdb-client"]
+k3s = ["kubernetes", "pyyaml"]
+keycloak = ["python-keycloak"]
+localstack = ["boto3"]
+mailpit = ["cryptography"]
+minio = ["minio"]
+mongodb = ["pymongo"]
+mssql = ["pymssql", "sqlalchemy"]
+mysql = ["pymysql[rsa]", "sqlalchemy"]
+nats = ["nats-py"]
+neo4j = ["neo4j"]
+opensearch = ["opensearch-py"]
+oracle = ["oracledb", "sqlalchemy"]
+oracle-free = ["oracledb", "sqlalchemy"]
+qdrant = ["qdrant-client"]
+rabbitmq = ["pika"]
+redis = ["redis"]
+registry = ["bcrypt"]
+scylla = ["cassandra-driver (==3.29.1)"]
+selenium = ["selenium"]
+sftp = ["cryptography"]
+test-module-import = ["httpx"]
+trino = ["trino"]
+weaviate = ["weaviate-client (>=4.5.4,<5.0.0)"]
+
 [[package]]
 name = "toml"
 version = "0.10.2"
@@ -2970,6 +3033,17 @@ files = [
     {file = "types_pytest_lazy_fixture-0.6.3.3-py3-none-any.whl", hash = "sha256:a56a55649147ff960ff79d4b2c781a4f769351abc1876873f3116d0bd0c96353"},
 ]
 
+[[package]]
+name = "types-pyyaml"
+version = "6.0.12.20240917"
+description = "Typing stubs for PyYAML"
+optional = false
+python-versions = ">=3.8"
+files = [
+    {file = "types-PyYAML-6.0.12.20240917.tar.gz", hash = "sha256:d1405a86f9576682234ef83bcb4e6fff7c9305c8b1fbad5e0bcd4f7dbdc9c587"},
+    {file = "types_PyYAML-6.0.12.20240917-py3-none-any.whl", hash = "sha256:392b267f1c0fe6022952462bf5d6523f31e37f6cea49b14cee7ad634b6301570"},
+]
+
 [[package]]
 name = "types-requests"
 version = "2.31.0.0"
@@ -3044,22 +3118,6 @@ brotli = ["brotli (==1.0.9)", "brotli (>=1.0.9)", "brotlicffi (>=0.8.0)", "brotl
 secure = ["certifi", "cryptography (>=1.3.4)", "idna (>=2.0.0)", "ipaddress", "pyOpenSSL (>=0.14)", "urllib3-secure-extra"]
 socks = ["PySocks (>=1.5.6,!=1.5.7,<2.0)"]
 
-[[package]]
-name = "websocket-client"
-version = "1.3.3"
-description = "WebSocket client for Python with low level API options"
-optional = false
-python-versions = ">=3.7"
-files = [
-    {file = "websocket-client-1.3.3.tar.gz", hash = "sha256:d58c5f284d6a9bf8379dab423259fe8f85b70d5fa5d2916d5791a84594b122b1"},
-    {file = "websocket_client-1.3.3-py3-none-any.whl", hash = "sha256:5d55652dc1d0b3c734f044337d929aaf83f4f9138816ec680c1aefefb4dc4877"},
-]
-
-[package.extras]
-docs = ["Sphinx (>=3.4)", "sphinx-rtd-theme (>=0.5)"]
-optional = ["python-socks", "wsaccel"]
-test = ["websockets"]
-
 [[package]]
 name = "websockets"
 version = "12.0"
@@ -3184,6 +3242,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -3421,4 +3489,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "ad5c9ee7723359af22bbd7fa41538dcf78913c02e947a13a8f9a87eb3a59039e"
+content-hash = "13bfc7479aacfe051abb92252b8ddc2e0c429f4607b2d9d8c4b353d2f75c1927"
diff --git a/pyproject.toml b/pyproject.toml
index faa5f9123c..3f21094ba4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,6 +45,10 @@ kafka-python = "^2.0.2"
 jwcrypto = "^1.5.6"
 h2 = "^4.1.0"
 types-jwcrypto = "^1.5.0.20240925"
+pyyaml = "^6.0.2"
+types-pyyaml = "^6.0.12.20240917"
+testcontainers = "^4.8.1"
+jsonnet = "^0.20.0"
 
 [tool.poetry.group.dev.dependencies]
 mypy = "==1.3.0"
@@ -73,12 +77,14 @@ strict = true
 
 [[tool.mypy.overrides]]
 module = [
+    "_jsonnet.*",
     "asyncpg.*",
     "pg8000.*",
     "allure.*",
     "allure_commons.*",
     "allure_pytest.*",
     "kafka.*",
+    "testcontainers.*",
 ]
 ignore_missing_imports = true
 
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index 65f8e432b0..d950f2356d 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -21,6 +21,8 @@ if TYPE_CHECKING:
     from typing import Optional
 
 
+BASE_DIR = Path(__file__).parents[2]
+COMPUTE_CONFIG_DIR = BASE_DIR / "compute" / "etc"
 DEFAULT_OUTPUT_DIR: str = "test_output"
 
 
@@ -64,18 +66,17 @@ def get_test_repo_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
 @pytest.fixture(scope="session")
 def base_dir() -> Iterator[Path]:
     # find the base directory (currently this is the git root)
-    base_dir = Path(__file__).parents[2]
-    log.info(f"base_dir is {base_dir}")
+    log.info(f"base_dir is {BASE_DIR}")
 
-    yield base_dir
+    yield BASE_DIR
 
 
 @pytest.fixture(scope="session")
-def compute_config_dir(base_dir: Path) -> Iterator[Path]:
+def compute_config_dir() -> Iterator[Path]:
     """
     Retrieve the path to the compute configuration directory.
     """
-    yield base_dir / "compute" / "etc"
+    yield COMPUTE_CONFIG_DIR
 
 
 @pytest.fixture(scope="function")
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 6c75765632..c5e3034591 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -1,9 +1,453 @@
 from __future__ import annotations
 
-from fixtures.neon_fixtures import NeonEnv
+import enum
+import os
+import shutil
+from pathlib import Path
+from typing import TYPE_CHECKING, cast
+
+# Docs are available at https://jsonnet.org/ref/bindings.html#python_api
+import _jsonnet
+import pytest
+import requests
+import yaml
+from fixtures.log_helper import log
+from fixtures.paths import BASE_DIR, COMPUTE_CONFIG_DIR
+
+if TYPE_CHECKING:
+    from types import TracebackType
+    from typing import Optional, TypedDict, Union
+
+    from fixtures.neon_fixtures import NeonEnv
+    from fixtures.pg_version import PgVersion
+    from fixtures.port_distributor import PortDistributor
+
+    class Metric(TypedDict):
+        metric_name: str
+        type: str
+        help: str
+        key_labels: Optional[list[str]]
+        values: Optional[list[str]]
+        query: Optional[str]
+        query_ref: Optional[str]
+
+    class Collector(TypedDict):
+        collector_name: str
+        metrics: list[Metric]
+        queries: Optional[list[Query]]
+
+    class Query(TypedDict):
+        query_name: str
+        query: str
 
 
-def test_compute_metrics(neon_simple_env: NeonEnv):
+JSONNET_IMPORT_CACHE: dict[str, bytes] = {}
+JSONNET_PATH: list[Path] = [BASE_DIR / "compute" / "jsonnet", COMPUTE_CONFIG_DIR]
+
+
+def __import_callback(dir: str, rel: str) -> tuple[str, bytes]:
+    """
+    dir: The directory of the Jsonnet file which tried to import a file
+    rel: The actual import path from Jsonnet
+    """
+    if not rel:
+        raise RuntimeError("Empty filename")
+
+    full_path: Optional[str] = None
+    if os.path.isabs(rel):
+        full_path = rel
+    else:
+        for p in (dir, *JSONNET_PATH):
+            assert isinstance(p, (str, Path)), "for mypy"
+            full_path = os.path.join(p, rel)
+
+            assert isinstance(full_path, str), "for mypy"
+            if not os.path.exists(full_path):
+                full_path = None
+                continue
+
+            break
+
+        if not full_path:
+            raise RuntimeError(f"Could not resolve import ({rel}) in {dir}")
+
+    if os.path.isdir(full_path):
+        raise RuntimeError(f"Attempted to import directory: {full_path}")
+
+    if full_path not in JSONNET_IMPORT_CACHE:
+        with open(full_path, encoding="utf-8") as f:
+            JSONNET_IMPORT_CACHE[full_path] = f.read().encode()
+
+    return full_path, JSONNET_IMPORT_CACHE[full_path]
+
+
+def jsonnet_evaluate_file(
+    jsonnet_file: Union[str, Path],
+    ext_vars: Optional[Union[str, dict[str, str]]] = None,
+    tla_vars: Optional[Union[str, dict[str, str]]] = None,
+) -> str:
+    return cast(
+        "str",
+        _jsonnet.evaluate_file(
+            str(jsonnet_file),
+            ext_vars=ext_vars,
+            tla_vars=tla_vars,
+            import_callback=__import_callback,
+        ),
+    )
+
+
+def evaluate_collector(jsonnet_file: Path, pg_version: PgVersion) -> str:
+    return jsonnet_evaluate_file(jsonnet_file, ext_vars={"pg_version": str(pg_version)})
+
+
+def evaluate_config(
+    jsonnet_file: Path, collector_name: str, collector_file: Union[str, Path], connstr: str
+) -> str:
+    return jsonnet_evaluate_file(
+        jsonnet_file,
+        tla_vars={
+            "collector_name": collector_name,
+            "collector_file": str(collector_file),
+            "connection_string": connstr,
+        },
+    )
+
+
+@enum.unique
+class SqlExporterProcess(str, enum.Enum):
+    COMPUTE = "compute"
+    AUTOSCALING = "autoscaling"
+
+
+@pytest.mark.parametrize(
+    "collector_name",
+    ["neon_collector", "neon_collector_autoscaling"],
+    ids=[SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
+)
+def test_sql_exporter_metrics_smoke(
+    pg_version: PgVersion,
+    neon_simple_env: NeonEnv,
+    compute_config_dir: Path,
+    collector_name: str,
+):
+    """
+    This is a smoke test to ensure the metrics SQL queries for sql_exporter
+    work without errors.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create("main")
+    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.start()
+
+    # Extract all the SQL queries from the sql_exporter config files, and run
+    # them.
+    collector = cast(
+        "Collector",
+        yaml.safe_load(
+            jsonnet_evaluate_file(
+                str(compute_config_dir / f"{collector_name}.jsonnet"),
+                ext_vars={"pg_version": pg_version},
+            )
+        ),
+    )
+
+    for metric in collector["metrics"]:
+        query = metric.get("query")
+        if query is not None:
+            log.info("Checking query for metric %s in %s", metric["metric_name"], collector_name)
+            endpoint.safe_psql(query)
+
+    queries = collector.get("queries")
+    if queries is not None:
+        # This variable is named q because mypy is too silly to understand it is
+        # different from the query above.
+        #
+        # query: Optional[str]
+        # q: Metric
+        for q in queries:
+            log.info("Checking query %s in %s", q["query_name"], collector_name)
+            endpoint.safe_psql(q["query"])
+
+
+class SqlExporterRunner:
+    def __init__(self, test_output_dir: Path, sql_exporter_port: int) -> None:
+        self._log_file_name = test_output_dir / "sql_exporter.stderr"
+        self._sql_exporter_port = sql_exporter_port
+
+        log.info(f"Starting sql_exporter at http://localhost:{self._sql_exporter_port}")
+
+    def start(self) -> None:
+        raise NotImplementedError()
+
+    def stop(self) -> None:
+        raise NotImplementedError()
+
+    def __enter__(self) -> SqlExporterRunner:
+        self.start()
+
+        return self
+
+    def __exit__(
+        self,
+        exc_type: Optional[type[BaseException]],
+        exc: Optional[BaseException],
+        tb: Optional[TracebackType],
+    ):
+        self.stop()
+
+
+SQL_EXPORTER = shutil.which("sql_exporter")
+
+if SQL_EXPORTER is None:
+    from testcontainers.core.container import DockerContainer
+    from testcontainers.core.waiting_utils import wait_for_logs
+    from typing_extensions import override
+
+    class SqlExporterContainer(DockerContainer):  # type: ignore
+        def __init__(
+            self, logs_dir: Path, config_file: Path, collector_file: Path, port: int
+        ) -> None:
+            # NOTE: Keep the version the same as in
+            # compute/Dockerfile.compute-node and Dockerfile.build-tools.
+            #
+            # The "host" network mode allows sql_exporter to talk to the
+            # endpoint which is running on the host.
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
+
+            self.__logs_dir = logs_dir
+            self.__port = port
+
+            config_file_name = config_file.name
+            collector_file_name = collector_file.name
+
+            self.with_command(f"-config.file=/etc/{config_file_name} -web.listen-address=:{port}")
+
+            container_config_file = f"/etc/{config_file_name}"
+            container_collector_file = f"/etc/{collector_file_name}"
+            log.info(
+                "Mapping %s to %s in sql_exporter container", config_file, container_config_file
+            )
+            log.info(
+                "Mapping %s to %s in sql_exporter container",
+                collector_file,
+                container_collector_file,
+            )
+
+            # NOTE: z allows Podman to work with SELinux. Please don't change it.
+            # Ideally this would be a ro (read-only) mount, but I couldn't seem to
+            # get it to work.
+            self.with_volume_mapping(str(config_file), container_config_file, "z")
+            self.with_volume_mapping(str(collector_file), container_collector_file, "z")
+
+        @override
+        def start(self) -> SqlExporterContainer:
+            super().start()
+
+            log.info("Waiting for sql_exporter to be ready")
+            wait_for_logs(
+                self,
+                rf'level=info msg="Listening on" address=\[::\]:{self.__port}',
+                timeout=5,
+            )
+
+            return self
+
+    class SqlExporterContainerRunner(SqlExporterRunner):
+        def __init__(
+            self,
+            test_output_dir: Path,
+            config_file: Path,
+            collector_file: Path,
+            sql_exporter_port: int,
+        ) -> None:
+            super().__init__(test_output_dir, sql_exporter_port)
+
+            self.__container = SqlExporterContainer(
+                test_output_dir, config_file, collector_file, sql_exporter_port
+            )
+
+        @override
+        def start(self) -> None:
+            self.__container.start()
+
+        @override
+        def stop(self) -> None:
+            try:
+                # sql_exporter doesn't print anything to stdout
+                with open(self._log_file_name, "w", encoding="utf-8") as f:
+                    f.write(self.__container.get_logs()[1].decode())
+            except Exception:
+                log.exception("Failed to write sql_exporter logs")
+
+            # Stop the container *after* getting the logs
+            self.__container.stop()
+
+else:
+    import subprocess
+    import time
+    from signal import Signals
+
+    from typing_extensions import override
+
+    if TYPE_CHECKING:
+        from collections.abc import Mapping
+
+    class SqlExporterNativeRunner(SqlExporterRunner):
+        def __init__(
+            self,
+            test_output_dir: Path,
+            config_file: Path,
+            collector_file: Path,
+            sql_exporter_port: int,
+        ) -> None:
+            super().__init__(test_output_dir, sql_exporter_port)
+
+            self.__config_file = config_file
+            self.__collector_file = collector_file
+            self.__proc: subprocess.Popen[str]
+
+        @override
+        def start(self) -> None:
+            assert SQL_EXPORTER is not None
+
+            log_file = open(self._log_file_name, "w", encoding="utf-8")
+            self.__proc = subprocess.Popen(
+                [
+                    os.path.realpath(SQL_EXPORTER),
+                    f"-config.file={self.__config_file}",
+                    f"-web.listen-address=:{self._sql_exporter_port}",
+                ],
+                # If PGSERVICEFILE is set, sql_exporter won't launch.
+                env=cast("Mapping[str, str]", {}),
+                stderr=log_file,
+                bufsize=0,
+                text=True,
+            )
+
+            log.info("Waiting for sql_exporter to be ready")
+
+            with open(self._log_file_name, encoding="utf-8") as f:
+                started = time.time()
+                while True:
+                    if time.time() - started > 5:
+                        self.__proc.kill()
+                        raise RuntimeError("sql_exporter did not start up properly")
+
+                    line = f.readline()
+                    if not line:
+                        time.sleep(0.5)
+                        continue
+
+                    if (
+                        f'level=info msg="Listening on" address=[::]:{self._sql_exporter_port}'
+                        in line
+                    ):
+                        break
+
+        @override
+        def stop(self) -> None:
+            self.__proc.send_signal(Signals.SIGINT)
+            self.__proc.wait()
+
+
+@pytest.mark.parametrize(
+    "exporter",
+    [SqlExporterProcess.COMPUTE, SqlExporterProcess.AUTOSCALING],
+)
+def test_sql_exporter_metrics_e2e(
+    pg_version: PgVersion,
+    neon_simple_env: NeonEnv,
+    test_output_dir: Path,
+    compute_config_dir: Path,
+    exporter: SqlExporterProcess,
+    port_distributor: PortDistributor,
+):
+    """
+    This is a full E2E test of the sql_exporter setup to make sure it works
+    without error.
+
+    If you use Podman instead of Docker, you may run into issues. If you run
+    rootful Podman, you may need to add a ~/.testcontainers.properties file
+    with the following content:
+
+        ryuk.container.privileged=true
+
+    If you are not running rootful Podman, set the following environment
+    variable:
+
+        TESTCONTAINERS_RYUK_DISABLED=true
+
+    Note that you will need the Podman socket to be running. On a systemd-based
+    system, that command will look something like:
+
+        # Use `enable --now` to start the socket on login and immediately.
+        systemctl --user start podman.socket
+
+    Whether you use the user service manager or the system service manager is
+    up to you, but may have implications on the above ryuk related steps. Note
+    that you may also need the docker(1) Podman frontend. I am unsure if the
+    docker Python package supports Podman natively.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create("main")
+    endpoint.respec(skip_pg_catalog_updates=False)
+    endpoint.start()
+
+    if exporter == SqlExporterProcess.COMPUTE:
+        stem_suffix = ""
+    elif exporter == SqlExporterProcess.AUTOSCALING:
+        stem_suffix = "_autoscaling"
+
+    # Write the collector file
+    collector_file = test_output_dir / f"neon_collector{stem_suffix}.yml"
+    with open(collector_file, "w", encoding="utf-8") as o:
+        collector = evaluate_collector(
+            compute_config_dir / f"neon_collector{stem_suffix}.jsonnet", pg_version
+        )
+        o.write(collector)
+
+    conn_options = endpoint.conn_options()
+    pg_host = conn_options["host"]
+    pg_port = conn_options["port"]
+    pg_user = conn_options["user"]
+    pg_dbname = conn_options["dbname"]
+    pg_application_name = f"sql_exporter{stem_suffix}"
+    connstr = f"postgresql://{pg_user}@{pg_host}:{pg_port}/{pg_dbname}?sslmode=disable&application_name={pg_application_name}"
+
+    def escape_go_filepath_match_characters(s: str) -> str:
+        """
+        Unfortunately sql_exporter doesn't use plain file paths, so we need to
+        escape special characters. pytest encodes the parameters of a test using
+        [ and ], so we need to escape them with backslashes.
+        See https://pkg.go.dev/path/filepath#Match.
+        """
+        return s.replace("[", r"\[").replace("]", r"\]")
+
+    # Write the config file
+    config_file = test_output_dir / f"sql_exporter{stem_suffix}.yml"
+    with open(config_file, "w", encoding="utf-8") as o:
+        config = evaluate_config(
+            compute_config_dir / "sql_exporter.jsonnet",
+            collector_name=collector_file.stem,
+            collector_file=escape_go_filepath_match_characters(str(collector_file))
+            if SQL_EXPORTER
+            else collector_file.name,
+            connstr=connstr,
+        )
+        o.write(config)
+
+    sql_exporter_port = port_distributor.get_port()
+    with (SqlExporterNativeRunner if SQL_EXPORTER else SqlExporterContainerRunner)(
+        test_output_dir, config_file, collector_file, sql_exporter_port
+    ) as _runner:
+        resp = requests.get(f"http://localhost:{sql_exporter_port}/metrics")
+        resp.raise_for_status()
+
+
+def test_perf_counters(neon_simple_env: NeonEnv):
     """
     Test compute metrics, exposed in the neon_backend_perf_counters and
     neon_perf_counters views

From 8e2e9f0fed000c1204b84a8dc9702ba28046938b Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 29 Oct 2024 22:24:04 +0000
Subject: [PATCH 127/239] pageserver: generation-aware storage for
 TenantManifest (#9555)

## Problem

When tenant manifest objects are written without a generation suffix,
concurrently attached pageservers may stamp on each others writes of the
manifest and cause undefined behavior.

Closes: #9543

## Summary of changes

- Use download_generation_object helper when reading manifests, to
search for the most recent generation
- Use Tenant::generation as the generation suffix when writing
manifests.
---
 pageserver/src/tenant.rs                      |  9 ++-
 .../src/tenant/remote_timeline_client.rs      | 26 +++++---
 .../tenant/remote_timeline_client/download.rs | 59 ++++++++++++++-----
 3 files changed, 66 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6ac11b0ae1..90d9feeeb6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1352,14 +1352,15 @@ impl Tenant {
         )
         .await?;
         let (offloaded_add, tenant_manifest) =
-            match remote_timeline_client::do_download_tenant_manifest(
+            match remote_timeline_client::download_tenant_manifest(
                 remote_storage,
                 &self.tenant_shard_id,
+                self.generation,
                 &cancel,
             )
             .await
             {
-                Ok((tenant_manifest, _generation)) => (
+                Ok((tenant_manifest, _generation, _manifest_mtime)) => (
                     format!("{} offloaded", tenant_manifest.offloaded_timelines.len()),
                     tenant_manifest,
                 ),
@@ -3130,8 +3131,6 @@ impl Tenant {
         }
 
         let tenant_manifest = self.build_tenant_manifest();
-        // TODO: generation support
-        let generation = remote_timeline_client::TENANT_MANIFEST_GENERATION;
         for child_shard in child_shards {
             tracing::info!(
                 "Uploading tenant manifest for child {}",
@@ -3140,7 +3139,7 @@ impl Tenant {
             upload_tenant_manifest(
                 &self.remote_storage,
                 child_shard,
-                generation,
+                self.generation,
                 &tenant_manifest,
                 &self.cancel,
             )
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 19e762b9fa..03ec18c882 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -190,6 +190,7 @@ use chrono::{NaiveDateTime, Utc};
 pub(crate) use download::download_initdb_tar_zst;
 use pageserver_api::models::TimelineArchivalState;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
+use regex::Regex;
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 use utils::backoff::{
@@ -199,7 +200,7 @@ use utils::pausable_failpoint;
 
 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
-use std::sync::{Arc, Mutex};
+use std::sync::{Arc, Mutex, OnceLock};
 use std::time::Duration;
 
 use remote_storage::{
@@ -245,7 +246,7 @@ use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
 use super::Generation;
 
 pub(crate) use download::{
-    do_download_tenant_manifest, download_index_part, is_temp_download_file,
+    download_index_part, download_tenant_manifest, is_temp_download_file,
     list_remote_tenant_shards, list_remote_timelines,
 };
 pub(crate) use index::LayerFileMetadata;
@@ -274,12 +275,6 @@ pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 /// which we warn and skip.
 const DELETION_QUEUE_FLUSH_TIMEOUT: Duration = Duration::from_secs(10);
 
-/// Hardcode a generation for the tenant manifest for now so that we don't
-/// need to deal with generation-less manifests in the future.
-///
-/// TODO: add proper generation support to all the places that use this.
-pub(crate) const TENANT_MANIFEST_GENERATION: Generation = Generation::new(1);
-
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
     Deleted(IndexPart),
@@ -2239,6 +2234,12 @@ pub fn remote_tenant_manifest_path(
     RemotePath::from_string(&path).expect("Failed to construct path")
 }
 
+/// Prefix to all generations' manifest objects in a tenant shard
+pub fn remote_tenant_manifest_prefix(tenant_shard_id: &TenantShardId) -> RemotePath {
+    let path = format!("tenants/{tenant_shard_id}/tenant-manifest",);
+    RemotePath::from_string(&path).expect("Failed to construct path")
+}
+
 pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
     let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
     RemotePath::from_string(&path).expect("Failed to construct path")
@@ -2333,6 +2334,15 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
     }
 }
 
+/// Given the key of a tenant manifest, parse out the generation number
+pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
+    static RE: OnceLock<Regex> = OnceLock::new();
+    let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
+    re.captures(path.get_path().as_str())
+        .and_then(|c| c.get(1))
+        .and_then(|m| Generation::parse_suffix(m.as_str()))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 8679c68a27..efcd20d1bf 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -20,7 +20,9 @@ use utils::backoff;
 
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
-use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::span::{
+    debug_assert_current_span_has_tenant_and_timeline_id, debug_assert_current_span_has_tenant_id,
+};
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
@@ -36,9 +38,10 @@ use utils::pausable_failpoint;
 use super::index::{IndexPart, LayerFileMetadata};
 use super::manifest::TenantManifest;
 use super::{
-    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    remote_initdb_preserved_archive_path, remote_tenant_manifest_path, remote_tenant_path,
-    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_index_path,
+    remote_initdb_archive_path, remote_initdb_preserved_archive_path, remote_tenant_manifest_path,
+    remote_tenant_manifest_prefix, remote_tenant_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 
 ///
@@ -365,32 +368,34 @@ async fn do_download_remote_path_retry_forever(
     .await
 }
 
-pub async fn do_download_tenant_manifest(
+async fn do_download_tenant_manifest(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
+    _timeline_id: Option<&TimelineId>,
+    generation: Generation,
     cancel: &CancellationToken,
-) -> Result<(TenantManifest, Generation), DownloadError> {
-    // TODO: generation support
-    let generation = super::TENANT_MANIFEST_GENERATION;
+) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
     let remote_path = remote_tenant_manifest_path(tenant_shard_id, generation);
 
-    let (manifest_bytes, _manifest_bytes_mtime) =
+    let (manifest_bytes, manifest_bytes_mtime) =
         do_download_remote_path_retry_forever(storage, &remote_path, cancel).await?;
 
     let tenant_manifest = TenantManifest::from_json_bytes(&manifest_bytes)
         .with_context(|| format!("deserialize tenant manifest file at {remote_path:?}"))
         .map_err(DownloadError::Other)?;
 
-    Ok((tenant_manifest, generation))
+    Ok((tenant_manifest, generation, manifest_bytes_mtime))
 }
 
 async fn do_download_index_part(
     storage: &GenericRemoteStorage,
     tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
+    timeline_id: Option<&TimelineId>,
     index_generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    let timeline_id =
+        timeline_id.expect("A timeline ID is always provided when downloading an index");
     let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
 
     let (index_part_bytes, index_part_mtime) =
@@ -426,7 +431,7 @@ async fn do_download_index_part(
 pub(crate) async fn download_generation_object<'a, T, DF, DFF, PF>(
     storage: &'a GenericRemoteStorage,
     tenant_shard_id: &'a TenantShardId,
-    timeline_id: &'a TimelineId,
+    timeline_id: Option<&'a TimelineId>,
     my_generation: Generation,
     what: &str,
     prefix: RemotePath,
@@ -438,7 +443,7 @@ where
     DF: Fn(
         &'a GenericRemoteStorage,
         &'a TenantShardId,
-        &'a TimelineId,
+        Option<&'a TimelineId>,
         Generation,
         &'a CancellationToken,
     ) -> DFF,
@@ -446,7 +451,7 @@ where
     PF: Fn(RemotePath) -> Option<Generation>,
     T: 'static,
 {
-    debug_assert_current_span_has_tenant_and_timeline_id();
+    debug_assert_current_span_has_tenant_id();
 
     if my_generation.is_none() {
         // Operating without generations: just fetch the generation-less path
@@ -552,11 +557,13 @@ pub(crate) async fn download_index_part(
     my_generation: Generation,
     cancel: &CancellationToken,
 ) -> Result<(IndexPart, Generation, SystemTime), DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
     let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
     download_generation_object(
         storage,
         tenant_shard_id,
-        timeline_id,
+        Some(timeline_id),
         my_generation,
         "index_part",
         index_prefix,
@@ -567,6 +574,28 @@ pub(crate) async fn download_index_part(
     .await
 }
 
+pub(crate) async fn download_tenant_manifest(
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    my_generation: Generation,
+    cancel: &CancellationToken,
+) -> Result<(TenantManifest, Generation, SystemTime), DownloadError> {
+    let manifest_prefix = remote_tenant_manifest_prefix(tenant_shard_id);
+
+    download_generation_object(
+        storage,
+        tenant_shard_id,
+        None,
+        my_generation,
+        "tenant-manifest",
+        manifest_prefix,
+        do_download_tenant_manifest,
+        parse_remote_tenant_manifest_path,
+        cancel,
+    )
+    .await
+}
+
 pub(crate) async fn download_initdb_tar_zst(
     conf: &'static PageServerConf,
     storage: &GenericRemoteStorage,

From 0c828c57e2f82302f8261e7e9b58cef7f9f31f50 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 29 Oct 2024 23:03:45 -0500
Subject: [PATCH 128/239] Remove non-gzipped basebackup code path

In July of 2023, Bojan and Chi authored
92aee7e07f347a0cc125462705811963ab5c78e9. Our in production pageservers
are most definitely at a version where they all support gzipped
basebackups.
---
 compute_tools/src/compute.rs | 32 ++++----------------------------
 1 file changed, 4 insertions(+), 28 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index c9dd4dcfc5..d3e42fe618 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1,7 +1,6 @@
 use std::collections::HashMap;
 use std::env;
 use std::fs;
-use std::io::BufRead;
 use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
@@ -365,8 +364,7 @@ impl ComputeNode {
         let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
 
         let basebackup_cmd = match lsn {
-            // HACK We don't use compression on first start (Lsn(0)) because there's no API for it
-            Lsn(0) => format!("basebackup {} {}", spec.tenant_id, spec.timeline_id),
+            Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
             _ => format!(
                 "basebackup {} {} {} --gzip",
                 spec.tenant_id, spec.timeline_id, lsn
@@ -375,38 +373,16 @@ impl ComputeNode {
 
         let copyreader = client.copy_out(basebackup_cmd.as_str())?;
         let mut measured_reader = MeasuredReader::new(copyreader);
-
-        // Check the magic number to see if it's a gzip or not. Even though
-        // we might explicitly ask for gzip, an old pageserver with no implementation
-        // of gzip compression might send us uncompressed data. After some time
-        // passes we can assume all pageservers know how to compress and we can
-        // delete this check.
-        //
-        // If the data is not gzip, it will be tar. It will not be mistakenly
-        // recognized as gzip because tar starts with an ascii encoding of a filename,
-        // and 0x1f and 0x8b are unlikely first characters for any filename. Moreover,
-        // we send the "global" directory first from the pageserver, so it definitely
-        // won't be recognized as gzip.
         let mut bufreader = std::io::BufReader::new(&mut measured_reader);
-        let gzip = {
-            let peek = bufreader.fill_buf().unwrap();
-            peek[0] == 0x1f && peek[1] == 0x8b
-        };
 
         // Read the archive directly from the `CopyOutReader`
         //
         // Set `ignore_zeros` so that unpack() reads all the Copy data and
         // doesn't stop at the end-of-archive marker. Otherwise, if the server
         // sends an Error after finishing the tarball, we will not notice it.
-        if gzip {
-            let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        } else {
-            let mut ar = tar::Archive::new(&mut bufreader);
-            ar.set_ignore_zeros(true);
-            ar.unpack(&self.pgdata)?;
-        };
+        let mut ar = tar::Archive::new(flate2::read::GzDecoder::new(&mut bufreader));
+        ar.set_ignore_zeros(true);
+        ar.unpack(&self.pgdata)?;
 
         // Report metrics
         let mut state = self.state.lock().unwrap();

From 745061ddf862395894b34a9aa2e8c698d26cacd7 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 30 Oct 2024 11:07:02 +0100
Subject: [PATCH 129/239] chore(compute): Bump pg_mooncake to the latest
 version (#9576)

## Problem

There were some critical breaking changes made in the upstream since Oct
29th morning.

## Summary of changes

Point it to the topmost commit in the `neon` branch at the time of
writing this
https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
https://github.com/Mooncake-Labs/pg_mooncake/commit/c495cd17d6018a6fd170b3f47c645a89b23917fc
---
 compute/compute-node.Dockerfile | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 7e38ef8221..c2333eda08 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1131,14 +1131,17 @@ FROM rust-extensions-build AS pg-mooncake-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-ENV PG_MOONCAKE_VERSION=882175dbba07ba2e6e59b1088d61bf325b910b9e
+# The topmost commit in the `neon` branch at the time of writing this
+# https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN case "${PG_VERSION}" in \
         'v14') \
             echo "pg_mooncake is not supported on Postgres ${PG_VERSION}" && exit 0;; \
     esac && \
-    git clone --depth 1 --branch neon https://github.com/kelvich/pg_mooncake.git pg_mooncake-src && \
+    git clone --depth 1 --branch neon https://github.com/Mooncake-Labs/pg_mooncake.git pg_mooncake-src && \
     cd pg_mooncake-src && \
     git checkout "${PG_MOONCAKE_VERSION}" && \
     git submodule update --init --depth 1 --recursive && \

From 96e35e11a6e092429015d78120f8d12dcc542077 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 30 Oct 2024 12:46:39 +0100
Subject: [PATCH 130/239] postgres_ffi: add WAL generator for tests/benchmarks
 (#9503)

## Problem

We don't have a convenient way to generate WAL records for benchmarks
and tests.

## Summary of changes

Adds a WAL generator, exposed as an iterator. It currently only
generates logical messages (noops), but will be extended to write actual
table rows later.

Some existing code for WAL generation has been replaced with this
generator, to reduce duplication.
---
 libs/postgres_ffi/src/lib.rs                  |   1 +
 libs/postgres_ffi/src/wal_generator.rs        | 203 +++++++++++++
 libs/postgres_ffi/src/xlog_utils.rs           |  76 +----
 libs/utils/src/lsn.rs                         |   2 +-
 safekeeper/src/json_ctrl.rs                   |   3 +-
 .../tests/walproposer_sim/simulation.rs       |   3 +-
 .../tests/walproposer_sim/walproposer_disk.rs | 270 +-----------------
 7 files changed, 235 insertions(+), 323 deletions(-)
 create mode 100644 libs/postgres_ffi/src/wal_generator.rs

diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index 6b219488ac..0239b56d9c 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -36,6 +36,7 @@ macro_rules! postgres_ffi {
             pub mod controlfile_utils;
             pub mod nonrelfile_utils;
             pub mod wal_craft_test_export;
+            pub mod wal_generator;
             pub mod waldecoder_handler;
             pub mod xlog_utils;
 
diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs
new file mode 100644
index 0000000000..97968c269b
--- /dev/null
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -0,0 +1,203 @@
+use std::ffi::CStr;
+
+use bytes::{Bytes, BytesMut};
+use crc32c::crc32c_append;
+use utils::lsn::Lsn;
+
+use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
+use super::xlog_utils::{
+    XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
+    XLP_FIRST_IS_CONTRECORD,
+};
+use super::XLogRecord;
+use crate::pg_constants::{
+    RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
+    XLR_BLOCK_ID_DATA_SHORT,
+};
+use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
+
+/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
+/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
+/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
+/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
+/// boundaries if desired. Not optimized for performance.
+///
+/// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
+/// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
+///
+/// A WAL is split into 16 MB segments. Each segment is split into 8 KB pages, with headers.
+/// Records are arbitrary length, 8-byte aligned, and may span pages. The layout is e.g.:
+///
+/// |        Segment 1         |        Segment 2         |        Segment 3         |
+/// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
+/// | R1 |   R2  |R3|  R4  | R5  |  R6  |                 R7            | R8  |
+///
+/// TODO: support generating actual tables and rows.
+#[derive(Default)]
+pub struct WalGenerator {
+    /// Current LSN to append the next record at.
+    ///
+    /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
+    /// ensure that the LSN is on a valid record boundary (i.e. we can't start appending in the
+    /// middle on an existing record or header, or beyond the end of the existing WAL).
+    pub lsn: Lsn,
+    /// The starting LSN of the previous record. Used in WAL record headers. The Safekeeper doesn't
+    /// care about this, unlike Postgres, but we include it for completeness.
+    pub prev_lsn: Lsn,
+}
+
+impl WalGenerator {
+    // For now, hardcode the message payload.
+    // TODO: support specifying the payload size.
+    const PREFIX: &CStr = c"prefix";
+    const MESSAGE: &[u8] = b"message";
+
+    // Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
+    const SYS_ID: u64 = 0;
+    const TIMELINE_ID: u32 = 1;
+    const DB_ID: u32 = 0;
+
+    /// Creates a new WAL generator, which emits logical message records (noops).
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    /// Encodes a logical message (basically a noop), with the given prefix and message.
+    pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
+        let prefix = prefix.to_bytes_with_nul();
+        let header = XlLogicalMessage {
+            db_id: Self::DB_ID,
+            transactional: 0,
+            prefix_size: prefix.len() as u64,
+            message_size: message.len() as u64,
+        };
+        [&header.encode(), prefix, message].concat().into()
+    }
+
+    /// Encode a WAL record with the given payload data (e.g. a logical message).
+    pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
+        // Prefix data with block ID and length.
+        let data_header = Bytes::from(match data.len() {
+            0 => vec![],
+            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
+            256.. => {
+                let len_bytes = (data.len() as u32).to_le_bytes();
+                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
+            }
+        });
+
+        // Construct the WAL record header.
+        let mut header = XLogRecord {
+            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
+            xl_xid: 0,
+            xl_prev: prev_lsn.into(),
+            xl_info: info,
+            xl_rmid: rmid,
+            __bindgen_padding_0: [0; 2],
+            xl_crc: 0, // see below
+        };
+
+        // Compute the CRC checksum for the data, and the header up to the CRC field.
+        let mut crc = 0;
+        crc = crc32c_append(crc, &data_header);
+        crc = crc32c_append(crc, &data);
+        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
+        header.xl_crc = crc;
+
+        // Encode the final header and record.
+        let header = header.encode().unwrap();
+
+        [header, data_header, data].concat().into()
+    }
+
+    /// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
+    /// is to be appended.
+    fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
+        // Fast path: record fits in current page, and the page already has a header.
+        if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
+            return record;
+        }
+
+        let mut pages = BytesMut::new();
+        let mut remaining = record.clone(); // Bytes::clone() is cheap
+        while !remaining.is_empty() {
+            // At new page boundary, inject page header.
+            if lsn.block_offset() == 0 {
+                let mut page_header = XLogPageHeaderData {
+                    xlp_magic: XLOG_PAGE_MAGIC as u16,
+                    xlp_info: XLP_BKP_REMOVABLE,
+                    xlp_tli: Self::TIMELINE_ID,
+                    xlp_pageaddr: lsn.0,
+                    xlp_rem_len: 0,
+                    __bindgen_padding_0: [0; 4],
+                };
+                // If the record was split across page boundaries, mark as continuation.
+                if remaining.len() < record.len() {
+                    page_header.xlp_rem_len = remaining.len() as u32;
+                    page_header.xlp_info |= XLP_FIRST_IS_CONTRECORD;
+                }
+                // At start of segment, use a long page header.
+                let page_header = if lsn.segment_offset(WAL_SEGMENT_SIZE) == 0 {
+                    page_header.xlp_info |= XLP_LONG_HEADER;
+                    XLogLongPageHeaderData {
+                        std: page_header,
+                        xlp_sysid: Self::SYS_ID,
+                        xlp_seg_size: WAL_SEGMENT_SIZE as u32,
+                        xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
+                    }
+                    .encode()
+                    .unwrap()
+                } else {
+                    page_header.encode().unwrap()
+                };
+                pages.extend_from_slice(&page_header);
+                lsn += page_header.len() as u64;
+            }
+
+            // Append the record up to the next page boundary, if any.
+            let page_free = lsn.remaining_in_block() as usize;
+            let chunk = remaining.split_to(std::cmp::min(page_free, remaining.len()));
+            pages.extend_from_slice(&chunk);
+            lsn += chunk.len() as u64;
+        }
+        pages.freeze()
+    }
+
+    /// Records must be 8-byte aligned. Take an encoded record (including any injected page
+    /// boundaries), starting at the given LSN, and add any necessary padding at the end.
+    fn pad_record(record: Bytes, mut lsn: Lsn) -> Bytes {
+        lsn += record.len() as u64;
+        let padding = lsn.calc_padding(8u64) as usize;
+        if padding == 0 {
+            return record;
+        }
+        [record, Bytes::from(vec![0; padding])].concat().into()
+    }
+
+    /// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
+    pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
+        let record = Self::encode_record(data, rmid, info, self.prev_lsn);
+        let record = Self::encode_pages(record, self.lsn);
+        let record = Self::pad_record(record, self.lsn);
+        self.prev_lsn = self.lsn;
+        self.lsn += record.len() as u64;
+        record
+    }
+
+    /// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
+    pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
+        let data = Self::encode_logical_message(prefix, message);
+        self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
+    }
+}
+
+/// Generate WAL records as an iterator.
+impl Iterator for WalGenerator {
+    type Item = (Lsn, Bytes);
+
+    fn next(&mut self) -> Option<Self::Item> {
+        let lsn = self.lsn;
+        let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
+        Some((lsn, record))
+    }
+}
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index a636bd2a97..78a965174f 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -7,15 +7,14 @@
 // have been named the same as the corresponding PostgreSQL functions instead.
 //
 
-use crc32c::crc32c_append;
-
 use super::super::waldecoder::WalStreamDecoder;
 use super::bindings::{
     CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
     XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
 };
+use super::wal_generator::WalGenerator;
 use super::PG_MAJORVERSION;
-use crate::pg_constants;
+use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -26,7 +25,7 @@ use bytes::{Buf, Bytes};
 use log::*;
 
 use serde::Serialize;
-use std::ffi::OsStr;
+use std::ffi::{CString, OsStr};
 use std::fs::File;
 use std::io::prelude::*;
 use std::io::ErrorKind;
@@ -39,6 +38,7 @@ use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 
 pub const XLOG_FNAME_LEN: usize = 24;
+pub const XLP_BKP_REMOVABLE: u16 = 0x0004;
 pub const XLP_FIRST_IS_CONTRECORD: u16 = 0x0001;
 pub const XLP_REM_LEN_OFFS: usize = 2 + 2 + 4 + 8;
 pub const XLOG_RECORD_CRC_OFFS: usize = 4 + 4 + 8 + 1 + 1 + 2;
@@ -489,64 +489,16 @@ impl XlLogicalMessage {
 /// Create new WAL record for non-transactional logical message.
 /// Used for creating artificial WAL for tests, as LogicalMessage
 /// record is basically no-op.
-///
-/// NOTE: This leaves the xl_prev field zero. The safekeeper and
-/// pageserver tolerate that, but PostgreSQL does not.
-pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
-    let mut prefix_bytes: Vec<u8> = Vec::with_capacity(prefix.len() + 1);
-    prefix_bytes.write_all(prefix.as_bytes()).unwrap();
-    prefix_bytes.push(0);
-
-    let message_bytes = message.as_bytes();
-
-    let logical_message = XlLogicalMessage {
-        db_id: 0,
-        transactional: 0,
-        prefix_size: prefix_bytes.len() as u64,
-        message_size: message_bytes.len() as u64,
-    };
-
-    let mainrdata = logical_message.encode();
-    let mainrdata_len: usize = mainrdata.len() + prefix_bytes.len() + message_bytes.len();
-    // only short mainrdata is supported for now
-    assert!(mainrdata_len <= 255);
-    let mainrdata_len = mainrdata_len as u8;
-
-    let mut data: Vec<u8> = vec![pg_constants::XLR_BLOCK_ID_DATA_SHORT, mainrdata_len];
-    data.extend_from_slice(&mainrdata);
-    data.extend_from_slice(&prefix_bytes);
-    data.extend_from_slice(message_bytes);
-
-    let total_len = XLOG_SIZE_OF_XLOG_RECORD + data.len();
-
-    let mut header = XLogRecord {
-        xl_tot_len: total_len as u32,
-        xl_xid: 0,
-        xl_prev: 0,
-        xl_info: 0,
-        xl_rmid: 21,
-        __bindgen_padding_0: [0u8; 2usize],
-        xl_crc: 0, // crc will be calculated later
-    };
-
-    let header_bytes = header.encode().expect("failed to encode header");
-    let crc = crc32c_append(0, &data);
-    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
-    header.xl_crc = crc;
-
-    let mut wal: Vec<u8> = Vec::new();
-    wal.extend_from_slice(&header.encode().expect("failed to encode header"));
-    wal.extend_from_slice(&data);
-
-    // WAL start position must be aligned at 8 bytes,
-    // this will add padding for the next WAL record.
-    const PADDING: usize = 8;
-    let padding_rem = wal.len() % PADDING;
-    if padding_rem != 0 {
-        wal.resize(wal.len() + PADDING - padding_rem, 0);
-    }
-
-    wal
+pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
+    // This function can take untrusted input, so discard any NUL bytes in the prefix string.
+    let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
+    let message = message.as_bytes();
+    WalGenerator::encode_record(
+        WalGenerator::encode_logical_message(&prefix, message),
+        RM_LOGICALMSG_ID,
+        XLOG_LOGICAL_MESSAGE,
+        Lsn(0),
+    )
 }
 
 #[cfg(test)]
diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 3ec2c130bd..524f3604a1 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -12,7 +12,7 @@ use crate::seqwait::MonotonicCounter;
 pub const XLOG_BLCKSZ: u32 = 8192;
 
 /// A Postgres LSN (Log Sequence Number), also known as an XLogRecPtr
-#[derive(Clone, Copy, Eq, Ord, PartialEq, PartialOrd, Hash)]
+#[derive(Clone, Copy, Default, Eq, Ord, PartialEq, PartialOrd, Hash)]
 pub struct Lsn(pub u64);
 
 impl Serialize for Lsn {
diff --git a/safekeeper/src/json_ctrl.rs b/safekeeper/src/json_ctrl.rs
index 7fe924a08e..0573ea81e7 100644
--- a/safekeeper/src/json_ctrl.rs
+++ b/safekeeper/src/json_ctrl.rs
@@ -7,7 +7,6 @@
 //!
 
 use anyhow::Context;
-use bytes::Bytes;
 use postgres_backend::QueryError;
 use serde::{Deserialize, Serialize};
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -176,7 +175,7 @@ pub async fn append_logical_message(
             truncate_lsn: msg.truncate_lsn,
             proposer_uuid: [0u8; 16],
         },
-        wal_data: Bytes::from(wal_data),
+        wal_data,
     });
 
     let response = tli.process_msg(&append_request).await?;
diff --git a/safekeeper/tests/walproposer_sim/simulation.rs b/safekeeper/tests/walproposer_sim/simulation.rs
index 0d7aaf517b..fabf450eef 100644
--- a/safekeeper/tests/walproposer_sim/simulation.rs
+++ b/safekeeper/tests/walproposer_sim/simulation.rs
@@ -151,8 +151,7 @@ impl WalProposer {
         for _ in 0..cnt {
             self.disk
                 .lock()
-                .insert_logical_message("prefix", b"message")
-                .expect("failed to generate logical message");
+                .insert_logical_message(c"prefix", b"message");
         }
 
         let end_lsn = self.disk.lock().flush_rec_ptr();
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
index 123cd6bad6..f70cd65dfc 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -1,24 +1,7 @@
-use std::{ffi::CString, sync::Arc};
+use std::{ffi::CStr, sync::Arc};
 
-use byteorder::{LittleEndian, WriteBytesExt};
-use crc32c::crc32c_append;
 use parking_lot::{Mutex, MutexGuard};
-use postgres_ffi::{
-    pg_constants::{
-        RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE, XLP_LONG_HEADER, XLR_BLOCK_ID_DATA_LONG,
-        XLR_BLOCK_ID_DATA_SHORT,
-    },
-    v16::{
-        wal_craft_test_export::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC},
-        xlog_utils::{
-            XLogSegNoOffsetToRecPtr, XlLogicalMessage, XLOG_RECORD_CRC_OFFS,
-            XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-            XLP_FIRST_IS_CONTRECORD,
-        },
-        XLogRecord,
-    },
-    WAL_SEGMENT_SIZE, XLOG_BLCKSZ,
-};
+use postgres_ffi::v16::wal_generator::WalGenerator;
 use utils::lsn::Lsn;
 
 use super::block_storage::BlockStorage;
@@ -35,6 +18,7 @@ impl DiskWalProposer {
                 internal_available_lsn: Lsn(0),
                 prev_lsn: Lsn(0),
                 disk: BlockStorage::new(),
+                wal_generator: WalGenerator::new(),
             }),
         })
     }
@@ -51,6 +35,8 @@ pub struct State {
     prev_lsn: Lsn,
     // actual WAL storage
     disk: BlockStorage,
+    // WAL record generator
+    wal_generator: WalGenerator,
 }
 
 impl State {
@@ -66,6 +52,9 @@ impl State {
     /// Update the internal available LSN to the given value.
     pub fn reset_to(&mut self, lsn: Lsn) {
         self.internal_available_lsn = lsn;
+        self.prev_lsn = Lsn(0); // Safekeeper doesn't care if this is omitted
+        self.wal_generator.lsn = self.internal_available_lsn;
+        self.wal_generator.prev_lsn = self.prev_lsn;
     }
 
     /// Get current LSN.
@@ -73,242 +62,11 @@ impl State {
         self.internal_available_lsn
     }
 
-    /// Generate a new WAL record at the current LSN.
-    pub fn insert_logical_message(&mut self, prefix: &str, msg: &[u8]) -> anyhow::Result<()> {
-        let prefix_cstr = CString::new(prefix)?;
-        let prefix_bytes = prefix_cstr.as_bytes_with_nul();
-
-        let lm = XlLogicalMessage {
-            db_id: 0,
-            transactional: 0,
-            prefix_size: prefix_bytes.len() as ::std::os::raw::c_ulong,
-            message_size: msg.len() as ::std::os::raw::c_ulong,
-        };
-
-        let record_bytes = lm.encode();
-        let rdatas: Vec<&[u8]> = vec![&record_bytes, prefix_bytes, msg];
-        insert_wal_record(self, rdatas, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
-    }
-}
-
-fn insert_wal_record(
-    state: &mut State,
-    rdatas: Vec<&[u8]>,
-    rmid: u8,
-    info: u8,
-) -> anyhow::Result<()> {
-    // bytes right after the header, in the same rdata block
-    let mut scratch = Vec::new();
-    let mainrdata_len: usize = rdatas.iter().map(|rdata| rdata.len()).sum();
-
-    if mainrdata_len > 0 {
-        if mainrdata_len > 255 {
-            scratch.push(XLR_BLOCK_ID_DATA_LONG);
-            // TODO: verify endiness
-            let _ = scratch.write_u32::<LittleEndian>(mainrdata_len as u32);
-        } else {
-            scratch.push(XLR_BLOCK_ID_DATA_SHORT);
-            scratch.push(mainrdata_len as u8);
-        }
-    }
-
-    let total_len: u32 = (XLOG_SIZE_OF_XLOG_RECORD + scratch.len() + mainrdata_len) as u32;
-    let size = maxalign(total_len);
-    assert!(size as usize > XLOG_SIZE_OF_XLOG_RECORD);
-
-    let start_bytepos = recptr_to_bytepos(state.internal_available_lsn);
-    let end_bytepos = start_bytepos + size as u64;
-
-    let start_recptr = bytepos_to_recptr(start_bytepos);
-    let end_recptr = bytepos_to_recptr(end_bytepos);
-
-    assert!(recptr_to_bytepos(start_recptr) == start_bytepos);
-    assert!(recptr_to_bytepos(end_recptr) == end_bytepos);
-
-    let mut crc = crc32c_append(0, &scratch);
-    for rdata in &rdatas {
-        crc = crc32c_append(crc, rdata);
-    }
-
-    let mut header = XLogRecord {
-        xl_tot_len: total_len,
-        xl_xid: 0,
-        xl_prev: state.prev_lsn.0,
-        xl_info: info,
-        xl_rmid: rmid,
-        __bindgen_padding_0: [0u8; 2usize],
-        xl_crc: crc,
-    };
-
-    // now we have the header and can finish the crc
-    let header_bytes = header.encode()?;
-    let crc = crc32c_append(crc, &header_bytes[0..XLOG_RECORD_CRC_OFFS]);
-    header.xl_crc = crc;
-
-    let mut header_bytes = header.encode()?.to_vec();
-    assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_RECORD);
-
-    header_bytes.extend_from_slice(&scratch);
-
-    // finish rdatas
-    let mut rdatas = rdatas;
-    rdatas.insert(0, &header_bytes);
-
-    write_walrecord_to_disk(state, total_len as u64, rdatas, start_recptr, end_recptr)?;
-
-    state.internal_available_lsn = end_recptr;
-    state.prev_lsn = start_recptr;
-    Ok(())
-}
-
-fn write_walrecord_to_disk(
-    state: &mut State,
-    total_len: u64,
-    rdatas: Vec<&[u8]>,
-    start: Lsn,
-    end: Lsn,
-) -> anyhow::Result<()> {
-    let mut curr_ptr = start;
-    let mut freespace = insert_freespace(curr_ptr);
-    let mut written: usize = 0;
-
-    assert!(freespace >= size_of::<u32>());
-
-    for mut rdata in rdatas {
-        while rdata.len() >= freespace {
-            assert!(
-                curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
-                    || freespace == 0
-            );
-
-            state.write(curr_ptr.0, &rdata[..freespace]);
-            rdata = &rdata[freespace..];
-            written += freespace;
-            curr_ptr = Lsn(curr_ptr.0 + freespace as u64);
-
-            let mut new_page = XLogPageHeaderData {
-                xlp_magic: XLOG_PAGE_MAGIC as u16,
-                xlp_info: XLP_BKP_REMOVABLE,
-                xlp_tli: 1,
-                xlp_pageaddr: curr_ptr.0,
-                xlp_rem_len: (total_len - written as u64) as u32,
-                ..Default::default() // Put 0 in padding fields.
-            };
-            if new_page.xlp_rem_len > 0 {
-                new_page.xlp_info |= XLP_FIRST_IS_CONTRECORD;
-            }
-
-            if curr_ptr.segment_offset(WAL_SEGMENT_SIZE) == 0 {
-                new_page.xlp_info |= XLP_LONG_HEADER;
-                let long_page = XLogLongPageHeaderData {
-                    std: new_page,
-                    xlp_sysid: 0,
-                    xlp_seg_size: WAL_SEGMENT_SIZE as u32,
-                    xlp_xlog_blcksz: XLOG_BLCKSZ as u32,
-                };
-                let header_bytes = long_page.encode()?;
-                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_LONG_PHD);
-                state.write(curr_ptr.0, &header_bytes);
-                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
-            } else {
-                let header_bytes = new_page.encode()?;
-                assert!(header_bytes.len() == XLOG_SIZE_OF_XLOG_SHORT_PHD);
-                state.write(curr_ptr.0, &header_bytes);
-                curr_ptr = Lsn(curr_ptr.0 + header_bytes.len() as u64);
-            }
-            freespace = insert_freespace(curr_ptr);
-        }
-
-        assert!(
-            curr_ptr.segment_offset(WAL_SEGMENT_SIZE) >= XLOG_SIZE_OF_XLOG_SHORT_PHD
-                || rdata.is_empty()
-        );
-        state.write(curr_ptr.0, rdata);
-        curr_ptr = Lsn(curr_ptr.0 + rdata.len() as u64);
-        written += rdata.len();
-        freespace -= rdata.len();
-    }
-
-    assert!(written == total_len as usize);
-    curr_ptr.0 = maxalign(curr_ptr.0);
-    assert!(curr_ptr == end);
-    Ok(())
-}
-
-fn maxalign<T>(size: T) -> T
-where
-    T: std::ops::BitAnd<Output = T>
-        + std::ops::Add<Output = T>
-        + std::ops::Not<Output = T>
-        + From<u8>,
-{
-    (size + T::from(7)) & !T::from(7)
-}
-
-fn insert_freespace(ptr: Lsn) -> usize {
-    if ptr.block_offset() == 0 {
-        0
-    } else {
-        (XLOG_BLCKSZ as u64 - ptr.block_offset()) as usize
-    }
-}
-
-const XLP_BKP_REMOVABLE: u16 = 0x0004;
-const USABLE_BYTES_IN_PAGE: u64 = (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
-const USABLE_BYTES_IN_SEGMENT: u64 = ((WAL_SEGMENT_SIZE / XLOG_BLCKSZ) as u64
-    * USABLE_BYTES_IN_PAGE)
-    - (XLOG_SIZE_OF_XLOG_RECORD - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
-
-fn bytepos_to_recptr(bytepos: u64) -> Lsn {
-    let fullsegs = bytepos / USABLE_BYTES_IN_SEGMENT;
-    let mut bytesleft = bytepos % USABLE_BYTES_IN_SEGMENT;
-
-    let seg_offset = if bytesleft < (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64 {
-        // fits on first page of segment
-        bytesleft + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-    } else {
-        // account for the first page on segment with long header
-        bytesleft -= (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64;
-        let fullpages = bytesleft / USABLE_BYTES_IN_PAGE;
-        bytesleft %= USABLE_BYTES_IN_PAGE;
-
-        XLOG_BLCKSZ as u64
-            + fullpages * XLOG_BLCKSZ as u64
-            + bytesleft
-            + XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-    };
-
-    Lsn(XLogSegNoOffsetToRecPtr(
-        fullsegs,
-        seg_offset as u32,
-        WAL_SEGMENT_SIZE,
-    ))
-}
-
-fn recptr_to_bytepos(ptr: Lsn) -> u64 {
-    let fullsegs = ptr.segment_number(WAL_SEGMENT_SIZE);
-    let offset = ptr.segment_offset(WAL_SEGMENT_SIZE) as u64;
-
-    let fullpages = offset / XLOG_BLCKSZ as u64;
-    let offset = offset % XLOG_BLCKSZ as u64;
-
-    if fullpages == 0 {
-        fullsegs * USABLE_BYTES_IN_SEGMENT
-            + if offset > 0 {
-                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
-                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-            } else {
-                0
-            }
-    } else {
-        fullsegs * USABLE_BYTES_IN_SEGMENT
-            + (XLOG_BLCKSZ - XLOG_SIZE_OF_XLOG_SHORT_PHD) as u64
-            + (fullpages - 1) * USABLE_BYTES_IN_PAGE
-            + if offset > 0 {
-                assert!(offset >= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64);
-                offset - XLOG_SIZE_OF_XLOG_SHORT_PHD as u64
-            } else {
-                0
-            }
+    /// Inserts a logical record in the WAL at the current LSN.
+    pub fn insert_logical_message(&mut self, prefix: &CStr, msg: &[u8]) {
+        let record = self.wal_generator.generate_logical_message(prefix, msg);
+        self.disk.write(self.internal_available_lsn.into(), &record);
+        self.prev_lsn = self.internal_available_lsn;
+        self.internal_available_lsn += record.len() as u64;
     }
 }

From 8af9412eb211093a2d43afe5036552f3271aadf4 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 30 Oct 2024 09:58:29 -0500
Subject: [PATCH 131/239] Collect compute backpressure throttling time

This will tell us how much time the compute has spent throttled if
pageserver/safekeeper cannot keep up with WAL generation.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.jsonnet                     |  1 +
 .../compute_backpressure_throttling_ms.libsonnet       | 10 ++++++++++
 .../compute_backpressure_throttling_ms.sql             |  1 +
 3 files changed, 12 insertions(+)
 create mode 100644 compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 8b43ebe7a3..e73fb132ee 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -3,6 +3,7 @@
   metrics: [
     import 'sql_exporter/checkpoints_req.libsonnet',
     import 'sql_exporter/checkpoints_timed.libsonnet',
+    import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
     import 'sql_exporter/compute_current_lsn.libsonnet',
     import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
     import 'sql_exporter/compute_receive_lsn.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
new file mode 100644
index 0000000000..b25bb73d0f
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
@@ -0,0 +1,10 @@
+{
+  metric_name: 'compute_backpressure_throttling_ms',
+  type: 'gauge',
+  help: 'Time compute has spent throttled',
+  key_labels: null,
+  values: [
+    'throttled',
+  ],
+  query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
+}
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
new file mode 100644
index 0000000000..1fa62d38a4
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
@@ -0,0 +1 @@
+SELECT neon.backpressure_throttling_time() AS throttled;

From d0a02f36494e83df2e6ba942dbe8673e24e33848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 30 Oct 2024 17:04:57 +0100
Subject: [PATCH 132/239] Disallow archived timelines to be detached or
 reparented (#9578)

Disallow a request for timeline ancestor detach if either the to be
detached timeline, or any of the to be reparented timelines are
offloaded or archived.

In theory we could support timelines that are archived but not
offloaded, but archived timelines are at the risk of being offloaded, so
we treat them like offloaded timelines. As for offloaded timelines, any
code to "support" them would amount to unoffloading them, at which point
we can just demand to have the timelines be unarchived.

Part of #8088
---
 .../src/tenant/timeline/detach_ancestor.rs    | 45 ++++++++++++++++++-
 .../regress/test_timeline_detach_ancestor.py  | 37 +++++++++++++--
 2 files changed, 78 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index 641faada25..b4c0ab0329 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -29,6 +29,9 @@ pub(crate) enum Error {
     #[error("shutting down, please retry later")]
     ShuttingDown,
 
+    #[error("archived: {}", .0)]
+    Archived(TimelineId),
+
     #[error(transparent)]
     NotFound(crate::tenant::GetTimelineError),
 
@@ -79,8 +82,9 @@ impl From<Error> for ApiError {
     fn from(value: Error) -> Self {
         match value {
             Error::NoAncestor => ApiError::Conflict(value.to_string()),
-            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
+            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")),
             Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")),
             Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
                 ApiError::ResourceUnavailable(value.to_string().into())
             }
@@ -201,12 +205,18 @@ pub(super) async fn prepare(
         }));
     };
 
+    if detached.is_archived() != Some(false) {
+        return Err(Archived(detached.timeline_id));
+    }
+
     if !ancestor_lsn.is_valid() {
         // rare case, probably wouldn't even load
         tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
         return Err(NoAncestor);
     }
 
+    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+
     if ancestor.ancestor_timeline.is_some() {
         // non-technical requirement; we could flatten N ancestors just as easily but we chose
         // not to, at least initially
@@ -950,3 +960,36 @@ where
         }
     })
 }
+
+fn check_no_archived_children_of_ancestor(
+    tenant: &Tenant,
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+) -> Result<(), Error> {
+    let timelines = tenant.timelines.lock().unwrap();
+    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+    for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
+        if timeline.is_archived() == Some(true) {
+            return Err(Error::Archived(timeline.timeline_id));
+        }
+    }
+    for timeline_offloaded in timelines_offloaded.values() {
+        if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
+            continue;
+        }
+        // This forbids the detach ancestor feature if flattened timelines are present,
+        // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
+        // But as per current design, we don't record the ancestor_lsn of flattened timelines.
+        // This is a bit unfortunate, but as of writing this we don't support flattening
+        // anyway. Maybe we can evolve the data model in the future.
+        if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
+            let is_earlier = retain_lsn <= ancestor_lsn;
+            if !is_earlier {
+                continue;
+            }
+        }
+        return Err(Error::Archived(timeline_offloaded.timeline_id));
+    }
+    Ok(())
+}
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 0c8554bb54..d467c59e62 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -9,7 +9,7 @@ from queue import Empty, Queue
 from threading import Barrier
 
 import pytest
-from fixtures.common_types import Lsn, TimelineId
+from fixtures.common_types import Lsn, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     LogCursor,
@@ -634,7 +634,13 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
     shards = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shards
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shards if sharded else None,
+        initial_tenant_conf={
+            # turn off gc, we want to do manual offloading here.
+            "gc_period": "0s",
+        },
+    )
 
     pageservers = dict((int(p.id), p) for p in env.pageservers)
 
@@ -656,7 +662,9 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
         client.detach_ancestor(env.initial_tenant, env.initial_timeline)
     assert info.value.status_code == 409
 
-    _ = env.create_branch("first_branch")
+    early_branch = env.create_branch("early_branch")
+
+    first_branch = env.create_branch("first_branch")
 
     second_branch = env.create_branch("second_branch", ancestor_branch_name="first_branch")
 
@@ -665,6 +673,29 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
         client.detach_ancestor(env.initial_tenant, second_branch)
     assert info.value.status_code == 400
 
+    client.timeline_archival_config(
+        env.initial_tenant, second_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    client.timeline_archival_config(
+        env.initial_tenant, early_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    with pytest.raises(PageserverApiException, match=f".*archived: {early_branch}") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert info.value.status_code == 400
+
+    if not sharded:
+        client.timeline_offload(env.initial_tenant, early_branch)
+
+    client.timeline_archival_config(
+        env.initial_tenant, first_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    with pytest.raises(PageserverApiException, match=f".*archived: {first_branch}") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert info.value.status_code == 400
+
 
 def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     """

From bcfe013094a962a62c217fb41e7d02c01361505f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 30 Oct 2024 18:44:29 +0100
Subject: [PATCH 133/239] Don't keep around the timeline's remote_client
 (#9583)

Constructing a remote client is no big deal. Yes, it means an extra
download from S3 but it's not that expensive. This simplifies code paths
and scenarios to test. This unifies timelines that have been recently
offloaded with timelines that have been offloaded in an earlier
invocation of the process.

Part of #8088
---
 pageserver/src/tenant.rs | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 90d9feeeb6..8237f4662c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -521,13 +521,6 @@ pub struct OffloadedTimeline {
     /// Present for future flattening deliberations.
     pub archived_at: NaiveDateTime,
 
-    /// Lazily constructed remote client for the timeline
-    ///
-    /// If we offload a timeline, we keep around the remote client
-    /// for the duration of the process. If we find it through the
-    /// manifest, we don't construct it up until it's needed (deletion).
-    pub remote_client: Option<Arc<RemoteTimelineClient>>,
-
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
     pub delete_progress: TimelineDeleteProgress,
@@ -554,7 +547,6 @@ impl OffloadedTimeline {
             ancestor_retain_lsn,
             archived_at,
 
-            remote_client: Some(timeline.remote_client.clone()),
             delete_progress: timeline.delete_progress.clone(),
         })
     }
@@ -571,7 +563,6 @@ impl OffloadedTimeline {
             ancestor_timeline_id,
             ancestor_retain_lsn,
             archived_at,
-            remote_client: None,
             delete_progress: TimelineDeleteProgress::default(),
         }
     }
@@ -636,7 +627,7 @@ impl TimelineOrOffloaded {
     fn maybe_remote_client(&self) -> Option<Arc<RemoteTimelineClient>> {
         match self {
             TimelineOrOffloaded::Timeline(timeline) => Some(timeline.remote_client.clone()),
-            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.remote_client.clone(),
+            TimelineOrOffloaded::Offloaded(_offloaded) => None,
         }
     }
 }

From 8d70f88b3704f32e5abdb7e9580ff3bbc9c796b7 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 30 Oct 2024 14:13:11 -0400
Subject: [PATCH 134/239] refactor(pageserver): use JSON field encoding for
 consumption metrics cache (#9470)

In https://github.com/neondatabase/neon/issues/9032, I would like to
eventually add a `generation` field to the consumption metrics cache.
The current encoding is not backward compatible and it is hard to add
another field into the cache. Therefore, this patch refactors the format
to store "field -> value", and it's easier to maintain backward/forward
compatibility with this new format.

## Summary of changes

* Add `NewRawMetric` as the new format.
* Add upgrade path. When opening the disk cache, the codepath first
inspects the `version` field, and decide how to decode.
* Refactor metrics generation code and tests.
* Add tests on upgrade / compatibility with the old format.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/consumption_metrics.rs         |  60 ++++++++-
 .../src/consumption_metrics/disk_cache.rs     |  38 +++++-
 pageserver/src/consumption_metrics/metrics.rs |  94 +++++++++-----
 .../src/consumption_metrics/metrics/tests.rs  |  67 +++++++---
 pageserver/src/consumption_metrics/upload.rs  | 116 +++++++++++++++---
 5 files changed, 300 insertions(+), 75 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 0c7630edca..7e8c00c293 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -14,6 +14,7 @@ use itertools::Itertools as _;
 use pageserver_api::models::TenantState;
 use remote_storage::{GenericRemoteStorage, RemoteStorageConfig};
 use reqwest::Url;
+use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
@@ -35,12 +36,62 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// upload attempts.
 type RawMetric = (MetricsKey, (EventType, u64));
 
+/// The new serializable metrics format
+#[derive(Serialize, Deserialize)]
+struct NewMetricsRoot {
+    version: usize,
+    metrics: Vec<NewRawMetric>,
+}
+
+impl NewMetricsRoot {
+    pub fn is_v2_metrics(json_value: &serde_json::Value) -> bool {
+        if let Some(ver) = json_value.get("version") {
+            if let Some(2) = ver.as_u64() {
+                return true;
+            }
+        }
+        false
+    }
+}
+
+/// The new serializable metrics format
+#[derive(Serialize)]
+struct NewMetricsRefRoot<'a> {
+    version: usize,
+    metrics: &'a [NewRawMetric],
+}
+
+impl<'a> NewMetricsRefRoot<'a> {
+    fn new(metrics: &'a [NewRawMetric]) -> Self {
+        Self {
+            version: 2,
+            metrics,
+        }
+    }
+}
+
+/// The new serializable metrics format
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+struct NewRawMetric {
+    key: MetricsKey,
+    kind: EventType,
+    value: u64,
+    // TODO: add generation field and check against generations
+}
+
+impl NewRawMetric {
+    #[cfg(test)]
+    fn to_kv_pair(&self) -> (MetricsKey, NewRawMetric) {
+        (self.key, self.clone())
+    }
+}
+
 /// Caches the [`RawMetric`]s
 ///
 /// In practice, during startup, last sent values are stored here to be used in calculating new
 /// ones. After successful uploading, the cached values are updated to cache. This used to be used
 /// for deduplication, but that is no longer needed.
-type Cache = HashMap<MetricsKey, (EventType, u64)>;
+type Cache = HashMap<MetricsKey, NewRawMetric>;
 
 pub async fn run(
     conf: &'static PageServerConf,
@@ -231,11 +282,14 @@ async fn restore_and_reschedule(
             // collect_all_metrics
             let earlier_metric_at = found_some
                 .iter()
-                .map(|(_, (et, _))| et.recorded_at())
+                .map(|item| item.kind.recorded_at())
                 .copied()
                 .next();
 
-            let cached = found_some.into_iter().collect::<Cache>();
+            let cached = found_some
+                .into_iter()
+                .map(|item| (item.key, item))
+                .collect::<Cache>();
 
             (cached, earlier_metric_at)
         }
diff --git a/pageserver/src/consumption_metrics/disk_cache.rs b/pageserver/src/consumption_metrics/disk_cache.rs
index 387bf7a0f9..54a505a134 100644
--- a/pageserver/src/consumption_metrics/disk_cache.rs
+++ b/pageserver/src/consumption_metrics/disk_cache.rs
@@ -2,11 +2,33 @@ use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use std::sync::Arc;
 
-use super::RawMetric;
+use crate::consumption_metrics::NewMetricsRefRoot;
+
+use super::{NewMetricsRoot, NewRawMetric, RawMetric};
+
+pub(super) fn read_metrics_from_serde_value(
+    json_value: serde_json::Value,
+) -> anyhow::Result<Vec<NewRawMetric>> {
+    if NewMetricsRoot::is_v2_metrics(&json_value) {
+        let root = serde_json::from_value::<NewMetricsRoot>(json_value)?;
+        Ok(root.metrics)
+    } else {
+        let all_metrics = serde_json::from_value::<Vec<RawMetric>>(json_value)?;
+        let all_metrics = all_metrics
+            .into_iter()
+            .map(|(key, (event_type, value))| NewRawMetric {
+                key,
+                kind: event_type,
+                value,
+            })
+            .collect();
+        Ok(all_metrics)
+    }
+}
 
 pub(super) async fn read_metrics_from_disk(
     path: Arc<Utf8PathBuf>,
-) -> anyhow::Result<Vec<RawMetric>> {
+) -> anyhow::Result<Vec<NewRawMetric>> {
     // do not add context to each error, callsite will log with full path
     let span = tracing::Span::current();
     tokio::task::spawn_blocking(move || {
@@ -20,7 +42,8 @@ pub(super) async fn read_metrics_from_disk(
 
         let mut file = std::fs::File::open(&*path)?;
         let reader = std::io::BufReader::new(&mut file);
-        anyhow::Ok(serde_json::from_reader::<_, Vec<RawMetric>>(reader)?)
+        let json_value = serde_json::from_reader::<_, serde_json::Value>(reader)?;
+        read_metrics_from_serde_value(json_value)
     })
     .await
     .context("read metrics join error")
@@ -63,7 +86,7 @@ fn scan_and_delete_with_same_prefix(path: &Utf8Path) -> std::io::Result<()> {
 }
 
 pub(super) async fn flush_metrics_to_disk(
-    current_metrics: &Arc<Vec<RawMetric>>,
+    current_metrics: &Arc<Vec<NewRawMetric>>,
     path: &Arc<Utf8PathBuf>,
 ) -> anyhow::Result<()> {
     use std::io::Write;
@@ -93,8 +116,11 @@ pub(super) async fn flush_metrics_to_disk(
             // write out all of the raw metrics, to be read out later on restart as cached values
             {
                 let mut writer = std::io::BufWriter::new(&mut tempfile);
-                serde_json::to_writer(&mut writer, &*current_metrics)
-                    .context("serialize metrics")?;
+                serde_json::to_writer(
+                    &mut writer,
+                    &NewMetricsRefRoot::new(current_metrics.as_ref()),
+                )
+                .context("serialize metrics")?;
                 writer
                     .into_inner()
                     .map_err(|_| anyhow::anyhow!("flushing metrics failed"))?;
diff --git a/pageserver/src/consumption_metrics/metrics.rs b/pageserver/src/consumption_metrics/metrics.rs
index 7ba2d04c4f..07fac09f6f 100644
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -9,7 +9,7 @@ use utils::{
     lsn::Lsn,
 };
 
-use super::{Cache, RawMetric};
+use super::{Cache, NewRawMetric};
 
 /// Name of the metric, used by `MetricsKey` factory methods and `deserialize_cached_events`
 /// instead of static str.
@@ -64,11 +64,21 @@ impl MetricsKey {
 struct AbsoluteValueFactory(MetricsKey);
 
 impl AbsoluteValueFactory {
-    const fn at(self, time: DateTime<Utc>, val: u64) -> RawMetric {
+    #[cfg(test)]
+    const fn at_old_format(self, time: DateTime<Utc>, val: u64) -> super::RawMetric {
         let key = self.0;
         (key, (EventType::Absolute { time }, val))
     }
 
+    const fn at(self, time: DateTime<Utc>, val: u64) -> NewRawMetric {
+        let key = self.0;
+        NewRawMetric {
+            key,
+            kind: EventType::Absolute { time },
+            value: val,
+        }
+    }
+
     fn key(&self) -> &MetricsKey {
         &self.0
     }
@@ -84,7 +94,28 @@ impl IncrementalValueFactory {
         prev_end: DateTime<Utc>,
         up_to: DateTime<Utc>,
         val: u64,
-    ) -> RawMetric {
+    ) -> NewRawMetric {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        let when = EventType::Incremental {
+            start_time: prev_end,
+            stop_time: up_to,
+        };
+        NewRawMetric {
+            key,
+            kind: when,
+            value: val,
+        }
+    }
+
+    #[allow(clippy::wrong_self_convention)]
+    #[cfg(test)]
+    const fn from_until_old_format(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> super::RawMetric {
         let key = self.0;
         // cannot assert prev_end < up_to because these are realtime clock based
         let when = EventType::Incremental {
@@ -185,7 +216,7 @@ pub(super) async fn collect_all_metrics(
     tenant_manager: &Arc<TenantManager>,
     cached_metrics: &Cache,
     ctx: &RequestContext,
-) -> Vec<RawMetric> {
+) -> Vec<NewRawMetric> {
     use pageserver_api::models::TenantState;
 
     let started_at = std::time::Instant::now();
@@ -220,11 +251,11 @@ pub(super) async fn collect_all_metrics(
     res
 }
 
-async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<RawMetric>
+async fn collect<S>(tenants: S, cache: &Cache, ctx: &RequestContext) -> Vec<NewRawMetric>
 where
     S: futures::stream::Stream<Item = (TenantId, Arc<crate::tenant::Tenant>)>,
 {
-    let mut current_metrics: Vec<RawMetric> = Vec::new();
+    let mut current_metrics: Vec<NewRawMetric> = Vec::new();
 
     let mut tenants = std::pin::pin!(tenants);
 
@@ -291,7 +322,7 @@ impl TenantSnapshot {
         tenant_id: TenantId,
         now: DateTime<Utc>,
         cached: &Cache,
-        metrics: &mut Vec<RawMetric>,
+        metrics: &mut Vec<NewRawMetric>,
     ) {
         let remote_size = MetricsKey::remote_storage_size(tenant_id).at(now, self.remote_size);
 
@@ -302,9 +333,9 @@ impl TenantSnapshot {
             let mut synthetic_size = self.synthetic_size;
 
             if synthetic_size == 0 {
-                if let Some((_, value)) = cached.get(factory.key()) {
-                    // use the latest value from previous session
-                    synthetic_size = *value;
+                if let Some(item) = cached.get(factory.key()) {
+                    // use the latest value from previous session, TODO: check generation number
+                    synthetic_size = item.value;
                 }
             }
 
@@ -381,37 +412,36 @@ impl TimelineSnapshot {
         tenant_id: TenantId,
         timeline_id: TimelineId,
         now: DateTime<Utc>,
-        metrics: &mut Vec<RawMetric>,
+        metrics: &mut Vec<NewRawMetric>,
         cache: &Cache,
     ) {
         let timeline_written_size = u64::from(self.last_record_lsn);
 
         let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id);
 
-        let last_stop_time = cache
-            .get(written_size_delta_key.key())
-            .map(|(until, _val)| {
-                until
-                    .incremental_timerange()
-                    .expect("never create EventType::Absolute for written_size_delta")
-                    .end
-            });
+        let last_stop_time = cache.get(written_size_delta_key.key()).map(|item| {
+            item.kind
+                .incremental_timerange()
+                .expect("never create EventType::Absolute for written_size_delta")
+                .end
+        });
 
-        let (key, written_size_now) =
+        let written_size_now =
             MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size);
 
         // by default, use the last sent written_size as the basis for
         // calculating the delta. if we don't yet have one, use the load time value.
-        let prev = cache
-            .get(&key)
-            .map(|(prev_at, prev)| {
+        let prev: (DateTime<Utc>, u64) = cache
+            .get(&written_size_now.key)
+            .map(|item| {
                 // use the prev time from our last incremental update, or default to latest
                 // absolute update on the first round.
-                let prev_at = prev_at
+                let prev_at = item
+                    .kind
                     .absolute_time()
                     .expect("never create EventType::Incremental for written_size");
                 let prev_at = last_stop_time.unwrap_or(prev_at);
-                (*prev_at, *prev)
+                (*prev_at, item.value)
             })
             .unwrap_or_else(|| {
                 // if we don't have a previous point of comparison, compare to the load time
@@ -422,24 +452,28 @@ impl TimelineSnapshot {
 
         let up_to = now;
 
-        if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+        if let Some(delta) = written_size_now.value.checked_sub(prev.1) {
             let key_value = written_size_delta_key.from_until(prev.0, up_to, delta);
             // written_size_delta
             metrics.push(key_value);
             // written_size
-            metrics.push((key, written_size_now));
+            metrics.push(written_size_now);
         } else {
             // the cached value was ahead of us, report zero until we've caught up
             metrics.push(written_size_delta_key.from_until(prev.0, up_to, 0));
             // the cached value was ahead of us, report the same until we've caught up
-            metrics.push((key, (written_size_now.0, prev.1)));
+            metrics.push(NewRawMetric {
+                key: written_size_now.key,
+                kind: written_size_now.kind,
+                value: prev.1,
+            });
         }
 
         {
             let factory = MetricsKey::timeline_logical_size(tenant_id, timeline_id);
             let current_or_previous = self
                 .current_exact_logical_size
-                .or_else(|| cache.get(factory.key()).map(|(_, val)| *val));
+                .or_else(|| cache.get(factory.key()).map(|item| item.value));
 
             if let Some(size) = current_or_previous {
                 metrics.push(factory.at(now, size));
@@ -452,4 +486,4 @@ impl TimelineSnapshot {
 mod tests;
 
 #[cfg(test)]
-pub(crate) use tests::metric_examples;
+pub(crate) use tests::{metric_examples, metric_examples_old};
diff --git a/pageserver/src/consumption_metrics/metrics/tests.rs b/pageserver/src/consumption_metrics/metrics/tests.rs
index f9cbcea565..3ed7b44123 100644
--- a/pageserver/src/consumption_metrics/metrics/tests.rs
+++ b/pageserver/src/consumption_metrics/metrics/tests.rs
@@ -1,3 +1,5 @@
+use crate::consumption_metrics::RawMetric;
+
 use super::*;
 use std::collections::HashMap;
 
@@ -50,9 +52,9 @@ fn startup_collected_timeline_metrics_second_round() {
     let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2);
 
     let mut metrics = Vec::new();
-    let cache = HashMap::from([
-        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0)
-    ]);
+    let cache = HashMap::from([MetricsKey::written_size(tenant_id, timeline_id)
+        .at(before, disk_consistent_lsn.0)
+        .to_kv_pair()]);
 
     let snap = TimelineSnapshot {
         loaded_at: (disk_consistent_lsn, init),
@@ -89,9 +91,13 @@ fn startup_collected_timeline_metrics_nth_round_at_same_lsn() {
     let mut metrics = Vec::new();
     let cache = HashMap::from([
         // at t=before was the last time the last_record_lsn changed
-        MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0),
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before, disk_consistent_lsn.0)
+            .to_kv_pair(),
         // end time of this event is used for the next ones
-        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, just_before, 0),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(before, just_before, 0)
+            .to_kv_pair(),
     ]);
 
     let snap = TimelineSnapshot {
@@ -138,13 +144,17 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
     };
 
     let mut cache = HashMap::from([
-        MetricsKey::written_size(tenant_id, timeline_id).at(before_restart, 100),
-        MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(
-            way_before,
-            before_restart,
-            // not taken into account, but the timestamps are important
-            999_999_999,
-        ),
+        MetricsKey::written_size(tenant_id, timeline_id)
+            .at(before_restart, 100)
+            .to_kv_pair(),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until(
+                way_before,
+                before_restart,
+                // not taken into account, but the timestamps are important
+                999_999_999,
+            )
+            .to_kv_pair(),
     ]);
 
     let mut metrics = Vec::new();
@@ -163,7 +173,7 @@ fn post_restart_written_sizes_with_rolled_back_last_record_lsn() {
     );
 
     // now if we cache these metrics, and re-run while "still in recovery"
-    cache.extend(metrics.drain(..));
+    cache.extend(metrics.drain(..).map(|x| x.to_kv_pair()));
 
     // "still in recovery", because our snapshot did not change
     snap.to_metrics(tenant_id, timeline_id, later, &mut metrics, &cache);
@@ -194,14 +204,14 @@ fn post_restart_current_exact_logical_size_uses_cached() {
         current_exact_logical_size: None,
     };
 
-    let cache = HashMap::from([
-        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(before_restart, 100)
-    ]);
+    let cache = HashMap::from([MetricsKey::timeline_logical_size(tenant_id, timeline_id)
+        .at(before_restart, 100)
+        .to_kv_pair()]);
 
     let mut metrics = Vec::new();
     snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache);
 
-    metrics.retain(|(key, _)| key.metric == Name::LogicalSize);
+    metrics.retain(|item| item.key.metric == Name::LogicalSize);
 
     assert_eq!(
         metrics,
@@ -224,7 +234,9 @@ fn post_restart_synthetic_size_uses_cached_if_available() {
     let before_restart = DateTime::<Utc>::from(now - std::time::Duration::from_secs(5 * 60));
     let now = DateTime::<Utc>::from(now);
 
-    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id).at(before_restart, 1000)]);
+    let cached = HashMap::from([MetricsKey::synthetic_size(tenant_id)
+        .at(before_restart, 1000)
+        .to_kv_pair()]);
 
     let mut metrics = Vec::new();
     ts.to_metrics(tenant_id, now, &cached, &mut metrics);
@@ -278,12 +290,29 @@ fn time_backwards<const N: usize>() -> [std::time::SystemTime; N] {
     times
 }
 
-pub(crate) const fn metric_examples(
+pub(crate) const fn metric_examples_old(
     tenant_id: TenantId,
     timeline_id: TimelineId,
     now: DateTime<Utc>,
     before: DateTime<Utc>,
 ) -> [RawMetric; 6] {
+    [
+        MetricsKey::written_size(tenant_id, timeline_id).at_old_format(now, 0),
+        MetricsKey::written_size_delta(tenant_id, timeline_id)
+            .from_until_old_format(before, now, 0),
+        MetricsKey::timeline_logical_size(tenant_id, timeline_id).at_old_format(now, 0),
+        MetricsKey::remote_storage_size(tenant_id).at_old_format(now, 0),
+        MetricsKey::resident_size(tenant_id).at_old_format(now, 0),
+        MetricsKey::synthetic_size(tenant_id).at_old_format(now, 1),
+    ]
+}
+
+pub(crate) const fn metric_examples(
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    now: DateTime<Utc>,
+    before: DateTime<Utc>,
+) -> [NewRawMetric; 6] {
     [
         MetricsKey::written_size(tenant_id, timeline_id).at(now, 0),
         MetricsKey::written_size_delta(tenant_id, timeline_id).from_until(before, now, 0),
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 1eb25d337b..1cb4e917c0 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -7,7 +7,7 @@ use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
 
-use super::{metrics::Name, Cache, MetricsKey, RawMetric};
+use super::{metrics::Name, Cache, MetricsKey, NewRawMetric, RawMetric};
 use utils::id::{TenantId, TimelineId};
 
 /// How the metrics from pageserver are identified.
@@ -24,7 +24,7 @@ pub(super) async fn upload_metrics_http(
     client: &reqwest::Client,
     metric_collection_endpoint: &reqwest::Url,
     cancel: &CancellationToken,
-    metrics: &[RawMetric],
+    metrics: &[NewRawMetric],
     cached_metrics: &mut Cache,
     idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
@@ -53,8 +53,8 @@ pub(super) async fn upload_metrics_http(
 
         match res {
             Ok(()) => {
-                for (curr_key, curr_val) in chunk {
-                    cached_metrics.insert(*curr_key, *curr_val);
+                for item in chunk {
+                    cached_metrics.insert(item.key, item.clone());
                 }
                 uploaded += chunk.len();
             }
@@ -86,7 +86,7 @@ pub(super) async fn upload_metrics_bucket(
     client: &GenericRemoteStorage,
     cancel: &CancellationToken,
     node_id: &str,
-    metrics: &[RawMetric],
+    metrics: &[NewRawMetric],
     idempotency_keys: &[IdempotencyKey<'_>],
 ) -> anyhow::Result<()> {
     if metrics.is_empty() {
@@ -140,16 +140,16 @@ pub(super) async fn upload_metrics_bucket(
 /// across different metrics sinks), and must have the same length as input.
 fn serialize_in_chunks<'a>(
     chunk_size: usize,
-    input: &'a [RawMetric],
+    input: &'a [NewRawMetric],
     idempotency_keys: &'a [IdempotencyKey<'a>],
-) -> impl ExactSizeIterator<Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>> + 'a
+) -> impl ExactSizeIterator<Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>> + 'a
 {
     use bytes::BufMut;
 
     assert_eq!(input.len(), idempotency_keys.len());
 
     struct Iter<'a> {
-        inner: std::slice::Chunks<'a, RawMetric>,
+        inner: std::slice::Chunks<'a, NewRawMetric>,
         idempotency_keys: std::slice::Iter<'a, IdempotencyKey<'a>>,
         chunk_size: usize,
 
@@ -160,7 +160,7 @@ fn serialize_in_chunks<'a>(
     }
 
     impl<'a> Iterator for Iter<'a> {
-        type Item = Result<(&'a [RawMetric], bytes::Bytes), serde_json::Error>;
+        type Item = Result<(&'a [NewRawMetric], bytes::Bytes), serde_json::Error>;
 
         fn next(&mut self) -> Option<Self::Item> {
             let chunk = self.inner.next()?;
@@ -269,6 +269,58 @@ impl RawMetricExt for RawMetric {
     }
 }
 
+impl RawMetricExt for NewRawMetric {
+    fn as_event(&self, key: &IdempotencyKey<'_>) -> Event<Ids, Name> {
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.key;
+
+        let kind = self.kind;
+        let value = self.value;
+
+        Event {
+            kind,
+            metric,
+            idempotency_key: key.to_string(),
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        }
+    }
+
+    fn update_in_place(&self, event: &mut Event<Ids, Name>, key: &IdempotencyKey<'_>) {
+        use std::fmt::Write;
+
+        let MetricsKey {
+            metric,
+            tenant_id,
+            timeline_id,
+        } = self.key;
+
+        let kind = self.kind;
+        let value = self.value;
+
+        *event = Event {
+            kind,
+            metric,
+            idempotency_key: {
+                event.idempotency_key.clear();
+                write!(event.idempotency_key, "{key}").unwrap();
+                std::mem::take(&mut event.idempotency_key)
+            },
+            value,
+            extra: Ids {
+                tenant_id,
+                timeline_id,
+            },
+        };
+    }
+}
+
 pub(crate) trait KeyGen<'a> {
     fn generate(&self) -> IdempotencyKey<'a>;
 }
@@ -381,6 +433,10 @@ async fn upload(
 
 #[cfg(test)]
 mod tests {
+    use crate::consumption_metrics::{
+        disk_cache::read_metrics_from_serde_value, NewMetricsRefRoot,
+    };
+
     use super::*;
     use chrono::{DateTime, Utc};
     use once_cell::sync::Lazy;
@@ -473,23 +529,49 @@ mod tests {
         let idempotency_key = consumption_metrics::IdempotencyKey::for_tests(*SAMPLES_NOW, "1", 0);
         let examples = examples.into_iter().zip(metric_samples());
 
-        for ((line, expected), (key, (kind, value))) in examples {
+        for ((line, expected), item) in examples {
             let e = consumption_metrics::Event {
-                kind,
-                metric: key.metric,
+                kind: item.kind,
+                metric: item.key.metric,
                 idempotency_key: idempotency_key.to_string(),
-                value,
+                value: item.value,
                 extra: Ids {
-                    tenant_id: key.tenant_id,
-                    timeline_id: key.timeline_id,
+                    tenant_id: item.key.tenant_id,
+                    timeline_id: item.key.timeline_id,
                 },
             };
             let actual = serde_json::to_string(&e).unwrap();
-            assert_eq!(expected, actual, "example for {kind:?} from line {line}");
+            assert_eq!(
+                expected, actual,
+                "example for {:?} from line {line}",
+                item.kind
+            );
         }
     }
 
-    fn metric_samples() -> [RawMetric; 6] {
+    #[test]
+    fn disk_format_upgrade() {
+        let old_samples_json = serde_json::to_value(metric_samples_old()).unwrap();
+        let new_samples =
+            serde_json::to_value(NewMetricsRefRoot::new(metric_samples().as_ref())).unwrap();
+        let upgraded_samples = read_metrics_from_serde_value(old_samples_json).unwrap();
+        let new_samples = read_metrics_from_serde_value(new_samples).unwrap();
+        assert_eq!(upgraded_samples, new_samples);
+    }
+
+    fn metric_samples_old() -> [RawMetric; 6] {
+        let tenant_id = TenantId::from_array([0; 16]);
+        let timeline_id = TimelineId::from_array([0xff; 16]);
+
+        let before = DateTime::parse_from_rfc3339("2023-09-14T00:00:00.123456789Z")
+            .unwrap()
+            .into();
+        let [now, before] = [*SAMPLES_NOW, before];
+
+        super::super::metrics::metric_examples_old(tenant_id, timeline_id, now, before)
+    }
+
+    fn metric_samples() -> [NewRawMetric; 6] {
         let tenant_id = TenantId::from_array([0; 16]);
         let timeline_id = TimelineId::from_array([0xff; 16]);
 

From 65b69392ea156ff04a3b4fc1609ba7b990ddbe27 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 30 Oct 2024 19:37:09 +0100
Subject: [PATCH 135/239] Disallow offloaded children during timeline deletion
 (#9582)

If we delete a timeline that has childen, those children will have their
data corrupted. Therefore, extend the already existing safety check to
offloaded timelines as well.

Part of #8088
---
 pageserver/src/tenant/timeline/delete.rs     | 39 ++++++++++----------
 pageserver/src/tenant/timeline/offload.rs    |  4 +-
 test_runner/regress/test_timeline_archive.py |  7 ++++
 3 files changed, 29 insertions(+), 21 deletions(-)

diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 2c6161da15..b0c4fa2bc9 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -214,7 +214,8 @@ impl DeleteTimelineFlow {
     ) -> Result<(), DeleteTimelineError> {
         super::debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
+        let allow_offloaded_children = false;
+        let (timeline, mut guard) = Self::prepare(tenant, timeline_id, allow_offloaded_children)?;
 
         guard.mark_in_progress()?;
 
@@ -340,6 +341,7 @@ impl DeleteTimelineFlow {
     pub(super) fn prepare(
         tenant: &Tenant,
         timeline_id: TimelineId,
+        allow_offloaded_children: bool,
     ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
         // Note the interaction between this guard and deletion guard.
         // Here we attempt to lock deletion guard when we're holding a lock on timelines.
@@ -352,30 +354,27 @@ impl DeleteTimelineFlow {
         // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
         // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
         let timelines = tenant.timelines.lock().unwrap();
+        let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
 
         let timeline = match timelines.get(&timeline_id) {
             Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
-            None => {
-                let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
-                match offloaded_timelines.get(&timeline_id) {
-                    Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
-                    None => return Err(DeleteTimelineError::NotFound),
-                }
-            }
+            None => match timelines_offloaded.get(&timeline_id) {
+                Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
+                None => return Err(DeleteTimelineError::NotFound),
+            },
         };
 
-        // Ensure that there are no child timelines **attached to that pageserver**,
-        // because detach removes files, which will break child branches
-        let children: Vec<TimelineId> = timelines
-            .iter()
-            .filter_map(|(id, entry)| {
-                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
-                    Some(*id)
-                } else {
-                    None
-                }
-            })
-            .collect();
+        // Ensure that there are no child timelines, because we are about to remove files,
+        // which will break child branches
+        let mut children = Vec::new();
+        if !allow_offloaded_children {
+            children.extend(timelines_offloaded.iter().filter_map(|(id, entry)| {
+                (entry.ancestor_timeline_id == Some(timeline_id)).then_some(*id)
+            }));
+        }
+        children.extend(timelines.iter().filter_map(|(id, entry)| {
+            (entry.get_ancestor_timeline_id() == Some(timeline_id)).then_some(*id)
+        }));
 
         if !children.is_empty() {
             return Err(DeleteTimelineError::HasChildren(children));
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 305c139b54..5b196cf8a7 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -12,7 +12,9 @@ pub(crate) async fn offload_timeline(
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
-    let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
+    let allow_offloaded_children = true;
+    let (timeline, guard) =
+        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
         tracing::error!("timeline already offloaded, but given timeline object");
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 77efd7b749..3e9812c38a 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -213,6 +213,13 @@ def test_timeline_offloading(neon_env_builder: NeonEnvBuilder, manual_offload: b
     wait_until(30, 1, leaf_offloaded)
     wait_until(30, 1, parent_offloaded)
 
+    # Offloaded child timelines should still prevent deletion
+    with pytest.raises(
+        PageserverApiException,
+        match=f".* timeline which has child timelines: \\[{leaf_timeline_id}\\]",
+    ):
+        ps_http.timeline_delete(tenant_id, parent_timeline_id)
+
     ps_http.timeline_archival_config(
         tenant_id,
         grandparent_timeline_id,

From 411c3aa0d62a4d8a2e18b43dc03b677bf1969d66 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 31 Oct 2024 10:47:43 +0000
Subject: [PATCH 136/239] pageserver: lift decoding and interpreting of wal
 into wal_decoder (#9524)

## Problem

Decoding and ingestion are still coupled in `pageserver::WalIngest`.

## Summary of changes

A new type is added to `wal_decoder::models`, InterpretedWalRecord. This
type contains everything that the pageserver requires in order to ingest
a WAL record. The highlights are the `metadata_record` which is an
optional special record type to be handled and `blocks` which stores
key, value pairs to be persisted to storage.

This type is produced by
`wal_decoder::models::InterpretedWalRecord::from_bytes` from a raw PG
wal record.

The rest of this commit separates decoding and interpretation of the PG
WAL record from its application in `WalIngest::ingest_record`.

Related: https://github.com/neondatabase/neon/issues/9335
Epic: https://github.com/neondatabase/neon/issues/9329
---
 libs/wal_decoder/src/decoder.rs               |  969 +++++++++++++
 libs/wal_decoder/src/models.rs                |   44 +
 pageserver/src/import_datadir.rs              |   23 +-
 .../walreceiver/walreceiver_connection.rs     |   16 +-
 pageserver/src/walingest.rs                   | 1212 ++---------------
 5 files changed, 1182 insertions(+), 1082 deletions(-)

diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index 8b13789179..780fce3d69 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -1 +1,970 @@
+//! This module contains logic for decoding and interpreting
+//! raw bytes which represent a raw Postgres WAL record.
 
+use crate::models::*;
+use bytes::{Buf, Bytes, BytesMut};
+use pageserver_api::key::rel_block_to_key;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::value::Value;
+use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
+use postgres_ffi::walrecord::*;
+use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
+use utils::lsn::Lsn;
+
+impl InterpretedWalRecord {
+    /// Decode and interpreted raw bytes which represent one Postgres WAL record.
+    /// Data blocks which do not match the provided shard identity are filtered out.
+    /// Shard 0 is a special case since it tracks all relation sizes. We only give it
+    /// the keys that are being written as that is enough for updating relation sizes.
+    pub fn from_bytes_filtered(
+        buf: Bytes,
+        shard: &ShardIdentity,
+        lsn: Lsn,
+        pg_version: u32,
+    ) -> anyhow::Result<InterpretedWalRecord> {
+        let mut decoded = DecodedWALRecord::default();
+        decode_wal_record(buf, &mut decoded, pg_version)?;
+
+        let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
+            FlushUncommittedRecords::Yes
+        } else {
+            FlushUncommittedRecords::No
+        };
+
+        let metadata_record = MetadataRecord::from_decoded(&decoded, lsn, pg_version)?;
+
+        let mut blocks = Vec::default();
+        for blk in decoded.blocks.iter() {
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+
+            if !key.is_valid_key_on_write_path() {
+                anyhow::bail!("Unsupported key decoded at LSN {}: {}", lsn, key);
+            }
+
+            let key_is_local = shard.is_key_local(&key);
+
+            tracing::debug!(
+                lsn=%lsn,
+                key=%key,
+                "ingest: shard decision {}",
+                if !key_is_local { "drop" } else { "keep" },
+            );
+
+            if !key_is_local {
+                if shard.is_shard_zero() {
+                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                    // its blkno in case it implicitly extends a relation.
+                    blocks.push((key.to_compact(), None));
+                }
+
+                continue;
+            }
+
+            // Instead of storing full-page-image WAL record,
+            // it is better to store extracted image: we can skip wal-redo
+            // in this case. Also some FPI records may contain multiple (up to 32) pages,
+            // so them have to be copied multiple times.
+            //
+            let value = if blk.apply_image
+                && blk.has_image
+                && decoded.xl_rmid == pg_constants::RM_XLOG_ID
+                && (decoded.xl_info == pg_constants::XLOG_FPI
+                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+                // compression of WAL is not yet supported: fall back to storing the original WAL record
+                && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
+                // do not materialize null pages because them most likely be soon replaced with real data
+                && blk.bimg_len != 0
+            {
+                // Extract page image from FPI record
+                let img_len = blk.bimg_len as usize;
+                let img_offs = blk.bimg_offset as usize;
+                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
+                // TODO(vlad): skip the copy
+                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
+
+                if blk.hole_length != 0 {
+                    let tail = image.split_off(blk.hole_offset as usize);
+                    image.resize(image.len() + blk.hole_length as usize, 0u8);
+                    image.unsplit(tail);
+                }
+                //
+                // Match the logic of XLogReadBufferForRedoExtended:
+                // The page may be uninitialized. If so, we can't set the LSN because
+                // that would corrupt the page.
+                //
+                if !page_is_new(&image) {
+                    page_set_lsn(&mut image, lsn)
+                }
+                assert_eq!(image.len(), BLCKSZ as usize);
+
+                Value::Image(image.freeze())
+            } else {
+                Value::WalRecord(NeonWalRecord::Postgres {
+                    will_init: blk.will_init || blk.apply_image,
+                    rec: decoded.record.clone(),
+                })
+            };
+
+            blocks.push((key.to_compact(), Some(value)));
+        }
+
+        Ok(InterpretedWalRecord {
+            metadata_record,
+            blocks,
+            lsn,
+            flush_uncommitted,
+            xid: decoded.xl_xid,
+        })
+    }
+}
+
+impl MetadataRecord {
+    fn from_decoded(
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // Note: this doesn't actually copy the bytes since
+        // the [`Bytes`] type implements it via a level of indirection.
+        let mut buf = decoded.record.clone();
+        buf.advance(decoded.main_data_offset);
+
+        match decoded.xl_rmid {
+            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
+                Self::decode_heapam_record(&mut buf, decoded, pg_version)
+            }
+            pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
+            // Handle other special record types
+            pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
+            pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_TBLSPC_ID => {
+                tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
+                Ok(None)
+            }
+            pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
+            pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, lsn),
+            pg_constants::RM_MULTIXACT_ID => {
+                Self::decode_multixact_record(&mut buf, decoded, pg_version)
+            }
+            pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
+            // This is an odd duck. It needs to go to all shards.
+            // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
+            // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
+            // the pageserver and decode it there.
+            //
+            // Alternatively, one can make the checkpoint part of the subscription protocol
+            // to the pageserver. This should work fine, but can be done at a later point.
+            pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, lsn),
+            pg_constants::RM_LOGICALMSG_ID => {
+                Self::decode_logical_message_record(&mut buf, decoded)
+            }
+            pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
+            pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
+            _unexpected => {
+                // TODO: consider failing here instead of blindly doing something without
+                // understanding the protocol
+                Ok(None)
+            }
+        }
+    }
+
+    fn decode_heapam_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // Handle VM bit updates that are implicitly part of heap records.
+
+        // First, look at the record to determine which VM bits need
+        // to be cleared. If either of these variables is set, we
+        // need to clear the corresponding bits in the visibility map.
+        let mut new_heap_blkno: Option<u32> = None;
+        let mut old_heap_blkno: Option<u32> = None;
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+
+        match pg_version {
+            14 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v14::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v14::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v14::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v14::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v14::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v14::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            15 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v15::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v15::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v15::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v15::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v15::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v15::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            16 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v16::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v16::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v16::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v16::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v16::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v16::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            17 => {
+                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                    if info == pg_constants::XLOG_HEAP_INSERT {
+                        let xlrec = v17::XlHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_DELETE {
+                        let xlrec = v17::XlHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_UPDATE
+                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
+                    {
+                        let xlrec = v17::XlHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP_LOCK {
+                        let xlrec = v17::XlHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
+                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
+                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
+                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                } else {
+                    anyhow::bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
+                }
+            }
+            _ => {}
+        }
+
+        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
+            let vm_rel = RelTag {
+                forknum: VISIBILITYMAP_FORKNUM,
+                spcnode: decoded.blocks[0].rnode_spcnode,
+                dbnode: decoded.blocks[0].rnode_dbnode,
+                relnode: decoded.blocks[0].rnode_relnode,
+            };
+
+            Ok(Some(MetadataRecord::Heapam(HeapamRecord::ClearVmBits(
+                ClearVmBits {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    vm_rel,
+                    flags,
+                },
+            ))))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn decode_neonmgr_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // Handle VM bit updates that are implicitly part of heap records.
+
+        // First, look at the record to determine which VM bits need
+        // to be cleared. If either of these variables is set, we
+        // need to clear the corresponding bits in the visibility map.
+        let mut new_heap_blkno: Option<u32> = None;
+        let mut old_heap_blkno: Option<u32> = None;
+        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
+
+        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
+
+        match pg_version {
+            16 | 17 => {
+                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
+
+                match info {
+                    pg_constants::XLOG_NEON_HEAP_INSERT => {
+                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
+                        assert_eq!(0, buf.remaining());
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_DELETE => {
+                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_UPDATE
+                    | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
+                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
+                        // the size of tuple data is inferred from the size of the record.
+                        // we can't validate the remaining number of bytes without parsing
+                        // the tuple data.
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
+                        }
+                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
+                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
+                            // non-HOT update where the new tuple goes to different page than
+                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
+                            // set.
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
+                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
+
+                        let offset_array_len =
+                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
+                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
+                                0
+                            } else {
+                                size_of::<u16>() * xlrec.ntuples as usize
+                            };
+                        assert_eq!(offset_array_len, buf.remaining());
+
+                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
+                            new_heap_blkno = Some(decoded.blocks[0].blkno);
+                        }
+                    }
+                    pg_constants::XLOG_NEON_HEAP_LOCK => {
+                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
+                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
+                            old_heap_blkno = Some(decoded.blocks[0].blkno);
+                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
+                        }
+                    }
+                    info => anyhow::bail!("Unknown WAL record type for Neon RMGR: {}", info),
+                }
+            }
+            _ => anyhow::bail!(
+                "Neon RMGR has no known compatibility with PostgreSQL version {}",
+                pg_version
+            ),
+        }
+
+        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
+            let vm_rel = RelTag {
+                forknum: VISIBILITYMAP_FORKNUM,
+                spcnode: decoded.blocks[0].rnode_spcnode,
+                dbnode: decoded.blocks[0].rnode_dbnode,
+                relnode: decoded.blocks[0].rnode_relnode,
+            };
+
+            Ok(Some(MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(
+                ClearVmBits {
+                    new_heap_blkno,
+                    old_heap_blkno,
+                    vm_rel,
+                    flags,
+                },
+            ))))
+        } else {
+            Ok(None)
+        }
+    }
+
+    fn decode_smgr_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_SMGR_CREATE {
+            let create = XlSmgrCreate::decode(buf);
+            let rel = RelTag {
+                spcnode: create.rnode.spcnode,
+                dbnode: create.rnode.dbnode,
+                relnode: create.rnode.relnode,
+                forknum: create.forknum,
+            };
+
+            return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Create(SmgrCreate {
+                rel,
+            }))));
+        } else if info == pg_constants::XLOG_SMGR_TRUNCATE {
+            let truncate = XlSmgrTruncate::decode(buf);
+            return Ok(Some(MetadataRecord::Smgr(SmgrRecord::Truncate(truncate))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_dbase_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        // TODO: Refactor this to avoid the duplication between postgres versions.
+
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        tracing::debug!(%info, %pg_version, "handle RM_DBASE_ID");
+
+        if pg_version == 14 {
+            if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
+                let createdb = XlCreateDatabase::decode(buf);
+                tracing::debug!("XLOG_DBASE_CREATE v14");
+
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 15 {
+            if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 16 {
+            if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        } else if pg_version == 17 {
+            if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                tracing::debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                // The XLOG record was renamed between v14 and v15,
+                // but the record format is the same.
+                // So we can reuse XlCreateDatabase here.
+                tracing::debug!("XLOG_DBASE_CREATE_FILE_COPY");
+
+                let createdb = XlCreateDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Create(DbaseCreate {
+                    db_id: createdb.db_id,
+                    tablespace_id: createdb.tablespace_id,
+                    src_db_id: createdb.src_db_id,
+                    src_tablespace_id: createdb.src_tablespace_id,
+                }));
+
+                return Ok(Some(record));
+            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
+                let dropdb = XlDropDatabase::decode(buf);
+                let record = MetadataRecord::Dbase(DbaseRecord::Drop(DbaseDrop {
+                    db_id: dropdb.db_id,
+                    tablespace_ids: dropdb.tablespace_ids,
+                }));
+
+                return Ok(Some(record));
+            }
+        }
+
+        Ok(None)
+    }
+
+    fn decode_clog_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
+
+        if info == pg_constants::CLOG_ZEROPAGE {
+            let pageno = if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            };
+            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+            Ok(Some(MetadataRecord::Clog(ClogRecord::ZeroPage(
+                ClogZeroPage { segno, rpageno },
+            ))))
+        } else {
+            assert!(info == pg_constants::CLOG_TRUNCATE);
+            let xlrec = XlClogTruncate::decode(buf, pg_version);
+
+            Ok(Some(MetadataRecord::Clog(ClogRecord::Truncate(
+                ClogTruncate {
+                    pageno: xlrec.pageno,
+                    oldest_xid: xlrec.oldest_xid,
+                    oldest_xid_db: xlrec.oldest_xid_db,
+                },
+            ))))
+        }
+    }
+
+    fn decode_xact_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
+        let origin_id = decoded.origin_id;
+        let xl_xid = decoded.xl_xid;
+
+        if info == pg_constants::XLOG_XACT_COMMIT {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::Commit(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            }))));
+        } else if info == pg_constants::XLOG_XACT_ABORT {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::Abort(XactCommon {
+                parsed,
+                origin_id,
+                xl_xid,
+                lsn,
+            }))));
+        } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::CommitPrepared(
+                XactCommon {
+                    parsed,
+                    origin_id,
+                    xl_xid,
+                    lsn,
+                },
+            ))));
+        } else if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
+            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
+            return Ok(Some(MetadataRecord::Xact(XactRecord::AbortPrepared(
+                XactCommon {
+                    parsed,
+                    origin_id,
+                    xl_xid,
+                    lsn,
+                },
+            ))));
+        } else if info == pg_constants::XLOG_XACT_PREPARE {
+            return Ok(Some(MetadataRecord::Xact(XactRecord::Prepare(
+                XactPrepare {
+                    xl_xid: decoded.xl_xid,
+                    data: Bytes::copy_from_slice(&buf[..]),
+                },
+            ))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_multixact_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        pg_version: u32,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+
+        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
+            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
+        {
+            let pageno = if pg_version < 17 {
+                buf.get_u32_le()
+            } else {
+                buf.get_u64_le() as u32
+            };
+            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+
+            let slru_kind = match info {
+                pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets,
+                pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers,
+                _ => unreachable!(),
+            };
+
+            return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::ZeroPage(
+                MultiXactZeroPage {
+                    slru_kind,
+                    segno,
+                    rpageno,
+                },
+            ))));
+        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+            let xlrec = XlMultiXactCreate::decode(buf);
+            return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Create(
+                xlrec,
+            ))));
+        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
+            let xlrec = XlMultiXactTruncate::decode(buf);
+            return Ok(Some(MetadataRecord::MultiXact(MultiXactRecord::Truncate(
+                xlrec,
+            ))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_relmap_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let update = XlRelmapUpdate::decode(buf);
+
+        let mut buf = decoded.record.clone();
+        buf.advance(decoded.main_data_offset);
+        // skip xl_relmap_update
+        buf.advance(12);
+
+        Ok(Some(MetadataRecord::Relmap(RelmapRecord::Update(
+            RelmapUpdate {
+                update,
+                buf: Bytes::copy_from_slice(&buf[..]),
+            },
+        ))))
+    }
+
+    fn decode_xlog_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+        lsn: Lsn,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        Ok(Some(MetadataRecord::Xlog(XlogRecord::Raw(RawXlogRecord {
+            info,
+            lsn,
+            buf: buf.clone(),
+        }))))
+    }
+
+    fn decode_logical_message_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+            let xlrec = XlLogicalMessage::decode(buf);
+            let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
+
+            #[cfg(feature = "testing")]
+            if prefix == "neon-test" {
+                return Ok(Some(MetadataRecord::LogicalMessage(
+                    LogicalMessageRecord::Failpoint,
+                )));
+            }
+
+            if let Some(path) = prefix.strip_prefix("neon-file:") {
+                let buf_size = xlrec.prefix_size + xlrec.message_size;
+                let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]);
+                return Ok(Some(MetadataRecord::LogicalMessage(
+                    LogicalMessageRecord::Put(PutLogicalMessage {
+                        path: path.to_string(),
+                        buf,
+                    }),
+                )));
+            }
+        }
+
+        Ok(None)
+    }
+
+    fn decode_standby_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_RUNNING_XACTS {
+            let xlrec = XlRunningXacts::decode(buf);
+            return Ok(Some(MetadataRecord::Standby(StandbyRecord::RunningXacts(
+                StandbyRunningXacts {
+                    oldest_running_xid: xlrec.oldest_running_xid,
+                },
+            ))));
+        }
+
+        Ok(None)
+    }
+
+    fn decode_replorigin_record(
+        buf: &mut Bytes,
+        decoded: &DecodedWALRecord,
+    ) -> anyhow::Result<Option<MetadataRecord>> {
+        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+        if info == pg_constants::XLOG_REPLORIGIN_SET {
+            let xlrec = XlReploriginSet::decode(buf);
+            return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Set(
+                xlrec,
+            ))));
+        } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
+            let xlrec = XlReploriginDrop::decode(buf);
+            return Ok(Some(MetadataRecord::Replorigin(ReploriginRecord::Drop(
+                xlrec,
+            ))));
+        }
+
+        Ok(None)
+    }
+}
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 58f8e1b2da..92b66fcefd 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -25,7 +25,9 @@
 //!                     |--> write to KV store within the pageserver
 
 use bytes::Bytes;
+use pageserver_api::key::CompactKey;
 use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::value::Value;
 use postgres_ffi::walrecord::{
     XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
     XlSmgrTruncate, XlXactParsedRecord,
@@ -33,6 +35,48 @@ use postgres_ffi::walrecord::{
 use postgres_ffi::{Oid, TransactionId};
 use utils::lsn::Lsn;
 
+pub enum FlushUncommittedRecords {
+    Yes,
+    No,
+}
+
+/// An interpreted Postgres WAL record, ready to be handled by the pageserver
+pub struct InterpretedWalRecord {
+    /// Optional metadata record - may cause writes to metadata keys
+    /// in the storage engine
+    pub metadata_record: Option<MetadataRecord>,
+    /// Images or deltas for blocks modified in the original WAL record.
+    /// The [`Value`] is optional to avoid sending superfluous data to
+    /// shard 0 for relation size tracking.
+    pub blocks: Vec<(CompactKey, Option<Value>)>,
+    /// Byte offset within WAL for the end of the original PG WAL record
+    pub lsn: Lsn,
+    /// Whether to flush all uncommitted modifications to the storage engine
+    /// before ingesting this record. This is currently only used for legacy PG
+    /// database creations which read pages from a template database. Such WAL
+    /// records require reading data blocks while ingesting, hence the need to flush.
+    pub flush_uncommitted: FlushUncommittedRecords,
+    /// Transaction id of the original PG WAL record
+    pub xid: TransactionId,
+}
+
+/// The interpreted part of the Postgres WAL record which requires metadata
+/// writes to the underlying storage engine.
+pub enum MetadataRecord {
+    Heapam(HeapamRecord),
+    Neonrmgr(NeonrmgrRecord),
+    Smgr(SmgrRecord),
+    Dbase(DbaseRecord),
+    Clog(ClogRecord),
+    Xact(XactRecord),
+    MultiXact(MultiXactRecord),
+    Relmap(RelmapRecord),
+    Xlog(XlogRecord),
+    LogicalMessage(LogicalMessageRecord),
+    Standby(StandbyRecord),
+    Replorigin(ReploriginRecord),
+}
+
 pub enum HeapamRecord {
     ClearVmBits(ClearVmBits),
 }
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 530c91c4da..06c4553e1c 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -12,6 +12,7 @@ use pageserver_api::key::rel_block_to_key;
 use tokio::io::{AsyncRead, AsyncReadExt};
 use tokio_tar::Archive;
 use tracing::*;
+use wal_decoder::models::InterpretedWalRecord;
 use walkdir::WalkDir;
 
 use crate::context::RequestContext;
@@ -23,7 +24,6 @@ use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::*;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use postgres_ffi::ControlFileData;
 use postgres_ffi::DBState_DB_SHUTDOWNED;
 use postgres_ffi::Oid;
@@ -312,11 +312,15 @@ async fn import_wal(
         let mut modification = tline.begin_modification(last_lsn);
         while last_lsn <= endpoint {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+                let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                    recdata,
+                    tline.get_shard_identity(),
+                    lsn,
+                    tline.pg_version,
+                )?;
 
                 walingest
-                    .ingest_record(decoded, lsn, &mut modification, ctx)
+                    .ingest_record(interpreted, &mut modification, ctx)
                     .await?;
                 WAL_INGEST.records_committed.inc();
 
@@ -453,10 +457,15 @@ pub async fn import_wal_from_tar(
         let mut modification = tline.begin_modification(last_lsn);
         while last_lsn <= end_lsn {
             if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, tline.pg_version)?;
+                let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                    recdata,
+                    tline.get_shard_identity(),
+                    lsn,
+                    tline.pg_version,
+                )?;
+
                 walingest
-                    .ingest_record(decoded, lsn, &mut modification, ctx)
+                    .ingest_record(interpreted, &mut modification, ctx)
                     .await?;
                 modification.commit(ctx).await?;
                 last_lsn = lsn;
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index 739fadbc6b..eb19fb691f 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -22,6 +22,7 @@ use tokio::{select, sync::watch, time};
 use tokio_postgres::{replication::ReplicationStream, Client};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, trace, warn, Instrument};
+use wal_decoder::models::{FlushUncommittedRecords, InterpretedWalRecord};
 
 use super::TaskStateUpdate;
 use crate::{
@@ -35,7 +36,6 @@ use crate::{
 use postgres_backend::is_expected_io_error;
 use postgres_connection::PgConnectionConfig;
 use postgres_ffi::waldecoder::WalStreamDecoder;
-use postgres_ffi::walrecord::{decode_wal_record, DecodedWALRecord};
 use utils::{id::NodeId, lsn::Lsn};
 use utils::{pageserver_feedback::PageserverFeedback, sync::gate::GateError};
 
@@ -339,11 +339,15 @@ pub(super) async fn handle_walreceiver_connection(
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
-                        // Deserialize WAL record
-                        let mut decoded = DecodedWALRecord::default();
-                        decode_wal_record(recdata, &mut decoded, modification.tline.pg_version)?;
+                        // Deserialize and interpret WAL record
+                        let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                            recdata,
+                            modification.tline.get_shard_identity(),
+                            lsn,
+                            modification.tline.pg_version,
+                        )?;
 
-                        if decoded.is_dbase_create_copy(timeline.pg_version)
+                        if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                             && uncommitted_records > 0
                         {
                             // Special case: legacy PG database creations operate by reading pages from a 'template' database:
@@ -360,7 +364,7 @@ pub(super) async fn handle_walreceiver_connection(
 
                         // Ingest the records without immediately committing them.
                         let ingested = walingest
-                            .ingest_record(decoded, lsn, &mut modification, &ctx)
+                            .ingest_record(interpreted, &mut modification, &ctx)
                             .await
                             .with_context(|| format!("could not ingest record at {lsn}"))?;
                         if !ingested {
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 27b3f93845..84353970b7 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -3,17 +3,17 @@
 //!
 //! The pipeline for ingesting WAL looks like this:
 //!
-//! WAL receiver  ->   WalIngest  ->   Repository
+//! WAL receiver  -> [`wal_decoder`] ->  WalIngest  ->   Repository
 //!
-//! The WAL receiver receives a stream of WAL from the WAL safekeepers,
-//! and decodes it to individual WAL records. It feeds the WAL records
-//! to WalIngest, which parses them and stores them in the Repository.
+//! The WAL receiver receives a stream of WAL from the WAL safekeepers.
+//! Records get decoded and interpreted in the [`wal_decoder`] module
+//! and then stored to the Repository by WalIngest.
 //!
 //! The neon Repository can store page versions in two formats: as
-//! page images, or a WAL records. WalIngest::ingest_record() extracts
-//! page images out of some WAL records, but most it stores as WAL
+//! page images, or a WAL records. [`wal_decoder::models::InterpretedWalRecord::from_bytes_filtered`]
+//! extracts page images out of some WAL records, but mostly it's WAL
 //! records. If a WAL record modifies multiple pages, WalIngest
-//! will call Repository::put_wal_record or put_page_image functions
+//! will call Repository::put_rel_wal_record or put_rel_page_image functions
 //! separately for each modified page.
 //!
 //! To reconstruct a page using a WAL record, the Repository calls the
@@ -28,14 +28,15 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;
 
+use pageserver_api::key::Key;
 use pageserver_api::shard::ShardIdentity;
+use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::walrecord::*;
 use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
-use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
 use wal_decoder::models::*;
 
 use anyhow::{bail, Context, Result};
-use bytes::{Buf, Bytes, BytesMut};
+use bytes::{Buf, Bytes};
 use tracing::*;
 use utils::failpoint_support;
 use utils::rate_limit::RateLimit;
@@ -50,10 +51,10 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::value::Value;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::TransactionId;
-use postgres_ffi::BLCKSZ;
 use utils::bin_ser::SerializeError;
 use utils::lsn::Lsn;
 
@@ -140,257 +141,161 @@ impl WalIngest {
         })
     }
 
-    ///
-    /// Decode a PostgreSQL WAL record and store it in the repository, in the given timeline.
+    /// Ingest an interpreted PostgreSQL WAL record by doing writes to the underlying key value
+    /// storage of a given timeline.
     ///
     /// This function updates `lsn` field of `DatadirModification`
     ///
-    /// Helper function to parse a WAL record and call the Timeline's PUT functions for all the
-    /// relations/pages that the record affects.
-    ///
     /// This function returns `true` if the record was ingested, and `false` if it was filtered out
     pub async fn ingest_record(
         &mut self,
-        decoded: DecodedWALRecord,
-        lsn: Lsn,
+        interpreted: InterpretedWalRecord,
         modification: &mut DatadirModification<'_>,
         ctx: &RequestContext,
     ) -> anyhow::Result<bool> {
         WAL_INGEST.records_received.inc();
-        let pg_version = modification.tline.pg_version;
         let prev_len = modification.len();
 
-        modification.set_lsn(lsn)?;
+        modification.set_lsn(interpreted.lsn)?;
 
-        if decoded.is_dbase_create_copy(pg_version) {
+        if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
             // Records of this type should always be preceded by a commit(), as they
             // rely on reading data pages back from the Timeline.
             assert!(!modification.has_dirty_data_pages());
         }
 
-        let mut buf = decoded.record.clone();
-        buf.advance(decoded.main_data_offset);
-
         assert!(!self.checkpoint_modified);
-        if decoded.xl_xid != pg_constants::INVALID_TRANSACTION_ID
-            && self.checkpoint.update_next_xid(decoded.xl_xid)
+        if interpreted.xid != pg_constants::INVALID_TRANSACTION_ID
+            && self.checkpoint.update_next_xid(interpreted.xid)
         {
             self.checkpoint_modified = true;
         }
 
         failpoint_support::sleep_millis_async!("wal-ingest-record-sleep");
 
-        match decoded.xl_rmid {
-            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
-                // Heap AM records need some special handling, because they modify VM pages
-                // without registering them with the standard mechanism.
-                let maybe_heapam_record =
-                    Self::decode_heapam_record(&mut buf, &decoded, pg_version)?;
-                if let Some(heapam_record) = maybe_heapam_record {
-                    match heapam_record {
-                        HeapamRecord::ClearVmBits(clear_vm_bits) => {
-                            self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
-                                .await?;
-                        }
-                    }
-                }
-            }
-            pg_constants::RM_NEON_ID => {
-                let maybe_nenonrmgr_record =
-                    Self::decode_neonmgr_record(&mut buf, &decoded, pg_version)?;
-                if let Some(neonrmgr_record) = maybe_nenonrmgr_record {
-                    match neonrmgr_record {
-                        NeonrmgrRecord::ClearVmBits(clear_vm_bits) => {
-                            self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
-                                .await?;
-                        }
-                    }
-                }
-            }
-            // Handle other special record types
-            pg_constants::RM_SMGR_ID => {
-                let maybe_smgr_record =
-                    Self::decode_smgr_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(smgr_record) = maybe_smgr_record {
-                    match smgr_record {
-                        SmgrRecord::Create(create) => {
-                            self.ingest_xlog_smgr_create(create, modification, ctx)
-                                .await?;
-                        }
-                        SmgrRecord::Truncate(truncate) => {
-                            self.ingest_xlog_smgr_truncate(truncate, modification, ctx)
-                                .await?;
-                        }
-                    }
-                }
-            }
-            pg_constants::RM_DBASE_ID => {
-                let maybe_dbase_record =
-                    Self::decode_dbase_record(&mut buf, &decoded, pg_version).unwrap();
-
-                if let Some(dbase_record) = maybe_dbase_record {
-                    match dbase_record {
-                        DbaseRecord::Create(create) => {
-                            self.ingest_xlog_dbase_create(create, modification, ctx)
-                                .await?;
-                        }
-                        DbaseRecord::Drop(drop) => {
-                            self.ingest_xlog_dbase_drop(drop, modification, ctx).await?;
-                        }
-                    }
-                }
-            }
-            pg_constants::RM_TBLSPC_ID => {
-                trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
-            }
-            pg_constants::RM_CLOG_ID => {
-                // [`Self::decode_clog_record`] may never fail and always returns.
-                // It has this interface to match all the other decoding methods.
-                let clog_record = Self::decode_clog_record(&mut buf, &decoded, pg_version)
-                    .unwrap()
-                    .unwrap();
-
-                match clog_record {
-                    ClogRecord::ZeroPage(zero_page) => {
-                        self.ingest_clog_zero_page(zero_page, modification, ctx)
-                            .await?;
-                    }
-                    ClogRecord::Truncate(truncate) => {
-                        self.ingest_clog_truncate(truncate, modification, ctx)
-                            .await?;
-                    }
-                }
-            }
-            pg_constants::RM_XACT_ID => {
-                let maybe_xact_record =
-                    Self::decode_xact_record(&mut buf, &decoded, lsn, pg_version).unwrap();
-                if let Some(xact_record) = maybe_xact_record {
-                    self.ingest_xact_record(xact_record, modification, ctx)
+        match interpreted.metadata_record {
+            Some(MetadataRecord::Heapam(rec)) => match rec {
+                HeapamRecord::ClearVmBits(clear_vm_bits) => {
+                    self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
                         .await?;
                 }
-            }
-            pg_constants::RM_MULTIXACT_ID => {
-                let maybe_multixact_record =
-                    Self::decode_multixact_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(multixact_record) = maybe_multixact_record {
-                    match multixact_record {
-                        MultiXactRecord::ZeroPage(zero_page) => {
-                            self.ingest_multixact_zero_page(zero_page, modification, ctx)
-                                .await?;
-                        }
-                        MultiXactRecord::Create(create) => {
-                            self.ingest_multixact_create(modification, &create)?;
-                        }
-                        MultiXactRecord::Truncate(truncate) => {
-                            self.ingest_multixact_truncate(modification, &truncate, ctx)
-                                .await?;
-                        }
-                    }
-                }
-            }
-            pg_constants::RM_RELMAP_ID => {
-                let relmap_record = Self::decode_relmap_record(&mut buf, &decoded, pg_version)
-                    .unwrap()
-                    .unwrap();
-                match relmap_record {
-                    RelmapRecord::Update(update) => {
-                        self.ingest_relmap_update(update, modification, ctx).await?;
-                    }
-                }
-            }
-            // This is an odd duck. It needs to go to all shards.
-            // Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
-            // in WalIngest::new), we have to send the whole DecodedWalRecord::record to
-            // the pageserver and decode it there.
-            //
-            // Alternatively, one can make the checkpoint part of the subscription protocol
-            // to the pageserver. This should work fine, but can be done at a later point.
-            pg_constants::RM_XLOG_ID => {
-                let xlog_record = Self::decode_xlog_record(&mut buf, &decoded, lsn, pg_version)
-                    .unwrap()
-                    .unwrap();
-
-                match xlog_record {
-                    XlogRecord::Raw(raw) => {
-                        self.ingest_raw_xlog_record(raw, modification, ctx).await?;
-                    }
-                }
-            }
-            pg_constants::RM_LOGICALMSG_ID => {
-                let maybe_logical_message_record =
-                    Self::decode_logical_message_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(logical_message_record) = maybe_logical_message_record {
-                    match logical_message_record {
-                        LogicalMessageRecord::Put(put) => {
-                            self.ingest_logical_message_put(put, modification, ctx)
-                                .await?;
-                        }
-                        #[cfg(feature = "testing")]
-                        LogicalMessageRecord::Failpoint => {
-                            // This is a convenient way to make the WAL ingestion pause at
-                            // particular point in the WAL. For more fine-grained control,
-                            // we could peek into the message and only pause if it contains
-                            // a particular string, for example, but this is enough for now.
-                            failpoint_support::sleep_millis_async!(
-                                "pageserver-wal-ingest-logical-message-sleep"
-                            );
-                        }
-                    }
-                }
-            }
-            pg_constants::RM_STANDBY_ID => {
-                let maybe_standby_record =
-                    Self::decode_standby_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(standby_record) = maybe_standby_record {
-                    self.ingest_standby_record(standby_record).unwrap();
-                }
-            }
-            pg_constants::RM_REPLORIGIN_ID => {
-                let maybe_replorigin_record =
-                    Self::decode_replorigin_record(&mut buf, &decoded, pg_version).unwrap();
-                if let Some(replorigin_record) = maybe_replorigin_record {
-                    self.ingest_replorigin_record(replorigin_record, modification)
+            },
+            Some(MetadataRecord::Neonrmgr(rec)) => match rec {
+                NeonrmgrRecord::ClearVmBits(clear_vm_bits) => {
+                    self.ingest_clear_vm_bits(clear_vm_bits, modification, ctx)
                         .await?;
                 }
+            },
+            Some(MetadataRecord::Smgr(rec)) => match rec {
+                SmgrRecord::Create(create) => {
+                    self.ingest_xlog_smgr_create(create, modification, ctx)
+                        .await?;
+                }
+                SmgrRecord::Truncate(truncate) => {
+                    self.ingest_xlog_smgr_truncate(truncate, modification, ctx)
+                        .await?;
+                }
+            },
+            Some(MetadataRecord::Dbase(rec)) => match rec {
+                DbaseRecord::Create(create) => {
+                    self.ingest_xlog_dbase_create(create, modification, ctx)
+                        .await?;
+                }
+                DbaseRecord::Drop(drop) => {
+                    self.ingest_xlog_dbase_drop(drop, modification, ctx).await?;
+                }
+            },
+            Some(MetadataRecord::Clog(rec)) => match rec {
+                ClogRecord::ZeroPage(zero_page) => {
+                    self.ingest_clog_zero_page(zero_page, modification, ctx)
+                        .await?;
+                }
+                ClogRecord::Truncate(truncate) => {
+                    self.ingest_clog_truncate(truncate, modification, ctx)
+                        .await?;
+                }
+            },
+            Some(MetadataRecord::Xact(rec)) => {
+                self.ingest_xact_record(rec, modification, ctx).await?;
             }
-            _x => {
-                // TODO: should probably log & fail here instead of blindly
-                // doing something without understanding the protocol
+            Some(MetadataRecord::MultiXact(rec)) => match rec {
+                MultiXactRecord::ZeroPage(zero_page) => {
+                    self.ingest_multixact_zero_page(zero_page, modification, ctx)
+                        .await?;
+                }
+                MultiXactRecord::Create(create) => {
+                    self.ingest_multixact_create(modification, &create)?;
+                }
+                MultiXactRecord::Truncate(truncate) => {
+                    self.ingest_multixact_truncate(modification, &truncate, ctx)
+                        .await?;
+                }
+            },
+            Some(MetadataRecord::Relmap(rec)) => match rec {
+                RelmapRecord::Update(update) => {
+                    self.ingest_relmap_update(update, modification, ctx).await?;
+                }
+            },
+            Some(MetadataRecord::Xlog(rec)) => match rec {
+                XlogRecord::Raw(raw) => {
+                    self.ingest_raw_xlog_record(raw, modification, ctx).await?;
+                }
+            },
+            Some(MetadataRecord::LogicalMessage(rec)) => match rec {
+                LogicalMessageRecord::Put(put) => {
+                    self.ingest_logical_message_put(put, modification, ctx)
+                        .await?;
+                }
+                #[cfg(feature = "testing")]
+                LogicalMessageRecord::Failpoint => {
+                    // This is a convenient way to make the WAL ingestion pause at
+                    // particular point in the WAL. For more fine-grained control,
+                    // we could peek into the message and only pause if it contains
+                    // a particular string, for example, but this is enough for now.
+                    failpoint_support::sleep_millis_async!(
+                        "pageserver-wal-ingest-logical-message-sleep"
+                    );
+                }
+            },
+            Some(MetadataRecord::Standby(rec)) => {
+                self.ingest_standby_record(rec).unwrap();
+            }
+            Some(MetadataRecord::Replorigin(rec)) => {
+                self.ingest_replorigin_record(rec, modification).await?;
+            }
+            None => {
+                // There are two cases through which we end up here:
+                // 1. The resource manager for the original PG WAL record
+                //    is [`pg_constants::RM_TBLSPC_ID`]. This is not a supported
+                //    record type within Neon.
+                // 2. The resource manager id was unknown to
+                //    [`wal_decoder::decoder::MetadataRecord::from_decoded`].
+                // TODO(vlad): Tighten this up more once we build confidence
+                // that case (2) does not happen in the field.
             }
         }
 
-        // Iterate through all the blocks that the record modifies, and
-        // "put" a separate copy of the record for each block.
-        for blk in decoded.blocks.iter() {
-            let rel = RelTag {
-                spcnode: blk.rnode_spcnode,
-                dbnode: blk.rnode_dbnode,
-                relnode: blk.rnode_relnode,
-                forknum: blk.forknum,
-            };
-
-            let key = rel_block_to_key(rel, blk.blkno);
-            let key_is_local = self.shard.is_key_local(&key);
-
-            tracing::debug!(
-                lsn=%lsn,
-                key=%key,
-                "ingest: shard decision {} (checkpoint={})",
-                if !key_is_local { "drop" } else { "keep" },
-                self.checkpoint_modified
-            );
-
-            if !key_is_local {
-                if self.shard.is_shard_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
-                    // its blkno in case it implicitly extends a relation.
-                    self.observe_decoded_block(modification, blk, ctx).await?;
+        // Iterate through all the key value pairs provided in the interpreted block
+        // and update the modification currently in-flight to include them.
+        for (compact_key, maybe_value) in interpreted.blocks.into_iter() {
+            let (rel, blk) = Key::from_compact(compact_key).to_rel_block()?;
+            match maybe_value {
+                Some(Value::Image(img)) => {
+                    self.put_rel_page_image(modification, rel, blk, img, ctx)
+                        .await?;
+                }
+                Some(Value::WalRecord(rec)) => {
+                    self.put_rel_wal_record(modification, rel, blk, rec, ctx)
+                        .await?;
+                }
+                None => {
+                    // Shard 0 tracks relation sizes. We will observe
+                    // its blkno in case it implicitly extends a relation.
+                    assert!(self.shard.is_shard_zero());
+                    self.observe_decoded_block(modification, rel, blk, ctx)
+                        .await?;
                 }
-
-                continue;
             }
-            self.ingest_decoded_block(modification, lsn, &decoded, blk, ctx)
-                .await?;
         }
 
         // If checkpoint data was updated, store the new version in the repository
@@ -433,82 +338,11 @@ impl WalIngest {
     async fn observe_decoded_block(
         &mut self,
         modification: &mut DatadirModification<'_>,
-        blk: &DecodedBkpBlock,
+        rel: RelTag,
+        blkno: BlockNumber,
         ctx: &RequestContext,
     ) -> Result<(), PageReconstructError> {
-        let rel = RelTag {
-            spcnode: blk.rnode_spcnode,
-            dbnode: blk.rnode_dbnode,
-            relnode: blk.rnode_relnode,
-            forknum: blk.forknum,
-        };
-        self.handle_rel_extend(modification, rel, blk.blkno, ctx)
-            .await
-    }
-
-    async fn ingest_decoded_block(
-        &mut self,
-        modification: &mut DatadirModification<'_>,
-        lsn: Lsn,
-        decoded: &DecodedWALRecord,
-        blk: &DecodedBkpBlock,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        let rel = RelTag {
-            spcnode: blk.rnode_spcnode,
-            dbnode: blk.rnode_dbnode,
-            relnode: blk.rnode_relnode,
-            forknum: blk.forknum,
-        };
-
-        //
-        // Instead of storing full-page-image WAL record,
-        // it is better to store extracted image: we can skip wal-redo
-        // in this case. Also some FPI records may contain multiple (up to 32) pages,
-        // so them have to be copied multiple times.
-        //
-        if blk.apply_image
-            && blk.has_image
-            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
-            && (decoded.xl_info == pg_constants::XLOG_FPI
-            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-            // compression of WAL is not yet supported: fall back to storing the original WAL record
-            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, modification.tline.pg_version)
-            // do not materialize null pages because them most likely be soon replaced with real data
-            && blk.bimg_len != 0
-        {
-            // Extract page image from FPI record
-            let img_len = blk.bimg_len as usize;
-            let img_offs = blk.bimg_offset as usize;
-            let mut image = BytesMut::with_capacity(BLCKSZ as usize);
-            image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
-
-            if blk.hole_length != 0 {
-                let tail = image.split_off(blk.hole_offset as usize);
-                image.resize(image.len() + blk.hole_length as usize, 0u8);
-                image.unsplit(tail);
-            }
-            //
-            // Match the logic of XLogReadBufferForRedoExtended:
-            // The page may be uninitialized. If so, we can't set the LSN because
-            // that would corrupt the page.
-            //
-            if !page_is_new(&image) {
-                page_set_lsn(&mut image, lsn)
-            }
-            assert_eq!(image.len(), BLCKSZ as usize);
-
-            self.put_rel_page_image(modification, rel, blk.blkno, image.freeze(), ctx)
-                .await?;
-        } else {
-            let rec = NeonWalRecord::Postgres {
-                will_init: blk.will_init || blk.apply_image,
-                rec: decoded.record.clone(),
-            };
-            self.put_rel_wal_record(modification, rel, blk.blkno, rec, ctx)
-                .await?;
-        }
-        Ok(())
+        self.handle_rel_extend(modification, rel, blkno, ctx).await
     }
 
     async fn ingest_clear_vm_bits(
@@ -599,413 +433,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_heapam_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<HeapamRecord>> {
-        // Handle VM bit updates that are implicitly part of heap records.
-
-        // First, look at the record to determine which VM bits need
-        // to be cleared. If either of these variables is set, we
-        // need to clear the corresponding bits in the visibility map.
-        let mut new_heap_blkno: Option<u32> = None;
-        let mut old_heap_blkno: Option<u32> = None;
-        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-
-        match pg_version {
-            14 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v14::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v14::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v14::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v14::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v14::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v14::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            15 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v15::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v15::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v15::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v15::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v15::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v15::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            16 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v16::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v16::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v16::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v16::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v16::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v16::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            17 => {
-                if decoded.xl_rmid == pg_constants::RM_HEAP_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                    if info == pg_constants::XLOG_HEAP_INSERT {
-                        let xlrec = v17::XlHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_DELETE {
-                        let xlrec = v17::XlHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_UPDATE
-                        || info == pg_constants::XLOG_HEAP_HOT_UPDATE
-                    {
-                        let xlrec = v17::XlHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP_LOCK {
-                        let xlrec = v17::XlHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else if decoded.xl_rmid == pg_constants::RM_HEAP2_ID {
-                    let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-                    if info == pg_constants::XLOG_HEAP2_MULTI_INSERT {
-                        let xlrec = v17::XlHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    } else if info == pg_constants::XLOG_HEAP2_LOCK_UPDATED {
-                        let xlrec = v17::XlHeapLockUpdated::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                } else {
-                    bail!("Unknown RMGR {} for Heap decoding", decoded.xl_rmid);
-                }
-            }
-            _ => {}
-        }
-
-        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
-            let vm_rel = RelTag {
-                forknum: VISIBILITYMAP_FORKNUM,
-                spcnode: decoded.blocks[0].rnode_spcnode,
-                dbnode: decoded.blocks[0].rnode_dbnode,
-                relnode: decoded.blocks[0].rnode_relnode,
-            };
-
-            Ok(Some(HeapamRecord::ClearVmBits(ClearVmBits {
-                new_heap_blkno,
-                old_heap_blkno,
-                vm_rel,
-                flags,
-            })))
-        } else {
-            Ok(None)
-        }
-    }
-
-    fn decode_neonmgr_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<NeonrmgrRecord>> {
-        // Handle VM bit updates that are implicitly part of heap records.
-
-        // First, look at the record to determine which VM bits need
-        // to be cleared. If either of these variables is set, we
-        // need to clear the corresponding bits in the visibility map.
-        let mut new_heap_blkno: Option<u32> = None;
-        let mut old_heap_blkno: Option<u32> = None;
-        let mut flags = pg_constants::VISIBILITYMAP_VALID_BITS;
-
-        assert_eq!(decoded.xl_rmid, pg_constants::RM_NEON_ID);
-
-        match pg_version {
-            16 | 17 => {
-                let info = decoded.xl_info & pg_constants::XLOG_HEAP_OPMASK;
-
-                match info {
-                    pg_constants::XLOG_NEON_HEAP_INSERT => {
-                        let xlrec = v17::rm_neon::XlNeonHeapInsert::decode(buf);
-                        assert_eq!(0, buf.remaining());
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_DELETE => {
-                        let xlrec = v17::rm_neon::XlNeonHeapDelete::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_DELETE_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_UPDATE
-                    | pg_constants::XLOG_NEON_HEAP_HOT_UPDATE => {
-                        let xlrec = v17::rm_neon::XlNeonHeapUpdate::decode(buf);
-                        // the size of tuple data is inferred from the size of the record.
-                        // we can't validate the remaining number of bytes without parsing
-                        // the tuple data.
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks.last().unwrap().blkno);
-                        }
-                        if (xlrec.flags & pg_constants::XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED) != 0 {
-                            // PostgreSQL only uses XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED on a
-                            // non-HOT update where the new tuple goes to different page than
-                            // the old one. Otherwise, only XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED is
-                            // set.
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_MULTI_INSERT => {
-                        let xlrec = v17::rm_neon::XlNeonHeapMultiInsert::decode(buf);
-
-                        let offset_array_len =
-                            if decoded.xl_info & pg_constants::XLOG_HEAP_INIT_PAGE > 0 {
-                                // the offsets array is omitted if XLOG_HEAP_INIT_PAGE is set
-                                0
-                            } else {
-                                size_of::<u16>() * xlrec.ntuples as usize
-                            };
-                        assert_eq!(offset_array_len, buf.remaining());
-
-                        if (xlrec.flags & pg_constants::XLH_INSERT_ALL_VISIBLE_CLEARED) != 0 {
-                            new_heap_blkno = Some(decoded.blocks[0].blkno);
-                        }
-                    }
-                    pg_constants::XLOG_NEON_HEAP_LOCK => {
-                        let xlrec = v17::rm_neon::XlNeonHeapLock::decode(buf);
-                        if (xlrec.flags & pg_constants::XLH_LOCK_ALL_FROZEN_CLEARED) != 0 {
-                            old_heap_blkno = Some(decoded.blocks[0].blkno);
-                            flags = pg_constants::VISIBILITYMAP_ALL_FROZEN;
-                        }
-                    }
-                    info => bail!("Unknown WAL record type for Neon RMGR: {}", info),
-                }
-            }
-            _ => bail!(
-                "Neon RMGR has no known compatibility with PostgreSQL version {}",
-                pg_version
-            ),
-        }
-
-        if new_heap_blkno.is_some() || old_heap_blkno.is_some() {
-            let vm_rel = RelTag {
-                forknum: VISIBILITYMAP_FORKNUM,
-                spcnode: decoded.blocks[0].rnode_spcnode,
-                dbnode: decoded.blocks[0].rnode_dbnode,
-                relnode: decoded.blocks[0].rnode_relnode,
-            };
-
-            Ok(Some(NeonrmgrRecord::ClearVmBits(ClearVmBits {
-                new_heap_blkno,
-                old_heap_blkno,
-                vm_rel,
-                flags,
-            })))
-        } else {
-            Ok(None)
-        }
-    }
-
     /// Subroutine of ingest_record(), to handle an XLOG_DBASE_CREATE record.
     async fn ingest_xlog_dbase_create(
         &mut self,
@@ -1122,125 +549,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_dbase_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<DbaseRecord>> {
-        // TODO: Refactor this to avoid the duplication between postgres versions.
-
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        debug!(%info, %pg_version, "handle RM_DBASE_ID");
-
-        if pg_version == 14 {
-            if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
-                let createdb = XlCreateDatabase::decode(buf);
-                debug!("XLOG_DBASE_CREATE v14");
-
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        } else if pg_version == 15 {
-            if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                debug!("XLOG_DBASE_CREATE_FILE_COPY");
-
-                let createdb = XlCreateDatabase::decode(buf);
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        } else if pg_version == 16 {
-            if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                debug!("XLOG_DBASE_CREATE_FILE_COPY");
-
-                let createdb = XlCreateDatabase::decode(buf);
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        } else if pg_version == 17 {
-            if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_WAL_LOG {
-                debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_CREATE_FILE_COPY {
-                // The XLOG record was renamed between v14 and v15,
-                // but the record format is the same.
-                // So we can reuse XlCreateDatabase here.
-                debug!("XLOG_DBASE_CREATE_FILE_COPY");
-
-                let createdb = XlCreateDatabase::decode(buf);
-                let record = DbaseRecord::Create(DbaseCreate {
-                    db_id: createdb.db_id,
-                    tablespace_id: createdb.tablespace_id,
-                    src_db_id: createdb.src_db_id,
-                    src_tablespace_id: createdb.src_tablespace_id,
-                });
-
-                return Ok(Some(record));
-            } else if info == postgres_ffi::v17::bindings::XLOG_DBASE_DROP {
-                let dropdb = XlDropDatabase::decode(buf);
-                let record = DbaseRecord::Drop(DbaseDrop {
-                    db_id: dropdb.db_id,
-                    tablespace_ids: dropdb.tablespace_ids,
-                });
-
-                return Ok(Some(record));
-            }
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_xlog_smgr_create(
         &mut self,
         create: SmgrCreate,
@@ -1252,30 +560,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_smgr_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<SmgrRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_SMGR_CREATE {
-            let create = XlSmgrCreate::decode(buf);
-            let rel = RelTag {
-                spcnode: create.rnode.spcnode,
-                dbnode: create.rnode.dbnode,
-                relnode: create.rnode.relnode,
-                forknum: create.forknum,
-            };
-
-            return Ok(Some(SmgrRecord::Create(SmgrCreate { rel })));
-        } else if info == pg_constants::XLOG_SMGR_TRUNCATE {
-            let truncate = XlSmgrTruncate::decode(buf);
-            return Ok(Some(SmgrRecord::Truncate(truncate)));
-        }
-
-        Ok(None)
-    }
-
     /// Subroutine of ingest_record(), to handle an XLOG_SMGR_TRUNCATE record.
     ///
     /// This is the same logic as in PostgreSQL's smgr_redo() function.
@@ -1535,59 +819,6 @@ impl WalIngest {
         Ok(())
     }
 
-    // TODO(vlad): Standardise interface for `decode_...`
-    fn decode_xact_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        lsn: Lsn,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<XactRecord>> {
-        let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
-        let origin_id = decoded.origin_id;
-        let xl_xid = decoded.xl_xid;
-
-        if info == pg_constants::XLOG_XACT_COMMIT {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::Commit(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_ABORT {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::Abort(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::CommitPrepared(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_ABORT_PREPARED {
-            let parsed = XlXactParsedRecord::decode(buf, decoded.xl_xid, decoded.xl_info);
-            return Ok(Some(XactRecord::AbortPrepared(XactCommon {
-                parsed,
-                origin_id,
-                xl_xid,
-                lsn,
-            })));
-        } else if info == pg_constants::XLOG_XACT_PREPARE {
-            return Ok(Some(XactRecord::Prepare(XactPrepare {
-                xl_xid: decoded.xl_xid,
-                data: Bytes::copy_from_slice(&buf[..]),
-            })));
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_clog_truncate(
         &mut self,
         truncate: ClogTruncate,
@@ -1681,35 +912,6 @@ impl WalIngest {
         .await
     }
 
-    fn decode_clog_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<ClogRecord>> {
-        let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
-
-        if info == pg_constants::CLOG_ZEROPAGE {
-            let pageno = if pg_version < 17 {
-                buf.get_u32_le()
-            } else {
-                buf.get_u64_le() as u32
-            };
-            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-            Ok(Some(ClogRecord::ZeroPage(ClogZeroPage { segno, rpageno })))
-        } else {
-            assert!(info == pg_constants::CLOG_TRUNCATE);
-            let xlrec = XlClogTruncate::decode(buf, pg_version);
-
-            Ok(Some(ClogRecord::Truncate(ClogTruncate {
-                pageno: xlrec.pageno,
-                oldest_xid: xlrec.oldest_xid,
-                oldest_xid_db: xlrec.oldest_xid_db,
-            })))
-        }
-    }
-
     fn ingest_multixact_create(
         &mut self,
         modification: &mut DatadirModification,
@@ -1880,46 +1082,6 @@ impl WalIngest {
         .await
     }
 
-    fn decode_multixact_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        pg_version: u32,
-    ) -> anyhow::Result<Option<MultiXactRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-
-        if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE
-            || info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE
-        {
-            let pageno = if pg_version < 17 {
-                buf.get_u32_le()
-            } else {
-                buf.get_u64_le() as u32
-            };
-            let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-            let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-
-            let slru_kind = match info {
-                pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE => SlruKind::MultiXactOffsets,
-                pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE => SlruKind::MultiXactMembers,
-                _ => unreachable!(),
-            };
-
-            return Ok(Some(MultiXactRecord::ZeroPage(MultiXactZeroPage {
-                slru_kind,
-                segno,
-                rpageno,
-            })));
-        } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-            let xlrec = XlMultiXactCreate::decode(buf);
-            return Ok(Some(MultiXactRecord::Create(xlrec)));
-        } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
-            let xlrec = XlMultiXactTruncate::decode(buf);
-            return Ok(Some(MultiXactRecord::Truncate(xlrec)));
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_relmap_update(
         &mut self,
         update: RelmapUpdate,
@@ -1933,24 +1095,6 @@ impl WalIngest {
             .await
     }
 
-    fn decode_relmap_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<RelmapRecord>> {
-        let update = XlRelmapUpdate::decode(buf);
-
-        let mut buf = decoded.record.clone();
-        buf.advance(decoded.main_data_offset);
-        // skip xl_relmap_update
-        buf.advance(12);
-
-        Ok(Some(RelmapRecord::Update(RelmapUpdate {
-            update,
-            buf: Bytes::copy_from_slice(&buf[..]),
-        })))
-    }
-
     async fn ingest_raw_xlog_record(
         &mut self,
         raw_record: RawXlogRecord,
@@ -2051,20 +1195,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_xlog_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        lsn: Lsn,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<XlogRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        Ok(Some(XlogRecord::Raw(RawXlogRecord {
-            info,
-            lsn,
-            buf: buf.clone(),
-        })))
-    }
-
     async fn ingest_logical_message_put(
         &mut self,
         put: PutLogicalMessage,
@@ -2075,50 +1205,6 @@ impl WalIngest {
         modification.put_file(path.as_str(), &buf, ctx).await
     }
 
-    fn decode_logical_message_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<LogicalMessageRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-            let xlrec = XlLogicalMessage::decode(buf);
-            let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
-
-            #[cfg(feature = "testing")]
-            if prefix == "neon-test" {
-                return Ok(Some(LogicalMessageRecord::Failpoint));
-            }
-
-            if let Some(path) = prefix.strip_prefix("neon-file:") {
-                let buf_size = xlrec.prefix_size + xlrec.message_size;
-                let buf = Bytes::copy_from_slice(&buf[xlrec.prefix_size..buf_size]);
-                return Ok(Some(LogicalMessageRecord::Put(PutLogicalMessage {
-                    path: path.to_string(),
-                    buf,
-                })));
-            }
-        }
-
-        Ok(None)
-    }
-
-    fn decode_standby_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<StandbyRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_RUNNING_XACTS {
-            let xlrec = XlRunningXacts::decode(buf);
-            return Ok(Some(StandbyRecord::RunningXacts(StandbyRunningXacts {
-                oldest_running_xid: xlrec.oldest_running_xid,
-            })));
-        }
-
-        Ok(None)
-    }
-
     fn ingest_standby_record(&mut self, record: StandbyRecord) -> Result<()> {
         match record {
             StandbyRecord::RunningXacts(running_xacts) => {
@@ -2133,23 +1219,6 @@ impl WalIngest {
         Ok(())
     }
 
-    fn decode_replorigin_record(
-        buf: &mut Bytes,
-        decoded: &DecodedWALRecord,
-        _pg_version: u32,
-    ) -> anyhow::Result<Option<ReploriginRecord>> {
-        let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-        if info == pg_constants::XLOG_REPLORIGIN_SET {
-            let xlrec = XlReploriginSet::decode(buf);
-            return Ok(Some(ReploriginRecord::Set(xlrec)));
-        } else if info == pg_constants::XLOG_REPLORIGIN_DROP {
-            let xlrec = XlReploriginDrop::decode(buf);
-            return Ok(Some(ReploriginRecord::Drop(xlrec)));
-        }
-
-        Ok(None)
-    }
-
     async fn ingest_replorigin_record(
         &mut self,
         record: ReploriginRecord,
@@ -3010,7 +2079,6 @@ mod tests {
     async fn test_ingest_real_wal() {
         use crate::tenant::harness::*;
         use postgres_ffi::waldecoder::WalStreamDecoder;
-        use postgres_ffi::walrecord::decode_wal_record;
         use postgres_ffi::WAL_SEGMENT_SIZE;
 
         // Define test data path and constants.
@@ -3082,10 +2150,16 @@ mod tests {
         for chunk in bytes[xlogoff..].chunks(50) {
             decoder.feed_bytes(chunk);
             while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
-                let mut decoded = DecodedWALRecord::default();
-                decode_wal_record(recdata, &mut decoded, modification.tline.pg_version).unwrap();
+                let interpreted = InterpretedWalRecord::from_bytes_filtered(
+                    recdata,
+                    modification.tline.get_shard_identity(),
+                    lsn,
+                    modification.tline.pg_version,
+                )
+                .unwrap();
+
                 walingest
-                    .ingest_record(decoded, lsn, &mut modification, &ctx)
+                    .ingest_record(interpreted, &mut modification, &ctx)
                     .instrument(span.clone())
                     .await
                     .unwrap();

From f9d8256d559cd767aab4c04106ef732aca8a2811 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 31 Oct 2024 11:51:58 +0100
Subject: [PATCH 137/239] pageserver: don't return option from
 `DeletionQueue::new` (#9588)

`DeletionQueue::new()` always returns deletion workers, so the returned
`Option` is redundant.
---
 pageserver/src/bin/pageserver.rs |  4 +---
 pageserver/src/deletion_queue.rs | 13 ++++---------
 2 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index c6659345f9..782122139e 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -398,9 +398,7 @@ fn start_pageserver(
         ControllerUpcallClient::new(conf, &shutdown_pageserver),
         conf,
     );
-    if let Some(deletion_workers) = deletion_workers {
-        deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
-    }
+    deletion_workers.spawn_with(BACKGROUND_RUNTIME.handle());
 
     // Up to this point no significant I/O has been done: this should have been fast.  Record
     // duration prior to starting I/O intensive phase of startup.
diff --git a/pageserver/src/deletion_queue.rs b/pageserver/src/deletion_queue.rs
index 7733bdb640..37fa300467 100644
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -618,13 +618,11 @@ impl DeletionQueue {
     /// Caller may use the returned object to construct clients with new_client.
     /// Caller should tokio::spawn the background() members of the two worker objects returned:
     /// we don't spawn those inside new() so that the caller can use their runtime/spans of choice.
-    ///
-    /// If remote_storage is None, then the returned workers will also be None.
     pub fn new<C>(
         remote_storage: GenericRemoteStorage,
         controller_upcall_client: Option<C>,
         conf: &'static PageServerConf,
-    ) -> (Self, Option<DeletionQueueWorkers<C>>)
+    ) -> (Self, DeletionQueueWorkers<C>)
     where
         C: ControlPlaneGenerationsApi + Send + Sync,
     {
@@ -656,7 +654,7 @@ impl DeletionQueue {
                 },
                 cancel: cancel.clone(),
             },
-            Some(DeletionQueueWorkers {
+            DeletionQueueWorkers {
                 frontend: ListWriter::new(conf, rx, backend_tx, cancel.clone()),
                 backend: Validator::new(
                     conf,
@@ -667,7 +665,7 @@ impl DeletionQueue {
                     cancel.clone(),
                 ),
                 executor: Deleter::new(remote_storage, executor_rx, cancel.clone()),
-            }),
+            },
         )
     }
 
@@ -742,9 +740,7 @@ mod test {
             );
 
             tracing::debug!("Spawning worker for new queue queue");
-            let worker_join = workers
-                .unwrap()
-                .spawn_with(&tokio::runtime::Handle::current());
+            let worker_join = workers.spawn_with(&tokio::runtime::Handle::current());
 
             let old_worker_join = std::mem::replace(&mut self.worker_join, worker_join);
             let old_deletion_queue = std::mem::replace(&mut self.deletion_queue, deletion_queue);
@@ -855,7 +851,6 @@ mod test {
             harness.conf,
         );
 
-        let worker = worker.unwrap();
         let worker_join = worker.spawn_with(&tokio::runtime::Handle::current());
 
         Ok(TestSetup {

From e96398a552ccd68acac58666665936f5c8cbe431 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 31 Oct 2024 13:05:24 +0000
Subject: [PATCH 138/239]  Add support of extensions for v17 (part 4) (#9568)

- pg_jsonschema 0.3.3
- pg_graphql 1.5.9
- rum 65e0a752
- pg_tiktoken a5bc447e

update support of extensions for v14-v16:
- pg_jsonschema 0.3.1 -> 0.3.3
- pg_graphql 1.5.7 -> 1.5.9
- rum 6ab37053 -> 65e0a752
- pg_tiktoken e64e55aa -> a5bc447e
---
 compute/compute-node.Dockerfile | 78 +++++++++++++++++++++------------
 1 file changed, 50 insertions(+), 28 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index c2333eda08..e4c6589c60 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -431,14 +431,11 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 COPY compute/patches/rum.patch /rum.patch
 
-# maybe version-specific
-# support for v17 is unknown
-# last release 1.3.13 - Sep 19, 2022
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/postgrespro/rum/archive/refs/tags/1.3.13.tar.gz -O rum.tar.gz && \
-    echo "6ab370532c965568df6210bd844ac6ba649f53055e48243525b0b7e5c4d69a7d rum.tar.gz" | sha256sum --check && \
+# supports v17 since https://github.com/postgrespro/rum/commit/cb1edffc57736cd2a4455f8d0feab0d69928da25
+# doesn't use releases since 1.3.13 - Sep 19, 2022
+# use latest commit from the master branch
+RUN wget https://github.com/postgrespro/rum/archive/cb1edffc57736cd2a4455f8d0feab0d69928da25.tar.gz -O rum.tar.gz && \
+    echo "65e0a752e99f4c3226400c9b899f997049e93503db8bf5c8072efa136d32fd83 rum.tar.gz" | sha256sum --check && \
     mkdir rum-src && cd rum-src && tar xzf ../rum.tar.gz --strip-components=1 -C . && \
     patch -p1 < /rum.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config USE_PGXS=1 && \
@@ -959,21 +956,31 @@ RUN apt-get install -y protobuf-compiler && \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-jsonschema-pg-build
+FROM rust-extensions-build-pgrx12 AS pg-jsonschema-pg-build
 ARG PG_VERSION
-
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_jsonschema does not yet have a release that supports pg17" && exit 0;; \
+# version 0.3.3 supports v17
+# last release v0.3.3 - Oct 16, 2024
+#
+# there were no breaking changes
+# so we can use the same version for all postgres versions
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+        export PG_JSONSCHEMA_VERSION=0.3.3 \
+        export PG_JSONSCHEMA_CHECKSUM=40c2cffab4187e0233cb8c3bde013be92218c282f95f4469c5282f6b30d64eac \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v0.3.1.tar.gz -O pg_jsonschema.tar.gz && \
-    echo "61df3db1ed83cf24f6aa39c826f8818bfa4f0bd33b587fd6b2b1747985642297 pg_jsonschema.tar.gz" | sha256sum --check && \
+    wget https://github.com/supabase/pg_jsonschema/archive/refs/tags/v${PG_JSONSCHEMA_VERSION}.tar.gz -O pg_jsonschema.tar.gz && \
+    echo "${PG_JSONSCHEMA_CHECKSUM} pg_jsonschema.tar.gz" | sha256sum --check && \
     mkdir pg_jsonschema-src && cd pg_jsonschema-src && tar xzf ../pg_jsonschema.tar.gz --strip-components=1 -C . && \
     # see commit 252b3685a27a0f4c31a0f91e983c6314838e89e8
     # `unsafe-postgres` feature allows to build pgx extensions
     # against postgres forks that decided to change their ABI name (like us).
     # With that we can build extensions without forking them and using stock
     # pgx. As this feature is new few manual version bumps were required.
-    sed -i 's/pgrx = "0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_jsonschema.control
 
@@ -984,16 +991,27 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-graphql-pg-build
+FROM rust-extensions-build-pgrx12 AS pg-graphql-pg-build
 ARG PG_VERSION
 
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_graphql does not yet have a release that supports pg17 as of now" && exit 0;; \
+# version 1.5.9 supports v17
+# last release v1.5.9 - Oct 16, 2024
+#
+# there were no breaking changes
+# so we can use the same version for all postgres versions
+RUN case "${PG_VERSION}" in \
+    "v14" | "v15" | "v16" | "v17") \
+        export PG_GRAPHQL_VERSION=1.5.9 \
+        export PG_GRAPHQL_CHECKSUM=cf768385a41278be1333472204fc0328118644ae443182cf52f7b9b23277e497 \
+    ;; \
+    *) \
+        echo "unexpected PostgreSQL version" && exit 1 \
+    ;; \
     esac && \
-    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v1.5.7.tar.gz -O pg_graphql.tar.gz && \
-    echo "2b3e567a5b31019cb97ae0e33263c1bcc28580be5a444ac4c8ece5c4be2aea41 pg_graphql.tar.gz" | sha256sum --check && \
+    wget https://github.com/supabase/pg_graphql/archive/refs/tags/v${PG_GRAPHQL_VERSION}.tar.gz -O pg_graphql.tar.gz && \
+    echo "${PG_GRAPHQL_CHECKSUM} pg_graphql.tar.gz" | sha256sum --check && \
     mkdir pg_graphql-src && cd pg_graphql-src && tar xzf ../pg_graphql.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "=0.12.6"/pgrx = { version = "0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release && \
     # it's needed to enable extension because it uses untrusted C language
     sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_graphql.control && \
@@ -1006,15 +1024,13 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-tiktoken-pg-build
+FROM rust-extensions-build-pgrx12 AS pg-tiktoken-pg-build
 ARG PG_VERSION
 
-# 26806147b17b60763039c6a6878884c41a262318 made on 26/09/2023
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_tiktoken does not have versions, nor support for pg17" && exit 0;; \
-    esac && \
-    wget https://github.com/kelvich/pg_tiktoken/archive/26806147b17b60763039c6a6878884c41a262318.tar.gz -O pg_tiktoken.tar.gz && \
-    echo "e64e55aaa38c259512d3e27c572da22c4637418cf124caba904cd50944e5004e pg_tiktoken.tar.gz" | sha256sum --check && \
+# doesn't use releases
+# 9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7 - on Oct 29, 2024
+RUN wget https://github.com/kelvich/pg_tiktoken/archive/9118dd4549b7d8c0bbc98e04322499f7bf2fa6f7.tar.gz -O pg_tiktoken.tar.gz && \
+    echo "a5bc447e7920ee149d3c064b8b9f0086c0e83939499753178f7d35788416f628 pg_tiktoken.tar.gz" | sha256sum --check && \
     mkdir pg_tiktoken-src && cd pg_tiktoken-src && tar xzf ../pg_tiktoken.tar.gz --strip-components=1 -C . && \
     # TODO update pgrx version in the pg_tiktoken repo and remove this line
     sed -i 's/pgrx = { version = "=0.10.2",/pgrx = { version = "0.11.3",/g' Cargo.toml && \
@@ -1032,6 +1048,8 @@ RUN case "${PG_VERSION}" in "v17") \
 FROM rust-extensions-build AS pg-pgx-ulid-build
 ARG PG_VERSION
 
+# doesn't support v17 yet
+# https://github.com/pksunkara/pgx_ulid/pull/52
 RUN case "${PG_VERSION}" in "v17") \
     echo "pgx_ulid does not support pg17 as of the latest version (0.1.5)" && exit 0;; \
     esac && \
@@ -1052,6 +1070,10 @@ RUN case "${PG_VERSION}" in "v17") \
 FROM rust-extensions-build AS pg-session-jwt-build
 ARG PG_VERSION
 
+# TODO use versioned releases
+# add v17 support
+# NOTE: local_proxy depends on the version of pg_session_jwt
+# Do not update without approve from proxy team
 RUN case "${PG_VERSION}" in "v17") \
     echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
     esac && \

From 51fda118f608271ce8f35662ccff4484d45778da Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 31 Oct 2024 14:34:50 +0100
Subject: [PATCH 139/239] increase lifetime of AWS session token to 12 hours
 (#9590)

## Problem

clickbench regression causes clickbench to run >9 hours and the AWS
session token is expired before the run completes

## Summary of changes

extend lifetime of session token for this job to 12 hours
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index 69b8bc5d70..abc58733b3 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -683,7 +683,7 @@ jobs:
       with:
         aws-region: eu-central-1
         role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 18000 # 5 hours
+        role-duration-seconds: 43200 # 12 hours
 
     - name: Download Neon artifact
       uses: ./.github/actions/download

From 552088ac1635f88a65914f6fc540fd343db66e57 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 31 Oct 2024 14:44:59 +0000
Subject: [PATCH 140/239] pageserver: fix spurious error logs in timeline
 lifecycle (#9589)

## Problem

The final part of https://github.com/neondatabase/neon/issues/9543 will
be a chaos test that creates/deletes/archives/offloads timelines while
restarting pageservers and migrating tenants. Developing that test
showed up a few places where we log errors during normal shutdown.

## Summary of changes

- UninitializedTimeline's drop should log at info severity: this is a
normal code path when some part of timeline creation encounters a
cancellation `?` path.
- When offloading and finding a `RemoteTimelineClient` in a
non-initialized state, this is not an error and should not be logged as
such.
- The `offload_timeline` function returned an anyhow error, so callers
couldn't gracefully pick out cancellation errors from real errors:
update this to have a structured error type and use it throughout.
---
 pageserver/src/http/routes.rs               |  9 +++-
 pageserver/src/tenant.rs                    |  8 ++-
 pageserver/src/tenant/tasks.rs              |  1 +
 pageserver/src/tenant/timeline.rs           | 13 +++++
 pageserver/src/tenant/timeline/offload.rs   | 56 ++++++++++++++-------
 pageserver/src/tenant/timeline/uninit.rs    |  4 +-
 test_runner/regress/test_broken_timeline.py |  2 -
 test_runner/regress/test_import.py          |  1 -
 test_runner/regress/test_tenant_delete.py   |  2 -
 9 files changed, 68 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 2d8f4309ca..ef8efd3f27 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -80,6 +80,7 @@ use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::timeline::offload::offload_timeline;
+use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
@@ -2004,7 +2005,12 @@ async fn timeline_offload_handler(
         }
         offload_timeline(&tenant, &timeline)
             .await
-            .map_err(ApiError::InternalServerError)?;
+            .map_err(|e| {
+                match e {
+                    OffloadError::Cancelled => ApiError::ResourceUnavailable("Timeline shutting down".into()),
+                    _ => ApiError::InternalServerError(anyhow!(e))
+                }
+            })?;
 
         json_response(StatusCode::OK, ())
     }
@@ -2060,6 +2066,7 @@ async fn timeline_checkpoint_handler(
                 .map_err(|e|
                     match e {
                         CompactionError::ShuttingDown => ApiError::ShuttingDown,
+                        CompactionError::Offload(e) => ApiError::InternalServerError(anyhow::anyhow!(e)),
                         CompactionError::Other(e) => ApiError::InternalServerError(e)
                     }
                 )?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8237f4662c..68f8f7e13c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2529,6 +2529,11 @@ impl Tenant {
                         .await
                         .inspect_err(|e| match e {
                             timeline::CompactionError::ShuttingDown => (),
+                            timeline::CompactionError::Offload(_) => {
+                                // Failures to offload timelines do not trip the circuit breaker, because
+                                // they do not do lots of writes the way compaction itself does: it is cheap
+                                // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                            }
                             timeline::CompactionError::Other(e) => {
                                 self.compaction_circuit_breaker
                                     .lock()
@@ -2544,8 +2549,7 @@ impl Tenant {
             if pending_task_left == Some(false) && *can_offload {
                 offload_timeline(self, timeline)
                     .instrument(info_span!("offload_timeline", %timeline_id))
-                    .await
-                    .map_err(timeline::CompactionError::Other)?;
+                    .await?;
             }
         }
 
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 547739e773..16dac10dca 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -279,6 +279,7 @@ fn log_compaction_error(
 
     let decision = match e {
         ShuttingDown => None,
+        Offload(_) => Some(LooksLike::Error),
         _ if task_cancelled => Some(LooksLike::Info),
         Other(e) => {
             let root_cause = e.root_cause();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index d765a7c987..12919866a3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -20,6 +20,7 @@ use chrono::{DateTime, Utc};
 use enumset::EnumSet;
 use fail::fail_point;
 use handle::ShardTimelineId;
+use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
@@ -4475,11 +4476,23 @@ impl Drop for Timeline {
 pub(crate) enum CompactionError {
     #[error("The timeline or pageserver is shutting down")]
     ShuttingDown,
+    /// Compaction tried to offload a timeline and failed
+    #[error("Failed to offload timeline: {0}")]
+    Offload(OffloadError),
     /// Compaction cannot be done right now; page reconstruction and so on.
     #[error(transparent)]
     Other(anyhow::Error),
 }
 
+impl From<OffloadError> for CompactionError {
+    fn from(e: OffloadError) -> Self {
+        match e {
+            OffloadError::Cancelled => Self::ShuttingDown,
+            _ => Self::Offload(e),
+        }
+    }
+}
+
 impl CompactionError {
     pub fn is_cancelled(&self) -> bool {
         matches!(self, CompactionError::ShuttingDown)
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 5b196cf8a7..c77c240000 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -3,18 +3,40 @@ use std::sync::Arc;
 use super::delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard};
 use super::Timeline;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
+use crate::tenant::{OffloadedTimeline, Tenant, TenantManifestError, TimelineOrOffloaded};
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum OffloadError {
+    #[error("Cancelled")]
+    Cancelled,
+    #[error("Timeline is not archived")]
+    NotArchived,
+    #[error(transparent)]
+    RemoteStorage(anyhow::Error),
+    #[error("Unexpected offload error: {0}")]
+    Other(anyhow::Error),
+}
+
+impl From<TenantManifestError> for OffloadError {
+    fn from(e: TenantManifestError) -> Self {
+        match e {
+            TenantManifestError::Cancelled => Self::Cancelled,
+            TenantManifestError::RemoteStorage(e) => Self::RemoteStorage(e),
+        }
+    }
+}
 
 pub(crate) async fn offload_timeline(
     tenant: &Tenant,
     timeline: &Arc<Timeline>,
-) -> anyhow::Result<()> {
+) -> Result<(), OffloadError> {
     debug_assert_current_span_has_tenant_and_timeline_id();
     tracing::info!("offloading archived timeline");
 
     let allow_offloaded_children = true;
     let (timeline, guard) =
-        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)?;
+        DeleteTimelineFlow::prepare(tenant, timeline.timeline_id, allow_offloaded_children)
+            .map_err(|e| OffloadError::Other(anyhow::anyhow!(e)))?;
 
     let TimelineOrOffloaded::Timeline(timeline) = timeline else {
         tracing::error!("timeline already offloaded, but given timeline object");
@@ -26,14 +48,15 @@ pub(crate) async fn offload_timeline(
         Some(true) => (),
         Some(false) => {
             tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
-            anyhow::bail!("timeline isn't archived");
+            return Err(OffloadError::NotArchived);
         }
         None => {
-            tracing::warn!(
+            // This is legal: calls to this function can race with the timeline shutting down
+            tracing::info!(
                 ?is_archived,
-                "tried offloading a timeline where manifest is not yet available"
+                "tried offloading a timeline whose remote storage is not initialized"
             );
-            anyhow::bail!("timeline manifest hasn't been loaded yet");
+            return Err(OffloadError::Cancelled);
         }
     }
 
@@ -44,9 +67,11 @@ pub(crate) async fn offload_timeline(
     // to make deletions possible while offloading is in progress
 
     let conf = &tenant.conf;
-    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
+    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline)
+        .await
+        .map_err(OffloadError::Other)?;
 
-    remove_timeline_from_tenant(tenant, &timeline, &guard).await?;
+    remove_timeline_from_tenant(tenant, &timeline, &guard);
 
     {
         let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
@@ -65,21 +90,18 @@ pub(crate) async fn offload_timeline(
     // at the next restart attach it again.
     // For that to happen, we'd need to make the manifest reflect our *intended* state,
     // not our actual state of offloaded timelines.
-    tenant
-        .store_tenant_manifest()
-        .await
-        .map_err(|e| anyhow::anyhow!(e))?;
+    tenant.store_tenant_manifest().await?;
 
     Ok(())
 }
 
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn remove_timeline_from_tenant(
+fn remove_timeline_from_tenant(
     tenant: &Tenant,
     timeline: &Timeline,
     _: &DeletionGuard, // using it as a witness
-) -> anyhow::Result<()> {
+) {
     // Remove the timeline from the map.
     let mut timelines = tenant.timelines.lock().unwrap();
     let children_exist = timelines
@@ -95,8 +117,4 @@ async fn remove_timeline_from_tenant(
     timelines
         .remove(&timeline.timeline_id)
         .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
-
-    drop(timelines);
-
-    Ok(())
 }
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index c398289a5c..a93bdde3f8 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -141,7 +141,9 @@ impl Drop for UninitializedTimeline<'_> {
     fn drop(&mut self) {
         if let Some((_, create_guard)) = self.raw_timeline.take() {
             let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
-            error!("Timeline got dropped without initializing, cleaning its files");
+            // This is unusual, but can happen harmlessly if the pageserver is stopped while
+            // creating a timeline.
+            info!("Timeline got dropped without initializing, cleaning its files");
             cleanup_timeline_directory(create_guard);
         }
     }
diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py
index 99e0e23b4a..124e62999a 100644
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -103,7 +103,6 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
     env.pageserver.allowed_errors.extend(
         [
             ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
-            ".*Timeline got dropped without initializing, cleaning its files.*",
         ]
     )
 
@@ -145,7 +144,6 @@ def test_timeline_init_break_before_checkpoint_recreate(
     env.pageserver.allowed_errors.extend(
         [
             ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
-            ".*Timeline got dropped without initializing, cleaning its files.*",
             ".*Failed to load index_part from remote storage, failed creation?.*",
         ]
     )
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index e367db33ff..743fa72aba 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -91,7 +91,6 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
         [
             ".*Failed to import basebackup.*",
             ".*unexpected non-zero bytes after the tar archive.*",
-            ".*Timeline got dropped without initializing, cleaning its files.*",
             ".*InternalServerError.*timeline not found.*",
             ".*InternalServerError.*Tenant .* not found.*",
             ".*InternalServerError.*Timeline .* not found.*",
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index f486327445..47df3ead70 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -146,8 +146,6 @@ def test_long_timeline_create_cancelled_by_tenant_delete(neon_env_builder: NeonE
 
     env.pageserver.allowed_errors.extend(
         [
-            # happens with the cancellation bailing flushing loop earlier, leaving disk_consistent_lsn at zero
-            ".*Timeline got dropped without initializing, cleaning its files",
             # the response hit_pausable_failpoint_and_later_fail
             f".*Error processing HTTP request: InternalServerError\\(new timeline {env.initial_tenant}/{env.initial_timeline} has invalid disk_consistent_lsn",
         ]

From 897cffb9d86389c990e65eba1f3ddac1213b0363 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 31 Oct 2024 14:57:55 +0000
Subject: [PATCH 141/239] auth_broker: fix local_proxy conn count (#9593)

our current metrics for http pool opened connections is always negative
:D oops
---
 proxy/src/serverless/http_conn_pool.rs | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 934a50c14f..b92ae31310 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -294,6 +294,11 @@ pub(crate) fn poll_http2_client(
                 conn_id,
                 aux: aux.clone(),
             });
+            Metrics::get()
+                .proxy
+                .http_pool_opened_connections
+                .get_metric()
+                .inc();
 
             Arc::downgrade(&pool)
         }
@@ -306,7 +311,7 @@ pub(crate) fn poll_http2_client(
             let res = connection.await;
             match res {
                 Ok(()) => info!("connection closed"),
-                Err(e) => error!(%session_id, "connection error: {}", e),
+                Err(e) => error!(%session_id, "connection error: {e:?}"),
             }
 
             // remove from connection pool

From 9761b6a64e80a4e8bce4b00afce5c2c4f6b825bd Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 31 Oct 2024 15:50:41 +0000
Subject: [PATCH 142/239] update pg_session_jwt to use pgrx 0.12 for pg17
 (#9595)

Updates the extension to use pgrx 0.12. No changes to the extensions
have been made, the only difference is the pgrx version.
---
 compute/compute-node.Dockerfile | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index e4c6589c60..30126de56c 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1067,20 +1067,16 @@ RUN case "${PG_VERSION}" in "v17") \
 #
 #########################################################################################
 
-FROM rust-extensions-build AS pg-session-jwt-build
+FROM rust-extensions-build-pgrx12 AS pg-session-jwt-build
 ARG PG_VERSION
 
-# TODO use versioned releases
-# add v17 support
 # NOTE: local_proxy depends on the version of pg_session_jwt
 # Do not update without approve from proxy team
-RUN case "${PG_VERSION}" in "v17") \
-    echo "pg_session_jwt does not yet have a release that supports pg17" && exit 0;; \
-    esac && \
-    wget https://github.com/neondatabase/pg_session_jwt/archive/e1310b08ba51377a19e0559e4d1194883b9b2ba2.tar.gz -O pg_session_jwt.tar.gz && \
-    echo "837932a077888d5545fd54b0abcc79e5f8e37017c2769a930afc2f5c94df6f4e pg_session_jwt.tar.gz" | sha256sum --check && \
+# Make sure the version is reflected in proxy/src/serverless/local_conn_pool.rs
+RUN wget https://github.com/neondatabase/pg_session_jwt/archive/refs/tags/v0.1.2-v17.tar.gz -O pg_session_jwt.tar.gz && \
+    echo "c8ecbed9cb8c6441bce5134a176002b043018adf9d05a08e457dda233090a86e pg_session_jwt.tar.gz" | sha256sum --check && \
     mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \
-    sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
+    sed -i 's/pgrx = "0.12.6"/pgrx = { version = "=0.12.6", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgrx install --release
 
 #########################################################################################

From e589c2e5ecce03b169ef24d8639e42beeec48837 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 31 Oct 2024 18:29:16 +0000
Subject: [PATCH 143/239] storage_controller: allow deployment infra to use
 infra token  (#9596)

## Problem

We wish for the deployment orchestrator to use infra scoped tokens,
but storcon endpoints it's using require admin scoped tokens.

## Summary of Changes

Switch over all endpoints that are used by the deployment orchestrator
to use an infra scoped token. This causes no breakage during mixed
version scenarios because admin scoped tokens allow access to all
endpoints. The deployment orchestrator can cut over to the infra token
after this commit touches down in prod.

Once this commit is released we should also update the tests code to use
infra scoped tokens where appropriate. Currently it would fail on the
[compat tests](https://github.com/neondatabase/neon/blob/9761b6a64e80a4e8bce4b00afce5c2c4f6b825bd/test_runner/regress/test_storage_controller.py#L69-L71).
---
 storage_controller/src/http.rs | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index face3d2c2d..f6ea1aedc6 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -658,7 +658,7 @@ async fn handle_node_register(req: Request<Body>) -> Result<Response<Body>, ApiE
 }
 
 async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -737,7 +737,7 @@ async fn handle_node_configure(req: Request<Body>) -> Result<Response<Body>, Api
 }
 
 async fn handle_node_status(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -786,7 +786,7 @@ async fn handle_get_leader(req: Request<Body>) -> Result<Response<Body>, ApiErro
 }
 
 async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -804,7 +804,7 @@ async fn handle_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiErro
 }
 
 async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -822,7 +822,7 @@ async fn handle_cancel_node_drain(req: Request<Body>) -> Result<Response<Body>,
 }
 
 async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {
@@ -840,7 +840,7 @@ async fn handle_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError
 }
 
 async fn handle_cancel_node_fill(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::Infra)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {

From 2d1366c8ee217da3c09a1d3d68a3cfc7e98f500a Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Fri, 1 Nov 2024 11:22:38 +0000
Subject: [PATCH 144/239] fix pre-commit hook with python stubs (#9602)

fix #9601
---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 3f21094ba4..92580ee156 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -67,7 +67,7 @@ exclude = [
 check_untyped_defs = true
 # Help mypy find imports when running against list of individual files.
 # Without this line it would behave differently when executed on the entire project.
-mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner"
+mypy_path = "$MYPY_CONFIG_FILE_DIR:$MYPY_CONFIG_FILE_DIR/test_runner:$MYPY_CONFIG_FILE_DIR/test_runner/stubs"
 
 disallow_incomplete_defs = false
 disallow_untyped_calls = false

From 4c2c8d67081ee7856246e92df68dc13b1009c1a6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 1 Nov 2024 12:25:04 +0100
Subject: [PATCH 145/239] test_runner: fix `tenant_get_shards` with one
 pageserver (#9603)

## Problem

`tenant_get_shards()` does not work with a sharded tenant on 1
pageserver, as it assumes an unsharded tenant in this case. This special
case appears to have been added to handle e.g. `test_emergency_mode`,
where the storage controller is stopped. This breaks e.g. the sharded
ingest benchmark in #9591 when run with a single shard.

## Summary of changes

Correctly look up shards even with a single pageserver, but add a
special case that assumes an unsharded tenant if the storage controller
is stopped and the caller provides an explicit pageserver, in order to
accomodate `test_emergency_mode`.
---
 test_runner/fixtures/neon_fixtures.py | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 1b9bc873f4..e4d6e6da5d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1397,7 +1397,7 @@ def neon_simple_env(
     pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnv]:
     """
-    Simple Neon environment, with no authentication and no safekeepers.
+    Simple Neon environment, with 1 safekeeper and 1 pageserver. No authentication, no fsync.
 
     This fixture will use RemoteStorageKind.LOCAL_FS with pageserver.
     """
@@ -4701,6 +4701,7 @@ def tenant_get_shards(
 
     If the caller provides `pageserver_id`, it will be used for all shards, even
     if the shard is indicated by storage controller to be on some other pageserver.
+    If the storage controller is not running, assume an unsharded tenant.
 
     Caller should over the response to apply their per-pageserver action to
     each shard
@@ -4710,17 +4711,17 @@ def tenant_get_shards(
     else:
         override_pageserver = None
 
-    if len(env.pageservers) > 1:
-        return [
-            (
-                TenantShardId.parse(s["shard_id"]),
-                override_pageserver or env.get_pageserver(s["node_id"]),
-            )
-            for s in env.storage_controller.locate(tenant_id)
-        ]
-    else:
-        # Assume an unsharded tenant
-        return [(TenantShardId(tenant_id, 0, 0), override_pageserver or env.pageserver)]
+    if not env.storage_controller.running and override_pageserver is not None:
+        log.warning(f"storage controller not running, assuming unsharded tenant {tenant_id}")
+        return [(TenantShardId(tenant_id, 0, 0), override_pageserver)]
+
+    return [
+        (
+            TenantShardId.parse(s["shard_id"]),
+            override_pageserver or env.get_pageserver(s["node_id"]),
+        )
+        for s in env.storage_controller.locate(tenant_id)
+    ]
 
 
 def wait_replica_caughtup(primary: Endpoint, secondary: Endpoint):

From 8b3bcf71eee04dc24398b37b82e5d8e528c2d0c4 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Fri, 1 Nov 2024 12:46:02 +0100
Subject: [PATCH 146/239] revert higher token expiration (#9605)

## Problem

The IAM role associated with our github action runner supports a max
token expiration which is lower than the value we tried.

## Summary of changes

Since we believe to have understood the performance regression we (by
ensuring availability zone affinity of compute and pageserver) the job
should again run in lower than 5 hours and we revert this change instead
of increasing the max session token expiration in the IAM role which
would reduce our security.
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index abc58733b3..69b8bc5d70 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -683,7 +683,7 @@ jobs:
       with:
         aws-region: eu-central-1
         role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
-        role-duration-seconds: 43200 # 12 hours
+        role-duration-seconds: 18000 # 5 hours
 
     - name: Download Neon artifact
       uses: ./.github/actions/download

From 123816e99ac3b150aecfc71002f53cf0b1e64bf0 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 1 Nov 2024 13:47:03 +0100
Subject: [PATCH 147/239] safekeeper: log slow WalAcceptor sends (#9564)

## Problem

We don't have any observability into full WalAcceptor queues per
timeline.

## Summary of changes

Logs a message when a WalAcceptor send has blocked for 5 seconds, and
another message when the send completes. This implies that the log
frequency is at most once every 5 seconds per timeline, so we don't need
further throttling.
---
 safekeeper/src/receive_wal.rs | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index f97e127a17..2410e22f45 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -26,10 +26,11 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use tokio::io::AsyncRead;
 use tokio::io::AsyncWrite;
+use tokio::sync::mpsc::error::SendTimeoutError;
 use tokio::sync::mpsc::{channel, Receiver, Sender};
 use tokio::task;
 use tokio::task::JoinHandle;
-use tokio::time::{Duration, MissedTickBehavior};
+use tokio::time::{Duration, Instant, MissedTickBehavior};
 use tracing::*;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;
@@ -384,9 +385,29 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
     msg_tx: Sender<ProposerAcceptorMessage>,
     mut next_msg: ProposerAcceptorMessage,
 ) -> Result<(), CopyStreamHandlerEnd> {
+    /// Threshold for logging slow WalAcceptor sends.
+    const SLOW_THRESHOLD: Duration = Duration::from_secs(5);
+
     loop {
-        if msg_tx.send(next_msg).await.is_err() {
-            return Ok(()); // chan closed, WalAcceptor terminated
+        let started = Instant::now();
+        match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await {
+            Ok(()) => {}
+            // Slow send, log a message and keep trying. Log context has timeline ID.
+            Err(SendTimeoutError::Timeout(next_msg)) => {
+                warn!(
+                    "slow WalAcceptor send blocked for {:.3}s",
+                    Instant::now().duration_since(started).as_secs_f64()
+                );
+                if msg_tx.send(next_msg).await.is_err() {
+                    return Ok(()); // WalAcceptor terminated
+                }
+                warn!(
+                    "slow WalAcceptor send completed after {:.3}s",
+                    Instant::now().duration_since(started).as_secs_f64()
+                )
+            }
+            // WalAcceptor terminated.
+            Err(SendTimeoutError::Closed(_)) => return Ok(()),
         }
         next_msg = read_message(pgb_reader).await?;
     }

From 3c16bd6e0bbc5e39111188cfca571b5033d3a377 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 1 Nov 2024 16:47:20 +0000
Subject: [PATCH 148/239] storcon: skip non-active projects in chaos injection
 (#9606)

## Problem

We may sometimes use scheduling modes like `Pause` to pin a tenant in
its current location for operational reasons. It is undesirable for the
chaos task to make any changes to such projects.

## Summary of changes

- Add a check for scheduling mode
- Add a log line when we do choose to do a chaos action for a tenant:
this will help us understand which operations originate from the chaos
task.
---
 storage_controller/src/service/chaos_injector.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/storage_controller/src/service/chaos_injector.rs b/storage_controller/src/service/chaos_injector.rs
index 99961d691c..0e551beaa7 100644
--- a/storage_controller/src/service/chaos_injector.rs
+++ b/storage_controller/src/service/chaos_injector.rs
@@ -1,5 +1,6 @@
 use std::{sync::Arc, time::Duration};
 
+use pageserver_api::controller_api::ShardSchedulingPolicy;
 use rand::seq::SliceRandom;
 use rand::thread_rng;
 use tokio_util::sync::CancellationToken;
@@ -47,6 +48,16 @@ impl ChaosInjector {
                 .get_mut(victim)
                 .expect("Held lock between choosing ID and this get");
 
+            if !matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active) {
+                // Skip non-active scheduling policies, so that a shard with a policy like Pause can
+                // be pinned without being disrupted by us.
+                tracing::info!(
+                    "Skipping shard {victim}: scheduling policy is {:?}",
+                    shard.get_scheduling_policy()
+                );
+                continue;
+            }
+
             // Pick a secondary to promote
             let Some(new_location) = shard
                 .intent
@@ -63,6 +74,8 @@ impl ChaosInjector {
                 continue;
             };
 
+            tracing::info!("Injecting chaos: migrate {victim} {old_location}->{new_location}");
+
             shard.intent.demote_attached(scheduler, old_location);
             shard.intent.promote_attached(scheduler, new_location);
             self.service.maybe_reconcile_shard(shard, nodes);

From 8ac523d2ee8e29ecc6891f6bd661fcf51b2147ba Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 1 Nov 2024 20:31:29 +0200
Subject: [PATCH 149/239] Do not assign page LSN to new (uninitialized) page in
 ClearVisibilityMapFlags redo handler (#9287)

## Problem

https://neondb.slack.com/archives/C04DGM6SMTM/p1727872045252899

See https://github.com/neondatabase/neon/issues/9240

## Summary of changes

Add `!page_is_new` check before assigning page lsn.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 pageserver/src/walredo/apply_neon.rs | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 7aaa357318..d712d8bf5e 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -67,7 +67,10 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
+                // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
+                if !postgres_ffi::page_is_new(page) {
+                    postgres_ffi::page_set_lsn(page, lsn);
+                }
             }
 
             // Repeat for 'old_heap_blkno', if any
@@ -81,7 +84,10 @@ pub(crate) fn apply_in_neon(
                 let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
 
                 map[map_byte as usize] &= !(flags << map_offset);
-                postgres_ffi::page_set_lsn(page, lsn);
+                // The page should never be empty, but we're checking it anyway as a precaution, so that if it is empty for some reason anyway, we don't make matters worse by setting the LSN on it.
+                if !postgres_ffi::page_is_new(page) {
+                    postgres_ffi::page_set_lsn(page, lsn);
+                }
             }
         }
         // Non-relational WAL records are handled here, with custom code that has the

From 0058eb09df13ba13ead20a8a34ceefa4a3580f23 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Sat, 2 Nov 2024 17:42:10 +0100
Subject: [PATCH 150/239] test_runner/performance: add sharded ingest benchmark
 (#9591)

Adds a Python benchmark for sharded ingestion. This ingests 7 GB of WAL
(100M rows) into a Safekeeper and fans out to 10 shards running on 10
different pageservers. The ingest volume and duration is recorded.
---
 .../performance/test_sharded_ingest.py        | 71 +++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 test_runner/performance/test_sharded_ingest.py

diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
new file mode 100644
index 0000000000..77e8f2cf17
--- /dev/null
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -0,0 +1,71 @@
+from __future__ import annotations
+
+from contextlib import closing
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn, TenantShardId
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    tenant_get_shards,
+    wait_for_last_flush_lsn,
+)
+
+
+@pytest.mark.timeout(600)
+@pytest.mark.parametrize("shard_count", [1, 8, 32])
+def test_sharded_ingest(
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    shard_count: int,
+):
+    """
+    Benchmarks sharded ingestion throughput, by ingesting a large amount of WAL into a Safekeeper
+    and fanning out to a large number of shards on dedicated Pageservers. Comparing the base case
+    (shard_count=1) to the sharded case indicates the overhead of sharding.
+    """
+
+    ROW_COUNT = 100_000_000  # about 7 GB of WAL
+
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start()
+
+    # Create a sharded tenant and timeline, and migrate it to the respective pageservers. Ensure
+    # the storage controller doesn't mess with shard placements.
+    #
+    # TODO: there should be a way to disable storage controller background reconciliations.
+    # Currently, disabling reconciliation also disables foreground operations.
+    tenant_id, timeline_id = env.create_tenant(shard_count=shard_count)
+
+    for shard_number in range(0, shard_count):
+        tenant_shard_id = TenantShardId(tenant_id, shard_number, shard_count)
+        pageserver_id = shard_number + 1
+        env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_id)
+
+    shards = tenant_get_shards(env, tenant_id)
+    env.storage_controller.reconcile_until_idle()
+    assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
+
+    # Start the endpoint.
+    endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    # Ingest data and measure WAL volume and duration.
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            log.info("Ingesting data")
+            cur.execute("set statement_timeout = 0")
+            cur.execute("create table huge (i int, j int)")
+
+            with zenbenchmark.record_duration("pageserver_ingest"):
+                with zenbenchmark.record_duration("wal_ingest"):
+                    cur.execute(f"insert into huge values (generate_series(1, {ROW_COUNT}), 0)")
+
+                wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
+
+    end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
+    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+
+    assert tenant_get_shards(env, tenant_id) == shards, "shards moved"

From 4534f5cdc663e81f9ccd233d0cc2b2db733658c6 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 4 Nov 2024 09:11:52 +0000
Subject: [PATCH 151/239] pageserver: make local timeline deletion infallible
 (#9594)

## Problem

In https://github.com/neondatabase/neon/pull/9589, timeline offload code
is modified to return an explicit error type rather than propagating
anyhow::Error. One of the 'Other' cases there is I/O errors from local
timeline deletion, which shouldn't need to exist, because our policy is
not to try and continue running if the local disk gives us errors.

## Summary of changes

- Make `delete_local_timeline_directory` and use `.fatal_err(` on I/O
errors

---------

Co-authored-by: Erik Grinaker <erik@neon.tech>
---
 pageserver/src/tenant/timeline/delete.rs  | 40 +++++++++--------------
 pageserver/src/tenant/timeline/offload.rs |  4 +--
 2 files changed, 17 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index b0c4fa2bc9..5a4c2d9da3 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -18,6 +18,7 @@ use crate::{
         CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
         TimelineOrOffloaded,
     },
+    virtual_file::MaybeFatalIo,
 };
 
 use super::{Timeline, TimelineResources};
@@ -62,10 +63,10 @@ pub(super) async fn delete_local_timeline_directory(
     conf: &PageServerConf,
     tenant_shard_id: TenantShardId,
     timeline: &Timeline,
-) -> anyhow::Result<()> {
+) {
     // Always ensure the lock order is compaction -> gc.
     let compaction_lock = timeline.compaction_lock.lock();
-    let compaction_lock = crate::timed(
+    let _compaction_lock = crate::timed(
         compaction_lock,
         "acquires compaction lock",
         std::time::Duration::from_secs(5),
@@ -73,7 +74,7 @@ pub(super) async fn delete_local_timeline_directory(
     .await;
 
     let gc_lock = timeline.gc_lock.lock();
-    let gc_lock = crate::timed(
+    let _gc_lock = crate::timed(
         gc_lock,
         "acquires gc lock",
         std::time::Duration::from_secs(5),
@@ -85,24 +86,15 @@ pub(super) async fn delete_local_timeline_directory(
 
     let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
 
-    fail::fail_point!("timeline-delete-before-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-    });
-
     // NB: This need not be atomic because the deleted flag in the IndexPart
     // will be observed during tenant/timeline load. The deletion will be resumed there.
     //
-    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
-    // This can happen if we're called a second time, e.g.,
-    // because of a previous failure/cancellation at/after
-    // failpoint timeline-delete-after-rm.
-    //
-    // ErrorKind::NotFound can also happen if we race with tenant detach, because,
+    // ErrorKind::NotFound can happen e.g. if we race with tenant detach, because,
     // no locks are shared.
     tokio::fs::remove_dir_all(local_timeline_directory)
         .await
         .or_else(fs_ext::ignore_not_found)
-        .context("remove local timeline directory")?;
+        .fatal_err("removing timeline directory");
 
     // Make sure previous deletions are ordered before mark removal.
     // Otherwise there is no guarantee that they reach the disk before mark deletion.
@@ -113,17 +105,9 @@ pub(super) async fn delete_local_timeline_directory(
     let timeline_path = conf.timelines_path(&tenant_shard_id);
     crashsafe::fsync_async(timeline_path)
         .await
-        .context("fsync_pre_mark_remove")?;
+        .fatal_err("fsync after removing timeline directory");
 
     info!("finished deleting layer files, releasing locks");
-    drop(gc_lock);
-    drop(compaction_lock);
-
-    fail::fail_point!("timeline-delete-after-rm", |_| {
-        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-    });
-
-    Ok(())
 }
 
 /// Removes remote layers and an index file after them.
@@ -440,12 +424,20 @@ impl DeleteTimelineFlow {
         timeline: &TimelineOrOffloaded,
         remote_client: Arc<RemoteTimelineClient>,
     ) -> Result<(), DeleteTimelineError> {
+        fail::fail_point!("timeline-delete-before-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+        });
+
         // Offloaded timelines have no local state
         // TODO: once we persist offloaded information, delete the timeline from there, too
         if let TimelineOrOffloaded::Timeline(timeline) = timeline {
-            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
+            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await;
         }
 
+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
         delete_remote_layers_and_index(&remote_client).await?;
 
         pausable_failpoint!("in_progress_delete");
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index c77c240000..cccf24e303 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -67,9 +67,7 @@ pub(crate) async fn offload_timeline(
     // to make deletions possible while offloading is in progress
 
     let conf = &tenant.conf;
-    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline)
-        .await
-        .map_err(OffloadError::Other)?;
+    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await;
 
     remove_timeline_from_tenant(tenant, &timeline, &guard);
 

From d5de63c6b86816b61f5997bff503f0eb20d5fa72 Mon Sep 17 00:00:00 2001
From: Matthias van de Meent <matthias@neon.tech>
Date: Mon, 4 Nov 2024 13:10:32 +0100
Subject: [PATCH 152/239] Fix a time zone issue in a PG17 test case (#9618)

The commit was cherry-picked and thus shouldn't cause issues once we
merge the release tag for PostgreSQL 17.1
---
 vendor/postgres-v17   | 2 +-
 vendor/revisions.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 68b5038f27..9ad2f3c5c3 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 68b5038f27e493bde6ae552fe066f10cbdfe6a14
+Subproject commit 9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 896a75814e..18bde18359 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,7 +1,7 @@
 {
   "v17": [
     "17.0",
-    "68b5038f27e493bde6ae552fe066f10cbdfe6a14"
+    "9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb"
   ],
   "v16": [
     "16.4",

From 3dcdbcc34dbf64a296a78a8252c0b42d7137cc3c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 4 Nov 2024 13:29:13 +0000
Subject: [PATCH 153/239] remove aws-lc-rs dep and fix storage_broker tls
 (#9613)

It seems the ecosystem is not so keen on moving to aws-lc-rs as it's
build setup is more complicated than ring (requiring cmake).

Eventually I expect the ecosystem should pivot to
https://github.com/ctz/graviola/tree/main/rustls-graviola as it
stabilises (it has a very simply build step and license), but for now
let's try not have a headache of juggling two crypto libs.

I also noticed that tonic will just fail with tls without a default
provider, so I added some defensive code for that.
---
 Cargo.lock                                    | 150 +++---------------
 Cargo.toml                                    |   4 +-
 libs/postgres_backend/tests/simple_select.rs  |   6 +-
 proxy/src/bin/pg_sni_router.rs                |  17 +-
 proxy/src/compute.rs                          |  18 +--
 proxy/src/config.rs                           |   6 +-
 proxy/src/proxy/tests/mod.rs                  |  28 ++--
 storage_broker/Cargo.toml                     |   1 +
 storage_broker/src/lib.rs                     |   6 +
 .../src/scan_safekeeper_metadata.rs           |   6 +-
 workspace_hack/Cargo.toml                     |   9 +-
 11 files changed, 72 insertions(+), 179 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c5af247e8b..44ef6d960c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -310,33 +310,6 @@ dependencies = [
  "zeroize",
 ]
 
-[[package]]
-name = "aws-lc-rs"
-version = "1.9.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2f95446d919226d587817a7d21379e6eb099b97b45110a7f272a444ca5c54070"
-dependencies = [
- "aws-lc-sys",
- "mirai-annotations",
- "paste",
- "zeroize",
-]
-
-[[package]]
-name = "aws-lc-sys"
-version = "0.21.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b3ddc4a5b231dd6958b140ff3151b6412b3f4321fab354f399eec8f14b06df62"
-dependencies = [
- "bindgen 0.69.5",
- "cc",
- "cmake",
- "dunce",
- "fs_extra",
- "libc",
- "paste",
-]
-
 [[package]]
 name = "aws-runtime"
 version = "1.4.3"
@@ -942,29 +915,6 @@ dependencies = [
  "serde",
 ]
 
-[[package]]
-name = "bindgen"
-version = "0.69.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
-dependencies = [
- "bitflags 2.4.1",
- "cexpr",
- "clang-sys",
- "itertools 0.10.5",
- "lazy_static",
- "lazycell",
- "log",
- "prettyplease",
- "proc-macro2",
- "quote",
- "regex",
- "rustc-hash",
- "shlex",
- "syn 2.0.52",
- "which",
-]
-
 [[package]]
 name = "bindgen"
 version = "0.70.1"
@@ -974,7 +924,7 @@ dependencies = [
  "bitflags 2.4.1",
  "cexpr",
  "clang-sys",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "log",
  "prettyplease",
  "proc-macro2",
@@ -1220,15 +1170,6 @@ version = "0.5.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
 
-[[package]]
-name = "cmake"
-version = "0.1.51"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "fb1e43aa7fd152b1f968787f7dbcdeb306d1867ff373c69955211876c053f91a"
-dependencies = [
- "cc",
-]
-
 [[package]]
 name = "colorchoice"
 version = "1.0.0"
@@ -1329,9 +1270,9 @@ dependencies = [
 
 [[package]]
 name = "const-oid"
-version = "0.9.5"
+version = "0.9.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "28c122c3980598d243d63d9a704629a2d748d101f278052ff068be5a4423ab6f"
+checksum = "c2459377285ad874054d797f3ccebf984978aa39129f6eafde5cdc8315b612f8"
 
 [[package]]
 name = "const-random"
@@ -1815,12 +1756,6 @@ dependencies = [
  "syn 2.0.52",
 ]
 
-[[package]]
-name = "dunce"
-version = "1.0.5"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813"
-
 [[package]]
 name = "dyn-clone"
 version = "1.0.14"
@@ -2125,12 +2060,6 @@ dependencies = [
  "tokio-util",
 ]
 
-[[package]]
-name = "fs_extra"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "42703706b716c37f96a77aea830392ad231f44c9e9a67872fa5548707e11b11c"
-
 [[package]]
 name = "fsevent-sys"
 version = "4.1.0"
@@ -2484,15 +2413,6 @@ dependencies = [
  "digest",
 ]
 
-[[package]]
-name = "home"
-version = "0.5.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5"
-dependencies = [
- "windows-sys 0.52.0",
-]
-
 [[package]]
 name = "hostname"
 version = "0.4.0"
@@ -2988,12 +2908,6 @@ dependencies = [
  "spin",
 ]
 
-[[package]]
-name = "lazycell"
-version = "1.3.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
-
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3224,12 +3138,6 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
-[[package]]
-name = "mirai-annotations"
-version = "1.12.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c9be0862c1b3f26a88803c4a49de6889c10e608b3ee9344e6ef5b45fb37ad3d1"
-
 [[package]]
 name = "multimap"
 version = "0.8.3"
@@ -4147,7 +4055,7 @@ dependencies = [
  "bytes",
  "once_cell",
  "pq_proto",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-pemfile 2.1.1",
  "serde",
  "thiserror",
@@ -4176,7 +4084,7 @@ name = "postgres_ffi"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
  "bytes",
  "crc32c",
  "env_logger",
@@ -4314,7 +4222,7 @@ checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
  "heck 0.5.0",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "log",
  "multimap",
  "once_cell",
@@ -4334,7 +4242,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e9552f850d5f0964a4e4d0bf306459ac29323ddfbae05e35a7c0d35cb0803cc5"
 dependencies = [
  "anyhow",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "proc-macro2",
  "quote",
  "syn 2.0.52",
@@ -4422,7 +4330,7 @@ dependencies = [
  "rsa",
  "rstest",
  "rustc-hash",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-native-certs 0.8.0",
  "rustls-pemfile 2.1.1",
  "scopeguard",
@@ -5106,23 +5014,22 @@ dependencies = [
  "log",
  "ring",
  "rustls-pki-types",
- "rustls-webpki 0.102.2",
+ "rustls-webpki 0.102.8",
  "subtle",
  "zeroize",
 ]
 
 [[package]]
 name = "rustls"
-version = "0.23.7"
+version = "0.23.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ebbbdb961df0ad3f2652da8f3fdc4b36122f568f968f45ad3316f26c025c677b"
+checksum = "eee87ff5d9b36712a58574e12e9f0ea80f915a5b0ac518d322b24a465617925e"
 dependencies = [
- "aws-lc-rs",
  "log",
  "once_cell",
  "ring",
  "rustls-pki-types",
- "rustls-webpki 0.102.2",
+ "rustls-webpki 0.102.8",
  "subtle",
  "zeroize",
 ]
@@ -5202,11 +5109,10 @@ dependencies = [
 
 [[package]]
 name = "rustls-webpki"
-version = "0.102.2"
+version = "0.102.8"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "faaa0a62740bedb9b2ef5afa303da42764c012f743917351dc9a237ea1663610"
+checksum = "64ca1bc8749bd4cf37b5ce386cc146580777b4e8572c7b97baf22c83f444bee9"
 dependencies = [
- "aws-lc-rs",
  "ring",
  "rustls-pki-types",
  "untrusted",
@@ -5823,6 +5729,7 @@ dependencies = [
  "once_cell",
  "parking_lot 0.12.1",
  "prost",
+ "rustls 0.23.16",
  "tokio",
  "tonic",
  "tonic-build",
@@ -5905,7 +5812,7 @@ dependencies = [
  "postgres_ffi",
  "remote_storage",
  "reqwest 0.12.4",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-native-certs 0.8.0",
  "serde",
  "serde_json",
@@ -6338,7 +6245,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "04fb792ccd6bbcd4bba408eb8a292f70fc4a3589e5d793626f45190e6454b6ab"
 dependencies = [
  "ring",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "tokio",
  "tokio-postgres",
  "tokio-rustls 0.26.0",
@@ -6372,7 +6279,7 @@ version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4"
 dependencies = [
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-pki-types",
  "tokio",
 ]
@@ -6781,7 +6688,7 @@ dependencies = [
  "base64 0.22.1",
  "log",
  "once_cell",
- "rustls 0.23.7",
+ "rustls 0.23.16",
  "rustls-pki-types",
  "url",
  "webpki-roots 0.26.1",
@@ -6985,7 +6892,7 @@ name = "walproposer"
 version = "0.1.0"
 dependencies = [
  "anyhow",
- "bindgen 0.70.1",
+ "bindgen",
  "postgres_ffi",
  "utils",
 ]
@@ -7160,18 +7067,6 @@ dependencies = [
  "rustls-pki-types",
 ]
 
-[[package]]
-name = "which"
-version = "4.4.2"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7"
-dependencies = [
- "either",
- "home",
- "once_cell",
- "rustix",
-]
-
 [[package]]
 name = "whoami"
 version = "1.5.1"
@@ -7431,7 +7326,7 @@ dependencies = [
  "hyper-util",
  "indexmap 1.9.3",
  "indexmap 2.0.1",
- "itertools 0.10.5",
+ "itertools 0.12.1",
  "lazy_static",
  "libc",
  "log",
@@ -7452,8 +7347,7 @@ dependencies = [
  "regex-automata 0.4.3",
  "regex-syntax 0.8.2",
  "reqwest 0.12.4",
- "rustls 0.23.7",
- "rustls-webpki 0.102.2",
+ "rustls 0.23.16",
  "scopeguard",
  "serde",
  "serde_json",
diff --git a/Cargo.toml b/Cargo.toml
index 7f9a766ff9..e5f7719e7f 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -143,7 +143,7 @@ reqwest-retry = "0.5"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
-rustls = "0.23"
+rustls = { version = "0.23.16", default-features = false }
 rustls-pemfile = "2"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
@@ -174,7 +174,7 @@ tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
 tokio-io-timeout = "1.2.0"
 tokio-postgres-rustls = "0.12.0"
-tokio-rustls = "0.26"
+tokio-rustls = { version = "0.26.0", default-features = false, features = ["tls12", "ring"]}
 tokio-stream = "0.1"
 tokio-tar = "0.3"
 tokio-util = { version = "0.7.10", features = ["io", "rt"] }
diff --git a/libs/postgres_backend/tests/simple_select.rs b/libs/postgres_backend/tests/simple_select.rs
index 9d3031d699..3fcfbf4a03 100644
--- a/libs/postgres_backend/tests/simple_select.rs
+++ b/libs/postgres_backend/tests/simple_select.rs
@@ -2,7 +2,7 @@
 use once_cell::sync::Lazy;
 use postgres_backend::{AuthType, Handler, PostgresBackend, QueryError};
 use pq_proto::{BeMessage, RowDescriptor};
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use std::io::Cursor;
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -94,7 +94,7 @@ async fn simple_select_ssl() {
     let (client_sock, server_sock) = make_tcp_pair().await;
 
     let server_cfg =
-        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
             .expect("aws_lc_rs should support the default protocol versions")
             .with_no_client_auth()
@@ -110,7 +110,7 @@ async fn simple_select_ssl() {
     });
 
     let client_cfg =
-        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
             .expect("aws_lc_rs should support the default protocol versions")
             .with_root_certificates({
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 025053d3cb..71783ee452 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -15,7 +15,7 @@ use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use rustls::pki_types::PrivateKeyDer;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tokio::net::TcpListener;
@@ -105,14 +105,13 @@ async fn main() -> anyhow::Result<()> {
             let first_cert = cert_chain.first().context("missing certificate")?;
             let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
 
-            let tls_config = rustls::ServerConfig::builder_with_provider(Arc::new(
-                aws_lc_rs::default_provider(),
-            ))
-            .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
-            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
-            .with_no_client_auth()
-            .with_single_cert(cert_chain, key)?
-            .into();
+            let tls_config =
+                rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
+                    .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
+                    .context("ring should support TLS1.2 and TLS1.3")?
+                    .with_no_client_auth()
+                    .with_single_cert(cert_chain, key)?
+                    .into();
 
             (tls_config, tls_server_end_point)
         }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b97942ee5d..65b6dd215b 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -8,7 +8,7 @@ use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
@@ -266,12 +266,12 @@ impl ConnCfg {
     }
 }
 
+type RustlsStream = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>::Stream;
+
 pub(crate) struct PostgresConnection {
     /// Socket connected to a compute node.
-    pub(crate) stream: tokio_postgres::maybe_tls_stream::MaybeTlsStream<
-        tokio::net::TcpStream,
-        tokio_postgres_rustls::RustlsStream<tokio::net::TcpStream>,
-    >,
+    pub(crate) stream:
+        tokio_postgres::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
     /// PostgreSQL connection parameters.
     pub(crate) params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
@@ -298,9 +298,9 @@ impl ConnCfg {
         let client_config = if allow_self_signed_compute {
             // Allow all certificates for creating the connection
             let verifier = Arc::new(AcceptEverythingVerifier);
-            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .expect("aws_lc_rs should support the default protocol versions")
+                .expect("ring should support the default protocol versions")
                 .dangerous()
                 .with_custom_certificate_verifier(verifier)
         } else {
@@ -308,9 +308,9 @@ impl ConnCfg {
                 .get_or_try_init(load_certs)
                 .map_err(ConnectionError::TlsCertificateError)?
                 .clone();
-            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .expect("aws_lc_rs should support the default protocol versions")
+                .expect("ring should support the default protocol versions")
                 .with_root_certificates(root_store)
         };
         let client_config = client_config.with_no_client_auth();
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5183f22fa3..2870e100b7 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -7,7 +7,7 @@ use anyhow::{bail, ensure, Context, Ok};
 use clap::ValueEnum;
 use itertools::Itertools;
 use remote_storage::RemoteStorageConfig;
-use rustls::crypto::aws_lc_rs::{self, sign};
+use rustls::crypto::ring::{self, sign};
 use rustls::pki_types::{CertificateDer, PrivateKeyDer};
 use sha2::{Digest, Sha256};
 use tracing::{error, info};
@@ -127,9 +127,9 @@ pub fn configure_tls(
 
     // allow TLS 1.2 to be compatible with older client libraries
     let mut config =
-        rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])
-            .context("aws_lc_rs should support TLS1.2 and TLS1.3")?
+            .context("ring should support TLS1.2 and TLS1.3")?
             .with_no_client_auth()
             .with_cert_resolver(cert_resolver.clone());
 
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index fe62fee204..abb0599d08 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -9,11 +9,12 @@ use async_trait::async_trait;
 use http::StatusCode;
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use rustls::pki_types;
+use tokio::io::DuplexStream;
 use tokio_postgres::config::SslMode;
 use tokio_postgres::tls::{MakeTlsConnect, NoTls};
-use tokio_postgres_rustls::{MakeRustlsConnect, RustlsStream};
+use tokio_postgres_rustls::MakeRustlsConnect;
 
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -69,19 +70,12 @@ struct ClientConfig<'a> {
     hostname: &'a str,
 }
 
+type TlsConnect<S> = <MakeRustlsConnect as MakeTlsConnect<S>>::TlsConnect;
+
 impl ClientConfig<'_> {
-    fn make_tls_connect<S: AsyncRead + AsyncWrite + Unpin + Send + 'static>(
-        self,
-    ) -> anyhow::Result<
-        impl tokio_postgres::tls::TlsConnect<
-                S,
-                Error = impl std::fmt::Debug + use<S>,
-                Future = impl Send + use<S>,
-                Stream = RustlsStream<S>,
-            > + use<S>,
-    > {
+    fn make_tls_connect(self) -> anyhow::Result<TlsConnect<DuplexStream>> {
         let mut mk = MakeRustlsConnect::new(self.config);
-        let tls = MakeTlsConnect::<S>::make_tls_connect(&mut mk, self.hostname)?;
+        let tls = MakeTlsConnect::<DuplexStream>::make_tls_connect(&mut mk, self.hostname)?;
         Ok(tls)
     }
 }
@@ -95,9 +89,9 @@ fn generate_tls_config<'a>(
 
     let tls_config = {
         let config =
-            rustls::ServerConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ServerConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .context("aws_lc_rs should support the default protocol versions")?
+                .context("ring should support the default protocol versions")?
                 .with_no_client_auth()
                 .with_single_cert(vec![cert.clone()], key.clone_key())?
                 .into();
@@ -116,9 +110,9 @@ fn generate_tls_config<'a>(
 
     let client_config = {
         let config =
-            rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+            rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
                 .with_safe_default_protocol_versions()
-                .context("aws_lc_rs should support the default protocol versions")?
+                .context("ring should support the default protocol versions")?
                 .with_root_certificates({
                     let mut store = rustls::RootCertStore::empty();
                     store.add(ca)?;
diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml
index 2d19472c36..17d4aed63b 100644
--- a/storage_broker/Cargo.toml
+++ b/storage_broker/Cargo.toml
@@ -28,6 +28,7 @@ tokio = { workspace = true, features = ["rt-multi-thread"] }
 tracing.workspace = true
 metrics.workspace = true
 utils.workspace = true
+rustls.workspace = true
 
 workspace_hack.workspace = true
 
diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs
index bc632a39f7..3ac40f6e14 100644
--- a/storage_broker/src/lib.rs
+++ b/storage_broker/src/lib.rs
@@ -52,6 +52,12 @@ where
     // If schema starts with https, start encrypted connection; do plain text
     // otherwise.
     if let Some("https") = tonic_endpoint.uri().scheme_str() {
+        // if there's no default provider and both ring+aws-lc-rs are enabled
+        // this the tls settings on tonic will not work.
+        // erroring is ok.
+        rustls::crypto::ring::default_provider()
+            .install_default()
+            .ok();
         let tls = ClientTlsConfig::new();
         tonic_endpoint = tonic_endpoint.tls_config(tls)?;
     }
diff --git a/storage_scrubber/src/scan_safekeeper_metadata.rs b/storage_scrubber/src/scan_safekeeper_metadata.rs
index 6c312d0036..403b4590a8 100644
--- a/storage_scrubber/src/scan_safekeeper_metadata.rs
+++ b/storage_scrubber/src/scan_safekeeper_metadata.rs
@@ -6,7 +6,7 @@ use once_cell::sync::OnceCell;
 use pageserver_api::shard::TenantShardId;
 use postgres_ffi::{XLogFileName, PG_TLI};
 use remote_storage::GenericRemoteStorage;
-use rustls::crypto::aws_lc_rs;
+use rustls::crypto::ring;
 use serde::Serialize;
 use tokio_postgres::types::PgLsn;
 use tracing::{debug, error, info};
@@ -256,9 +256,9 @@ async fn load_timelines_from_db(
     // Use rustls (Neon requires TLS)
     let root_store = TLS_ROOTS.get_or_try_init(load_certs)?.clone();
     let client_config =
-        rustls::ClientConfig::builder_with_provider(Arc::new(aws_lc_rs::default_provider()))
+        rustls::ClientConfig::builder_with_provider(Arc::new(ring::default_provider()))
             .with_safe_default_protocol_versions()
-            .context("aws_lc_rs should support the default protocol versions")?
+            .context("ring should support the default protocol versions")?
             .with_root_certificates(root_store)
             .with_no_client_auth();
     let tls_connector = tokio_postgres_rustls::MakeRustlsConnect::new(client_config);
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 28c51b8ac1..8d83d9d9e2 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -47,7 +47,7 @@ hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["full"]
 hyper-util = { version = "0.1", features = ["client-legacy", "server-auto", "service"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.10" }
+itertools = { version = "0.12" }
 lazy_static = { version = "1", default-features = false, features = ["spin_no_std"] }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
@@ -65,8 +65,7 @@ regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
 reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
-rustls = { version = "0.23", features = ["ring"] }
-rustls-webpki = { version = "0.102", default-features = false, features = ["aws_lc_rs", "ring", "std"] }
+rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["alloc", "raw_value"] }
@@ -80,7 +79,7 @@ tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] }
-tokio-rustls = { version = "0.26", features = ["ring"] }
+tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }
 toml_edit = { version = "0.22", features = ["serde"] }
@@ -106,7 +105,7 @@ half = { version = "2", default-features = false, features = ["num-traits"] }
 hashbrown = { version = "0.14", features = ["raw"] }
 indexmap-dff4ba8e3ae991db = { package = "indexmap", version = "1", default-features = false, features = ["std"] }
 indexmap-f595c2ba2a3f28df = { package = "indexmap", version = "2", features = ["serde"] }
-itertools = { version = "0.10" }
+itertools = { version = "0.12" }
 libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }

From 8ad1dbce7252d1ed91543ddda6cd4c7c8ade414d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 4 Nov 2024 14:04:56 +0000
Subject: [PATCH 154/239] [proxy]: parse proxy protocol TLVs with aws/azure
 support (#9610)

AWS/azure private link shares extra information in the "TLV" values of
the proxy protocol v2 header. This code doesn't action on it, but it
parses it as appropriate.
---
 Cargo.lock                          |   2 +
 proxy/Cargo.toml                    |   2 +
 proxy/src/bin/pg_sni_router.rs      |   6 +-
 proxy/src/console_redirect_proxy.rs |   6 +-
 proxy/src/context/mod.rs            |  26 ++--
 proxy/src/context/parquet.rs        |   2 +-
 proxy/src/protocol2.rs              | 186 ++++++++++++++++++++++++----
 proxy/src/proxy/mod.rs              |  10 +-
 proxy/src/serverless/mod.rs         |  56 ++++++---
 9 files changed, 236 insertions(+), 60 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 44ef6d960c..484769bd16 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4341,6 +4341,8 @@ dependencies = [
  "smallvec",
  "smol_str",
  "socket2",
+ "strum",
+ "strum_macros",
  "subtle",
  "thiserror",
  "tikv-jemalloc-ctl",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index e25d2fcbab..2580b1cf8a 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -74,6 +74,8 @@ sha2 = { workspace = true, features = ["asm", "oid"] }
 smol_str.workspace = true
 smallvec.workspace = true
 socket2.workspace = true
+strum.workspace = true
+strum_macros.workspace = true
 subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
diff --git a/proxy/src/bin/pg_sni_router.rs b/proxy/src/bin/pg_sni_router.rs
index 71783ee452..ef5b5e8509 100644
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -13,6 +13,7 @@ use itertools::Itertools;
 use proxy::config::TlsServerEndPoint;
 use proxy::context::RequestMonitoring;
 use proxy::metrics::{Metrics, ThreadPoolMetrics};
+use proxy::protocol2::ConnectionInfo;
 use proxy::proxy::{copy_bidirectional_client_compute, run_until_cancelled, ErrorSource};
 use proxy::stream::{PqStream, Stream};
 use rustls::crypto::ring;
@@ -178,7 +179,10 @@ async fn task_main(
                 info!(%peer_addr, "serving");
                 let ctx = RequestMonitoring::new(
                     session_id,
-                    peer_addr.ip(),
+                    ConnectionInfo {
+                        addr: peer_addr,
+                        extra: None,
+                    },
                     proxy::metrics::Protocol::SniRouter,
                     "sni",
                 );
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 81d1d70958..243ef07854 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -11,7 +11,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestMonitoring;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::read_proxy_protocol;
+use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
 use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::proxy::passthrough::ProxyPassthrough;
@@ -65,8 +65,8 @@ pub async fn task_main(
                     error!("proxy protocol header not supported");
                     return;
                 }
-                Ok((socket, Some(addr))) => (socket, addr.ip()),
-                Ok((socket, None)) => (socket, peer_addr.ip()),
+                Ok((socket, Some(info))) => (socket, info),
+                Ok((socket, None)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
             };
 
             match socket.inner.set_nodelay(true) {
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index ca3b808a1b..2a6c9c5969 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -19,6 +19,7 @@ use crate::intern::{BranchIdInt, ProjectIdInt};
 use crate::metrics::{
     ConnectOutcome, InvalidEndpointsGroup, LatencyTimer, Metrics, Protocol, Waiting,
 };
+use crate::protocol2::ConnectionInfo;
 use crate::types::{DbName, EndpointId, RoleName};
 
 pub mod parquet;
@@ -40,7 +41,7 @@ pub struct RequestMonitoring(
 );
 
 struct RequestMonitoringInner {
-    pub(crate) peer_addr: IpAddr,
+    pub(crate) conn_info: ConnectionInfo,
     pub(crate) session_id: Uuid,
     pub(crate) protocol: Protocol,
     first_packet: chrono::DateTime<Utc>,
@@ -84,7 +85,7 @@ impl Clone for RequestMonitoring {
     fn clone(&self) -> Self {
         let inner = self.0.try_lock().expect("should not deadlock");
         let new = RequestMonitoringInner {
-            peer_addr: inner.peer_addr,
+            conn_info: inner.conn_info.clone(),
             session_id: inner.session_id,
             protocol: inner.protocol,
             first_packet: inner.first_packet,
@@ -117,7 +118,7 @@ impl Clone for RequestMonitoring {
 impl RequestMonitoring {
     pub fn new(
         session_id: Uuid,
-        peer_addr: IpAddr,
+        conn_info: ConnectionInfo,
         protocol: Protocol,
         region: &'static str,
     ) -> Self {
@@ -125,13 +126,13 @@ impl RequestMonitoring {
             "connect_request",
             %protocol,
             ?session_id,
-            %peer_addr,
+            %conn_info,
             ep = tracing::field::Empty,
             role = tracing::field::Empty,
         );
 
         let inner = RequestMonitoringInner {
-            peer_addr,
+            conn_info,
             session_id,
             protocol,
             first_packet: Utc::now(),
@@ -162,7 +163,11 @@ impl RequestMonitoring {
 
     #[cfg(test)]
     pub(crate) fn test() -> Self {
-        RequestMonitoring::new(Uuid::now_v7(), [127, 0, 0, 1].into(), Protocol::Tcp, "test")
+        use std::net::SocketAddr;
+        let ip = IpAddr::from([127, 0, 0, 1]);
+        let addr = SocketAddr::new(ip, 5432);
+        let conn_info = ConnectionInfo { addr, extra: None };
+        RequestMonitoring::new(Uuid::now_v7(), conn_info, Protocol::Tcp, "test")
     }
 
     pub(crate) fn console_application_name(&self) -> String {
@@ -286,7 +291,12 @@ impl RequestMonitoring {
     }
 
     pub(crate) fn peer_addr(&self) -> IpAddr {
-        self.0.try_lock().expect("should not deadlock").peer_addr
+        self.0
+            .try_lock()
+            .expect("should not deadlock")
+            .conn_info
+            .addr
+            .ip()
     }
 
     pub(crate) fn cold_start_info(&self) -> ColdStartInfo {
@@ -362,7 +372,7 @@ impl RequestMonitoringInner {
     }
 
     fn has_private_peer_addr(&self) -> bool {
-        match self.peer_addr {
+        match self.conn_info.addr.ip() {
             IpAddr::V4(ip) => ip.is_private(),
             IpAddr::V6(_) => false,
         }
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index 3432ac5ff6..adbb74c8e5 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -121,7 +121,7 @@ impl From<&RequestMonitoringInner> for RequestData {
     fn from(value: &RequestMonitoringInner) -> Self {
         Self {
             session_id: value.session_id,
-            peer_addr: value.peer_addr.to_string(),
+            peer_addr: value.conn_info.addr.ip().to_string(),
             timestamp: value.first_packet.naive_utc(),
             username: value.user.as_deref().map(String::from),
             application_name: value.application.as_deref().map(String::from),
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index ef2391cdd8..d1084ca2ff 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -1,12 +1,15 @@
 //! Proxy Protocol V2 implementation
+//! Compatible with <https://www.haproxy.org/download/3.1/doc/proxy-protocol.txt>
 
+use core::fmt;
 use std::io;
-use std::net::SocketAddr;
+use std::net::{Ipv4Addr, Ipv6Addr, SocketAddr};
 use std::pin::Pin;
 use std::task::{Context, Poll};
 
-use bytes::BytesMut;
+use bytes::{Buf, Bytes, BytesMut};
 use pin_project_lite::pin_project;
+use strum_macros::FromRepr;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
 
 pin_project! {
@@ -58,9 +61,35 @@ const HEADER: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub struct ConnectionInfo {
+    pub addr: SocketAddr,
+    pub extra: Option<ConnectionInfoExtra>,
+}
+
+impl fmt::Display for ConnectionInfo {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match &self.extra {
+            None => self.addr.ip().fmt(f),
+            Some(ConnectionInfoExtra::Aws { vpce_id }) => {
+                write!(f, "vpce_id[{vpce_id:?}]:addr[{}]", self.addr.ip())
+            }
+            Some(ConnectionInfoExtra::Azure { link_id }) => {
+                write!(f, "link_id[{link_id}]:addr[{}]", self.addr.ip())
+            }
+        }
+    }
+}
+
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub enum ConnectionInfoExtra {
+    Aws { vpce_id: Bytes },
+    Azure { link_id: u32 },
+}
+
 pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
     mut read: T,
-) -> std::io::Result<(ChainRW<T>, Option<SocketAddr>)> {
+) -> std::io::Result<(ChainRW<T>, Option<ConnectionInfo>)> {
     let mut buf = BytesMut::with_capacity(128);
     while buf.len() < 16 {
         let bytes_read = read.read_buf(&mut buf).await?;
@@ -164,22 +193,107 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
     //   - destination layer 3 address in network byte order
     //   - source layer 4 address if any, in network byte order (port)
     //   - destination layer 4 address if any, in network byte order (port)
-    let addresses = buf.split_to(remaining_length as usize);
-    let socket = match address_length {
+    let mut header = buf.split_to(usize::from(remaining_length));
+    let mut addr = header.split_to(usize::from(address_length));
+    let socket = match addr.len() {
         12 => {
-            let src_addr: [u8; 4] = addresses[0..4].try_into().unwrap();
-            let src_port = u16::from_be_bytes(addresses[8..10].try_into().unwrap());
+            let src_addr = Ipv4Addr::from_bits(addr.get_u32());
+            let _dst_addr = Ipv4Addr::from_bits(addr.get_u32());
+            let src_port = addr.get_u16();
+            let _dst_port = addr.get_u16();
             Some(SocketAddr::from((src_addr, src_port)))
         }
         36 => {
-            let src_addr: [u8; 16] = addresses[0..16].try_into().unwrap();
-            let src_port = u16::from_be_bytes(addresses[32..34].try_into().unwrap());
+            let src_addr = Ipv6Addr::from_bits(addr.get_u128());
+            let _dst_addr = Ipv6Addr::from_bits(addr.get_u128());
+            let src_port = addr.get_u16();
+            let _dst_port = addr.get_u16();
             Some(SocketAddr::from((src_addr, src_port)))
         }
         _ => None,
     };
 
-    Ok((ChainRW { inner: read, buf }, socket))
+    let mut extra = None;
+
+    while let Some(mut tlv) = read_tlv(&mut header) {
+        match Pp2Kind::from_repr(tlv.kind) {
+            Some(Pp2Kind::Aws) => {
+                if tlv.value.is_empty() {
+                    tracing::warn!("invalid aws tlv: no subtype");
+                }
+                let subtype = tlv.value.get_u8();
+                match Pp2AwsType::from_repr(subtype) {
+                    Some(Pp2AwsType::VpceId) => {
+                        extra = Some(ConnectionInfoExtra::Aws { vpce_id: tlv.value });
+                    }
+                    None => {
+                        tracing::warn!("unknown aws tlv: subtype={subtype}");
+                    }
+                }
+            }
+            Some(Pp2Kind::Azure) => {
+                if tlv.value.is_empty() {
+                    tracing::warn!("invalid azure tlv: no subtype");
+                }
+                let subtype = tlv.value.get_u8();
+                match Pp2AzureType::from_repr(subtype) {
+                    Some(Pp2AzureType::PrivateEndpointLinkId) => {
+                        if tlv.value.len() != 4 {
+                            tracing::warn!("invalid azure link_id: {:?}", tlv.value);
+                        }
+                        extra = Some(ConnectionInfoExtra::Azure {
+                            link_id: tlv.value.get_u32_le(),
+                        });
+                    }
+                    None => {
+                        tracing::warn!("unknown azure tlv: subtype={subtype}");
+                    }
+                }
+            }
+            Some(kind) => {
+                tracing::debug!("unused tlv[{kind:?}]: {:?}", tlv.value);
+            }
+            None => {
+                tracing::debug!("unknown tlv: {tlv:?}");
+            }
+        }
+    }
+
+    let conn_info = socket.map(|addr| ConnectionInfo { addr, extra });
+
+    Ok((ChainRW { inner: read, buf }, conn_info))
+}
+
+#[derive(FromRepr, Debug, Copy, Clone)]
+#[repr(u8)]
+enum Pp2Kind {
+    // The following are defined by https://www.haproxy.org/download/3.1/doc/proxy-protocol.txt
+    // we don't use these but it would be interesting to know what's available
+    Alpn = 0x01,
+    Authority = 0x02,
+    Crc32C = 0x03,
+    Noop = 0x04,
+    UniqueId = 0x05,
+    Ssl = 0x20,
+    NetNs = 0x30,
+
+    /// <https://docs.aws.amazon.com/elasticloadbalancing/latest/network/edit-target-group-attributes.html#proxy-protocol>
+    Aws = 0xEA,
+
+    /// <https://learn.microsoft.com/en-us/azure/private-link/private-link-service-overview#getting-connection-information-using-tcp-proxy-v2>
+    Azure = 0xEE,
+}
+
+#[derive(FromRepr, Debug, Copy, Clone)]
+#[repr(u8)]
+enum Pp2AwsType {
+    VpceId = 0x01,
+}
+
+#[derive(FromRepr, Debug, Copy, Clone)]
+#[repr(u8)]
+enum Pp2AzureType {
+    PrivateEndpointLinkId = 0x01,
 }
 
 impl<T: AsyncRead> AsyncRead for ChainRW<T> {
@@ -216,6 +330,25 @@ impl<T: AsyncRead> ChainRW<T> {
     }
 }
 
+#[derive(Debug)]
+struct Tlv {
+    kind: u8,
+    value: Bytes,
+}
+
+fn read_tlv(b: &mut BytesMut) -> Option<Tlv> {
+    if b.len() < 3 {
+        return None;
+    }
+    let kind = b.get_u8();
+    let len = usize::from(b.get_u16());
+    if b.len() < len {
+        return None;
+    }
+    let value = b.split_to(len).freeze();
+    Some(Tlv { kind, value })
+}
+
 #[cfg(test)]
 mod tests {
     use tokio::io::AsyncReadExt;
@@ -242,7 +375,7 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+        let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
             .await
             .unwrap();
 
@@ -250,7 +383,9 @@ mod tests {
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(addr, Some(([127, 0, 0, 1], 65535).into()));
+
+        let info = info.unwrap();
+        assert_eq!(info.addr, ([127, 0, 0, 1], 65535).into());
     }
 
     #[tokio::test]
@@ -273,7 +408,7 @@ mod tests {
 
         let extra_data = [0x55; 256];
 
-        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+        let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
             .await
             .unwrap();
 
@@ -281,9 +416,11 @@ mod tests {
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
+
+        let info = info.unwrap();
         assert_eq!(
-            addr,
-            Some(([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into())
+            info.addr,
+            ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
         );
     }
 
@@ -291,30 +428,31 @@ mod tests {
     async fn test_invalid() {
         let data = [0x55; 256];
 
-        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
+        let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(addr, None);
+        assert_eq!(info, None);
     }
 
     #[tokio::test]
     async fn test_short() {
         let data = [0x55; 10];
 
-        let (mut read, addr) = read_proxy_protocol(data.as_slice()).await.unwrap();
+        let (mut read, info) = read_proxy_protocol(data.as_slice()).await.unwrap();
 
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(addr, None);
+        assert_eq!(info, None);
     }
 
     #[tokio::test]
     async fn test_large_tlv() {
         let tlv = vec![0x55; 32768];
-        let len = (12 + tlv.len() as u16).to_be_bytes();
+        let tlv_len = (tlv.len() as u16).to_be_bytes();
+        let len = (12 + 3 + tlv.len() as u16).to_be_bytes();
 
         let header = super::HEADER
             // Proxy command, Inet << 4 | Stream
@@ -330,11 +468,13 @@ mod tests {
             // dst port
             .chain([1, 1].as_slice())
             // TLV
+            .chain([255].as_slice())
+            .chain(tlv_len.as_slice())
             .chain(tlv.as_slice());
 
         let extra_data = [0xaa; 256];
 
-        let (mut read, addr) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+        let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
             .await
             .unwrap();
 
@@ -342,6 +482,8 @@ mod tests {
         read.read_to_end(&mut bytes).await.unwrap();
 
         assert_eq!(bytes, extra_data);
-        assert_eq!(addr, Some(([55, 56, 57, 58], 65535).into()));
+
+        let info = info.unwrap();
+        assert_eq!(info.addr, ([55, 56, 57, 58], 65535).into());
     }
 }
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 2970d93393..922646d889 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -28,7 +28,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestMonitoring;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::read_proxy_protocol;
+use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
@@ -87,7 +87,7 @@ pub async fn task_main(
         let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
 
         connections.spawn(async move {
-            let (socket, peer_addr) = match read_proxy_protocol(socket).await {
+            let (socket, conn_info) = match read_proxy_protocol(socket).await {
                 Err(e) => {
                     warn!("per-client task finished with an error: {e:#}");
                     return;
@@ -100,8 +100,8 @@ pub async fn task_main(
                     warn!("proxy protocol header not supported");
                     return;
                 }
-                Ok((socket, Some(addr))) => (socket, addr.ip()),
-                Ok((socket, None)) => (socket, peer_addr.ip()),
+                Ok((socket, Some(info))) => (socket, info),
+                Ok((socket, None)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
             };
 
             match socket.inner.set_nodelay(true) {
@@ -114,7 +114,7 @@ pub async fn task_main(
 
             let ctx = RequestMonitoring::new(
                 session_id,
-                peer_addr,
+                conn_info,
                 crate::metrics::Protocol::Tcp,
                 &config.region,
             );
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index edbb0347d3..4b60ddf60f 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -44,10 +44,10 @@ use tracing::{info, warn, Instrument};
 use utils::http::error::ApiError;
 
 use crate::cancellation::CancellationHandlerMain;
-use crate::config::ProxyConfig;
+use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
-use crate::protocol2::{read_proxy_protocol, ChainRW};
+use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectionInfo};
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
@@ -180,7 +180,7 @@ pub async fn task_main(
                     peer_addr,
                 ))
                 .await;
-                let Some((conn, peer_addr)) = startup_result else {
+                let Some((conn, conn_info)) = startup_result else {
                     return;
                 };
 
@@ -192,7 +192,7 @@ pub async fn task_main(
                     endpoint_rate_limiter,
                     conn_token,
                     conn,
-                    peer_addr,
+                    conn_info,
                     session_id,
                 ))
                 .await;
@@ -240,7 +240,7 @@ async fn connection_startup(
     session_id: uuid::Uuid,
     conn: TcpStream,
     peer_addr: SocketAddr,
-) -> Option<(AsyncRW, IpAddr)> {
+) -> Option<(AsyncRW, ConnectionInfo)> {
     // handle PROXY protocol
     let (conn, peer) = match read_proxy_protocol(conn).await {
         Ok(c) => c,
@@ -250,17 +250,32 @@ async fn connection_startup(
         }
     };
 
-    let peer_addr = peer.unwrap_or(peer_addr).ip();
-    let has_private_peer_addr = match peer_addr {
+    let conn_info = match peer {
+        None if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+            tracing::warn!("missing required proxy protocol header");
+            return None;
+        }
+        Some(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+            tracing::warn!("proxy protocol header not supported");
+            return None;
+        }
+        Some(info) => info,
+        None => ConnectionInfo {
+            addr: peer_addr,
+            extra: None,
+        },
+    };
+
+    let has_private_peer_addr = match conn_info.addr.ip() {
         IpAddr::V4(ip) => ip.is_private(),
         IpAddr::V6(_) => false,
     };
-    info!(?session_id, %peer_addr, "accepted new TCP connection");
+    info!(?session_id, %conn_info, "accepted new TCP connection");
 
     // try upgrade to TLS, but with a timeout.
     let conn = match timeout(config.handshake_timeout, tls_acceptor.accept(conn)).await {
         Ok(Ok(conn)) => {
-            info!(?session_id, %peer_addr, "accepted new TLS connection");
+            info!(?session_id, %conn_info, "accepted new TLS connection");
             conn
         }
         // The handshake failed
@@ -268,7 +283,7 @@ async fn connection_startup(
             if !has_private_peer_addr {
                 Metrics::get().proxy.tls_handshake_failures.inc();
             }
-            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}");
             return None;
         }
         // The handshake timed out
@@ -276,12 +291,12 @@ async fn connection_startup(
             if !has_private_peer_addr {
                 Metrics::get().proxy.tls_handshake_failures.inc();
             }
-            warn!(?session_id, %peer_addr, "failed to accept TLS connection: {e:?}");
+            warn!(?session_id, %conn_info, "failed to accept TLS connection: {e:?}");
             return None;
         }
     };
 
-    Some((conn, peer_addr))
+    Some((conn, conn_info))
 }
 
 /// Handles HTTP connection
@@ -297,7 +312,7 @@ async fn connection_handler(
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
     cancellation_token: CancellationToken,
     conn: AsyncRW,
-    peer_addr: IpAddr,
+    conn_info: ConnectionInfo,
     session_id: uuid::Uuid,
 ) {
     let session_id = AtomicTake::new(session_id);
@@ -306,6 +321,7 @@ async fn connection_handler(
     let http_cancellation_token = CancellationToken::new();
     let _cancel_connection = http_cancellation_token.clone().drop_guard();
 
+    let conn_info2 = conn_info.clone();
     let server = Builder::new(TokioExecutor::new());
     let conn = server.serve_connection_with_upgrades(
         hyper_util::rt::TokioIo::new(conn),
@@ -340,7 +356,7 @@ async fn connection_handler(
                     connections.clone(),
                     cancellation_handler.clone(),
                     session_id,
-                    peer_addr,
+                    conn_info2.clone(),
                     http_request_token,
                     endpoint_rate_limiter.clone(),
                 )
@@ -365,7 +381,7 @@ async fn connection_handler(
     // On cancellation, trigger the HTTP connection handler to shut down.
     let res = match select(pin!(cancellation_token.cancelled()), pin!(conn)).await {
         Either::Left((_cancelled, mut conn)) => {
-            tracing::debug!(%peer_addr, "cancelling connection");
+            tracing::debug!(%conn_info, "cancelling connection");
             conn.as_mut().graceful_shutdown();
             conn.await
         }
@@ -373,8 +389,8 @@ async fn connection_handler(
     };
 
     match res {
-        Ok(()) => tracing::info!(%peer_addr, "HTTP connection closed"),
-        Err(e) => tracing::warn!(%peer_addr, "HTTP connection error {e}"),
+        Ok(()) => tracing::info!(%conn_info, "HTTP connection closed"),
+        Err(e) => tracing::warn!(%conn_info, "HTTP connection error {e}"),
     }
 }
 
@@ -386,7 +402,7 @@ async fn request_handler(
     ws_connections: TaskTracker,
     cancellation_handler: Arc<CancellationHandlerMain>,
     session_id: uuid::Uuid,
-    peer_addr: IpAddr,
+    conn_info: ConnectionInfo,
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
@@ -404,7 +420,7 @@ async fn request_handler(
     {
         let ctx = RequestMonitoring::new(
             session_id,
-            peer_addr,
+            conn_info,
             crate::metrics::Protocol::Ws,
             &config.region,
         );
@@ -439,7 +455,7 @@ async fn request_handler(
     } else if request.uri().path() == "/sql" && *request.method() == Method::POST {
         let ctx = RequestMonitoring::new(
             session_id,
-            peer_addr,
+            conn_info,
             crate::metrics::Protocol::Http,
             &config.region,
         );

From 0d5a512825705cce3f2c30707e27b3e437a9bc91 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 4 Nov 2024 16:22:46 +0100
Subject: [PATCH 155/239] safekeeper: add walreceiver metrics (#9450)

## Problem

We don't have any observability for Safekeeper WAL receiver queues.

## Summary of changes

Adds a few WAL receiver metrics:

* `safekeeper_wal_receivers`: gauge of currently connected WAL
receivers.
* `safekeeper_wal_receiver_queue_depth`: histogram of queue depths per
receiver, sampled every 5 seconds.
* `safekeeper_wal_receiver_queue_depth_total`: gauge of total queued
messages across all receivers.
* `safekeeper_wal_receiver_queue_size_total`: gauge of total queued
message sizes across all receivers.

There are already metrics for ingested WAL volume: `written_wal_bytes`
counter per timeline, and `safekeeper_write_wal_bytes` per-request
histogram.
---
 libs/metrics/src/lib.rs       | 81 +++++++++++++++++++++++++++++++++++
 safekeeper/src/metrics.rs     | 52 +++++++++++++++++++---
 safekeeper/src/receive_wal.rs | 46 +++++++++++++++++++-
 safekeeper/src/safekeeper.rs  | 64 +++++++++++++++++++++++++++
 4 files changed, 234 insertions(+), 9 deletions(-)

diff --git a/libs/metrics/src/lib.rs b/libs/metrics/src/lib.rs
index 64e56cb691..0f6c2a0937 100644
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -110,6 +110,23 @@ static MAXRSS_KB: Lazy<IntGauge> = Lazy::new(|| {
 pub const DISK_FSYNC_SECONDS_BUCKETS: &[f64] =
     &[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 30.0];
 
+/// Constructs histogram buckets that are powers of two starting at 1 (i.e. 2^0), covering the end
+/// points. For example, passing start=5,end=20 yields 4,8,16,32 as does start=4,end=32.
+pub fn pow2_buckets(start: usize, end: usize) -> Vec<f64> {
+    assert_ne!(start, 0);
+    assert!(start <= end);
+    let start = match start.checked_next_power_of_two() {
+        Some(n) if n == start => n, // start already power of two
+        Some(n) => n >> 1,          // power of two below start
+        None => panic!("start too large"),
+    };
+    let end = end.checked_next_power_of_two().expect("end too large");
+    std::iter::successors(Some(start), |n| n.checked_mul(2))
+        .take_while(|n| n <= &end)
+        .map(|n| n as f64)
+        .collect()
+}
+
 pub struct BuildInfo {
     pub revision: &'static str,
     pub build_tag: &'static str,
@@ -595,3 +612,67 @@ where
         self.dec.collect_into(metadata, labels, name, &mut enc.0)
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    const POW2_BUCKETS_MAX: usize = 1 << (usize::BITS - 1);
+
+    #[test]
+    fn pow2_buckets_cases() {
+        assert_eq!(pow2_buckets(1, 1), vec![1.0]);
+        assert_eq!(pow2_buckets(1, 2), vec![1.0, 2.0]);
+        assert_eq!(pow2_buckets(1, 3), vec![1.0, 2.0, 4.0]);
+        assert_eq!(pow2_buckets(1, 4), vec![1.0, 2.0, 4.0]);
+        assert_eq!(pow2_buckets(1, 5), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(1, 6), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(1, 7), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(
+            pow2_buckets(1, 200),
+            vec![1.0, 2.0, 4.0, 8.0, 16.0, 32.0, 64.0, 128.0, 256.0]
+        );
+
+        assert_eq!(pow2_buckets(1, 8), vec![1.0, 2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(2, 8), vec![2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(3, 8), vec![2.0, 4.0, 8.0]);
+        assert_eq!(pow2_buckets(4, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(5, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(6, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(7, 8), vec![4.0, 8.0]);
+        assert_eq!(pow2_buckets(8, 8), vec![8.0]);
+        assert_eq!(pow2_buckets(20, 200), vec![16.0, 32.0, 64.0, 128.0, 256.0]);
+
+        // Largest valid values.
+        assert_eq!(
+            pow2_buckets(1, POW2_BUCKETS_MAX).len(),
+            usize::BITS as usize
+        );
+        assert_eq!(pow2_buckets(POW2_BUCKETS_MAX, POW2_BUCKETS_MAX).len(), 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_zero_start() {
+        pow2_buckets(0, 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_end_lt_start() {
+        pow2_buckets(2, 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_end_overflow_min() {
+        pow2_buckets(1, POW2_BUCKETS_MAX + 1);
+    }
+
+    #[test]
+    #[should_panic]
+    fn pow2_buckets_end_overflow_max() {
+        pow2_buckets(1, usize::MAX);
+    }
+}
diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index e8fdddcdc1..bb56e923f8 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -5,23 +5,23 @@ use std::{
     time::{Instant, SystemTime},
 };
 
-use ::metrics::{register_histogram, GaugeVec, Histogram, IntGauge, DISK_FSYNC_SECONDS_BUCKETS};
 use anyhow::Result;
 use futures::Future;
 use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
+    pow2_buckets,
     proto::MetricFamily,
-    register_histogram_vec, register_int_counter, register_int_counter_pair,
-    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge,
-    HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_histogram, register_histogram_vec, register_int_counter, register_int_counter_pair,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge, GaugeVec,
+    Histogram, HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec,
+    IntGauge, IntGaugeVec, DISK_FSYNC_SECONDS_BUCKETS,
 };
 use once_cell::sync::Lazy;
-
 use postgres_ffi::XLogSegNo;
-use utils::pageserver_feedback::PageserverFeedback;
-use utils::{id::TenantTimelineId, lsn::Lsn};
+use utils::{id::TenantTimelineId, lsn::Lsn, pageserver_feedback::PageserverFeedback};
 
 use crate::{
+    receive_wal::MSG_QUEUE_SIZE,
     state::{TimelineMemState, TimelinePersistentState},
     GlobalTimelines,
 };
@@ -204,6 +204,44 @@ pub static WAL_BACKUP_TASKS: Lazy<IntCounterPair> = Lazy::new(|| {
     )
     .expect("Failed to register safekeeper_wal_backup_tasks_finished_total counter")
 });
+pub static WAL_RECEIVERS: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "safekeeper_wal_receivers",
+        "Number of currently connected WAL receivers (i.e. connected computes)"
+    )
+    .expect("Failed to register safekeeper_wal_receivers")
+});
+pub static WAL_RECEIVER_QUEUE_DEPTH: Lazy<Histogram> = Lazy::new(|| {
+    // Use powers of two buckets, but add a bucket at 0 and the max queue size to track empty and
+    // full queues respectively.
+    let mut buckets = pow2_buckets(1, MSG_QUEUE_SIZE);
+    buckets.insert(0, 0.0);
+    buckets.insert(buckets.len() - 1, (MSG_QUEUE_SIZE - 1) as f64);
+    assert!(buckets.len() <= 12, "too many histogram buckets");
+
+    register_histogram!(
+        "safekeeper_wal_receiver_queue_depth",
+        "Number of queued messages per WAL receiver (sampled every 5 seconds)",
+        buckets
+    )
+    .expect("Failed to register safekeeper_wal_receiver_queue_depth histogram")
+});
+pub static WAL_RECEIVER_QUEUE_DEPTH_TOTAL: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "safekeeper_wal_receiver_queue_depth_total",
+        "Total number of queued messages across all WAL receivers",
+    )
+    .expect("Failed to register safekeeper_wal_receiver_queue_depth_total gauge")
+});
+// TODO: consider adding a per-receiver queue_size histogram. This will require wrapping the Tokio
+// MPSC channel to update counters on send, receive, and drop, while forwarding all other methods.
+pub static WAL_RECEIVER_QUEUE_SIZE_TOTAL: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "safekeeper_wal_receiver_queue_size_total",
+        "Total memory byte size of queued messages across all WAL receivers",
+    )
+    .expect("Failed to register safekeeper_wal_receiver_queue_size_total gauge")
+});
 
 // Metrics collected on operations on the storage repository.
 #[derive(strum_macros::EnumString, strum_macros::Display, strum_macros::IntoStaticStr)]
diff --git a/safekeeper/src/receive_wal.rs b/safekeeper/src/receive_wal.rs
index 2410e22f45..a0a96c6e99 100644
--- a/safekeeper/src/receive_wal.rs
+++ b/safekeeper/src/receive_wal.rs
@@ -3,6 +3,10 @@
 //! sends replies back.
 
 use crate::handler::SafekeeperPostgresHandler;
+use crate::metrics::{
+    WAL_RECEIVERS, WAL_RECEIVER_QUEUE_DEPTH, WAL_RECEIVER_QUEUE_DEPTH_TOTAL,
+    WAL_RECEIVER_QUEUE_SIZE_TOTAL,
+};
 use crate::safekeeper::AcceptorProposerMessage;
 use crate::safekeeper::ProposerAcceptorMessage;
 use crate::safekeeper::ServerInfo;
@@ -86,6 +90,7 @@ impl WalReceivers {
         };
 
         self.update_num(&shared);
+        WAL_RECEIVERS.inc();
 
         WalReceiverGuard {
             id: pos,
@@ -144,6 +149,7 @@ impl WalReceivers {
         let mut shared = self.mutex.lock();
         shared.slots[id] = None;
         self.update_num(&shared);
+        WAL_RECEIVERS.dec();
     }
 
     /// Broadcast pageserver feedback to connected walproposers.
@@ -390,6 +396,7 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
 
     loop {
         let started = Instant::now();
+        let size = next_msg.size();
         match msg_tx.send_timeout(next_msg, SLOW_THRESHOLD).await {
             Ok(()) => {}
             // Slow send, log a message and keep trying. Log context has timeline ID.
@@ -409,6 +416,11 @@ async fn read_network_loop<IO: AsyncRead + AsyncWrite + Unpin>(
             // WalAcceptor terminated.
             Err(SendTimeoutError::Closed(_)) => return Ok(()),
         }
+
+        // Update metrics. Will be decremented in WalAcceptor.
+        WAL_RECEIVER_QUEUE_DEPTH_TOTAL.inc();
+        WAL_RECEIVER_QUEUE_SIZE_TOTAL.add(size as i64);
+
         next_msg = read_message(pgb_reader).await?;
     }
 }
@@ -466,6 +478,12 @@ async fn network_write<IO: AsyncRead + AsyncWrite + Unpin>(
 /// walproposer, even when it's writing a steady stream of messages.
 const FLUSH_INTERVAL: Duration = Duration::from_secs(1);
 
+/// The metrics computation interval.
+///
+/// The Prometheus poll interval is 60 seconds at the time of writing. We sample the queue depth
+/// every 5 seconds, for 12 samples per poll. This will give a count of up to 12x active timelines.
+const METRICS_INTERVAL: Duration = Duration::from_secs(5);
+
 /// Encapsulates a task which takes messages from msg_rx, processes and pushes
 /// replies to reply_tx.
 ///
@@ -512,12 +530,15 @@ impl WalAcceptor {
     async fn run(&mut self) -> anyhow::Result<()> {
         let walreceiver_guard = self.tli.get_walreceivers().register(self.conn_id);
 
-        // Periodically flush the WAL.
+        // Periodically flush the WAL and compute metrics.
         let mut flush_ticker = tokio::time::interval(FLUSH_INTERVAL);
         flush_ticker.set_missed_tick_behavior(MissedTickBehavior::Delay);
         flush_ticker.tick().await; // skip the initial, immediate tick
 
-        // Tracks unflushed appends.
+        let mut metrics_ticker = tokio::time::interval(METRICS_INTERVAL);
+        metrics_ticker.set_missed_tick_behavior(MissedTickBehavior::Skip);
+
+        // Tracks whether we have unflushed appends.
         let mut dirty = false;
 
         loop {
@@ -529,6 +550,10 @@ impl WalAcceptor {
                         break;
                     };
 
+                    // Update gauge metrics.
+                    WAL_RECEIVER_QUEUE_DEPTH_TOTAL.dec();
+                    WAL_RECEIVER_QUEUE_SIZE_TOTAL.sub(msg.size() as i64);
+
                     // Update walreceiver state in shmem for reporting.
                     if let ProposerAcceptorMessage::Elected(_) = &msg {
                         walreceiver_guard.get().status = WalReceiverStatus::Streaming;
@@ -565,6 +590,12 @@ impl WalAcceptor {
                         .process_msg(&ProposerAcceptorMessage::FlushWAL)
                         .await?
                 }
+
+                // Update histogram metrics periodically.
+                _ = metrics_ticker.tick() => {
+                    WAL_RECEIVER_QUEUE_DEPTH.observe(self.msg_rx.len() as f64);
+                    None // no reply
+                }
             };
 
             // Send reply, if any.
@@ -585,3 +616,14 @@ impl WalAcceptor {
         Ok(())
     }
 }
+
+/// On drop, drain msg_rx and update metrics to avoid leaks.
+impl Drop for WalAcceptor {
+    fn drop(&mut self) {
+        self.msg_rx.close(); // prevent further sends
+        while let Ok(msg) = self.msg_rx.try_recv() {
+            WAL_RECEIVER_QUEUE_DEPTH_TOTAL.dec();
+            WAL_RECEIVER_QUEUE_SIZE_TOTAL.sub(msg.size() as i64);
+        }
+    }
+}
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index b3e006ab05..cf41d7a0ab 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -422,6 +422,70 @@ impl ProposerAcceptorMessage {
             _ => bail!("unknown proposer-acceptor message tag: {}", tag),
         }
     }
+
+    /// The memory size of the message, including byte slices.
+    pub fn size(&self) -> usize {
+        const BASE_SIZE: usize = std::mem::size_of::<ProposerAcceptorMessage>();
+
+        // For most types, the size is just the base enum size including the nested structs. Some
+        // types also contain byte slices; add them.
+        //
+        // We explicitly list all fields, to draw attention here when new fields are added.
+        let mut size = BASE_SIZE;
+        size += match self {
+            Self::Greeting(ProposerGreeting {
+                protocol_version: _,
+                pg_version: _,
+                proposer_id: _,
+                system_id: _,
+                timeline_id: _,
+                tenant_id: _,
+                tli: _,
+                wal_seg_size: _,
+            }) => 0,
+
+            Self::VoteRequest(VoteRequest { term: _ }) => 0,
+
+            Self::Elected(ProposerElected {
+                term: _,
+                start_streaming_at: _,
+                term_history: _,
+                timeline_start_lsn: _,
+            }) => 0,
+
+            Self::AppendRequest(AppendRequest {
+                h:
+                    AppendRequestHeader {
+                        term: _,
+                        term_start_lsn: _,
+                        begin_lsn: _,
+                        end_lsn: _,
+                        commit_lsn: _,
+                        truncate_lsn: _,
+                        proposer_uuid: _,
+                    },
+                wal_data,
+            }) => wal_data.len(),
+
+            Self::NoFlushAppendRequest(AppendRequest {
+                h:
+                    AppendRequestHeader {
+                        term: _,
+                        term_start_lsn: _,
+                        begin_lsn: _,
+                        end_lsn: _,
+                        commit_lsn: _,
+                        truncate_lsn: _,
+                        proposer_uuid: _,
+                    },
+                wal_data,
+            }) => wal_data.len(),
+
+            Self::FlushWAL => 0,
+        };
+
+        size
+    }
 }
 
 /// Acceptor -> Proposer messages

From 06113e94e6e0108882dd04820a37a83ac68bf59a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 4 Nov 2024 17:42:08 +0100
Subject: [PATCH 156/239] fix(test_regress): always use storcon virtual
 pageserver API to set tenant config (#9622)

Problem
-------

Tests that directly call the Pageserver Management API to set tenant
config are flaky if the Pageserver is managed by Storcon because Storcon
is the source of truth and may (theoretically) reconcile a tenant at any
time.

Solution
--------

Switch all users of
`set_tenant_config`/`patch_tenant_config_client_side`
to use the `env.storage_controller.pageserver_api()`

Future Work
-----------

Prevent regressions from creeping in.

And generally clean up up tenant configuration.
Maybe we can avoid the Pageserver having a default tenant config at all
and put the default into Storcon instead?

* => https://github.com/neondatabase/neon/issues/9621

Refs
----

fixes https://github.com/neondatabase/neon/issues/9522
---
 test_runner/fixtures/pageserver/http.py       | 11 ++++++++++
 .../regress/test_attach_tenant_config.py      | 20 +++++++++++--------
 .../regress/test_disk_usage_eviction.py       | 15 ++++++++------
 .../regress/test_ingestion_layer_size.py      |  2 +-
 .../regress/test_layers_from_future.py        |  2 +-
 .../test_pageserver_crash_consistency.py      |  4 +++-
 .../test_pageserver_getpage_throttle.py       |  6 +++---
 test_runner/regress/test_s3_restore.py        |  4 +++-
 .../regress/test_timeline_detach_ancestor.py  |  2 +-
 9 files changed, 44 insertions(+), 22 deletions(-)

diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 175a1870d4..57a5d6875e 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -404,6 +404,12 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         return res.json()
 
     def set_tenant_config(self, tenant_id: Union[TenantId, TenantShardId], config: dict[str, Any]):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        Storcon is the authority on tenant config - changes you make directly
+        against pageserver may be reconciled away at any time.
+        """
         assert "tenant_id" not in config.keys()
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/config",
@@ -417,6 +423,11 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
         inserts: Optional[dict[str, Any]] = None,
         removes: Optional[list[str]] = None,
     ):
+        """
+        Only use this via storage_controller.pageserver_api().
+
+        See `set_tenant_config` for more information.
+        """
         current = self.tenant_config(tenant_id).tenant_specific_overrides
         if inserts is not None:
             current.update(inserts)
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 83d003a5cc..64de7626f4 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -176,17 +176,21 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "lsn_lease_length_for_ts": "5s",
     }
 
-    ps_http = env.pageserver.http_client()
+    vps_http = env.storage_controller.pageserver_api()
 
-    initial_tenant_config = ps_http.tenant_config(env.initial_tenant)
-    assert initial_tenant_config.tenant_specific_overrides == {}
+    initial_tenant_config = vps_http.tenant_config(env.initial_tenant)
+    assert [
+        (key, val)
+        for key, val in initial_tenant_config.tenant_specific_overrides.items()
+        if val is not None
+    ] == []
     assert set(initial_tenant_config.effective_config.keys()) == set(
         fully_custom_config.keys()
     ), "ensure we cover all config options"
 
     (tenant_id, _) = env.create_tenant()
-    ps_http.set_tenant_config(tenant_id, fully_custom_config)
-    our_tenant_config = ps_http.tenant_config(tenant_id)
+    vps_http.set_tenant_config(tenant_id, fully_custom_config)
+    our_tenant_config = vps_http.tenant_config(tenant_id)
     assert our_tenant_config.tenant_specific_overrides == fully_custom_config
     assert set(our_tenant_config.effective_config.keys()) == set(
         fully_custom_config.keys()
@@ -199,10 +203,10 @@ def test_fully_custom_config(positive_env: NeonEnv):
         == {k: True for k in fully_custom_config.keys()}
     ), "ensure our custom config has different values than the default config for all config options, so we know we overrode everything"
 
-    ps_http.tenant_detach(tenant_id)
+    env.pageserver.tenant_detach(tenant_id)
     env.pageserver.tenant_attach(tenant_id, config=fully_custom_config)
 
-    assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
-    assert set(ps_http.tenant_config(tenant_id).effective_config.keys()) == set(
+    assert vps_http.tenant_config(tenant_id).tenant_specific_overrides == fully_custom_config
+    assert set(vps_http.tenant_config(tenant_id).effective_config.keys()) == set(
         fully_custom_config.keys()
     ), "ensure we cover all config options"
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 72866766de..c8d3b2ff3e 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -38,21 +38,24 @@ def test_min_resident_size_override_handling(
     neon_env_builder: NeonEnvBuilder, config_level_override: int
 ):
     env = neon_env_builder.init_start()
+    vps_http = env.storage_controller.pageserver_api()
     ps_http = env.pageserver.http_client()
 
     def assert_config(tenant_id, expect_override, expect_effective):
+        # talk to actual pageserver to _get_ the config, workaround for
+        # https://github.com/neondatabase/neon/issues/9621
         config = ps_http.tenant_config(tenant_id)
         assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override
         assert config.effective_config.get("min_resident_size_override") == expect_effective
 
     def assert_overrides(tenant_id, default_tenant_conf_value):
-        ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
+        vps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
         assert_config(tenant_id, 200, 200)
 
-        ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
+        vps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
         assert_config(tenant_id, 0, 0)
 
-        ps_http.set_tenant_config(tenant_id, {})
+        vps_http.set_tenant_config(tenant_id, {})
         assert_config(tenant_id, None, default_tenant_conf_value)
 
     if config_level_override is not None:
@@ -72,7 +75,7 @@ def test_min_resident_size_override_handling(
     # Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig.
     tenant_id, _ = env.create_tenant(conf={"min_resident_size_override": "100"})
     assert_config(tenant_id, 100, 100)
-    ps_http.set_tenant_config(tenant_id, {})
+    vps_http.set_tenant_config(tenant_id, {})
     assert_config(tenant_id, None, config_level_override)
 
 
@@ -457,10 +460,10 @@ def test_pageserver_respects_overridden_resident_size(
     assert (
         du_by_timeline[large_tenant] > min_resident_size
     ), "ensure the larger tenant will get a haircut"
-    ps_http.patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         small_tenant[0], {"min_resident_size_override": min_resident_size}
     )
-    ps_http.patch_tenant_config_client_side(
+    env.neon_env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         large_tenant[0], {"min_resident_size_override": min_resident_size}
     )
 
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
index 2edbf4d6d3..646dac8e6e 100644
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -81,7 +81,7 @@ def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, bui
     print_layer_size_histogram(post_ingest)
 
     # since all we have are L0s, we should be getting nice L1s and images out of them now
-    ps_http.patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         env.initial_tenant,
         {
             "compaction_threshold": 1,
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 2536ec1b3c..309e0f3015 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -127,7 +127,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
     ), "sanity check for what above loop is supposed to do"
 
     # create the image layer from the future
-    ps_http.patch_tenant_config_client_side(
+    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
         tenant_id, {"image_creation_threshold": image_creation_threshold}, None
     )
     assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
diff --git a/test_runner/regress/test_pageserver_crash_consistency.py b/test_runner/regress/test_pageserver_crash_consistency.py
index ac46d3e62a..fcae7983f4 100644
--- a/test_runner/regress/test_pageserver_crash_consistency.py
+++ b/test_runner/regress/test_pageserver_crash_consistency.py
@@ -46,7 +46,9 @@ def test_local_only_layers_after_crash(neon_env_builder: NeonEnvBuilder, pg_bin:
     for sk in env.safekeepers:
         sk.stop()
 
-    pageserver_http.patch_tenant_config_client_side(tenant_id, {"compaction_threshold": 3})
+    env.storage_controller.pageserver_api().patch_tenant_config_client_side(
+        tenant_id, {"compaction_threshold": 3}
+    )
     # hit the exit failpoint
     with pytest.raises(ConnectionError, match="Remote end closed connection without response"):
         pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 6811d09cff..f1aad85fe9 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -146,13 +146,13 @@ def test_throttle_fair_config_is_settable_but_ignored_in_mgmt_api(neon_env_build
     To be removed after https://github.com/neondatabase/neon/pull/8539 is rolled out.
     """
     env = neon_env_builder.init_start()
-    ps_http = env.pageserver.http_client()
+    vps_http = env.storage_controller.pageserver_api()
     # with_fair config should still be settable
-    ps_http.set_tenant_config(
+    vps_http.set_tenant_config(
         env.initial_tenant,
         {"timeline_get_throttle": throttle_config_with_field_fair_set},
     )
-    conf = ps_http.tenant_config(env.initial_tenant)
+    conf = vps_http.tenant_config(env.initial_tenant)
     assert_throttle_config_with_field_fair_set(conf.effective_config["timeline_get_throttle"])
     assert_throttle_config_with_field_fair_set(
         conf.tenant_specific_overrides["timeline_get_throttle"]
diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py
index bedc9b5865..7a9e6d62b2 100644
--- a/test_runner/regress/test_s3_restore.py
+++ b/test_runner/regress/test_s3_restore.py
@@ -52,7 +52,9 @@ def test_tenant_s3_restore(
     tenant_id = env.initial_tenant
 
     # now lets create the small layers
-    ps_http.set_tenant_config(tenant_id, many_small_layers_tenant_config())
+    env.storage_controller.pageserver_api().set_tenant_config(
+        tenant_id, many_small_layers_tenant_config()
+    )
 
     # Default tenant and the one we created
     assert ps_http.get_metric_value("pageserver_tenant_manager_slots", {"mode": "attached"}) == 1
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index d467c59e62..ed47f9432b 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -511,7 +511,7 @@ def test_compaction_induced_by_detaches_in_history(
 
         assert len(delta_layers(branch_timeline_id)) == 5
 
-        client.patch_tenant_config_client_side(
+        env.storage_controller.pageserver_api().patch_tenant_config_client_side(
             env.initial_tenant, {"compaction_threshold": 5}, None
         )
 

From 81d1bb19410a1c9a5c6bc673059ea5d27103a5be Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 4 Nov 2024 17:28:10 +0000
Subject: [PATCH 157/239] quieten aws_config logs (#9626)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

logs during aws authentication are soooo noisy in staging 🙃
---
 proxy/src/logging.rs | 1 +
 1 file changed, 1 insertion(+)

diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 11921867e4..74d2b9a1d0 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -18,6 +18,7 @@ pub async fn init() -> anyhow::Result<LoggingGuard> {
     let env_filter = EnvFilter::builder()
         .with_default_directive(LevelFilter::INFO.into())
         .from_env_lossy()
+        .add_directive("aws_config=info".parse().unwrap())
         .add_directive("azure_core::policies::transport=off".parse().unwrap());
 
     let fmt_layer = tracing_subscriber::fmt::layer()

From 59879985b4dea9eae57c99e1c07dbbab684be3f4 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 4 Nov 2024 19:56:40 +0100
Subject: [PATCH 158/239] proxy: Wrap JWT errors in separate AuthError variant
 (#9625)

* Also rename `AuthFailed` variant to `PasswordFailed`.
* Before this all JWT errors end up in `AuthError::AuthFailed()`,
  expects a username and also causes cache invalidation.
---
 proxy/src/auth/backend/classic.rs |  2 +-
 proxy/src/auth/backend/hacks.rs   |  2 +-
 proxy/src/auth/backend/mod.rs     |  4 ++--
 proxy/src/auth/mod.rs             | 20 +++++++++++++-------
 proxy/src/serverless/backend.rs   | 10 ++++------
 5 files changed, 21 insertions(+), 17 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index de32a06e9e..6d26c99832 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -51,7 +51,7 @@ pub(super) async fn authenticate(
                 sasl::Outcome::Success(key) => key,
                 sasl::Outcome::Failure(reason) => {
                     info!("auth backend failed with an error: {reason}");
-                    return Err(auth::AuthError::auth_failed(&*creds.user));
+                    return Err(auth::AuthError::password_failed(&*creds.user));
                 }
             };
 
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index 28bdacd769..1411d908a5 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -46,7 +46,7 @@ pub(crate) async fn authenticate_cleartext(
         sasl::Outcome::Success(key) => key,
         sasl::Outcome::Failure(reason) => {
             info!("auth backend failed with an error: {reason}");
-            return Err(auth::AuthError::auth_failed(&*info.user));
+            return Err(auth::AuthError::password_failed(&*info.user));
         }
     };
 
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 17334b9cbb..0eb68e6412 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -349,7 +349,7 @@ async fn auth_quirks(
     {
         Ok(keys) => Ok(keys),
         Err(e) => {
-            if e.is_auth_failed() {
+            if e.is_password_failed() {
                 // The password could have been changed, so we invalidate the cache.
                 cached_entry.invalidate();
             }
@@ -376,7 +376,7 @@ async fn authenticate_with_secret(
             crate::sasl::Outcome::Success(key) => key,
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
-                return Err(auth::AuthError::auth_failed(&*info.user));
+                return Err(auth::AuthError::password_failed(&*info.user));
             }
         };
 
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 7a373dd825..2bd7a2da3d 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -21,6 +21,7 @@ pub(crate) use flow::*;
 use thiserror::Error;
 use tokio::time::error::Elapsed;
 
+use crate::auth::backend::jwt::JwtError;
 use crate::control_plane;
 use crate::error::{ReportableError, UserFacingError};
 
@@ -55,7 +56,7 @@ pub(crate) enum AuthError {
     MissingEndpointName,
 
     #[error("password authentication failed for user '{0}'")]
-    AuthFailed(Box<str>),
+    PasswordFailed(Box<str>),
 
     /// Errors produced by e.g. [`crate::stream::PqStream`].
     #[error(transparent)]
@@ -76,6 +77,9 @@ pub(crate) enum AuthError {
 
     #[error("Disconnected due to inactivity after {0}.")]
     ConfirmationTimeout(humantime::Duration),
+
+    #[error(transparent)]
+    Jwt(#[from] JwtError),
 }
 
 impl AuthError {
@@ -83,8 +87,8 @@ impl AuthError {
         AuthError::BadAuthMethod(name.into())
     }
 
-    pub(crate) fn auth_failed(user: impl Into<Box<str>>) -> Self {
-        AuthError::AuthFailed(user.into())
+    pub(crate) fn password_failed(user: impl Into<Box<str>>) -> Self {
+        AuthError::PasswordFailed(user.into())
     }
 
     pub(crate) fn ip_address_not_allowed(ip: IpAddr) -> Self {
@@ -95,8 +99,8 @@ impl AuthError {
         AuthError::TooManyConnections
     }
 
-    pub(crate) fn is_auth_failed(&self) -> bool {
-        matches!(self, AuthError::AuthFailed(_))
+    pub(crate) fn is_password_failed(&self) -> bool {
+        matches!(self, AuthError::PasswordFailed(_))
     }
 
     pub(crate) fn user_timeout(elapsed: Elapsed) -> Self {
@@ -114,7 +118,7 @@ impl UserFacingError for AuthError {
             Self::Web(e) => e.to_string_client(),
             Self::GetAuthInfo(e) => e.to_string_client(),
             Self::Sasl(e) => e.to_string_client(),
-            Self::AuthFailed(_) => self.to_string(),
+            Self::PasswordFailed(_) => self.to_string(),
             Self::BadAuthMethod(_) => self.to_string(),
             Self::MalformedPassword(_) => self.to_string(),
             Self::MissingEndpointName => self.to_string(),
@@ -123,6 +127,7 @@ impl UserFacingError for AuthError {
             Self::TooManyConnections => self.to_string(),
             Self::UserTimeout(_) => self.to_string(),
             Self::ConfirmationTimeout(_) => self.to_string(),
+            Self::Jwt(_) => self.to_string(),
         }
     }
 }
@@ -133,7 +138,7 @@ impl ReportableError for AuthError {
             Self::Web(e) => e.get_error_kind(),
             Self::GetAuthInfo(e) => e.get_error_kind(),
             Self::Sasl(e) => e.get_error_kind(),
-            Self::AuthFailed(_) => crate::error::ErrorKind::User,
+            Self::PasswordFailed(_) => crate::error::ErrorKind::User,
             Self::BadAuthMethod(_) => crate::error::ErrorKind::User,
             Self::MalformedPassword(_) => crate::error::ErrorKind::User,
             Self::MissingEndpointName => crate::error::ErrorKind::User,
@@ -142,6 +147,7 @@ impl ReportableError for AuthError {
             Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
             Self::UserTimeout(_) => crate::error::ErrorKind::User,
             Self::ConfirmationTimeout(_) => crate::error::ErrorKind::User,
+            Self::Jwt(_) => crate::error::ErrorKind::User,
         }
     }
 }
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 07e0e30148..c89e0f0232 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -81,7 +81,7 @@ impl PoolingBackend {
             None => {
                 // If we don't have an authentication secret, for the http flow we can just return an error.
                 info!("authentication info not found");
-                return Err(AuthError::auth_failed(&*user_info.user));
+                return Err(AuthError::password_failed(&*user_info.user));
             }
         };
         let ep = EndpointIdInt::from(&user_info.endpoint);
@@ -99,7 +99,7 @@ impl PoolingBackend {
             }
             crate::sasl::Outcome::Failure(reason) => {
                 info!("auth backend failed with an error: {reason}");
-                Err(AuthError::auth_failed(&*user_info.user))
+                Err(AuthError::password_failed(&*user_info.user))
             }
         };
         res.map(|key| ComputeCredentials {
@@ -126,8 +126,7 @@ impl PoolingBackend {
                         &**console,
                         &jwt,
                     )
-                    .await
-                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;
+                    .await?;
 
                 Ok(ComputeCredentials {
                     info: user_info.clone(),
@@ -146,8 +145,7 @@ impl PoolingBackend {
                         &StaticAuthRules,
                         &jwt,
                     )
-                    .await
-                    .map_err(|e| AuthError::auth_failed(e.to_string()))?;
+                    .await?;
 
                 Ok(ComputeCredentials {
                     info: user_info.clone(),

From 1085fe57d308b017ed0d079a4890f34c7e9cd52a Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 4 Nov 2024 20:19:26 +0100
Subject: [PATCH 159/239] proxy: Rewrite ControlPlaneEvent as enum (#9627)

---
 proxy/src/cache/endpoints.rs | 110 ++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 52 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 12c33169bf..400c76291e 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -1,7 +1,7 @@
 use std::convert::Infallible;
+use std::future::pending;
 use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::Arc;
-use std::time::Duration;
 
 use dashmap::DashSet;
 use redis::streams::{StreamReadOptions, StreamReadReply};
@@ -19,25 +19,38 @@ use crate::rate_limiter::GlobalRateLimiter;
 use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::types::EndpointId;
 
+#[allow(clippy::enum_variant_names)]
 #[derive(Deserialize, Debug, Clone)]
-pub(crate) struct ControlPlaneEventKey {
-    endpoint_created: Option<EndpointCreated>,
-    branch_created: Option<BranchCreated>,
-    project_created: Option<ProjectCreated>,
+#[serde(tag = "type", rename_all(deserialize = "snake_case"))]
+enum ControlPlaneEvent {
+    EndpointCreated { endpoint_created: EndpointCreated },
+    BranchCreated { branch_created: BranchCreated },
+    ProjectCreated { project_created: ProjectCreated },
 }
+
 #[derive(Deserialize, Debug, Clone)]
 struct EndpointCreated {
     endpoint_id: String,
 }
+
 #[derive(Deserialize, Debug, Clone)]
 struct BranchCreated {
     branch_id: String,
 }
+
 #[derive(Deserialize, Debug, Clone)]
 struct ProjectCreated {
     project_id: String,
 }
 
+impl TryFrom<&Value> for ControlPlaneEvent {
+    type Error = anyhow::Error;
+    fn try_from(value: &Value) -> Result<Self, Self::Error> {
+        let json = String::from_redis_value(value)?;
+        Ok(serde_json::from_str(&json)?)
+    }
+}
+
 pub struct EndpointsCache {
     config: EndpointCacheConfig,
     endpoints: DashSet<EndpointIdInt>,
@@ -60,6 +73,7 @@ impl EndpointsCache {
             ready: AtomicBool::new(false),
         }
     }
+
     pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
             return true;
@@ -74,6 +88,7 @@ impl EndpointsCache {
         }
         !rejected
     }
+
     fn should_reject(&self, endpoint: &EndpointId) -> bool {
         if endpoint.is_endpoint() {
             !self.endpoints.contains(&EndpointIdInt::from(endpoint))
@@ -87,33 +102,28 @@ impl EndpointsCache {
                 .contains(&ProjectIdInt::from(&endpoint.as_project()))
         }
     }
-    fn insert_event(&self, key: ControlPlaneEventKey) {
-        // Do not do normalization here, we expect the events to be normalized.
-        if let Some(endpoint_created) = key.endpoint_created {
-            self.endpoints
-                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::EndpointCreated);
-        }
-        if let Some(branch_created) = key.branch_created {
-            self.branches
-                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::BranchCreated);
-        }
-        if let Some(project_created) = key.project_created {
-            self.projects
-                .insert(ProjectIdInt::from(&project_created.project_id.into()));
-            Metrics::get()
-                .proxy
-                .redis_events_count
-                .inc(RedisEventsCount::ProjectCreated);
-        }
+
+    fn insert_event(&self, event: ControlPlaneEvent) {
+        let counter = match event {
+            ControlPlaneEvent::EndpointCreated { endpoint_created } => {
+                self.endpoints
+                    .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+                RedisEventsCount::EndpointCreated
+            }
+            ControlPlaneEvent::BranchCreated { branch_created } => {
+                self.branches
+                    .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+                RedisEventsCount::BranchCreated
+            }
+            ControlPlaneEvent::ProjectCreated { project_created } => {
+                self.projects
+                    .insert(ProjectIdInt::from(&project_created.project_id.into()));
+                RedisEventsCount::ProjectCreated
+            }
+        };
+        Metrics::get().proxy.redis_events_count.inc(counter);
     }
+
     pub async fn do_read(
         &self,
         mut con: ConnectionWithCredentialsProvider,
@@ -131,12 +141,13 @@ impl EndpointsCache {
             }
             if cancellation_token.is_cancelled() {
                 info!("cancellation token is cancelled, exiting");
-                tokio::time::sleep(Duration::from_secs(60 * 60 * 24 * 7)).await;
-                // 1 week.
+                // Maintenance tasks run forever. Sleep forever when canceled.
+                pending::<()>().await;
             }
             tokio::time::sleep(self.config.retry_interval).await;
         }
     }
+
     async fn read_from_stream(
         &self,
         con: &mut ConnectionWithCredentialsProvider,
@@ -162,10 +173,7 @@ impl EndpointsCache {
         )
         .await
     }
-    fn parse_key_value(value: &Value) -> anyhow::Result<ControlPlaneEventKey> {
-        let s: String = FromRedisValue::from_redis_value(value)?;
-        Ok(serde_json::from_str(&s)?)
-    }
+
     async fn batch_read(
         &self,
         conn: &mut ConnectionWithCredentialsProvider,
@@ -196,27 +204,25 @@ impl EndpointsCache {
                 anyhow::bail!("Cannot read from redis stream {}", self.config.stream_name);
             }
 
-            let res = res.keys.pop().expect("Checked length above");
-            let len = res.ids.len();
-            for x in res.ids {
+            let key = res.keys.pop().expect("Checked length above");
+            let len = key.ids.len();
+            for stream_id in key.ids {
                 total += 1;
-                for (_, v) in x.map {
-                    let key = match Self::parse_key_value(&v) {
-                        Ok(x) => x,
-                        Err(e) => {
+                for value in stream_id.map.values() {
+                    match value.try_into() {
+                        Ok(event) => self.insert_event(event),
+                        Err(err) => {
                             Metrics::get().proxy.redis_errors_total.inc(RedisErrors {
                                 channel: &self.config.stream_name,
                             });
-                            tracing::error!("error parsing value {v:?}: {e:?}");
-                            continue;
+                            tracing::error!("error parsing value {value:?}: {err:?}");
                         }
                     };
-                    self.insert_event(key);
                 }
                 if total.is_power_of_two() {
                     tracing::debug!("endpoints read {}", total);
                 }
-                *last_id = x.id;
+                *last_id = stream_id.id;
             }
             if return_when_finish && len <= self.config.default_batch_size {
                 break;
@@ -229,11 +235,11 @@ impl EndpointsCache {
 
 #[cfg(test)]
 mod tests {
-    use super::ControlPlaneEventKey;
+    use super::ControlPlaneEvent;
 
     #[test]
-    fn test() {
-        let s = "{\"branch_created\":null,\"endpoint_created\":{\"endpoint_id\":\"ep-rapid-thunder-w0qqw2q9\"},\"project_created\":null,\"type\":\"endpoint_created\"}";
-        serde_json::from_str::<ControlPlaneEventKey>(s).unwrap();
+    fn test_parse_control_plane_event() {
+        let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
+        serde_json::from_str::<ControlPlaneEvent>(s).unwrap();
     }
 }

From ee68bbf6f5053d5ef7beb2fa711974866f66ca48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 4 Nov 2024 21:01:18 +0100
Subject: [PATCH 160/239] Add tenant config option to allow timeline_offloading
 (#9598)

Allow us to enable timeline offloading for single tenants without having
to enable it for the entire pageserver.

Part of #8088.
---
 control_plane/src/pageserver.rs                  | 10 ++++++++++
 libs/pageserver_api/src/config.rs                |  5 +++++
 libs/pageserver_api/src/models.rs                |  1 +
 pageserver/src/tenant.rs                         | 10 +++++++++-
 pageserver/src/tenant/config.rs                  |  8 ++++++++
 test_runner/regress/test_attach_tenant_config.py |  1 +
 6 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index 8df0a714ec..db54965eb5 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -403,6 +403,11 @@ impl PageServerNode {
             lsn_lease_length_for_ts: settings
                 .remove("lsn_lease_length_for_ts")
                 .map(|x| x.to_string()),
+            timeline_offloading: settings
+                .remove("timeline_offloading")
+                .map(|x| x.parse::<bool>())
+                .transpose()
+                .context("Failed to parse 'timeline_offloading' as bool")?,
         };
         if !settings.is_empty() {
             bail!("Unrecognized tenant settings: {settings:?}")
@@ -498,6 +503,11 @@ impl PageServerNode {
                 lsn_lease_length_for_ts: settings
                     .remove("lsn_lease_length_for_ts")
                     .map(|x| x.to_string()),
+                timeline_offloading: settings
+                    .remove("timeline_offloading")
+                    .map(|x| x.parse::<bool>())
+                    .transpose()
+                    .context("Failed to parse 'timeline_offloading' as bool")?,
             }
         };
 
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 6b2d6cf625..00cc426c3c 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -259,6 +259,10 @@ pub struct TenantConfigToml {
     /// Layers needed to reconstruct pages at LSN will not be GC-ed during this interval.
     #[serde(with = "humantime_serde")]
     pub lsn_lease_length_for_ts: Duration,
+
+    /// Enable auto-offloading of timelines.
+    /// (either this flag or the pageserver-global one need to be set)
+    pub timeline_offloading: bool,
 }
 
 pub mod defaults {
@@ -471,6 +475,7 @@ impl Default for TenantConfigToml {
             image_layer_creation_check_threshold: DEFAULT_IMAGE_LAYER_CREATION_CHECK_THRESHOLD,
             lsn_lease_length: LsnLease::DEFAULT_LENGTH,
             lsn_lease_length_for_ts: LsnLease::DEFAULT_LENGTH_FOR_TS,
+            timeline_offloading: false,
         }
     }
 }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 0a4992aea4..0dfa1ba817 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -310,6 +310,7 @@ pub struct TenantConfig {
     pub image_layer_creation_check_threshold: Option<u8>,
     pub lsn_lease_length: Option<String>,
     pub lsn_lease_length_for_ts: Option<String>,
+    pub timeline_offloading: Option<bool>,
 }
 
 /// The policy for the aux file storage.
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 68f8f7e13c..d45c99a41b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2499,8 +2499,15 @@ impl Tenant {
                             .iter()
                             .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
                     };
+                    let config_allows_offload = self.conf.timeline_offloading
+                        || self
+                            .tenant_conf
+                            .load()
+                            .tenant_conf
+                            .timeline_offloading
+                            .unwrap_or_default();
                     let can_offload =
-                        can_offload && has_no_unoffloaded_children && self.conf.timeline_offloading;
+                        can_offload && has_no_unoffloaded_children && config_allows_offload;
                     if (is_active, can_offload) == (false, false) {
                         None
                     } else {
@@ -4902,6 +4909,7 @@ pub(crate) mod harness {
                 ),
                 lsn_lease_length: Some(tenant_conf.lsn_lease_length),
                 lsn_lease_length_for_ts: Some(tenant_conf.lsn_lease_length_for_ts),
+                timeline_offloading: Some(tenant_conf.timeline_offloading),
             }
         }
     }
diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index ce686c89ef..4d6176bfd9 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -349,6 +349,10 @@ pub struct TenantConfOpt {
     #[serde(with = "humantime_serde")]
     #[serde(default)]
     pub lsn_lease_length_for_ts: Option<Duration>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    #[serde(default)]
+    pub timeline_offloading: Option<bool>,
 }
 
 impl TenantConfOpt {
@@ -411,6 +415,9 @@ impl TenantConfOpt {
             lsn_lease_length_for_ts: self
                 .lsn_lease_length_for_ts
                 .unwrap_or(global_conf.lsn_lease_length_for_ts),
+            timeline_offloading: self
+                .lazy_slru_download
+                .unwrap_or(global_conf.timeline_offloading),
         }
     }
 }
@@ -464,6 +471,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
             image_layer_creation_check_threshold: value.image_layer_creation_check_threshold,
             lsn_lease_length: value.lsn_lease_length.map(humantime),
             lsn_lease_length_for_ts: value.lsn_lease_length_for_ts.map(humantime),
+            timeline_offloading: value.timeline_offloading,
         }
     }
 }
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 64de7626f4..7d19ba3b5d 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -174,6 +174,7 @@ def test_fully_custom_config(positive_env: NeonEnv):
         "image_layer_creation_check_threshold": 1,
         "lsn_lease_length": "1m",
         "lsn_lease_length_for_ts": "5s",
+        "timeline_offloading": True,
     }
 
     vps_http = env.storage_controller.pageserver_api()

From 34812a6aaba2e2193b617cbb69ee4568aeca8274 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 4 Nov 2024 15:52:01 -0600
Subject: [PATCH 161/239] Improve some typing related to performance testing
 for LR

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_api.py              | 11 +++++---
 .../performance/test_logical_replication.py   | 27 ++++++++++++-------
 2 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 5934baccff..89c1f324b4 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import time
-from typing import TYPE_CHECKING, cast
+from typing import TYPE_CHECKING, cast, final
 
 import requests
 
@@ -261,17 +261,22 @@ class NeonAPI:
             time.sleep(0.5)
 
 
+@final
 class NeonApiEndpoint:
     def __init__(self, neon_api: NeonAPI, pg_version: PgVersion, project_id: Optional[str]):
         self.neon_api = neon_api
+        self.project_id: str
+        self.endpoint_id: str
+        self.connstr: str
+
         if project_id is None:
             project = neon_api.create_project(pg_version)
-            neon_api.wait_for_operation_to_finish(project["project"]["id"])
+            neon_api.wait_for_operation_to_finish(cast("str", project["project"]["id"]))
             self.project_id = project["project"]["id"]
             self.endpoint_id = project["endpoints"][0]["id"]
             self.connstr = project["connection_uris"][0]["connection_uri"]
             self.pgbench_env = connection_parameters_to_env(
-                project["connection_uris"][0]["connection_parameters"]
+                cast("dict[str, str]", project["connection_uris"][0]["connection_parameters"])
             )
             self.is_new = True
         else:
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 8b2a296bdd..e62485905e 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import time
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, cast
 
 import psycopg2
 import psycopg2.extras
@@ -12,13 +12,16 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import logical_replication_sync
 
 if TYPE_CHECKING:
+    from subprocess import Popen
+    from typing import AnyStr
+
     from fixtures.benchmark_fixture import NeonBenchmarker
     from fixtures.neon_api import NeonApiEndpoint
-    from fixtures.neon_fixtures import NeonEnv, PgBin
+    from fixtures.neon_fixtures import NeonEnv, PgBin, VanillaPostgres
 
 
 @pytest.mark.timeout(1000)
-def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg):
+def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
 
     endpoint = env.endpoints.create_start("main")
@@ -47,24 +50,28 @@ def test_logical_replication(neon_simple_env: NeonEnv, pg_bin: PgBin, vanilla_pg
     logical_replication_sync(vanilla_pg, endpoint)
     log.info(f"Sync with master took {time.time() - start} seconds")
 
-    sum_master = endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
-    sum_replica = vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
+    sum_master = cast("int", endpoint.safe_psql("select sum(abalance) from pgbench_accounts")[0][0])
+    sum_replica = cast(
+        "int", vanilla_pg.safe_psql("select sum(abalance) from pgbench_accounts")[0][0]
+    )
     assert sum_master == sum_replica
 
 
-def check_pgbench_still_running(pgbench, label=""):
+def check_pgbench_still_running(pgbench: Popen[AnyStr], label: str = ""):
     rc = pgbench.poll()
     if rc is not None:
         raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
 
 
-def measure_logical_replication_lag(sub_cur, pub_cur, timeout_sec=600):
+def measure_logical_replication_lag(
+    sub_cur: psycopg2.cursor, pub_cur: psycopg2.cursor, timeout_sec: float = 600
+):
     start = time.time()
     pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
-    pub_lsn = Lsn(pub_cur.fetchall()[0][0])
+    pub_lsn = Lsn(cast("str", pub_cur.fetchall()[0][0]))
     while (time.time() - start) < timeout_sec:
         sub_cur.execute("SELECT latest_end_lsn FROM pg_catalog.pg_stat_subscription")
-        res = sub_cur.fetchall()[0][0]
+        res = cast("str", sub_cur.fetchall()[0][0])
         if res:
             log.info(f"subscriber_lsn={res}")
             sub_lsn = Lsn(res)
@@ -286,7 +293,7 @@ def test_snap_files(
         conn.autocommit = True
         with conn.cursor() as cur:
             cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cur.fetchall()[0][0]
+            is_super = cast("bool", cur.fetchall()[0][0])
             assert is_super, "This benchmark won't work if we don't have superuser"
 
     pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)

From 1e16221f82b94fed3c37ce9e95583bb52deaa5d0 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 4 Nov 2024 18:21:59 -0600
Subject: [PATCH 162/239] Update psycopg2 to latest version for complete PG 17
 support

Update the types to match. Changes the cursor import to match the
C bindings[0].

Link: https://github.com/python/typeshed/issues/12578 [0]
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 poetry.lock                                   | 155 +++++++++---------
 pyproject.toml                                |   4 +-
 scripts/download_basebackup.py                |   4 +-
 .../performance/test_logical_replication.py   |   5 +-
 4 files changed, 80 insertions(+), 88 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index e06950cb52..d869761e8e 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.4 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.3 and should not be changed by hand.
 
 [[package]]
 name = "aiohappyeyeballs"
@@ -2106,83 +2106,78 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"]
 
 [[package]]
 name = "psycopg2-binary"
-version = "2.9.9"
+version = "2.9.10"
 description = "psycopg2 - Python-PostgreSQL Database Adapter"
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"},
-    {file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
-    {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
-    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"},
-    {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"},
-    {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"},
-    {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"},
+    {file = "psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:0ea8e3d0ae83564f2fc554955d327fa081d065c8ca5cc6d2abb643e2c9c1200f"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-macosx_14_0_arm64.whl", hash = "sha256:3e9c76f0ac6f92ecfc79516a8034a544926430f7b080ec5a0537bca389ee0906"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2ad26b467a405c798aaa1458ba09d7e2b6e5f96b1ce0ac15d82fd9f95dc38a92"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:270934a475a0e4b6925b5f804e3809dd5f90f8613621d062848dd82f9cd62007"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:48b338f08d93e7be4ab2b5f1dbe69dc5e9ef07170fe1f86514422076d9c010d0"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7f4152f8f76d2023aac16285576a9ecd2b11a9895373a1f10fd9db54b3ff06b4"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:32581b3020c72d7a421009ee1c6bf4a131ef5f0a968fab2e2de0c9d2bb4577f1"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:2ce3e21dc3437b1d960521eca599d57408a695a0d3c26797ea0f72e834c7ffe5"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:e984839e75e0b60cfe75e351db53d6db750b00de45644c5d1f7ee5d1f34a1ce5"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:3c4745a90b78e51d9ba06e2088a2fe0c693ae19cc8cb051ccda44e8df8a6eb53"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-win32.whl", hash = "sha256:e5720a5d25e3b99cd0dc5c8a440570469ff82659bb09431c1439b92caf184d3b"},
+    {file = "psycopg2_binary-2.9.10-cp310-cp310-win_amd64.whl", hash = "sha256:3c18f74eb4386bf35e92ab2354a12c17e5eb4d9798e4c0ad3a00783eae7cd9f1"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:04392983d0bb89a8717772a193cfaac58871321e3ec69514e1c4e0d4957b5aff"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-macosx_14_0_arm64.whl", hash = "sha256:1a6784f0ce3fec4edc64e985865c17778514325074adf5ad8f80636cd029ef7c"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b5f86c56eeb91dc3135b3fd8a95dc7ae14c538a2f3ad77a19645cf55bab1799c"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2b3d2491d4d78b6b14f76881905c7a8a8abcf974aad4a8a0b065273a0ed7a2cb"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2286791ececda3a723d1910441c793be44625d86d1a4e79942751197f4d30341"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:512d29bb12608891e349af6a0cccedce51677725a921c07dba6342beaf576f9a"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:5a507320c58903967ef7384355a4da7ff3f28132d679aeb23572753cbf2ec10b"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:6d4fa1079cab9018f4d0bd2db307beaa612b0d13ba73b5c6304b9fe2fb441ff7"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:851485a42dbb0bdc1edcdabdb8557c09c9655dfa2ca0460ff210522e073e319e"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:35958ec9e46432d9076286dda67942ed6d968b9c3a6a2fd62b48939d1d78bf68"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-win32.whl", hash = "sha256:ecced182e935529727401b24d76634a357c71c9275b356efafd8a2a91ec07392"},
+    {file = "psycopg2_binary-2.9.10-cp311-cp311-win_amd64.whl", hash = "sha256:ee0e8c683a7ff25d23b55b11161c2663d4b099770f6085ff0a20d4505778d6b4"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:880845dfe1f85d9d5f7c412efea7a08946a46894537e4e5d091732eb1d34d9a0"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-macosx_14_0_arm64.whl", hash = "sha256:9440fa522a79356aaa482aa4ba500b65f28e5d0e63b801abf6aa152a29bd842a"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e3923c1d9870c49a2d44f795df0c889a22380d36ef92440ff618ec315757e539"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7b2c956c028ea5de47ff3a8d6b3cc3330ab45cf0b7c3da35a2d6ff8420896526"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f758ed67cab30b9a8d2833609513ce4d3bd027641673d4ebc9c067e4d208eec1"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8cd9b4f2cfab88ed4a9106192de509464b75a906462fb846b936eabe45c2063e"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:6dc08420625b5a20b53551c50deae6e231e6371194fa0651dbe0fb206452ae1f"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:d7cd730dfa7c36dbe8724426bf5612798734bff2d3c3857f36f2733f5bfc7c00"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:155e69561d54d02b3c3209545fb08938e27889ff5a10c19de8d23eb5a41be8a5"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:c3cc28a6fd5a4a26224007712e79b81dbaee2ffb90ff406256158ec4d7b52b47"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-win32.whl", hash = "sha256:ec8a77f521a17506a24a5f626cb2aee7850f9b69a0afe704586f63a464f3cd64"},
+    {file = "psycopg2_binary-2.9.10-cp312-cp312-win_amd64.whl", hash = "sha256:18c5ee682b9c6dd3696dad6e54cc7ff3a1a9020df6a5c0f861ef8bfd338c3ca0"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c4ded1a24b20021ebe677b7b08ad10bf09aac197d6943bfe6fec70ac4e4690d"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3abb691ff9e57d4a93355f60d4f4c1dd2d68326c968e7db17ea96df3c023ef73"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8608c078134f0b3cbd9f89b34bd60a943b23fd33cc5f065e8d5f840061bd0673"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:230eeae2d71594103cd5b93fd29d1ace6420d0b86f4778739cb1a5a32f607d1f"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1"},
+    {file = "psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-macosx_12_0_x86_64.whl", hash = "sha256:eb09aa7f9cecb45027683bb55aebaaf45a0df8bf6de68801a6afdc7947bb09d4"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b73d6d7f0ccdad7bc43e6d34273f70d587ef62f824d7261c4ae9b8b1b6af90e8"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ce5ab4bf46a211a8e924d307c1b1fcda82368586a19d0a24f8ae166f5c784864"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:056470c3dc57904bbf63d6f534988bafc4e970ffd50f6271fc4ee7daad9498a5"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:73aa0e31fa4bb82578f3a6c74a73c273367727de397a7a0f07bd83cbea696baa"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:8de718c0e1c4b982a54b41779667242bc630b2197948405b7bd8ce16bcecac92"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:5c370b1e4975df846b0277b4deba86419ca77dbc25047f535b0bb03d1a544d44"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:ffe8ed017e4ed70f68b7b371d84b7d4a790368db9203dfc2d222febd3a9c8863"},
+    {file = "psycopg2_binary-2.9.10-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:8aecc5e80c63f7459a1a2ab2c64df952051df196294d9f739933a9f6687e86b3"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:7a813c8bdbaaaab1f078014b9b0b13f5de757e2b5d9be6403639b298a04d218b"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d00924255d7fc916ef66e4bf22f354a940c67179ad3fd7067d7a0a9c84d2fbfc"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7559bce4b505762d737172556a4e6ea8a9998ecac1e39b5233465093e8cee697"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e8b58f0a96e7a1e341fc894f62c1177a7c83febebb5ff9123b579418fdc8a481"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b269105e59ac96aba877c1707c600ae55711d9dcd3fc4b5012e4af68e30c648"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:79625966e176dc97ddabc142351e0409e28acf4660b88d1cf6adb876d20c490d"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:8aabf1c1a04584c168984ac678a668094d831f152859d06e055288fa515e4d30"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:19721ac03892001ee8fdd11507e6a2e01f4e37014def96379411ca99d78aeb2c"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:7f5d859928e635fa3ce3477704acee0f667b3a3d3e4bb109f2b18d4005f38287"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-win32.whl", hash = "sha256:3216ccf953b3f267691c90c6fe742e45d890d8272326b4a8b20850a03d05b7b8"},
+    {file = "psycopg2_binary-2.9.10-cp39-cp39-win_amd64.whl", hash = "sha256:30e34c4e97964805f715206c7b789d54a78b70f3ff19fbe590104b71c45600e5"},
 ]
 
 [[package]]
@@ -3013,13 +3008,13 @@ files = [
 
 [[package]]
 name = "types-psycopg2"
-version = "2.9.21.10"
+version = "2.9.21.20241019"
 description = "Typing stubs for psycopg2"
 optional = false
-python-versions = "*"
+python-versions = ">=3.8"
 files = [
-    {file = "types-psycopg2-2.9.21.10.tar.gz", hash = "sha256:c2600892312ae1c34e12f145749795d93dc4eac3ef7dbf8a9c1bfd45385e80d7"},
-    {file = "types_psycopg2-2.9.21.10-py3-none-any.whl", hash = "sha256:918224a0731a3650832e46633e720703b5beef7693a064e777d9748654fcf5e5"},
+    {file = "types-psycopg2-2.9.21.20241019.tar.gz", hash = "sha256:bca89b988d2ebd19bcd08b177d22a877ea8b841decb10ed130afcf39404612fa"},
+    {file = "types_psycopg2-2.9.21.20241019-py3-none-any.whl", hash = "sha256:44d091e67732d16a941baae48cd7b53bf91911bc36888652447cf1ef0c1fb3f6"},
 ]
 
 [[package]]
@@ -3489,4 +3484,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "13bfc7479aacfe051abb92252b8ddc2e0c429f4607b2d9d8c4b353d2f75c1927"
+content-hash = "c656496f9fbb7c29b2df3143c1d72c95b5e121cb6340134c0b8d070f54a08508"
diff --git a/pyproject.toml b/pyproject.toml
index 92580ee156..9ea42bf46f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,7 +6,7 @@ package-mode = false
 [tool.poetry.dependencies]
 python = "^3.9"
 pytest = "^7.4.4"
-psycopg2-binary = "^2.9.9"
+psycopg2-binary = "^2.9.10"
 typing-extensions = "^4.6.1"
 PyJWT = {version = "^2.1.0", extras = ["crypto"]}
 requests = "^2.32.3"
@@ -15,7 +15,7 @@ asyncpg = "^0.29.0"
 aiopg = "^1.4.0"
 Jinja2 = "^3.1.4"
 types-requests = "^2.31.0.0"
-types-psycopg2 = "^2.9.21.10"
+types-psycopg2 = "^2.9.21.20241019"
 boto3 = "^1.34.11"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
 moto = {extras = ["server"], version = "^5.0.6"}
diff --git a/scripts/download_basebackup.py b/scripts/download_basebackup.py
index f00ee87eb7..e23e4f99c3 100755
--- a/scripts/download_basebackup.py
+++ b/scripts/download_basebackup.py
@@ -23,9 +23,7 @@ def main(args: argparse.Namespace):
     psconn: PgConnection = psycopg2.connect(pageserver_connstr)
     psconn.autocommit = True
 
-    output = open(output_path, "wb")
-
-    with psconn.cursor() as pscur:
+    with open(output_path, "wb", encoding="utf-8") as output, psconn.cursor() as pscur:
         pscur.copy_expert(f"basebackup {tenant_id} {timeline_id} {lsn}", output)
 
 
diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index e62485905e..91d7e3446e 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -18,6 +18,7 @@ if TYPE_CHECKING:
     from fixtures.benchmark_fixture import NeonBenchmarker
     from fixtures.neon_api import NeonApiEndpoint
     from fixtures.neon_fixtures import NeonEnv, PgBin, VanillaPostgres
+    from psycopg2.extensions import cursor
 
 
 @pytest.mark.timeout(1000)
@@ -63,9 +64,7 @@ def check_pgbench_still_running(pgbench: Popen[AnyStr], label: str = ""):
         raise RuntimeError(f"{label} pgbench terminated early with return code {rc}")
 
 
-def measure_logical_replication_lag(
-    sub_cur: psycopg2.cursor, pub_cur: psycopg2.cursor, timeout_sec: float = 600
-):
+def measure_logical_replication_lag(sub_cur: cursor, pub_cur: cursor, timeout_sec: float = 600):
     start = time.time()
     pub_cur.execute("SELECT pg_current_wal_flush_lsn()")
     pub_lsn = Lsn(cast("str", pub_cur.fetchall()[0][0]))

From 8840f3858c829ccfd855c278dde6fdff6c60ff77 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 5 Nov 2024 13:16:55 +0100
Subject: [PATCH 163/239] pageserver: return 503 during tenant shutdown (#9635)

## Problem

Tenant operations may return `409 Conflict` if the tenant is shutting
down. This status code is not retried by the control plane, causing
user-facing errors during pageserver restarts. Operations should instead
return `503 Service Unavailable`, which may be retried for idempotent
operations.

## Summary of changes

Convert
`GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping)` to
`ApiError::ShuttingDown` rather than `ApiError::Conflict`. This error is
returned by `Tenant::wait_to_become_active` in most (all?)
tenant/timeline-related HTTP routes.
---
 pageserver/src/http/routes.rs | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ef8efd3f27..72eb3e7ade 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -37,6 +37,7 @@ use pageserver_api::models::TenantShardLocation;
 use pageserver_api::models::TenantShardSplitRequest;
 use pageserver_api::models::TenantShardSplitResponse;
 use pageserver_api::models::TenantSorting;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::TimelineArchivalConfigRequest;
 use pageserver_api::models::TimelineCreateRequestMode;
 use pageserver_api::models::TimelinesInfoAndOffloaded;
@@ -295,6 +296,9 @@ impl From<GetActiveTenantError> for ApiError {
             GetActiveTenantError::Broken(reason) => {
                 ApiError::InternalServerError(anyhow!("tenant is broken: {}", reason))
             }
+            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+                ApiError::ShuttingDown
+            }
             GetActiveTenantError::WillNotBecomeActive(_) => ApiError::Conflict(format!("{}", e)),
             GetActiveTenantError::Cancelled => ApiError::ShuttingDown,
             GetActiveTenantError::NotFound(gte) => gte.into(),

From 70ae8c16da2578b6282c1825391bdf4508fa2feb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 5 Nov 2024 14:02:49 +0100
Subject: [PATCH 164/239] Construct models::TenantConfig only once (#9630)

Since 5f83c9290b482dc90006c400dfc68e85a17af785/#1504 we've had
duplication in construction of models::TenantConfig, where both
constructs contained the same code. This PR removes one of the two
locations to avoid the duplication.
---
 control_plane/src/pageserver.rs | 115 ++++----------------------------
 1 file changed, 14 insertions(+), 101 deletions(-)

diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index db54965eb5..eab76e14c3 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -334,17 +334,20 @@ impl PageServerNode {
             checkpoint_distance: settings
                 .remove("checkpoint_distance")
                 .map(|x| x.parse::<u64>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'checkpoint_distance' as an integer")?,
             checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
             compaction_target_size: settings
                 .remove("compaction_target_size")
                 .map(|x| x.parse::<u64>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'compaction_target_size' as an integer")?,
             compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
             compaction_threshold: settings
                 .remove("compaction_threshold")
                 .map(|x| x.parse::<usize>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'compaction_threshold' as an integer")?,
             compaction_algorithm: settings
                 .remove("compaction_algorithm")
                 .map(serde_json::from_str)
@@ -353,16 +356,19 @@ impl PageServerNode {
             gc_horizon: settings
                 .remove("gc_horizon")
                 .map(|x| x.parse::<u64>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'gc_horizon' as an integer")?,
             gc_period: settings.remove("gc_period").map(|x| x.to_string()),
             image_creation_threshold: settings
                 .remove("image_creation_threshold")
                 .map(|x| x.parse::<usize>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
             image_layer_creation_check_threshold: settings
                 .remove("image_layer_creation_check_threshold")
                 .map(|x| x.parse::<u8>())
-                .transpose()?,
+                .transpose()
+                .context("Failed to parse 'image_creation_check_threshold' as integer")?,
             pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
             walreceiver_connect_timeout: settings
                 .remove("walreceiver_connect_timeout")
@@ -419,102 +425,9 @@ impl PageServerNode {
     pub async fn tenant_config(
         &self,
         tenant_id: TenantId,
-        mut settings: HashMap<&str, &str>,
+        settings: HashMap<&str, &str>,
     ) -> anyhow::Result<()> {
-        let config = {
-            // Braces to make the diff easier to read
-            models::TenantConfig {
-                checkpoint_distance: settings
-                    .remove("checkpoint_distance")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'checkpoint_distance' as an integer")?,
-                checkpoint_timeout: settings.remove("checkpoint_timeout").map(|x| x.to_string()),
-                compaction_target_size: settings
-                    .remove("compaction_target_size")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'compaction_target_size' as an integer")?,
-                compaction_period: settings.remove("compaction_period").map(|x| x.to_string()),
-                compaction_threshold: settings
-                    .remove("compaction_threshold")
-                    .map(|x| x.parse::<usize>())
-                    .transpose()
-                    .context("Failed to parse 'compaction_threshold' as an integer")?,
-                compaction_algorithm: settings
-                    .remove("compactin_algorithm")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'compaction_algorithm' json")?,
-                gc_horizon: settings
-                    .remove("gc_horizon")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'gc_horizon' as an integer")?,
-                gc_period: settings.remove("gc_period").map(|x| x.to_string()),
-                image_creation_threshold: settings
-                    .remove("image_creation_threshold")
-                    .map(|x| x.parse::<usize>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                image_layer_creation_check_threshold: settings
-                    .remove("image_layer_creation_check_threshold")
-                    .map(|x| x.parse::<u8>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
-
-                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
-                walreceiver_connect_timeout: settings
-                    .remove("walreceiver_connect_timeout")
-                    .map(|x| x.to_string()),
-                lagging_wal_timeout: settings
-                    .remove("lagging_wal_timeout")
-                    .map(|x| x.to_string()),
-                max_lsn_wal_lag: settings
-                    .remove("max_lsn_wal_lag")
-                    .map(|x| x.parse::<NonZeroU64>())
-                    .transpose()
-                    .context("Failed to parse 'max_lsn_wal_lag' as non zero integer")?,
-                eviction_policy: settings
-                    .remove("eviction_policy")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("Failed to parse 'eviction_policy' json")?,
-                min_resident_size_override: settings
-                    .remove("min_resident_size_override")
-                    .map(|x| x.parse::<u64>())
-                    .transpose()
-                    .context("Failed to parse 'min_resident_size_override' as an integer")?,
-                evictions_low_residence_duration_metric_threshold: settings
-                    .remove("evictions_low_residence_duration_metric_threshold")
-                    .map(|x| x.to_string()),
-                heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
-                lazy_slru_download: settings
-                    .remove("lazy_slru_download")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'lazy_slru_download' as bool")?,
-                timeline_get_throttle: settings
-                    .remove("timeline_get_throttle")
-                    .map(serde_json::from_str)
-                    .transpose()
-                    .context("parse `timeline_get_throttle` from json")?,
-                lsn_lease_length: settings.remove("lsn_lease_length").map(|x| x.to_string()),
-                lsn_lease_length_for_ts: settings
-                    .remove("lsn_lease_length_for_ts")
-                    .map(|x| x.to_string()),
-                timeline_offloading: settings
-                    .remove("timeline_offloading")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'timeline_offloading' as bool")?,
-            }
-        };
-
-        if !settings.is_empty() {
-            bail!("Unrecognized tenant settings: {settings:?}")
-        }
-
+        let config = Self::parse_config(settings)?;
         self.http_client
             .tenant_config(&models::TenantConfigRequest { tenant_id, config })
             .await?;

From e30f5fb922c44f47edddf4c0833f9a5fe95f33d0 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 5 Nov 2024 13:32:50 +0000
Subject: [PATCH 165/239] scrubber: remove AWS region assumption, tolerate
 negative max_project_size (#9636)

## Problem

First issues noticed when trying to run scrubber find-garbage on Azure:
- Azure staging contains projects with -1 set for max_project_size:
apparently the control plane treats this as a signed field.
- Scrubber code assumed that listing projects should filter to
aws-$REGION. This is no longer needed (per comment in the code) because
we know hit region-local APIs.

This PR doesn't make it work all the way (`init_remote` still assumes
S3), but these are necessary precursors.

## Summary of changes

- Change max-project_size from unsigned to signed
- Remove region filtering in favor of simply using the right region's
API (which we already do)
---
 storage_scrubber/src/cloud_admin_api.rs | 6 +++---
 storage_scrubber/src/garbage.rs         | 4 +---
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index 7b82a0b116..c9a62cd256 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -147,7 +147,7 @@ pub struct ProjectData {
     pub created_at: DateTime<Utc>,
     pub updated_at: DateTime<Utc>,
     pub pg_version: u32,
-    pub max_project_size: u64,
+    pub max_project_size: i64,
     pub remote_storage_size: u64,
     pub resident_size: u64,
     pub synthetic_storage_size: u64,
@@ -261,7 +261,7 @@ impl CloudAdminApiClient {
         }
     }
 
-    pub async fn list_projects(&self, region_id: String) -> Result<Vec<ProjectData>, Error> {
+    pub async fn list_projects(&self) -> Result<Vec<ProjectData>, Error> {
         let _permit = self
             .request_limiter
             .acquire()
@@ -318,7 +318,7 @@ impl CloudAdminApiClient {
 
             pagination_offset += response.data.len();
 
-            result.extend(response.data.drain(..).filter(|t| t.region_id == region_id));
+            result.append(&mut response.data);
 
             if pagination_offset >= response.total.unwrap_or(0) {
                 break;
diff --git a/storage_scrubber/src/garbage.rs b/storage_scrubber/src/garbage.rs
index a0040ada08..863dbf960d 100644
--- a/storage_scrubber/src/garbage.rs
+++ b/storage_scrubber/src/garbage.rs
@@ -160,9 +160,7 @@ async fn find_garbage_inner(
     // Build a set of console-known tenants, for quickly eliminating known-active tenants without having
     // to issue O(N) console API requests.
     let console_projects: HashMap<TenantId, ProjectData> = cloud_admin_api_client
-        // FIXME: we can't just assume that all console's region ids are aws-<something>.  This hack
-        // will go away when we are talking to Control Plane APIs, which are per-region.
-        .list_projects(format!("aws-{}", bucket_config.region))
+        .list_projects()
         .await?
         .into_iter()
         .map(|t| (t.tenant, t))

From 2f1a56c8f9c6666fb1972636be1f6c92801d2772 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Tue, 5 Nov 2024 17:33:41 +0200
Subject: [PATCH 166/239] proxy: Unify local and remote conn pool client
 structures (#9604)

Unify client, EndpointConnPool and DbUserConnPool for remote and local
conn.
- Use new ClientDataEnum for additional client data.
- Add ClientInnerCommon client structure.
- Remove Client and EndpointConnPool code from local_conn_pool.rs
---
 proxy/src/serverless/backend.rs         |   6 +-
 proxy/src/serverless/conn_pool.rs       |  61 ++--
 proxy/src/serverless/conn_pool_lib.rs   | 289 +++++++++++-----
 proxy/src/serverless/http_conn_pool.rs  |  20 +-
 proxy/src/serverless/local_conn_pool.rs | 418 ++++++------------------
 proxy/src/serverless/sql_over_http.rs   |   7 +-
 6 files changed, 341 insertions(+), 460 deletions(-)

diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index c89e0f0232..c2b0de1876 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -14,7 +14,7 @@ use tracing::{debug, info};
 use super::conn_pool::poll_client;
 use super::conn_pool_lib::{Client, ConnInfo, GlobalConnPool};
 use super::http_conn_pool::{self, poll_http2_client, Send};
-use super::local_conn_pool::{self, LocalClient, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
+use super::local_conn_pool::{self, LocalConnPool, EXT_NAME, EXT_SCHEMA, EXT_VERSION};
 use crate::auth::backend::local::StaticAuthRules;
 use crate::auth::backend::{ComputeCredentials, ComputeUserInfo};
 use crate::auth::{self, check_peer_addr_is_in_list, AuthError};
@@ -205,7 +205,7 @@ impl PoolingBackend {
         conn_info: ConnInfo,
     ) -> Result<http_conn_pool::Client<Send>, HttpConnError> {
         info!("pool: looking for an existing connection");
-        if let Some(client) = self.http_conn_pool.get(ctx, &conn_info) {
+        if let Ok(Some(client)) = self.http_conn_pool.get(ctx, &conn_info) {
             return Ok(client);
         }
 
@@ -248,7 +248,7 @@ impl PoolingBackend {
         &self,
         ctx: &RequestMonitoring,
         conn_info: ConnInfo,
-    ) -> Result<LocalClient<tokio_postgres::Client>, HttpConnError> {
+    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
         if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
             return Ok(client);
         }
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index 7fa3357b5b..1845603bf7 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -18,7 +18,9 @@ use {
     std::{sync::atomic, time::Duration},
 };
 
-use super::conn_pool_lib::{Client, ClientInnerExt, ConnInfo, GlobalConnPool};
+use super::conn_pool_lib::{
+    Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, GlobalConnPool,
+};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::Metrics;
@@ -152,53 +154,30 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
 
     }
     .instrument(span));
-    let inner = ClientInnerRemote {
+    let inner = ClientInnerCommon {
         inner: client,
-        session: tx,
-        cancel,
         aux,
         conn_id,
+        data: ClientDataEnum::Remote(ClientDataRemote {
+            session: tx,
+            cancel,
+        }),
     };
+
     Client::new(inner, conn_info, pool_clone)
 }
 
-pub(crate) struct ClientInnerRemote<C: ClientInnerExt> {
-    inner: C,
+pub(crate) struct ClientDataRemote {
     session: tokio::sync::watch::Sender<uuid::Uuid>,
     cancel: CancellationToken,
-    aux: MetricsAuxInfo,
-    conn_id: uuid::Uuid,
 }
 
-impl<C: ClientInnerExt> ClientInnerRemote<C> {
-    pub(crate) fn inner_mut(&mut self) -> &mut C {
-        &mut self.inner
-    }
-
-    pub(crate) fn inner(&self) -> &C {
-        &self.inner
-    }
-
-    pub(crate) fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
+impl ClientDataRemote {
+    pub fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
         &mut self.session
     }
 
-    pub(crate) fn aux(&self) -> &MetricsAuxInfo {
-        &self.aux
-    }
-
-    pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
-        self.conn_id
-    }
-
-    pub(crate) fn is_closed(&self) -> bool {
-        self.inner.is_closed()
-    }
-}
-
-impl<C: ClientInnerExt> Drop for ClientInnerRemote<C> {
-    fn drop(&mut self) {
-        // on client drop, tell the conn to shut down
+    pub fn cancel(&mut self) {
         self.cancel.cancel();
     }
 }
@@ -228,15 +207,13 @@ mod tests {
         }
     }
 
-    fn create_inner() -> ClientInnerRemote<MockClient> {
+    fn create_inner() -> ClientInnerCommon<MockClient> {
         create_inner_with(MockClient::new(false))
     }
 
-    fn create_inner_with(client: MockClient) -> ClientInnerRemote<MockClient> {
-        ClientInnerRemote {
+    fn create_inner_with(client: MockClient) -> ClientInnerCommon<MockClient> {
+        ClientInnerCommon {
             inner: client,
-            session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
-            cancel: CancellationToken::new(),
             aux: MetricsAuxInfo {
                 endpoint_id: (&EndpointId::from("endpoint")).into(),
                 project_id: (&ProjectId::from("project")).into(),
@@ -244,6 +221,10 @@ mod tests {
                 cold_start_info: crate::control_plane::messages::ColdStartInfo::Warm,
             },
             conn_id: uuid::Uuid::new_v4(),
+            data: ClientDataEnum::Remote(ClientDataRemote {
+                session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
+                cancel: CancellationToken::new(),
+            }),
         }
     }
 
@@ -280,7 +261,7 @@ mod tests {
         {
             let mut client = Client::new(create_inner(), conn_info.clone(), ep_pool.clone());
             assert_eq!(0, pool.get_global_connections_count());
-            client.inner_mut().1.discard();
+            client.inner().1.discard();
             // Discard should not add the connection from the pool.
             assert_eq!(0, pool.get_global_connections_count());
         }
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 8830cddf0c..00a8ac4768 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -11,10 +11,13 @@ use tokio_postgres::ReadyForQueryStatus;
 use tracing::{debug, info, Span};
 
 use super::backend::HttpConnError;
-use super::conn_pool::ClientInnerRemote;
+use super::conn_pool::ClientDataRemote;
+use super::http_conn_pool::ClientDataHttp;
+use super::local_conn_pool::ClientDataLocal;
 use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::ColdStartInfo;
+use crate::control_plane::messages::MetricsAuxInfo;
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::types::{DbName, EndpointCacheKey, RoleName};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
@@ -41,8 +44,46 @@ impl ConnInfo {
     }
 }
 
+pub(crate) enum ClientDataEnum {
+    Remote(ClientDataRemote),
+    Local(ClientDataLocal),
+    #[allow(dead_code)]
+    Http(ClientDataHttp),
+}
+
+pub(crate) struct ClientInnerCommon<C: ClientInnerExt> {
+    pub(crate) inner: C,
+    pub(crate) aux: MetricsAuxInfo,
+    pub(crate) conn_id: uuid::Uuid,
+    pub(crate) data: ClientDataEnum, // custom client data like session, key, jti
+}
+
+impl<C: ClientInnerExt> Drop for ClientInnerCommon<C> {
+    fn drop(&mut self) {
+        match &mut self.data {
+            ClientDataEnum::Remote(remote_data) => {
+                remote_data.cancel();
+            }
+            ClientDataEnum::Local(local_data) => {
+                local_data.cancel();
+            }
+            ClientDataEnum::Http(_http_data) => (),
+        }
+    }
+}
+
+impl<C: ClientInnerExt> ClientInnerCommon<C> {
+    pub(crate) fn get_conn_id(&self) -> uuid::Uuid {
+        self.conn_id
+    }
+
+    pub(crate) fn get_data(&mut self) -> &mut ClientDataEnum {
+        &mut self.data
+    }
+}
+
 pub(crate) struct ConnPoolEntry<C: ClientInnerExt> {
-    pub(crate) conn: ClientInnerRemote<C>,
+    pub(crate) conn: ClientInnerCommon<C>,
     pub(crate) _last_access: std::time::Instant,
 }
 
@@ -55,10 +96,33 @@ pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
     _guard: HttpEndpointPoolsGuard<'static>,
     global_connections_count: Arc<AtomicUsize>,
     global_pool_size_max_conns: usize,
+    pool_name: String,
 }
 
 impl<C: ClientInnerExt> EndpointConnPool<C> {
-    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
+    pub(crate) fn new(
+        hmap: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
+        tconns: usize,
+        max_conns_per_endpoint: usize,
+        global_connections_count: Arc<AtomicUsize>,
+        max_total_conns: usize,
+        pname: String,
+    ) -> Self {
+        Self {
+            pools: hmap,
+            total_conns: tconns,
+            max_conns: max_conns_per_endpoint,
+            _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
+            global_connections_count,
+            global_pool_size_max_conns: max_total_conns,
+            pool_name: pname,
+        }
+    }
+
+    pub(crate) fn get_conn_entry(
+        &mut self,
+        db_user: (DbName, RoleName),
+    ) -> Option<ConnPoolEntry<C>> {
         let Self {
             pools,
             total_conns,
@@ -84,9 +148,10 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
             ..
         } = self;
         if let Some(pool) = pools.get_mut(&db_user) {
-            let old_len = pool.conns.len();
-            pool.conns.retain(|conn| conn.conn.get_conn_id() != conn_id);
-            let new_len = pool.conns.len();
+            let old_len = pool.get_conns().len();
+            pool.get_conns()
+                .retain(|conn| conn.conn.get_conn_id() != conn_id);
+            let new_len = pool.get_conns().len();
             let removed = old_len - new_len;
             if removed > 0 {
                 global_connections_count.fetch_sub(removed, atomic::Ordering::Relaxed);
@@ -103,11 +168,26 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
         }
     }
 
-    pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerRemote<C>) {
-        let conn_id = client.get_conn_id();
+    pub(crate) fn get_name(&self) -> &str {
+        &self.pool_name
+    }
 
-        if client.is_closed() {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because connection is closed");
+    pub(crate) fn get_pool(&self, db_user: (DbName, RoleName)) -> Option<&DbUserConnPool<C>> {
+        self.pools.get(&db_user)
+    }
+
+    pub(crate) fn get_pool_mut(
+        &mut self,
+        db_user: (DbName, RoleName),
+    ) -> Option<&mut DbUserConnPool<C>> {
+        self.pools.get_mut(&db_user)
+    }
+
+    pub(crate) fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInnerCommon<C>) {
+        let conn_id = client.get_conn_id();
+        let pool_name = pool.read().get_name().to_string();
+        if client.inner.is_closed() {
+            info!(%conn_id, "{}: throwing away connection '{conn_info}' because connection is closed", pool_name);
             return;
         }
 
@@ -118,7 +198,7 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
             .load(atomic::Ordering::Relaxed)
             >= global_max_conn
         {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full");
+            info!(%conn_id, "{}: throwing away connection '{conn_info}' because pool is full", pool_name);
             return;
         }
 
@@ -130,13 +210,13 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
 
             if pool.total_conns < pool.max_conns {
                 let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
-                pool_entries.conns.push(ConnPoolEntry {
+                pool_entries.get_conns().push(ConnPoolEntry {
                     conn: client,
                     _last_access: std::time::Instant::now(),
                 });
 
                 returned = true;
-                per_db_size = pool_entries.conns.len();
+                per_db_size = pool_entries.get_conns().len();
 
                 pool.total_conns += 1;
                 pool.global_connections_count
@@ -153,9 +233,9 @@ impl<C: ClientInnerExt> EndpointConnPool<C> {
 
         // do logging outside of the mutex
         if returned {
-            info!(%conn_id, "pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
+            info!(%conn_id, "{pool_name}: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
         } else {
-            info!(%conn_id, "pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
+            info!(%conn_id, "{pool_name}: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
         }
     }
 }
@@ -176,19 +256,39 @@ impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
 
 pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
     pub(crate) conns: Vec<ConnPoolEntry<C>>,
+    pub(crate) initialized: Option<bool>, // a bit ugly, exists only for local pools
 }
 
 impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
     fn default() -> Self {
-        Self { conns: Vec::new() }
+        Self {
+            conns: Vec::new(),
+            initialized: None,
+        }
     }
 }
 
-impl<C: ClientInnerExt> DbUserConnPool<C> {
+pub(crate) trait DbUserConn<C: ClientInnerExt>: Default {
+    fn set_initialized(&mut self);
+    fn is_initialized(&self) -> bool;
+    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize;
+    fn get_conn_entry(&mut self, conns: &mut usize) -> (Option<ConnPoolEntry<C>>, usize);
+    fn get_conns(&mut self) -> &mut Vec<ConnPoolEntry<C>>;
+}
+
+impl<C: ClientInnerExt> DbUserConn<C> for DbUserConnPool<C> {
+    fn set_initialized(&mut self) {
+        self.initialized = Some(true);
+    }
+
+    fn is_initialized(&self) -> bool {
+        self.initialized.unwrap_or(false)
+    }
+
     fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
         let old_len = self.conns.len();
 
-        self.conns.retain(|conn| !conn.conn.is_closed());
+        self.conns.retain(|conn| !conn.conn.inner.is_closed());
 
         let new_len = self.conns.len();
         let removed = old_len - new_len;
@@ -196,10 +296,7 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
         removed
     }
 
-    pub(crate) fn get_conn_entry(
-        &mut self,
-        conns: &mut usize,
-    ) -> (Option<ConnPoolEntry<C>>, usize) {
+    fn get_conn_entry(&mut self, conns: &mut usize) -> (Option<ConnPoolEntry<C>>, usize) {
         let mut removed = self.clear_closed_clients(conns);
         let conn = self.conns.pop();
         if conn.is_some() {
@@ -215,6 +312,10 @@ impl<C: ClientInnerExt> DbUserConnPool<C> {
 
         (conn, removed)
     }
+
+    fn get_conns(&mut self) -> &mut Vec<ConnPoolEntry<C>> {
+        &mut self.conns
+    }
 }
 
 pub(crate) struct GlobalConnPool<C: ClientInnerExt> {
@@ -278,6 +379,60 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
         self.config.pool_options.idle_timeout
     }
 
+    pub(crate) fn get(
+        self: &Arc<Self>,
+        ctx: &RequestMonitoring,
+        conn_info: &ConnInfo,
+    ) -> Result<Option<Client<C>>, HttpConnError> {
+        let mut client: Option<ClientInnerCommon<C>> = None;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            return Ok(None);
+        };
+
+        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
+        if let Some(entry) = endpoint_pool
+            .write()
+            .get_conn_entry(conn_info.db_and_user())
+        {
+            client = Some(entry.conn);
+        }
+        let endpoint_pool = Arc::downgrade(&endpoint_pool);
+
+        // ok return cached connection if found and establish a new one otherwise
+        if let Some(mut client) = client {
+            if client.inner.is_closed() {
+                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
+                return Ok(None);
+            }
+            tracing::Span::current()
+                .record("conn_id", tracing::field::display(client.get_conn_id()));
+            tracing::Span::current().record(
+                "pid",
+                tracing::field::display(client.inner.get_process_id()),
+            );
+            info!(
+                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
+                "pool: reusing connection '{conn_info}'"
+            );
+
+            match client.get_data() {
+                ClientDataEnum::Local(data) => {
+                    data.session().send(ctx.session_id())?;
+                }
+
+                ClientDataEnum::Remote(data) => {
+                    data.session().send(ctx.session_id())?;
+                }
+                ClientDataEnum::Http(_) => (),
+            }
+
+            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
+            ctx.success();
+            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
+        }
+        Ok(None)
+    }
+
     pub(crate) fn shutdown(&self) {
         // drops all strong references to endpoint-pools
         self.global_pool.clear();
@@ -374,6 +529,7 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
             _guard: Metrics::get().proxy.http_endpoint_pools.guard(),
             global_connections_count: self.global_connections_count.clone(),
             global_pool_size_max_conns: self.config.pool_options.max_total_conns,
+            pool_name: String::from("remote"),
         }));
 
         // find or create a pool for this endpoint
@@ -400,55 +556,23 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
 
         pool
     }
+}
 
-    pub(crate) fn get(
-        self: &Arc<Self>,
-        ctx: &RequestMonitoring,
-        conn_info: &ConnInfo,
-    ) -> Result<Option<Client<C>>, HttpConnError> {
-        let mut client: Option<ClientInnerRemote<C>> = None;
-        let Some(endpoint) = conn_info.endpoint_cache_key() else {
-            return Ok(None);
-        };
+pub(crate) struct Client<C: ClientInnerExt> {
+    span: Span,
+    inner: Option<ClientInnerCommon<C>>,
+    conn_info: ConnInfo,
+    pool: Weak<RwLock<EndpointConnPool<C>>>,
+}
 
-        let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
-        if let Some(entry) = endpoint_pool
-            .write()
-            .get_conn_entry(conn_info.db_and_user())
-        {
-            client = Some(entry.conn);
-        }
-        let endpoint_pool = Arc::downgrade(&endpoint_pool);
-
-        // ok return cached connection if found and establish a new one otherwise
-        if let Some(mut client) = client {
-            if client.is_closed() {
-                info!("pool: cached connection '{conn_info}' is closed, opening a new one");
-                return Ok(None);
-            }
-            tracing::Span::current()
-                .record("conn_id", tracing::field::display(client.get_conn_id()));
-            tracing::Span::current().record(
-                "pid",
-                tracing::field::display(client.inner().get_process_id()),
-            );
-            info!(
-                cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
-                "pool: reusing connection '{conn_info}'"
-            );
-
-            client.session().send(ctx.session_id())?;
-            ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
-            ctx.success();
-            return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
-        }
-        Ok(None)
-    }
+pub(crate) struct Discard<'a, C: ClientInnerExt> {
+    conn_info: &'a ConnInfo,
+    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
 }
 
 impl<C: ClientInnerExt> Client<C> {
     pub(crate) fn new(
-        inner: ClientInnerRemote<C>,
+        inner: ClientInnerCommon<C>,
         conn_info: ConnInfo,
         pool: Weak<RwLock<EndpointConnPool<C>>>,
     ) -> Self {
@@ -460,7 +584,18 @@ impl<C: ClientInnerExt> Client<C> {
         }
     }
 
-    pub(crate) fn inner_mut(&mut self) -> (&mut C, Discard<'_, C>) {
+    pub(crate) fn client_inner(&mut self) -> (&mut ClientInnerCommon<C>, Discard<'_, C>) {
+        let Self {
+            inner,
+            pool,
+            conn_info,
+            span: _,
+        } = self;
+        let inner_m = inner.as_mut().expect("client inner should not be removed");
+        (inner_m, Discard { conn_info, pool })
+    }
+
+    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
         let Self {
             inner,
             pool,
@@ -468,12 +603,11 @@ impl<C: ClientInnerExt> Client<C> {
             span: _,
         } = self;
         let inner = inner.as_mut().expect("client inner should not be removed");
-        let inner_ref = inner.inner_mut();
-        (inner_ref, Discard { conn_info, pool })
+        (&mut inner.inner, Discard { conn_info, pool })
     }
 
     pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux();
+        let aux = &self.inner.as_ref().unwrap().aux;
         USAGE_METRICS.register(Ids {
             endpoint_id: aux.endpoint_id,
             branch_id: aux.branch_id,
@@ -498,13 +632,6 @@ impl<C: ClientInnerExt> Client<C> {
     }
 }
 
-pub(crate) struct Client<C: ClientInnerExt> {
-    span: Span,
-    inner: Option<ClientInnerRemote<C>>,
-    conn_info: ConnInfo,
-    pool: Weak<RwLock<EndpointConnPool<C>>>,
-}
-
 impl<C: ClientInnerExt> Drop for Client<C> {
     fn drop(&mut self) {
         if let Some(drop) = self.do_drop() {
@@ -517,10 +644,11 @@ impl<C: ClientInnerExt> Deref for Client<C> {
     type Target = C;
 
     fn deref(&self) -> &Self::Target {
-        self.inner
+        &self
+            .inner
             .as_ref()
             .expect("client inner should not be removed")
-            .inner()
+            .inner
     }
 }
 
@@ -539,11 +667,6 @@ impl ClientInnerExt for tokio_postgres::Client {
     }
 }
 
-pub(crate) struct Discard<'a, C: ClientInnerExt> {
-    conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<RwLock<EndpointConnPool<C>>>,
-}
-
 impl<C: ClientInnerExt> Discard<'_, C> {
     pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
         let conn_info = &self.conn_info;
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index b92ae31310..56be70abec 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -7,9 +7,11 @@ use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use rand::Rng;
+use std::result::Result::Ok;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
+use super::backend::HttpConnError;
 use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
@@ -28,6 +30,8 @@ pub(crate) struct ConnPoolEntry<C: ClientInnerExt + Clone> {
     aux: MetricsAuxInfo,
 }
 
+pub(crate) struct ClientDataHttp();
+
 // Per-endpoint connection pool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub(crate) struct EndpointConnPool<C: ClientInnerExt + Clone> {
@@ -206,14 +210,22 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
         }
     }
 
+    #[expect(unused_results)]
     pub(crate) fn get(
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Option<Client<C>> {
-        let endpoint = conn_info.endpoint_cache_key()?;
+    ) -> Result<Option<Client<C>>, HttpConnError> {
+        let result: Result<Option<Client<C>>, HttpConnError>;
+        let Some(endpoint) = conn_info.endpoint_cache_key() else {
+            result = Ok(None);
+            return result;
+        };
         let endpoint_pool = self.get_or_create_endpoint_pool(&endpoint);
-        let client = endpoint_pool.write().get_conn_entry()?;
+        let Some(client) = endpoint_pool.write().get_conn_entry() else {
+            result = Ok(None);
+            return result;
+        };
 
         tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
         info!(
@@ -222,7 +234,7 @@ impl<C: ClientInnerExt + Clone> GlobalConnPool<C> {
         );
         ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
         ctx.success();
-        Some(Client::new(client.conn, client.aux))
+        Ok(Some(Client::new(client.conn, client.aux)))
     }
 
     fn get_or_create_endpoint_pool(
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index 064e7db7b3..99d4329f88 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -11,7 +11,8 @@
 
 use std::collections::HashMap;
 use std::pin::pin;
-use std::sync::{Arc, Weak};
+use std::sync::atomic::AtomicUsize;
+use std::sync::Arc;
 use std::task::{ready, Poll};
 use std::time::Duration;
 
@@ -26,177 +27,42 @@ use signature::Signer;
 use tokio::time::Instant;
 use tokio_postgres::tls::NoTlsStream;
 use tokio_postgres::types::ToSql;
-use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
+use tokio_postgres::{AsyncMessage, Socket};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, info_span, warn, Instrument, Span};
+use tracing::{error, info, info_span, warn, Instrument};
 
 use super::backend::HttpConnError;
-use super::conn_pool_lib::{ClientInnerExt, ConnInfo};
+use super::conn_pool_lib::{
+    Client, ClientDataEnum, ClientInnerCommon, ClientInnerExt, ConnInfo, DbUserConn,
+    EndpointConnPool,
+};
 use crate::context::RequestMonitoring;
 use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::Metrics;
-use crate::types::{DbName, RoleName};
-use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
 
 pub(crate) const EXT_NAME: &str = "pg_session_jwt";
 pub(crate) const EXT_VERSION: &str = "0.1.2";
 pub(crate) const EXT_SCHEMA: &str = "auth";
 
-struct ConnPoolEntry<C: ClientInnerExt> {
-    conn: ClientInner<C>,
-    _last_access: std::time::Instant,
+pub(crate) struct ClientDataLocal {
+    session: tokio::sync::watch::Sender<uuid::Uuid>,
+    cancel: CancellationToken,
+    key: SigningKey,
+    jti: u64,
 }
 
-// Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
-// Number of open connections is limited by the `max_conns_per_endpoint`.
-pub(crate) struct EndpointConnPool<C: ClientInnerExt> {
-    pools: HashMap<(DbName, RoleName), DbUserConnPool<C>>,
-    total_conns: usize,
-    max_conns: usize,
-    global_pool_size_max_conns: usize,
-}
-
-impl<C: ClientInnerExt> EndpointConnPool<C> {
-    fn get_conn_entry(&mut self, db_user: (DbName, RoleName)) -> Option<ConnPoolEntry<C>> {
-        let Self {
-            pools, total_conns, ..
-        } = self;
-        pools
-            .get_mut(&db_user)
-            .and_then(|pool_entries| pool_entries.get_conn_entry(total_conns))
+impl ClientDataLocal {
+    pub fn session(&mut self) -> &mut tokio::sync::watch::Sender<uuid::Uuid> {
+        &mut self.session
     }
 
-    fn remove_client(&mut self, db_user: (DbName, RoleName), conn_id: uuid::Uuid) -> bool {
-        let Self {
-            pools, total_conns, ..
-        } = self;
-        if let Some(pool) = pools.get_mut(&db_user) {
-            let old_len = pool.conns.len();
-            pool.conns.retain(|conn| conn.conn.conn_id != conn_id);
-            let new_len = pool.conns.len();
-            let removed = old_len - new_len;
-            if removed > 0 {
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .dec_by(removed as i64);
-            }
-            *total_conns -= removed;
-            removed > 0
-        } else {
-            false
-        }
-    }
-
-    fn put(pool: &RwLock<Self>, conn_info: &ConnInfo, client: ClientInner<C>) {
-        let conn_id = client.conn_id;
-
-        if client.is_closed() {
-            info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because connection is closed");
-            return;
-        }
-        let global_max_conn = pool.read().global_pool_size_max_conns;
-        if pool.read().total_conns >= global_max_conn {
-            info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full");
-            return;
-        }
-
-        // return connection to the pool
-        let mut returned = false;
-        let mut per_db_size = 0;
-        let total_conns = {
-            let mut pool = pool.write();
-
-            if pool.total_conns < pool.max_conns {
-                let pool_entries = pool.pools.entry(conn_info.db_and_user()).or_default();
-                pool_entries.conns.push(ConnPoolEntry {
-                    conn: client,
-                    _last_access: std::time::Instant::now(),
-                });
-
-                returned = true;
-                per_db_size = pool_entries.conns.len();
-
-                pool.total_conns += 1;
-                Metrics::get()
-                    .proxy
-                    .http_pool_opened_connections
-                    .get_metric()
-                    .inc();
-            }
-
-            pool.total_conns
-        };
-
-        // do logging outside of the mutex
-        if returned {
-            info!(%conn_id, "local_pool: returning connection '{conn_info}' back to the pool, total_conns={total_conns}, for this (db, user)={per_db_size}");
-        } else {
-            info!(%conn_id, "local_pool: throwing away connection '{conn_info}' because pool is full, total_conns={total_conns}");
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Drop for EndpointConnPool<C> {
-    fn drop(&mut self) {
-        if self.total_conns > 0 {
-            Metrics::get()
-                .proxy
-                .http_pool_opened_connections
-                .get_metric()
-                .dec_by(self.total_conns as i64);
-        }
-    }
-}
-
-pub(crate) struct DbUserConnPool<C: ClientInnerExt> {
-    conns: Vec<ConnPoolEntry<C>>,
-
-    // true if we have definitely installed the extension and
-    // granted the role access to the auth schema.
-    initialized: bool,
-}
-
-impl<C: ClientInnerExt> Default for DbUserConnPool<C> {
-    fn default() -> Self {
-        Self {
-            conns: Vec::new(),
-            initialized: false,
-        }
-    }
-}
-
-impl<C: ClientInnerExt> DbUserConnPool<C> {
-    fn clear_closed_clients(&mut self, conns: &mut usize) -> usize {
-        let old_len = self.conns.len();
-
-        self.conns.retain(|conn| !conn.conn.is_closed());
-
-        let new_len = self.conns.len();
-        let removed = old_len - new_len;
-        *conns -= removed;
-        removed
-    }
-
-    fn get_conn_entry(&mut self, conns: &mut usize) -> Option<ConnPoolEntry<C>> {
-        let mut removed = self.clear_closed_clients(conns);
-        let conn = self.conns.pop();
-        if conn.is_some() {
-            *conns -= 1;
-            removed += 1;
-        }
-        Metrics::get()
-            .proxy
-            .http_pool_opened_connections
-            .get_metric()
-            .dec_by(removed as i64);
-        conn
+    pub fn cancel(&mut self) {
+        self.cancel.cancel();
     }
 }
 
 pub(crate) struct LocalConnPool<C: ClientInnerExt> {
-    global_pool: RwLock<EndpointConnPool<C>>,
+    global_pool: Arc<RwLock<EndpointConnPool<C>>>,
 
     config: &'static crate::config::HttpConfig,
 }
@@ -204,12 +70,14 @@ pub(crate) struct LocalConnPool<C: ClientInnerExt> {
 impl<C: ClientInnerExt> LocalConnPool<C> {
     pub(crate) fn new(config: &'static crate::config::HttpConfig) -> Arc<Self> {
         Arc::new(Self {
-            global_pool: RwLock::new(EndpointConnPool {
-                pools: HashMap::new(),
-                total_conns: 0,
-                max_conns: config.pool_options.max_conns_per_endpoint,
-                global_pool_size_max_conns: config.pool_options.max_total_conns,
-            }),
+            global_pool: Arc::new(RwLock::new(EndpointConnPool::new(
+                HashMap::new(),
+                0,
+                config.pool_options.max_conns_per_endpoint,
+                Arc::new(AtomicUsize::new(0)),
+                config.pool_options.max_total_conns,
+                String::from("local_pool"),
+            ))),
             config,
         })
     }
@@ -222,7 +90,7 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         conn_info: &ConnInfo,
-    ) -> Result<Option<LocalClient<C>>, HttpConnError> {
+    ) -> Result<Option<Client<C>>, HttpConnError> {
         let client = self
             .global_pool
             .write()
@@ -230,12 +98,14 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
             .map(|entry| entry.conn);
 
         // ok return cached connection if found and establish a new one otherwise
-        if let Some(client) = client {
-            if client.is_closed() {
+        if let Some(mut client) = client {
+            if client.inner.is_closed() {
                 info!("local_pool: cached connection '{conn_info}' is closed, opening a new one");
                 return Ok(None);
             }
-            tracing::Span::current().record("conn_id", tracing::field::display(client.conn_id));
+
+            tracing::Span::current()
+                .record("conn_id", tracing::field::display(client.get_conn_id()));
             tracing::Span::current().record(
                 "pid",
                 tracing::field::display(client.inner.get_process_id()),
@@ -244,47 +114,59 @@ impl<C: ClientInnerExt> LocalConnPool<C> {
                 cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
                 "local_pool: reusing connection '{conn_info}'"
             );
-            client.session.send(ctx.session_id())?;
+
+            match client.get_data() {
+                ClientDataEnum::Local(data) => {
+                    data.session().send(ctx.session_id())?;
+                }
+
+                ClientDataEnum::Remote(data) => {
+                    data.session().send(ctx.session_id())?;
+                }
+                ClientDataEnum::Http(_) => (),
+            }
+
             ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
             ctx.success();
-            return Ok(Some(LocalClient::new(
+
+            return Ok(Some(Client::new(
                 client,
                 conn_info.clone(),
-                Arc::downgrade(self),
+                Arc::downgrade(&self.global_pool),
             )));
         }
         Ok(None)
     }
 
     pub(crate) fn initialized(self: &Arc<Self>, conn_info: &ConnInfo) -> bool {
-        self.global_pool
-            .read()
-            .pools
-            .get(&conn_info.db_and_user())
-            .map_or(false, |pool| pool.initialized)
+        if let Some(pool) = self.global_pool.read().get_pool(conn_info.db_and_user()) {
+            return pool.is_initialized();
+        }
+        false
     }
 
     pub(crate) fn set_initialized(self: &Arc<Self>, conn_info: &ConnInfo) {
-        self.global_pool
+        if let Some(pool) = self
+            .global_pool
             .write()
-            .pools
-            .entry(conn_info.db_and_user())
-            .or_default()
-            .initialized = true;
+            .get_pool_mut(conn_info.db_and_user())
+        {
+            pool.set_initialized();
+        }
     }
 }
 
 #[allow(clippy::too_many_arguments)]
-pub(crate) fn poll_client(
-    global_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
+pub(crate) fn poll_client<C: ClientInnerExt>(
+    global_pool: Arc<LocalConnPool<C>>,
     ctx: &RequestMonitoring,
     conn_info: ConnInfo,
-    client: tokio_postgres::Client,
+    client: C,
     mut connection: tokio_postgres::Connection<Socket, NoTlsStream>,
     key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
-) -> LocalClient<tokio_postgres::Client> {
+) -> Client<C> {
     let conn_gauge = Metrics::get().proxy.db_connections.guard(ctx.protocol());
     let mut session_id = ctx.session_id();
     let (tx, mut rx) = tokio::sync::watch::channel(session_id);
@@ -377,111 +259,47 @@ pub(crate) fn poll_client(
     }
     .instrument(span));
 
-    let inner = ClientInner {
+    let inner = ClientInnerCommon {
         inner: client,
-        session: tx,
-        cancel,
         aux,
         conn_id,
-        key,
-        jti: 0,
+        data: ClientDataEnum::Local(ClientDataLocal {
+            session: tx,
+            cancel,
+            key,
+            jti: 0,
+        }),
     };
-    LocalClient::new(inner, conn_info, pool_clone)
+
+    Client::new(
+        inner,
+        conn_info,
+        Arc::downgrade(&pool_clone.upgrade().unwrap().global_pool),
+    )
 }
 
-pub(crate) struct ClientInner<C: ClientInnerExt> {
-    inner: C,
-    session: tokio::sync::watch::Sender<uuid::Uuid>,
-    cancel: CancellationToken,
-    aux: MetricsAuxInfo,
-    conn_id: uuid::Uuid,
-
-    // needed for pg_session_jwt state
-    key: SigningKey,
-    jti: u64,
-}
-
-impl<C: ClientInnerExt> Drop for ClientInner<C> {
-    fn drop(&mut self) {
-        // on client drop, tell the conn to shut down
-        self.cancel.cancel();
-    }
-}
-
-impl<C: ClientInnerExt> ClientInner<C> {
-    pub(crate) fn is_closed(&self) -> bool {
-        self.inner.is_closed()
-    }
-}
-
-impl ClientInner<tokio_postgres::Client> {
+impl ClientInnerCommon<tokio_postgres::Client> {
     pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
-        self.jti += 1;
-        let token = resign_jwt(&self.key, payload, self.jti)?;
+        if let ClientDataEnum::Local(local_data) = &mut self.data {
+            local_data.jti += 1;
+            let token = resign_jwt(&local_data.key, payload, local_data.jti)?;
 
-        // initiates the auth session
-        self.inner.simple_query("discard all").await?;
-        self.inner
-            .query(
-                "select auth.jwt_session_init($1)",
-                &[&token as &(dyn ToSql + Sync)],
-            )
-            .await?;
+            // initiates the auth session
+            self.inner.simple_query("discard all").await?;
+            self.inner
+                .query(
+                    "select auth.jwt_session_init($1)",
+                    &[&token as &(dyn ToSql + Sync)],
+                )
+                .await?;
 
-        let pid = self.inner.get_process_id();
-        info!(pid, jti = self.jti, "user session state init");
-
-        Ok(())
-    }
-}
-
-pub(crate) struct LocalClient<C: ClientInnerExt> {
-    span: Span,
-    inner: Option<ClientInner<C>>,
-    conn_info: ConnInfo,
-    pool: Weak<LocalConnPool<C>>,
-}
-
-pub(crate) struct Discard<'a, C: ClientInnerExt> {
-    conn_info: &'a ConnInfo,
-    pool: &'a mut Weak<LocalConnPool<C>>,
-}
-
-impl<C: ClientInnerExt> LocalClient<C> {
-    pub(self) fn new(
-        inner: ClientInner<C>,
-        conn_info: ConnInfo,
-        pool: Weak<LocalConnPool<C>>,
-    ) -> Self {
-        Self {
-            inner: Some(inner),
-            span: Span::current(),
-            conn_info,
-            pool,
+            let pid = self.inner.get_process_id();
+            info!(pid, jti = local_data.jti, "user session state init");
+            Ok(())
+        } else {
+            panic!("unexpected client data type");
         }
     }
-
-    pub(crate) fn client_inner(&mut self) -> (&mut ClientInner<C>, Discard<'_, C>) {
-        let Self {
-            inner,
-            pool,
-            conn_info,
-            span: _,
-        } = self;
-        let inner_m = inner.as_mut().expect("client inner should not be removed");
-        (inner_m, Discard { conn_info, pool })
-    }
-
-    pub(crate) fn inner(&mut self) -> (&mut C, Discard<'_, C>) {
-        let Self {
-            inner,
-            pool,
-            conn_info,
-            span: _,
-        } = self;
-        let inner = inner.as_mut().expect("client inner should not be removed");
-        (&mut inner.inner, Discard { conn_info, pool })
-    }
 }
 
 /// implements relatively efficient in-place json object key upserting
@@ -547,58 +365,6 @@ fn sign_jwt(sk: &SigningKey, payload: &[u8]) -> String {
     jwt
 }
 
-impl<C: ClientInnerExt> LocalClient<C> {
-    pub(crate) fn metrics(&self) -> Arc<MetricCounter> {
-        let aux = &self.inner.as_ref().unwrap().aux;
-        USAGE_METRICS.register(Ids {
-            endpoint_id: aux.endpoint_id,
-            branch_id: aux.branch_id,
-        })
-    }
-
-    fn do_drop(&mut self) -> Option<impl FnOnce() + use<C>> {
-        let conn_info = self.conn_info.clone();
-        let client = self
-            .inner
-            .take()
-            .expect("client inner should not be removed");
-        if let Some(conn_pool) = std::mem::take(&mut self.pool).upgrade() {
-            let current_span = self.span.clone();
-            // return connection to the pool
-            return Some(move || {
-                let _span = current_span.enter();
-                EndpointConnPool::put(&conn_pool.global_pool, &conn_info, client);
-            });
-        }
-        None
-    }
-}
-
-impl<C: ClientInnerExt> Drop for LocalClient<C> {
-    fn drop(&mut self) {
-        if let Some(drop) = self.do_drop() {
-            tokio::task::spawn_blocking(drop);
-        }
-    }
-}
-
-impl<C: ClientInnerExt> Discard<'_, C> {
-    pub(crate) fn check_idle(&mut self, status: ReadyForQueryStatus) {
-        let conn_info = &self.conn_info;
-        if status != ReadyForQueryStatus::Idle && std::mem::take(self.pool).strong_count() > 0 {
-            info!(
-                "local_pool: throwing away connection '{conn_info}' because connection is not idle"
-            );
-        }
-    }
-    pub(crate) fn discard(&mut self) {
-        let conn_info = &self.conn_info;
-        if std::mem::take(self.pool).strong_count() > 0 {
-            info!("local_pool: throwing away connection '{conn_info}' because connection is potentially in a broken state");
-        }
-    }
-}
-
 #[cfg(test)]
 mod tests {
     use p256::ecdsa::SigningKey;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index 0713c27d65..f0975617d4 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -31,7 +31,6 @@ use super::conn_pool_lib::{self, ConnInfo};
 use super::error::HttpCodeError;
 use super::http_util::json_response;
 use super::json::{json_to_pg_text, pg_text_row_to_json, JsonConversionError};
-use super::local_conn_pool;
 use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
 use crate::auth::{endpoint_sni, ComputeUserInfoParseError};
 use crate::config::{AuthenticationConfig, HttpConfig, ProxyConfig, TlsConfig};
@@ -1052,12 +1051,12 @@ async fn query_to_json<T: GenericClient>(
 
 enum Client {
     Remote(conn_pool_lib::Client<tokio_postgres::Client>),
-    Local(local_conn_pool::LocalClient<tokio_postgres::Client>),
+    Local(conn_pool_lib::Client<tokio_postgres::Client>),
 }
 
 enum Discard<'a> {
     Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
-    Local(local_conn_pool::Discard<'a, tokio_postgres::Client>),
+    Local(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
 }
 
 impl Client {
@@ -1071,7 +1070,7 @@ impl Client {
     fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
         match self {
             Client::Remote(client) => {
-                let (c, d) = client.inner_mut();
+                let (c, d) = client.inner();
                 (c, Discard::Remote(d))
             }
             Client::Local(local_client) => {

From babfeb70ba3c8036210a4ff2e8e33dd9ecde0dc4 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 5 Nov 2024 18:05:30 +0100
Subject: [PATCH 167/239] safekeeper: don't allocate send buffers on stack
 (#9644)

## Problem

While experimenting with `MAX_SEND_SIZE` for benchmarking, I saw stack
overflows when increasing it to 1 MB. Turns out a few buffers of this
size are stack-allocated rather than heap-allocated. Even at the default
128 KB size, that's a bit large to allocate on the stack.

## Summary of changes

Heap-allocate buffers of size `MAX_SEND_SIZE`.
---
 safekeeper/src/copy_timeline.rs | 2 +-
 safekeeper/src/debug_dump.rs    | 2 +-
 safekeeper/src/send_wal.rs      | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/safekeeper/src/copy_timeline.rs b/safekeeper/src/copy_timeline.rs
index 1bf0cc668f..07fa98212f 100644
--- a/safekeeper/src/copy_timeline.rs
+++ b/safekeeper/src/copy_timeline.rs
@@ -172,7 +172,7 @@ async fn copy_disk_segments(
 ) -> Result<()> {
     let mut wal_reader = tli.get_walreader(start_lsn).await?;
 
-    let mut buf = [0u8; MAX_SEND_SIZE];
+    let mut buf = vec![0u8; MAX_SEND_SIZE];
 
     let first_segment = start_lsn.segment_number(wal_seg_size);
     let last_segment = end_lsn.segment_number(wal_seg_size);
diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs
index 125f5af7f3..a2d0c49768 100644
--- a/safekeeper/src/debug_dump.rs
+++ b/safekeeper/src/debug_dump.rs
@@ -383,7 +383,7 @@ pub async fn calculate_digest(
     let mut wal_reader = tli.get_walreader(request.from_lsn).await?;
 
     let mut hasher = Sha256::new();
-    let mut buf = [0u8; MAX_SEND_SIZE];
+    let mut buf = vec![0u8; MAX_SEND_SIZE];
 
     let mut bytes_left = (request.until_lsn.0 - request.from_lsn.0) as usize;
     while bytes_left > 0 {
diff --git a/safekeeper/src/send_wal.rs b/safekeeper/src/send_wal.rs
index 6d677f405a..6d94ff98b1 100644
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -467,7 +467,7 @@ impl SafekeeperPostgresHandler {
             end_watch,
             ws_guard: ws_guard.clone(),
             wal_reader,
-            send_buf: [0; MAX_SEND_SIZE],
+            send_buf: vec![0u8; MAX_SEND_SIZE],
         };
         let mut reply_reader = ReplyReader {
             reader,
@@ -548,7 +548,7 @@ struct WalSender<'a, IO> {
     ws_guard: Arc<WalSenderGuard>,
     wal_reader: WalReader,
     // buffer for readling WAL into to send it
-    send_buf: [u8; MAX_SEND_SIZE],
+    send_buf: Vec<u8>,
 }
 
 const POLL_STATE_TIMEOUT: Duration = Duration::from_secs(1);

From fcde40d60035ef5187615b9864321041cf7bc024 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 5 Nov 2024 17:23:00 +0000
Subject: [PATCH 168/239] [proxy] use the proxy protocol v2 command to silence
 some logs (#9620)

The PROXY Protocol V2 offers a "command" concept. It can be of two
different values. "Local" and "Proxy". The spec suggests that "Local" be
used for health-checks. We can thus use this to silence logging for such
health checks such as those from NLB.

This additionally refactors the flow to be a bit more type-safe, self
documenting and using zerocopy deser.
---
 Cargo.lock                          |   3 +
 Cargo.toml                          |   1 +
 proxy/Cargo.toml                    |   1 +
 proxy/src/console_redirect_proxy.rs |  19 +-
 proxy/src/protocol2.rs              | 319 +++++++++++++++++-----------
 proxy/src/proxy/mod.rs              |  19 +-
 proxy/src/serverless/mod.rs         |  15 +-
 workspace_hack/Cargo.toml           |   2 +
 8 files changed, 238 insertions(+), 141 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 484769bd16..9c2a0b455e 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4365,6 +4365,7 @@ dependencies = [
  "walkdir",
  "workspace_hack",
  "x509-parser",
+ "zerocopy",
 ]
 
 [[package]]
@@ -7374,6 +7375,7 @@ dependencies = [
  "tracing",
  "tracing-core",
  "url",
+ "zerocopy",
  "zeroize",
  "zstd",
  "zstd-safe",
@@ -7446,6 +7448,7 @@ version = "0.7.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "1c4061bedbb353041c12f413700357bec76df2c7e2ca8e4df8bac24c6bf68e3d"
 dependencies = [
+ "byteorder",
  "zerocopy-derive",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index e5f7719e7f..8207726caa 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -196,6 +196,7 @@ walkdir = "2.3.2"
 rustls-native-certs = "0.8"
 x509-parser = "0.16"
 whoami = "1.5.1"
+zerocopy = { version = "0.7", features = ["derive"] }
 
 ## TODO replace this with tracing
 env_logger = "0.10"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 2580b1cf8a..bc4da95a91 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -98,6 +98,7 @@ rustls-native-certs.workspace = true
 x509-parser.workspace = true
 postgres-protocol.workspace = true
 redis.workspace = true
+zerocopy.workspace = true
 
 # jwt stuff
 jose-jwa = "0.1.2"
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 243ef07854..cc456f3667 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use futures::TryFutureExt;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, Instrument};
+use tracing::{debug, error, info, Instrument};
 
 use crate::auth::backend::ConsoleRedirectBackend;
 use crate::cancellation::{CancellationHandlerMain, CancellationHandlerMainInternal};
@@ -11,7 +11,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestMonitoring;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
+use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
 use crate::proxy::connect_compute::{connect_to_compute, TcpMechanism};
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::proxy::passthrough::ProxyPassthrough;
@@ -49,7 +49,7 @@ pub async fn task_main(
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
 
-        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+        debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
 
         connections.spawn(async move {
             let (socket, peer_addr) = match read_proxy_protocol(socket).await {
@@ -57,16 +57,21 @@ pub async fn task_main(
                     error!("per-client task finished with an error: {e:#}");
                     return;
                 }
-                Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+                // our load balancers will not send any more data. let's just exit immediately
+                Ok((_socket, ConnectHeader::Local)) => {
+                    debug!("healthcheck received");
+                    return;
+                }
+                Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
                     error!("missing required proxy protocol header");
                     return;
                 }
-                Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+                Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
                     error!("proxy protocol header not supported");
                     return;
                 }
-                Ok((socket, Some(info))) => (socket, info),
-                Ok((socket, None)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
+                Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
+                Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo{ addr: peer_addr, extra: None }),
             };
 
             match socket.inner.set_nodelay(true) {
diff --git a/proxy/src/protocol2.rs b/proxy/src/protocol2.rs
index d1084ca2ff..33a5eb5e1e 100644
--- a/proxy/src/protocol2.rs
+++ b/proxy/src/protocol2.rs
@@ -11,6 +11,7 @@ use bytes::{Buf, Bytes, BytesMut};
 use pin_project_lite::pin_project;
 use strum_macros::FromRepr;
 use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, ReadBuf};
+use zerocopy::{FromBytes, FromZeroes};
 
 pin_project! {
     /// A chained [`AsyncRead`] with [`AsyncWrite`] passthrough
@@ -57,16 +58,31 @@ impl<T: AsyncWrite> AsyncWrite for ChainRW<T> {
 }
 
 /// Proxy Protocol Version 2 Header
-const HEADER: [u8; 12] = [
+const SIGNATURE: [u8; 12] = [
     0x0D, 0x0A, 0x0D, 0x0A, 0x00, 0x0D, 0x0A, 0x51, 0x55, 0x49, 0x54, 0x0A,
 ];
 
+const LOCAL_V2: u8 = 0x20;
+const PROXY_V2: u8 = 0x21;
+
+const TCP_OVER_IPV4: u8 = 0x11;
+const UDP_OVER_IPV4: u8 = 0x12;
+const TCP_OVER_IPV6: u8 = 0x21;
+const UDP_OVER_IPV6: u8 = 0x22;
+
 #[derive(PartialEq, Eq, Clone, Debug)]
 pub struct ConnectionInfo {
     pub addr: SocketAddr,
     pub extra: Option<ConnectionInfoExtra>,
 }
 
+#[derive(PartialEq, Eq, Clone, Debug)]
+pub enum ConnectHeader {
+    Missing,
+    Local,
+    Proxy(ConnectionInfo),
+}
+
 impl fmt::Display for ConnectionInfo {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match &self.extra {
@@ -89,96 +105,31 @@ pub enum ConnectionInfoExtra {
 
 pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
     mut read: T,
-) -> std::io::Result<(ChainRW<T>, Option<ConnectionInfo>)> {
+) -> std::io::Result<(ChainRW<T>, ConnectHeader)> {
     let mut buf = BytesMut::with_capacity(128);
-    while buf.len() < 16 {
+    let header = loop {
         let bytes_read = read.read_buf(&mut buf).await?;
 
-        // exit for bad header
-        let len = usize::min(buf.len(), HEADER.len());
-        if buf[..len] != HEADER[..len] {
-            return Ok((ChainRW { inner: read, buf }, None));
+        // exit for bad header signature
+        let len = usize::min(buf.len(), SIGNATURE.len());
+        if buf[..len] != SIGNATURE[..len] {
+            return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
         }
 
         // if no more bytes available then exit
         if bytes_read == 0 {
-            return Ok((ChainRW { inner: read, buf }, None));
+            return Ok((ChainRW { inner: read, buf }, ConnectHeader::Missing));
         };
-    }
 
-    let header = buf.split_to(16);
-
-    // The next byte (the 13th one) is the protocol version and command.
-    // The highest four bits contains the version. As of this specification, it must
-    // always be sent as \x2 and the receiver must only accept this value.
-    let vc = header[12];
-    let version = vc >> 4;
-    let command = vc & 0b1111;
-    if version != 2 {
-        return Err(io::Error::new(
-            io::ErrorKind::Other,
-            "invalid proxy protocol version. expected version 2",
-        ));
-    }
-    match command {
-        // the connection was established on purpose by the proxy
-        // without being relayed. The connection endpoints are the sender and the
-        // receiver. Such connections exist when the proxy sends health-checks to the
-        // server. The receiver must accept this connection as valid and must use the
-        // real connection endpoints and discard the protocol block including the
-        // family which is ignored.
-        0 => {}
-        // the connection was established on behalf of another node,
-        // and reflects the original connection endpoints. The receiver must then use
-        // the information provided in the protocol block to get original the address.
-        1 => {}
-        // other values are unassigned and must not be emitted by senders. Receivers
-        // must drop connections presenting unexpected values here.
-        _ => {
-            return Err(io::Error::new(
-                io::ErrorKind::Other,
-                "invalid proxy protocol command. expected local (0) or proxy (1)",
-            ))
+        // check if we have enough bytes to continue
+        if let Some(header) = buf.try_get::<ProxyProtocolV2Header>() {
+            break header;
         }
     };
 
-    // The 14th byte contains the transport protocol and address family. The highest 4
-    // bits contain the address family, the lowest 4 bits contain the protocol.
-    let ft = header[13];
-    let address_length = match ft {
-        // - \x11 : TCP over IPv4 : the forwarded connection uses TCP over the AF_INET
-        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-        // - \x12 : UDP over IPv4 : the forwarded connection uses UDP over the AF_INET
-        //   protocol family. Address length is 2*4 + 2*2 = 12 bytes.
-        0x11 | 0x12 => 12,
-        // - \x21 : TCP over IPv6 : the forwarded connection uses TCP over the AF_INET6
-        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-        // - \x22 : UDP over IPv6 : the forwarded connection uses UDP over the AF_INET6
-        //   protocol family. Address length is 2*16 + 2*2 = 36 bytes.
-        0x21 | 0x22 => 36,
-        // unspecified or unix stream. ignore the addresses
-        _ => 0,
-    };
+    let remaining_length = usize::from(header.len.get());
 
-    // The 15th and 16th bytes is the address length in bytes in network endian order.
-    // It is used so that the receiver knows how many address bytes to skip even when
-    // it does not implement the presented protocol. Thus the length of the protocol
-    // header in bytes is always exactly 16 + this value. When a sender presents a
-    // LOCAL connection, it should not present any address so it sets this field to
-    // zero. Receivers MUST always consider this field to skip the appropriate number
-    // of bytes and must not assume zero is presented for LOCAL connections. When a
-    // receiver accepts an incoming connection showing an UNSPEC address family or
-    // protocol, it may or may not decide to log the address information if present.
-    let remaining_length = u16::from_be_bytes(header[14..16].try_into().unwrap());
-    if remaining_length < address_length {
-        return Err(io::Error::new(
-            io::ErrorKind::Other,
-            "invalid proxy protocol length. not enough to fit requested IP addresses",
-        ));
-    }
-    drop(header);
-
-    while buf.len() < remaining_length as usize {
+    while buf.len() < remaining_length {
         if read.read_buf(&mut buf).await? == 0 {
             return Err(io::Error::new(
                 io::ErrorKind::UnexpectedEof,
@@ -186,36 +137,69 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
             ));
         }
     }
+    let payload = buf.split_to(remaining_length);
 
-    // Starting from the 17th byte, addresses are presented in network byte order.
-    // The address order is always the same :
-    //   - source layer 3 address in network byte order
-    //   - destination layer 3 address in network byte order
-    //   - source layer 4 address if any, in network byte order (port)
-    //   - destination layer 4 address if any, in network byte order (port)
-    let mut header = buf.split_to(usize::from(remaining_length));
-    let mut addr = header.split_to(usize::from(address_length));
-    let socket = match addr.len() {
-        12 => {
-            let src_addr = Ipv4Addr::from_bits(addr.get_u32());
-            let _dst_addr = Ipv4Addr::from_bits(addr.get_u32());
-            let src_port = addr.get_u16();
-            let _dst_port = addr.get_u16();
-            Some(SocketAddr::from((src_addr, src_port)))
+    let res = process_proxy_payload(header, payload)?;
+    Ok((ChainRW { inner: read, buf }, res))
+}
+
+fn process_proxy_payload(
+    header: ProxyProtocolV2Header,
+    mut payload: BytesMut,
+) -> std::io::Result<ConnectHeader> {
+    match header.version_and_command {
+        // the connection was established on purpose by the proxy
+        // without being relayed. The connection endpoints are the sender and the
+        // receiver. Such connections exist when the proxy sends health-checks to the
+        // server. The receiver must accept this connection as valid and must use the
+        // real connection endpoints and discard the protocol block including the
+        // family which is ignored.
+        LOCAL_V2 => return Ok(ConnectHeader::Local),
+        // the connection was established on behalf of another node,
+        // and reflects the original connection endpoints. The receiver must then use
+        // the information provided in the protocol block to get original the address.
+        PROXY_V2 => {}
+        // other values are unassigned and must not be emitted by senders. Receivers
+        // must drop connections presenting unexpected values here.
+        #[rustfmt::skip] // https://github.com/rust-lang/rustfmt/issues/6384
+        _ => return Err(io::Error::new(
+            io::ErrorKind::Other,
+            format!(
+                "invalid proxy protocol command 0x{:02X}. expected local (0x20) or proxy (0x21)",
+                header.version_and_command
+            ),
+        )),
+    };
+
+    let size_err =
+        "invalid proxy protocol length. payload not large enough to fit requested IP addresses";
+    let addr = match header.protocol_and_family {
+        TCP_OVER_IPV4 | UDP_OVER_IPV4 => {
+            let addr = payload
+                .try_get::<ProxyProtocolV2HeaderV4>()
+                .ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
+
+            SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
         }
-        36 => {
-            let src_addr = Ipv6Addr::from_bits(addr.get_u128());
-            let _dst_addr = Ipv6Addr::from_bits(addr.get_u128());
-            let src_port = addr.get_u16();
-            let _dst_port = addr.get_u16();
-            Some(SocketAddr::from((src_addr, src_port)))
+        TCP_OVER_IPV6 | UDP_OVER_IPV6 => {
+            let addr = payload
+                .try_get::<ProxyProtocolV2HeaderV6>()
+                .ok_or_else(|| io::Error::new(io::ErrorKind::Other, size_err))?;
+
+            SocketAddr::from((addr.src_addr.get(), addr.src_port.get()))
+        }
+        // unspecified or unix stream. ignore the addresses
+        _ => {
+            return Err(io::Error::new(
+                io::ErrorKind::Other,
+                "invalid proxy protocol address family/transport protocol.",
+            ))
         }
-        _ => None,
     };
 
     let mut extra = None;
 
-    while let Some(mut tlv) = read_tlv(&mut header) {
+    while let Some(mut tlv) = read_tlv(&mut payload) {
         match Pp2Kind::from_repr(tlv.kind) {
             Some(Pp2Kind::Aws) => {
                 if tlv.value.is_empty() {
@@ -259,9 +243,7 @@ pub(crate) async fn read_proxy_protocol<T: AsyncRead + Unpin>(
         }
     }
 
-    let conn_info = socket.map(|addr| ConnectionInfo { addr, extra });
-
-    Ok((ChainRW { inner: read, buf }, conn_info))
+    Ok(ConnectHeader::Proxy(ConnectionInfo { addr, extra }))
 }
 
 #[derive(FromRepr, Debug, Copy, Clone)]
@@ -337,27 +319,93 @@ struct Tlv {
 }
 
 fn read_tlv(b: &mut BytesMut) -> Option<Tlv> {
-    if b.len() < 3 {
-        return None;
-    }
-    let kind = b.get_u8();
-    let len = usize::from(b.get_u16());
+    let tlv_header = b.try_get::<TlvHeader>()?;
+    let len = usize::from(tlv_header.len.get());
     if b.len() < len {
         return None;
     }
-    let value = b.split_to(len).freeze();
-    Some(Tlv { kind, value })
+    Some(Tlv {
+        kind: tlv_header.kind,
+        value: b.split_to(len).freeze(),
+    })
+}
+
+trait BufExt: Sized {
+    fn try_get<T: FromBytes>(&mut self) -> Option<T>;
+}
+impl BufExt for BytesMut {
+    fn try_get<T: FromBytes>(&mut self) -> Option<T> {
+        let res = T::read_from_prefix(self)?;
+        self.advance(size_of::<T>());
+        Some(res)
+    }
+}
+
+#[derive(FromBytes, FromZeroes, Copy, Clone)]
+#[repr(C)]
+struct ProxyProtocolV2Header {
+    signature: [u8; 12],
+    version_and_command: u8,
+    protocol_and_family: u8,
+    len: zerocopy::byteorder::network_endian::U16,
+}
+
+#[derive(FromBytes, FromZeroes, Copy, Clone)]
+#[repr(C)]
+struct ProxyProtocolV2HeaderV4 {
+    src_addr: NetworkEndianIpv4,
+    dst_addr: NetworkEndianIpv4,
+    src_port: zerocopy::byteorder::network_endian::U16,
+    dst_port: zerocopy::byteorder::network_endian::U16,
+}
+
+#[derive(FromBytes, FromZeroes, Copy, Clone)]
+#[repr(C)]
+struct ProxyProtocolV2HeaderV6 {
+    src_addr: NetworkEndianIpv6,
+    dst_addr: NetworkEndianIpv6,
+    src_port: zerocopy::byteorder::network_endian::U16,
+    dst_port: zerocopy::byteorder::network_endian::U16,
+}
+
+#[derive(FromBytes, FromZeroes, Copy, Clone)]
+#[repr(C)]
+struct TlvHeader {
+    kind: u8,
+    len: zerocopy::byteorder::network_endian::U16,
+}
+
+#[derive(FromBytes, FromZeroes, Copy, Clone)]
+#[repr(transparent)]
+struct NetworkEndianIpv4(zerocopy::byteorder::network_endian::U32);
+impl NetworkEndianIpv4 {
+    #[inline]
+    fn get(self) -> Ipv4Addr {
+        Ipv4Addr::from_bits(self.0.get())
+    }
+}
+
+#[derive(FromBytes, FromZeroes, Copy, Clone)]
+#[repr(transparent)]
+struct NetworkEndianIpv6(zerocopy::byteorder::network_endian::U128);
+impl NetworkEndianIpv6 {
+    #[inline]
+    fn get(self) -> Ipv6Addr {
+        Ipv6Addr::from_bits(self.0.get())
+    }
 }
 
 #[cfg(test)]
 mod tests {
     use tokio::io::AsyncReadExt;
 
-    use crate::protocol2::read_proxy_protocol;
+    use crate::protocol2::{
+        read_proxy_protocol, ConnectHeader, LOCAL_V2, PROXY_V2, TCP_OVER_IPV4, UDP_OVER_IPV6,
+    };
 
     #[tokio::test]
     async fn test_ipv4() {
-        let header = super::HEADER
+        let header = super::SIGNATURE
             // Proxy command, IPV4 | TCP
             .chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
             // 12 + 3 bytes
@@ -384,15 +432,17 @@ mod tests {
 
         assert_eq!(bytes, extra_data);
 
-        let info = info.unwrap();
+        let ConnectHeader::Proxy(info) = info else {
+            panic!()
+        };
         assert_eq!(info.addr, ([127, 0, 0, 1], 65535).into());
     }
 
     #[tokio::test]
     async fn test_ipv6() {
-        let header = super::HEADER
+        let header = super::SIGNATURE
             // Proxy command, IPV6 | UDP
-            .chain([(2 << 4) | 1, (2 << 4) | 2].as_slice())
+            .chain([PROXY_V2, UDP_OVER_IPV6].as_slice())
             // 36 + 3 bytes
             .chain([0, 39].as_slice())
             // src ip
@@ -417,7 +467,9 @@ mod tests {
 
         assert_eq!(bytes, extra_data);
 
-        let info = info.unwrap();
+        let ConnectHeader::Proxy(info) = info else {
+            panic!()
+        };
         assert_eq!(
             info.addr,
             ([15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0], 257).into()
@@ -433,7 +485,7 @@ mod tests {
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(info, None);
+        assert_eq!(info, ConnectHeader::Missing);
     }
 
     #[tokio::test]
@@ -445,7 +497,7 @@ mod tests {
         let mut bytes = vec![];
         read.read_to_end(&mut bytes).await.unwrap();
         assert_eq!(bytes, data);
-        assert_eq!(info, None);
+        assert_eq!(info, ConnectHeader::Missing);
     }
 
     #[tokio::test]
@@ -454,9 +506,9 @@ mod tests {
         let tlv_len = (tlv.len() as u16).to_be_bytes();
         let len = (12 + 3 + tlv.len() as u16).to_be_bytes();
 
-        let header = super::HEADER
+        let header = super::SIGNATURE
             // Proxy command, Inet << 4 | Stream
-            .chain([(2 << 4) | 1, (1 << 4) | 1].as_slice())
+            .chain([PROXY_V2, TCP_OVER_IPV4].as_slice())
             // 12 + 3 bytes
             .chain(len.as_slice())
             // src ip
@@ -483,7 +535,30 @@ mod tests {
 
         assert_eq!(bytes, extra_data);
 
-        let info = info.unwrap();
+        let ConnectHeader::Proxy(info) = info else {
+            panic!()
+        };
         assert_eq!(info.addr, ([55, 56, 57, 58], 65535).into());
     }
+
+    #[tokio::test]
+    async fn test_local() {
+        let len = 0u16.to_be_bytes();
+        let header = super::SIGNATURE
+            .chain([LOCAL_V2, 0x00].as_slice())
+            .chain(len.as_slice());
+
+        let extra_data = [0xaa; 256];
+
+        let (mut read, info) = read_proxy_protocol(header.chain(extra_data.as_slice()))
+            .await
+            .unwrap();
+
+        let mut bytes = vec![];
+        read.read_to_end(&mut bytes).await.unwrap();
+
+        assert_eq!(bytes, extra_data);
+
+        let ConnectHeader::Local = info else { panic!() };
+    }
 }
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 922646d889..17721c23d5 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -19,7 +19,7 @@ use smol_str::{format_smolstr, SmolStr};
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
-use tracing::{error, info, warn, Instrument};
+use tracing::{debug, error, info, warn, Instrument};
 
 use self::connect_compute::{connect_to_compute, TcpMechanism};
 use self::passthrough::ProxyPassthrough;
@@ -28,7 +28,7 @@ use crate::config::{ProxyConfig, ProxyProtocolV2, TlsConfig};
 use crate::context::RequestMonitoring;
 use crate::error::ReportableError;
 use crate::metrics::{Metrics, NumClientConnectionsGuard};
-use crate::protocol2::{read_proxy_protocol, ConnectionInfo};
+use crate::protocol2::{read_proxy_protocol, ConnectHeader, ConnectionInfo};
 use crate::proxy::handshake::{handshake, HandshakeData};
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::stream::{PqStream, Stream};
@@ -83,7 +83,7 @@ pub async fn task_main(
         let session_id = uuid::Uuid::new_v4();
         let cancellation_handler = Arc::clone(&cancellation_handler);
 
-        tracing::info!(protocol = "tcp", %session_id, "accepted new TCP connection");
+        debug!(protocol = "tcp", %session_id, "accepted new TCP connection");
         let endpoint_rate_limiter2 = endpoint_rate_limiter.clone();
 
         connections.spawn(async move {
@@ -92,16 +92,21 @@ pub async fn task_main(
                     warn!("per-client task finished with an error: {e:#}");
                     return;
                 }
-                Ok((_socket, None)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+                // our load balancers will not send any more data. let's just exit immediately
+                Ok((_socket, ConnectHeader::Local)) => {
+                    debug!("healthcheck received");
+                    return;
+                }
+                Ok((_socket, ConnectHeader::Missing)) if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
                     warn!("missing required proxy protocol header");
                     return;
                 }
-                Ok((_socket, Some(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+                Ok((_socket, ConnectHeader::Proxy(_))) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
                     warn!("proxy protocol header not supported");
                     return;
                 }
-                Ok((socket, Some(info))) => (socket, info),
-                Ok((socket, None)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
+                Ok((socket, ConnectHeader::Proxy(info))) => (socket, info),
+                Ok((socket, ConnectHeader::Missing)) => (socket, ConnectionInfo { addr: peer_addr, extra: None }),
             };
 
             match socket.inner.set_nodelay(true) {
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index 4b60ddf60f..cf758855fa 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -47,7 +47,7 @@ use crate::cancellation::CancellationHandlerMain;
 use crate::config::{ProxyConfig, ProxyProtocolV2};
 use crate::context::RequestMonitoring;
 use crate::metrics::Metrics;
-use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectionInfo};
+use crate::protocol2::{read_proxy_protocol, ChainRW, ConnectHeader, ConnectionInfo};
 use crate::proxy::run_until_cancelled;
 use crate::rate_limiter::EndpointRateLimiter;
 use crate::serverless::backend::PoolingBackend;
@@ -251,16 +251,21 @@ async fn connection_startup(
     };
 
     let conn_info = match peer {
-        None if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
+        // our load balancers will not send any more data. let's just exit immediately
+        ConnectHeader::Local => {
+            tracing::debug!("healthcheck received");
+            return None;
+        }
+        ConnectHeader::Missing if config.proxy_protocol_v2 == ProxyProtocolV2::Required => {
             tracing::warn!("missing required proxy protocol header");
             return None;
         }
-        Some(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
+        ConnectHeader::Proxy(_) if config.proxy_protocol_v2 == ProxyProtocolV2::Rejected => {
             tracing::warn!("proxy protocol header not supported");
             return None;
         }
-        Some(info) => info,
-        None => ConnectionInfo {
+        ConnectHeader::Proxy(info) => info,
+        ConnectHeader::Missing => ConnectionInfo {
             addr: peer_addr,
             extra: None,
         },
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 8d83d9d9e2..71ebab4119 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -88,6 +88,7 @@ tower = { version = "0.4", default-features = false, features = ["balance", "buf
 tracing = { version = "0.1", features = ["log"] }
 tracing-core = { version = "0.1" }
 url = { version = "2", features = ["serde"] }
+zerocopy = { version = "0.7", features = ["derive", "simd"] }
 zeroize = { version = "1", features = ["derive", "serde"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
@@ -126,6 +127,7 @@ serde = { version = "1", features = ["alloc", "derive"] }
 syn = { version = "2", features = ["extra-traits", "fold", "full", "visit", "visit-mut"] }
 time-macros = { version = "0.2", default-features = false, features = ["formatting", "parsing", "serde"] }
 toml_edit = { version = "0.22", features = ["serde"] }
+zerocopy = { version = "0.7", features = ["derive", "simd"] }
 zstd = { version = "0.13" }
 zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] }
 zstd-sys = { version = "2", default-features = false, features = ["legacy", "std", "zdict_builder"] }

From 754d2950a37935224620a0c8db0b2d311ae0c226 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 5 Nov 2024 22:32:33 +0100
Subject: [PATCH 169/239] proxy: Revert ControlPlaneEvent back to struct
 (#9649)

Due to neondatabase/cloud#19815 we need to be more tolerant when reading
events.
---
 proxy/src/cache/endpoints.rs | 77 ++++++++++++++++++++++--------------
 1 file changed, 47 insertions(+), 30 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index 400c76291e..a488d8a9cd 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -19,26 +19,28 @@ use crate::rate_limiter::GlobalRateLimiter;
 use crate::redis::connection_with_credentials_provider::ConnectionWithCredentialsProvider;
 use crate::types::EndpointId;
 
-#[allow(clippy::enum_variant_names)]
-#[derive(Deserialize, Debug, Clone)]
-#[serde(tag = "type", rename_all(deserialize = "snake_case"))]
-enum ControlPlaneEvent {
-    EndpointCreated { endpoint_created: EndpointCreated },
-    BranchCreated { branch_created: BranchCreated },
-    ProjectCreated { project_created: ProjectCreated },
+// TODO: this could be an enum, but events in Redis need to be fixed first.
+// ProjectCreated was sent with type:branch_created. So we ignore type.
+#[derive(Deserialize, Debug, Clone, PartialEq)]
+struct ControlPlaneEvent {
+    endpoint_created: Option<EndpointCreated>,
+    branch_created: Option<BranchCreated>,
+    project_created: Option<ProjectCreated>,
+    #[serde(rename = "type")]
+    _type: Option<String>,
 }
 
-#[derive(Deserialize, Debug, Clone)]
+#[derive(Deserialize, Debug, Clone, PartialEq)]
 struct EndpointCreated {
     endpoint_id: String,
 }
 
-#[derive(Deserialize, Debug, Clone)]
+#[derive(Deserialize, Debug, Clone, PartialEq)]
 struct BranchCreated {
     branch_id: String,
 }
 
-#[derive(Deserialize, Debug, Clone)]
+#[derive(Deserialize, Debug, Clone, PartialEq)]
 struct ProjectCreated {
     project_id: String,
 }
@@ -104,24 +106,28 @@ impl EndpointsCache {
     }
 
     fn insert_event(&self, event: ControlPlaneEvent) {
-        let counter = match event {
-            ControlPlaneEvent::EndpointCreated { endpoint_created } => {
-                self.endpoints
-                    .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
-                RedisEventsCount::EndpointCreated
-            }
-            ControlPlaneEvent::BranchCreated { branch_created } => {
-                self.branches
-                    .insert(BranchIdInt::from(&branch_created.branch_id.into()));
-                RedisEventsCount::BranchCreated
-            }
-            ControlPlaneEvent::ProjectCreated { project_created } => {
-                self.projects
-                    .insert(ProjectIdInt::from(&project_created.project_id.into()));
-                RedisEventsCount::ProjectCreated
-            }
-        };
-        Metrics::get().proxy.redis_events_count.inc(counter);
+        if let Some(endpoint_created) = event.endpoint_created {
+            self.endpoints
+                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::EndpointCreated);
+        } else if let Some(branch_created) = event.branch_created {
+            self.branches
+                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::BranchCreated);
+        } else if let Some(project_created) = event.project_created {
+            self.projects
+                .insert(ProjectIdInt::from(&project_created.project_id.into()));
+            Metrics::get()
+                .proxy
+                .redis_events_count
+                .inc(RedisEventsCount::ProjectCreated);
+        }
     }
 
     pub async fn do_read(
@@ -235,11 +241,22 @@ impl EndpointsCache {
 
 #[cfg(test)]
 mod tests {
-    use super::ControlPlaneEvent;
+    use super::*;
 
     #[test]
     fn test_parse_control_plane_event() {
         let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
-        serde_json::from_str::<ControlPlaneEvent>(s).unwrap();
+
+        assert_eq!(
+            serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
+            ControlPlaneEvent {
+                endpoint_created: Some(EndpointCreated {
+                    endpoint_id: "ep-rapid-thunder-w0qqw2q9".into()
+                }),
+                branch_created: None,
+                project_created: None,
+                _type: Some("endpoint_created".into()),
+            }
+        );
     }
 }

From ebc43efebc0adb99043d6b2cd6600abf4a588810 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Tue, 5 Nov 2024 23:03:53 +0100
Subject: [PATCH 170/239] proxy: Refactor cplane types (#9643)

The overall idea of the PR is to rename a few types to make their
purpose more clear, reduce abstraction where not needed, and move types
to to more better suited modules.
---
 proxy/src/auth/backend/console_redirect.rs    |   3 +-
 proxy/src/auth/backend/mod.rs                 |  41 +-
 proxy/src/bin/proxy.rs                        |  13 +-
 proxy/src/compute.rs                          |   2 +-
 proxy/src/config.rs                           |  10 +-
 .../{provider => client}/mock.rs              |  19 +-
 proxy/src/control_plane/client/mod.rs         | 281 +++++++++
 .../{provider => client}/neon.rs              |  44 +-
 proxy/src/control_plane/errors.rs             | 216 +++++++
 proxy/src/control_plane/messages.rs           |  10 +-
 proxy/src/control_plane/mod.rs                | 127 +++-
 proxy/src/control_plane/provider/mod.rs       | 588 ------------------
 proxy/src/proxy/tests/mod.rs                  |  34 +-
 proxy/src/proxy/wake_compute.rs               |   2 +-
 proxy/src/serverless/backend.rs               |   2 +-
 15 files changed, 708 insertions(+), 684 deletions(-)
 rename proxy/src/control_plane/{provider => client}/mock.rs (94%)
 create mode 100644 proxy/src/control_plane/client/mod.rs
 rename proxy/src/control_plane/{provider => client}/neon.rs (93%)
 create mode 100644 proxy/src/control_plane/errors.rs
 delete mode 100644 proxy/src/control_plane/provider/mod.rs

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 255e1fed54..1df59b4893 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -9,8 +9,7 @@ use super::ComputeCredentialKeys;
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestMonitoring;
-use crate::control_plane::provider::NodeInfo;
-use crate::control_plane::{self, CachedNodeInfo};
+use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::{ReportableError, UserFacingError};
 use crate::proxy::connect_compute::ComputeConnectBackend;
 use crate::stream::PqStream;
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 0eb68e6412..ccd5cf3071 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -21,11 +21,11 @@ use crate::auth::{self, validate_password_and_exchange, AuthError, ComputeUserIn
 use crate::cache::Cached;
 use crate::config::AuthenticationConfig;
 use crate::context::RequestMonitoring;
+use crate::control_plane::client::ControlPlaneClient;
 use crate::control_plane::errors::GetAuthInfoError;
-use crate::control_plane::provider::{
-    CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneBackend,
+use crate::control_plane::{
+    self, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi,
 };
-use crate::control_plane::{self, Api, AuthSecret};
 use crate::intern::EndpointIdInt;
 use crate::metrics::Metrics;
 use crate::proxy::connect_compute::ComputeConnectBackend;
@@ -62,42 +62,26 @@ impl<T> std::ops::Deref for MaybeOwned<'_, T> {
 ///   backends which require them for the authentication process.
 pub enum Backend<'a, T> {
     /// Cloud API (V2).
-    ControlPlane(MaybeOwned<'a, ControlPlaneBackend>, T),
+    ControlPlane(MaybeOwned<'a, ControlPlaneClient>, T),
     /// Local proxy uses configured auth credentials and does not wake compute
     Local(MaybeOwned<'a, LocalBackend>),
 }
 
-#[cfg(test)]
-pub(crate) trait TestBackend: Send + Sync + 'static {
-    fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError>;
-    fn get_allowed_ips_and_secret(
-        &self,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), control_plane::errors::GetAuthInfoError>;
-    fn dyn_clone(&self) -> Box<dyn TestBackend>;
-}
-
-#[cfg(test)]
-impl Clone for Box<dyn TestBackend> {
-    fn clone(&self) -> Self {
-        TestBackend::dyn_clone(&**self)
-    }
-}
-
 impl std::fmt::Display for Backend<'_, ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::ControlPlane(api, ()) => match &**api {
-                ControlPlaneBackend::Management(endpoint) => fmt
-                    .debug_tuple("ControlPlane::Management")
+                ControlPlaneClient::Neon(endpoint) => fmt
+                    .debug_tuple("ControlPlane::Neon")
                     .field(&endpoint.url())
                     .finish(),
                 #[cfg(any(test, feature = "testing"))]
-                ControlPlaneBackend::PostgresMock(endpoint) => fmt
+                ControlPlaneClient::PostgresMock(endpoint) => fmt
                     .debug_tuple("ControlPlane::PostgresMock")
                     .field(&endpoint.url())
                     .finish(),
                 #[cfg(test)]
-                ControlPlaneBackend::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
+                ControlPlaneClient::Test(_) => fmt.debug_tuple("ControlPlane::Test").finish(),
             },
             Self::Local(_) => fmt.debug_tuple("Local").finish(),
         }
@@ -282,7 +266,7 @@ impl AuthenticationConfig {
 /// All authentication flows will emit an AuthenticationOk message if successful.
 async fn auth_quirks(
     ctx: &RequestMonitoring,
-    api: &impl control_plane::Api,
+    api: &impl control_plane::ControlPlaneApi,
     user_info: ComputeUserInfoMaybeEndpoint,
     client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
     allow_cleartext: bool,
@@ -499,12 +483,12 @@ mod tests {
     use std::time::Duration;
 
     use bytes::BytesMut;
+    use control_plane::AuthSecret;
     use fallible_iterator::FallibleIterator;
     use once_cell::sync::Lazy;
     use postgres_protocol::authentication::sasl::{ChannelBinding, ScramSha256};
     use postgres_protocol::message::backend::Message as PgMessage;
     use postgres_protocol::message::frontend;
-    use provider::AuthSecret;
     use tokio::io::{AsyncRead, AsyncReadExt, AsyncWriteExt};
 
     use super::jwt::JwkCache;
@@ -513,8 +497,7 @@ mod tests {
     use crate::auth::{ComputeUserInfoMaybeEndpoint, IpPattern};
     use crate::config::AuthenticationConfig;
     use crate::context::RequestMonitoring;
-    use crate::control_plane::provider::{self, CachedAllowedIps, CachedRoleSecret};
-    use crate::control_plane::{self, CachedNodeInfo};
+    use crate::control_plane::{self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret};
     use crate::proxy::NeonOptions;
     use crate::rate_limiter::{EndpointRateLimiter, RateBucketInfo};
     use crate::scram::threadpool::ThreadPool;
@@ -526,7 +509,7 @@ mod tests {
         secret: AuthSecret,
     }
 
-    impl control_plane::Api for Auth {
+    impl control_plane::ControlPlaneApi for Auth {
         async fn get_role_secret(
             &self,
             _ctx: &RequestMonitoring,
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 82c259efc8..3179a929c4 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -513,7 +513,7 @@ async fn main() -> anyhow::Result<()> {
     }
 
     if let Either::Left(auth::Backend::ControlPlane(api, _)) = &auth_backend {
-        if let proxy::control_plane::provider::ControlPlaneBackend::Management(api) = &**api {
+        if let proxy::control_plane::client::ControlPlaneClient::Neon(api) = &**api {
             match (redis_notifications_client, regional_redis_client.clone()) {
                 (None, None) => {}
                 (client1, client2) => {
@@ -732,13 +732,13 @@ fn build_auth_backend(
             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
             let wake_compute_endpoint_rate_limiter =
                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
-            let api = control_plane::provider::neon::Api::new(
+            let api = control_plane::client::neon::NeonControlPlaneClient::new(
                 endpoint,
                 caches,
                 locks,
                 wake_compute_endpoint_rate_limiter,
             );
-            let api = control_plane::provider::ControlPlaneBackend::Management(api);
+            let api = control_plane::client::ControlPlaneClient::Neon(api);
             let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
 
             let config = Box::leak(Box::new(auth_backend));
@@ -749,8 +749,11 @@ fn build_auth_backend(
         #[cfg(feature = "testing")]
         AuthBackendType::Postgres => {
             let url = args.auth_endpoint.parse()?;
-            let api = control_plane::provider::mock::Api::new(url, !args.is_private_access_proxy);
-            let api = control_plane::provider::ControlPlaneBackend::PostgresMock(api);
+            let api = control_plane::client::mock::MockControlPlane::new(
+                url,
+                !args.is_private_access_proxy,
+            );
+            let api = control_plane::client::ControlPlaneClient::PostgresMock(api);
 
             let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 65b6dd215b..d397fc5160 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -19,9 +19,9 @@ use tracing::{error, info, warn};
 use crate::auth::parse_endpoint_param;
 use crate::cancellation::CancelClosure;
 use crate::context::RequestMonitoring;
+use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::WakeComputeError;
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::control_plane::provider::ApiLockError;
 use crate::error::{ReportableError, UserFacingError};
 use crate::metrics::{Metrics, NumDbConnectionsGuard};
 use crate::proxy::neon_option;
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 2870e100b7..f63d7e45aa 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -366,7 +366,7 @@ pub struct EndpointCacheConfig {
 }
 
 impl EndpointCacheConfig {
-    /// Default options for [`crate::control_plane::provider::NodeInfoCache`].
+    /// Default options for [`crate::control_plane::NodeInfoCache`].
     /// Notice that by default the limiter is empty, which means that cache is disabled.
     pub const CACHE_DEFAULT_OPTIONS: &'static str =
         "initial_batch_size=1000,default_batch_size=10,xread_timeout=5m,stream_name=controlPlane,disable_cache=true,limiter_info=1000@1s,retry_interval=1s";
@@ -441,7 +441,7 @@ pub struct CacheOptions {
 }
 
 impl CacheOptions {
-    /// Default options for [`crate::control_plane::provider::NodeInfoCache`].
+    /// Default options for [`crate::control_plane::NodeInfoCache`].
     pub const CACHE_DEFAULT_OPTIONS: &'static str = "size=4000,ttl=4m";
 
     /// Parse cache options passed via cmdline.
@@ -497,7 +497,7 @@ pub struct ProjectInfoCacheOptions {
 }
 
 impl ProjectInfoCacheOptions {
-    /// Default options for [`crate::control_plane::provider::NodeInfoCache`].
+    /// Default options for [`crate::control_plane::NodeInfoCache`].
     pub const CACHE_DEFAULT_OPTIONS: &'static str =
         "size=10000,ttl=4m,max_roles=10,gc_interval=60m";
 
@@ -616,9 +616,9 @@ pub struct ConcurrencyLockOptions {
 }
 
 impl ConcurrencyLockOptions {
-    /// Default options for [`crate::control_plane::provider::ApiLocks`].
+    /// Default options for [`crate::control_plane::client::ApiLocks`].
     pub const DEFAULT_OPTIONS_WAKE_COMPUTE_LOCK: &'static str = "permits=0";
-    /// Default options for [`crate::control_plane::provider::ApiLocks`].
+    /// Default options for [`crate::control_plane::client::ApiLocks`].
     pub const DEFAULT_OPTIONS_CONNECT_COMPUTE_LOCK: &'static str =
         "shards=64,permits=100,epoch=10m,timeout=10ms";
 
diff --git a/proxy/src/control_plane/provider/mock.rs b/proxy/src/control_plane/client/mock.rs
similarity index 94%
rename from proxy/src/control_plane/provider/mock.rs
rename to proxy/src/control_plane/client/mock.rs
index 75a242d8d3..fd333d2aac 100644
--- a/proxy/src/control_plane/provider/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -9,16 +9,17 @@ use tokio_postgres::config::SslMode;
 use tokio_postgres::Client;
 use tracing::{error, info, info_span, warn, Instrument};
 
-use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
-use super::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::ComputeUserInfo;
 use crate::auth::IpPattern;
 use crate::cache::Cached;
 use crate::context::RequestMonitoring;
-use crate::control_plane::errors::GetEndpointJwksError;
+use crate::control_plane::client::{CachedAllowedIps, CachedRoleSecret};
+use crate::control_plane::errors::{
+    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
+};
 use crate::control_plane::messages::MetricsAuxInfo;
-use crate::control_plane::provider::{CachedAllowedIps, CachedRoleSecret};
+use crate::control_plane::{AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo};
 use crate::error::io_error;
 use crate::intern::RoleNameInt;
 use crate::types::{BranchId, EndpointId, ProjectId, RoleName};
@@ -31,25 +32,25 @@ enum MockApiError {
     PasswordNotSet(tokio_postgres::Error),
 }
 
-impl From<MockApiError> for ApiError {
+impl From<MockApiError> for ControlPlaneError {
     fn from(e: MockApiError) -> Self {
         io_error(e).into()
     }
 }
 
-impl From<tokio_postgres::Error> for ApiError {
+impl From<tokio_postgres::Error> for ControlPlaneError {
     fn from(e: tokio_postgres::Error) -> Self {
         io_error(e).into()
     }
 }
 
 #[derive(Clone)]
-pub struct Api {
+pub struct MockControlPlane {
     endpoint: ApiUrl,
     ip_allowlist_check_enabled: bool,
 }
 
-impl Api {
+impl MockControlPlane {
     pub fn new(endpoint: ApiUrl, ip_allowlist_check_enabled: bool) -> Self {
         Self {
             endpoint,
@@ -201,7 +202,7 @@ async fn get_execute_postgres_query(
     Ok(Some(entry))
 }
 
-impl super::Api for Api {
+impl super::ControlPlaneApi for MockControlPlane {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
new file mode 100644
index 0000000000..e388d8a538
--- /dev/null
+++ b/proxy/src/control_plane/client/mod.rs
@@ -0,0 +1,281 @@
+#[cfg(any(test, feature = "testing"))]
+pub mod mock;
+pub mod neon;
+
+use std::hash::Hash;
+use std::sync::Arc;
+use std::time::Duration;
+
+use dashmap::DashMap;
+use tokio::time::Instant;
+use tracing::info;
+
+use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
+use crate::auth::backend::ComputeUserInfo;
+use crate::cache::endpoints::EndpointsCache;
+use crate::cache::project_info::ProjectInfoCacheImpl;
+use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
+use crate::context::RequestMonitoring;
+use crate::control_plane::{
+    errors, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, ControlPlaneApi, NodeInfoCache,
+};
+use crate::error::ReportableError;
+use crate::metrics::ApiLockMetrics;
+use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
+use crate::types::EndpointId;
+
+#[non_exhaustive]
+#[derive(Clone)]
+pub enum ControlPlaneClient {
+    /// Current Management API (V2).
+    Neon(neon::NeonControlPlaneClient),
+    /// Local mock control plane.
+    #[cfg(any(test, feature = "testing"))]
+    PostgresMock(mock::MockControlPlane),
+    /// Internal testing
+    #[cfg(test)]
+    #[allow(private_interfaces)]
+    Test(Box<dyn TestControlPlaneClient>),
+}
+
+impl ControlPlaneApi for ControlPlaneClient {
+    async fn get_role_secret(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
+        match self {
+            Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
+        }
+    }
+
+    async fn get_allowed_ips_and_secret(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
+        match self {
+            Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.get_allowed_ips_and_secret(),
+        }
+    }
+
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
+        match self {
+            Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
+            #[cfg(test)]
+            Self::Test(_api) => Ok(vec![]),
+        }
+    }
+
+    async fn wake_compute(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
+        match self {
+            Self::Neon(api) => api.wake_compute(ctx, user_info).await,
+            #[cfg(any(test, feature = "testing"))]
+            Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
+            #[cfg(test)]
+            Self::Test(api) => api.wake_compute(),
+        }
+    }
+}
+
+#[cfg(test)]
+pub(crate) trait TestControlPlaneClient: Send + Sync + 'static {
+    fn wake_compute(&self) -> Result<CachedNodeInfo, errors::WakeComputeError>;
+
+    fn get_allowed_ips_and_secret(
+        &self,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+
+    fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient>;
+}
+
+#[cfg(test)]
+impl Clone for Box<dyn TestControlPlaneClient> {
+    fn clone(&self) -> Self {
+        TestControlPlaneClient::dyn_clone(&**self)
+    }
+}
+
+/// Various caches for [`control_plane`](super).
+pub struct ApiCaches {
+    /// Cache for the `wake_compute` API method.
+    pub(crate) node_info: NodeInfoCache,
+    /// Cache which stores project_id -> endpoint_ids mapping.
+    pub project_info: Arc<ProjectInfoCacheImpl>,
+    /// List of all valid endpoints.
+    pub endpoints_cache: Arc<EndpointsCache>,
+}
+
+impl ApiCaches {
+    pub fn new(
+        wake_compute_cache_config: CacheOptions,
+        project_info_cache_config: ProjectInfoCacheOptions,
+        endpoint_cache_config: EndpointCacheConfig,
+    ) -> Self {
+        Self {
+            node_info: NodeInfoCache::new(
+                "node_info_cache",
+                wake_compute_cache_config.size,
+                wake_compute_cache_config.ttl,
+                true,
+            ),
+            project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
+            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
+        }
+    }
+}
+
+/// Various caches for [`control_plane`](super).
+pub struct ApiLocks<K> {
+    name: &'static str,
+    node_locks: DashMap<K, Arc<DynamicLimiter>>,
+    config: RateLimiterConfig,
+    timeout: Duration,
+    epoch: std::time::Duration,
+    metrics: &'static ApiLockMetrics,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub(crate) enum ApiLockError {
+    #[error("timeout acquiring resource permit")]
+    TimeoutError(#[from] tokio::time::error::Elapsed),
+}
+
+impl ReportableError for ApiLockError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
+        }
+    }
+}
+
+impl<K: Hash + Eq + Clone> ApiLocks<K> {
+    pub fn new(
+        name: &'static str,
+        config: RateLimiterConfig,
+        shards: usize,
+        timeout: Duration,
+        epoch: std::time::Duration,
+        metrics: &'static ApiLockMetrics,
+    ) -> prometheus::Result<Self> {
+        Ok(Self {
+            name,
+            node_locks: DashMap::with_shard_amount(shards),
+            config,
+            timeout,
+            epoch,
+            metrics,
+        })
+    }
+
+    pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
+        if self.config.initial_limit == 0 {
+            return Ok(WakeComputePermit {
+                permit: Token::disabled(),
+            });
+        }
+        let now = Instant::now();
+        let semaphore = {
+            // get fast path
+            if let Some(semaphore) = self.node_locks.get(key) {
+                semaphore.clone()
+            } else {
+                self.node_locks
+                    .entry(key.clone())
+                    .or_insert_with(|| {
+                        self.metrics.semaphores_registered.inc();
+                        DynamicLimiter::new(self.config)
+                    })
+                    .clone()
+            }
+        };
+        let permit = semaphore.acquire_timeout(self.timeout).await;
+
+        self.metrics
+            .semaphore_acquire_seconds
+            .observe(now.elapsed().as_secs_f64());
+        info!("acquired permit {:?}", now.elapsed().as_secs_f64());
+        Ok(WakeComputePermit { permit: permit? })
+    }
+
+    pub async fn garbage_collect_worker(&self) {
+        if self.config.initial_limit == 0 {
+            return;
+        }
+        let mut interval =
+            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
+        loop {
+            for (i, shard) in self.node_locks.shards().iter().enumerate() {
+                interval.tick().await;
+                // temporary lock a single shard and then clear any semaphores that aren't currently checked out
+                // race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
+                // therefore releasing it is safe from race conditions
+                info!(
+                    name = self.name,
+                    shard = i,
+                    "performing epoch reclamation on api lock"
+                );
+                let mut lock = shard.write();
+                let timer = self.metrics.reclamation_lag_seconds.start_timer();
+                let count = lock
+                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
+                    .count();
+                drop(lock);
+                self.metrics.semaphores_unregistered.inc_by(count as u64);
+                timer.observe();
+            }
+        }
+    }
+}
+
+pub(crate) struct WakeComputePermit {
+    permit: Token,
+}
+
+impl WakeComputePermit {
+    pub(crate) fn should_check_cache(&self) -> bool {
+        !self.permit.is_disabled()
+    }
+    pub(crate) fn release(self, outcome: Outcome) {
+        self.permit.release(outcome);
+    }
+    pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
+        match res {
+            Ok(_) => self.release(Outcome::Success),
+            Err(_) => self.release(Outcome::Overload),
+        }
+        res
+    }
+}
+
+impl FetchAuthRules for ControlPlaneClient {
+    async fn fetch_auth_rules(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
+        self.get_endpoint_jwks(ctx, endpoint)
+            .await
+            .map_err(FetchAuthRulesError::GetEndpointJwks)
+    }
+}
diff --git a/proxy/src/control_plane/provider/neon.rs b/proxy/src/control_plane/client/neon.rs
similarity index 93%
rename from proxy/src/control_plane/provider/neon.rs
rename to proxy/src/control_plane/client/neon.rs
index 8ea91d7875..6c67d2df96 100644
--- a/proxy/src/control_plane/provider/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -10,18 +10,20 @@ use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{debug, info, info_span, warn, Instrument};
 
-use super::super::messages::{ControlPlaneError, GetRoleSecret, WakeCompute};
-use super::errors::{ApiError, GetAuthInfoError, WakeComputeError};
-use super::{
-    ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret,
-    NodeInfo,
-};
+use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
 use crate::auth::backend::jwt::AuthRule;
 use crate::auth::backend::ComputeUserInfo;
 use crate::cache::Cached;
 use crate::context::RequestMonitoring;
-use crate::control_plane::errors::GetEndpointJwksError;
+use crate::control_plane::caches::ApiCaches;
+use crate::control_plane::errors::{
+    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
+};
+use crate::control_plane::locks::ApiLocks;
 use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::{
+    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+};
 use crate::metrics::{CacheOutcome, Metrics};
 use crate::rate_limiter::WakeComputeRateLimiter;
 use crate::types::{EndpointCacheKey, EndpointId};
@@ -30,7 +32,7 @@ use crate::{compute, http, scram};
 const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
 
 #[derive(Clone)]
-pub struct Api {
+pub struct NeonControlPlaneClient {
     endpoint: http::Endpoint,
     pub caches: &'static ApiCaches,
     pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
@@ -39,7 +41,7 @@ pub struct Api {
     jwt: Arc<str>,
 }
 
-impl Api {
+impl NeonControlPlaneClient {
     /// Construct an API object containing the auth parameters.
     pub fn new(
         endpoint: http::Endpoint,
@@ -256,7 +258,7 @@ impl Api {
     }
 }
 
-impl super::Api for Api {
+impl super::ControlPlaneApi for NeonControlPlaneClient {
     #[tracing::instrument(skip_all)]
     async fn get_role_secret(
         &self,
@@ -356,7 +358,7 @@ impl super::Api for Api {
                     let (cached, info) = cached.take_value();
                     let info = info.map_err(|c| {
                         info!(key = &*key, "found cached wake_compute error");
-                        WakeComputeError::ApiError(ApiError::ControlPlane(Box::new(*c)))
+                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
                     })?;
 
                     debug!(key = &*key, "found cached compute node info");
@@ -403,9 +405,11 @@ impl super::Api for Api {
                 Ok(cached.map(|()| node))
             }
             Err(err) => match err {
-                WakeComputeError::ApiError(ApiError::ControlPlane(err)) => {
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
                     let Some(status) = &err.status else {
-                        return Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)));
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
                     };
 
                     let reason = status
@@ -415,7 +419,9 @@ impl super::Api for Api {
 
                     // if we can retry this error, do not cache it.
                     if reason.can_retry() {
-                        return Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)));
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
                     }
 
                     // at this point, we should only have quota errors.
@@ -430,7 +436,9 @@ impl super::Api for Api {
                         Duration::from_secs(30),
                     );
 
-                    Err(WakeComputeError::ApiError(ApiError::ControlPlane(err)))
+                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                        err,
+                    )))
                 }
                 err => return Err(err),
             },
@@ -441,7 +449,7 @@ impl super::Api for Api {
 /// Parse http response body, taking status code into account.
 async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     response: http::Response,
-) -> Result<T, ApiError> {
+) -> Result<T, ControlPlaneError> {
     let status = response.status();
     if status.is_success() {
         // We shouldn't log raw body because it may contain secrets.
@@ -456,7 +464,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     // as the fact that the request itself has failed.
     let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
         warn!("failed to parse error body: {e}");
-        ControlPlaneError {
+        ControlPlaneErrorMessage {
             error: "reason unclear (malformed error message)".into(),
             http_status_code: status,
             status: None,
@@ -465,7 +473,7 @@ async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
     body.http_status_code = status;
 
     warn!("console responded with an error ({status}): {body:?}");
-    Err(ApiError::ControlPlane(Box::new(body)))
+    Err(ControlPlaneError::Message(Box::new(body)))
 }
 
 fn parse_host_port(input: &str) -> Option<(&str, u16)> {
diff --git a/proxy/src/control_plane/errors.rs b/proxy/src/control_plane/errors.rs
new file mode 100644
index 0000000000..d6f565e34a
--- /dev/null
+++ b/proxy/src/control_plane/errors.rs
@@ -0,0 +1,216 @@
+use thiserror::Error;
+
+use crate::control_plane::client::ApiLockError;
+use crate::control_plane::messages::{self, ControlPlaneErrorMessage, Reason};
+use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
+use crate::proxy::retry::CouldRetry;
+
+/// A go-to error message which doesn't leak any detail.
+pub(crate) const REQUEST_FAILED: &str = "Console request failed";
+
+/// Common console API error.
+#[derive(Debug, Error)]
+pub(crate) enum ControlPlaneError {
+    /// Error returned by the console itself.
+    #[error("{REQUEST_FAILED} with {0}")]
+    Message(Box<ControlPlaneErrorMessage>),
+
+    /// Various IO errors like broken pipe or malformed payload.
+    #[error("{REQUEST_FAILED}: {0}")]
+    Transport(#[from] std::io::Error),
+}
+
+impl ControlPlaneError {
+    /// Returns HTTP status code if it's the reason for failure.
+    pub(crate) fn get_reason(&self) -> messages::Reason {
+        match self {
+            ControlPlaneError::Message(e) => e.get_reason(),
+            ControlPlaneError::Transport(_) => messages::Reason::Unknown,
+        }
+    }
+}
+
+impl UserFacingError for ControlPlaneError {
+    fn to_string_client(&self) -> String {
+        match self {
+            // To minimize risks, only select errors are forwarded to users.
+            ControlPlaneError::Message(c) => c.get_user_facing_message(),
+            ControlPlaneError::Transport(_) => REQUEST_FAILED.to_owned(),
+        }
+    }
+}
+
+impl ReportableError for ControlPlaneError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            ControlPlaneError::Message(e) => match e.get_reason() {
+                Reason::RoleProtected => ErrorKind::User,
+                Reason::ResourceNotFound => ErrorKind::User,
+                Reason::ProjectNotFound => ErrorKind::User,
+                Reason::EndpointNotFound => ErrorKind::User,
+                Reason::BranchNotFound => ErrorKind::User,
+                Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
+                Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
+                Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
+                Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
+                Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
+                Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
+                Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
+                Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
+                Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
+                Reason::RunningOperations => ErrorKind::ControlPlane,
+                Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
+                Reason::Unknown => ErrorKind::ControlPlane,
+            },
+            ControlPlaneError::Transport(_) => crate::error::ErrorKind::ControlPlane,
+        }
+    }
+}
+
+impl CouldRetry for ControlPlaneError {
+    fn could_retry(&self) -> bool {
+        match self {
+            // retry some transport errors
+            Self::Transport(io) => io.could_retry(),
+            Self::Message(e) => e.could_retry(),
+        }
+    }
+}
+
+impl From<reqwest::Error> for ControlPlaneError {
+    fn from(e: reqwest::Error) -> Self {
+        io_error(e).into()
+    }
+}
+
+impl From<reqwest_middleware::Error> for ControlPlaneError {
+    fn from(e: reqwest_middleware::Error) -> Self {
+        io_error(e).into()
+    }
+}
+
+#[derive(Debug, Error)]
+pub(crate) enum GetAuthInfoError {
+    // We shouldn't include the actual secret here.
+    #[error("Console responded with a malformed auth secret")]
+    BadSecret,
+
+    #[error(transparent)]
+    ApiError(ControlPlaneError),
+}
+
+// This allows more useful interactions than `#[from]`.
+impl<E: Into<ControlPlaneError>> From<E> for GetAuthInfoError {
+    fn from(e: E) -> Self {
+        Self::ApiError(e.into())
+    }
+}
+
+impl UserFacingError for GetAuthInfoError {
+    fn to_string_client(&self) -> String {
+        match self {
+            // We absolutely should not leak any secrets!
+            Self::BadSecret => REQUEST_FAILED.to_owned(),
+            // However, API might return a meaningful error.
+            Self::ApiError(e) => e.to_string_client(),
+        }
+    }
+}
+
+impl ReportableError for GetAuthInfoError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            Self::BadSecret => crate::error::ErrorKind::ControlPlane,
+            Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+pub(crate) enum WakeComputeError {
+    #[error("Console responded with a malformed compute address: {0}")]
+    BadComputeAddress(Box<str>),
+
+    #[error(transparent)]
+    ControlPlane(ControlPlaneError),
+
+    #[error("Too many connections attempts")]
+    TooManyConnections,
+
+    #[error("error acquiring resource permit: {0}")]
+    TooManyConnectionAttempts(#[from] ApiLockError),
+}
+
+// This allows more useful interactions than `#[from]`.
+impl<E: Into<ControlPlaneError>> From<E> for WakeComputeError {
+    fn from(e: E) -> Self {
+        Self::ControlPlane(e.into())
+    }
+}
+
+impl UserFacingError for WakeComputeError {
+    fn to_string_client(&self) -> String {
+        match self {
+            // We shouldn't show user the address even if it's broken.
+            // Besides, user is unlikely to care about this detail.
+            Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
+            // However, control plane might return a meaningful error.
+            Self::ControlPlane(e) => e.to_string_client(),
+
+            Self::TooManyConnections => self.to_string(),
+
+            Self::TooManyConnectionAttempts(_) => {
+                "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
+            }
+        }
+    }
+}
+
+impl ReportableError for WakeComputeError {
+    fn get_error_kind(&self) -> crate::error::ErrorKind {
+        match self {
+            Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
+            Self::ControlPlane(e) => e.get_error_kind(),
+            Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
+            Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
+        }
+    }
+}
+
+impl CouldRetry for WakeComputeError {
+    fn could_retry(&self) -> bool {
+        match self {
+            Self::BadComputeAddress(_) => false,
+            Self::ControlPlane(e) => e.could_retry(),
+            Self::TooManyConnections => false,
+            Self::TooManyConnectionAttempts(_) => false,
+        }
+    }
+}
+
+#[derive(Debug, Error)]
+pub enum GetEndpointJwksError {
+    #[error("endpoint not found")]
+    EndpointNotFound,
+
+    #[error("failed to build control plane request: {0}")]
+    RequestBuild(#[source] reqwest::Error),
+
+    #[error("failed to send control plane request: {0}")]
+    RequestExecute(#[source] reqwest_middleware::Error),
+
+    #[error(transparent)]
+    ControlPlane(#[from] ControlPlaneError),
+
+    #[cfg(any(test, feature = "testing"))]
+    #[error(transparent)]
+    TokioPostgres(#[from] tokio_postgres::Error),
+
+    #[cfg(any(test, feature = "testing"))]
+    #[error(transparent)]
+    ParseUrl(#[from] url::ParseError),
+
+    #[cfg(any(test, feature = "testing"))]
+    #[error(transparent)]
+    TaskJoin(#[from] tokio::task::JoinError),
+}
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 13a54145b1..75c932e6ab 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -10,14 +10,14 @@ use crate::proxy::retry::CouldRetry;
 /// Generic error response with human-readable description.
 /// Note that we can't always present it to user as is.
 #[derive(Debug, Deserialize, Clone)]
-pub(crate) struct ControlPlaneError {
+pub(crate) struct ControlPlaneErrorMessage {
     pub(crate) error: Box<str>,
     #[serde(skip)]
     pub(crate) http_status_code: http::StatusCode,
     pub(crate) status: Option<Status>,
 }
 
-impl ControlPlaneError {
+impl ControlPlaneErrorMessage {
     pub(crate) fn get_reason(&self) -> Reason {
         self.status
             .as_ref()
@@ -26,7 +26,7 @@ impl ControlPlaneError {
     }
 
     pub(crate) fn get_user_facing_message(&self) -> String {
-        use super::provider::errors::REQUEST_FAILED;
+        use super::errors::REQUEST_FAILED;
         self.status
             .as_ref()
             .and_then(|s| s.details.user_facing_message.as_ref())
@@ -51,7 +51,7 @@ impl ControlPlaneError {
     }
 }
 
-impl Display for ControlPlaneError {
+impl Display for ControlPlaneErrorMessage {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
         let msg: &str = self
             .status
@@ -62,7 +62,7 @@ impl Display for ControlPlaneError {
     }
 }
 
-impl CouldRetry for ControlPlaneError {
+impl CouldRetry for ControlPlaneErrorMessage {
     fn could_retry(&self) -> bool {
         // If the error message does not have a status,
         // the error is unknown and probably should not retry automatically
diff --git a/proxy/src/control_plane/mod.rs b/proxy/src/control_plane/mod.rs
index 87d8e781aa..70607ac0d0 100644
--- a/proxy/src/control_plane/mod.rs
+++ b/proxy/src/control_plane/mod.rs
@@ -5,18 +5,137 @@
 pub mod messages;
 
 /// Wrappers for console APIs and their mocks.
-pub mod provider;
-pub(crate) use provider::{errors, Api, AuthSecret, CachedNodeInfo, NodeInfo};
+pub mod client;
+
+pub(crate) mod errors;
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
+use crate::auth::IpPattern;
+use crate::cache::project_info::ProjectInfoCacheImpl;
+use crate::cache::{Cached, TimedLru};
+use crate::context::RequestMonitoring;
+use crate::control_plane::messages::{ControlPlaneErrorMessage, MetricsAuxInfo};
+use crate::intern::ProjectIdInt;
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, scram};
 
 /// Various cache-related types.
 pub mod caches {
-    pub use super::provider::ApiCaches;
+    pub use super::client::ApiCaches;
 }
 
 /// Various cache-related types.
 pub mod locks {
-    pub use super::provider::ApiLocks;
+    pub use super::client::ApiLocks;
 }
 
 /// Console's management API.
 pub mod mgmt;
+
+/// Auth secret which is managed by the cloud.
+#[derive(Clone, Eq, PartialEq, Debug)]
+pub(crate) enum AuthSecret {
+    #[cfg(any(test, feature = "testing"))]
+    /// Md5 hash of user's password.
+    Md5([u8; 16]),
+
+    /// [SCRAM](crate::scram) authentication info.
+    Scram(scram::ServerSecret),
+}
+
+#[derive(Default)]
+pub(crate) struct AuthInfo {
+    pub(crate) secret: Option<AuthSecret>,
+    /// List of IP addresses allowed for the autorization.
+    pub(crate) allowed_ips: Vec<IpPattern>,
+    /// Project ID. This is used for cache invalidation.
+    pub(crate) project_id: Option<ProjectIdInt>,
+}
+
+/// Info for establishing a connection to a compute node.
+/// This is what we get after auth succeeded, but not before!
+#[derive(Clone)]
+pub(crate) struct NodeInfo {
+    /// Compute node connection params.
+    /// It's sad that we have to clone this, but this will improve
+    /// once we migrate to a bespoke connection logic.
+    pub(crate) config: compute::ConnCfg,
+
+    /// Labels for proxy's metrics.
+    pub(crate) aux: MetricsAuxInfo,
+
+    /// Whether we should accept self-signed certificates (for testing)
+    pub(crate) allow_self_signed_compute: bool,
+}
+
+impl NodeInfo {
+    pub(crate) async fn connect(
+        &self,
+        ctx: &RequestMonitoring,
+        timeout: Duration,
+    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
+        self.config
+            .connect(
+                ctx,
+                self.allow_self_signed_compute,
+                self.aux.clone(),
+                timeout,
+            )
+            .await
+    }
+    pub(crate) fn reuse_settings(&mut self, other: Self) {
+        self.allow_self_signed_compute = other.allow_self_signed_compute;
+        self.config.reuse_password(other.config);
+    }
+
+    pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
+        match keys {
+            #[cfg(any(test, feature = "testing"))]
+            ComputeCredentialKeys::Password(password) => self.config.password(password),
+            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
+            ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config,
+        };
+    }
+}
+
+pub(crate) type NodeInfoCache =
+    TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneErrorMessage>>>;
+pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
+pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
+pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
+
+/// This will allocate per each call, but the http requests alone
+/// already require a few allocations, so it should be fine.
+pub(crate) trait ControlPlaneApi {
+    /// Get the client's auth secret for authentication.
+    /// Returns option because user not found situation is special.
+    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
+    async fn get_role_secret(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
+
+    async fn get_allowed_ips_and_secret(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
+
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestMonitoring,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
+
+    /// Wake up the compute node and return the corresponding connection info.
+    async fn wake_compute(
+        &self,
+        ctx: &RequestMonitoring,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
+}
diff --git a/proxy/src/control_plane/provider/mod.rs b/proxy/src/control_plane/provider/mod.rs
deleted file mode 100644
index 49e57b6b7e..0000000000
--- a/proxy/src/control_plane/provider/mod.rs
+++ /dev/null
@@ -1,588 +0,0 @@
-#[cfg(any(test, feature = "testing"))]
-pub mod mock;
-pub mod neon;
-
-use std::hash::Hash;
-use std::sync::Arc;
-use std::time::Duration;
-
-use dashmap::DashMap;
-use tokio::time::Instant;
-use tracing::info;
-
-use super::messages::{ControlPlaneError, MetricsAuxInfo};
-use crate::auth::backend::jwt::{AuthRule, FetchAuthRules, FetchAuthRulesError};
-use crate::auth::backend::{ComputeCredentialKeys, ComputeUserInfo};
-use crate::auth::IpPattern;
-use crate::cache::endpoints::EndpointsCache;
-use crate::cache::project_info::ProjectInfoCacheImpl;
-use crate::cache::{Cached, TimedLru};
-use crate::config::{CacheOptions, EndpointCacheConfig, ProjectInfoCacheOptions};
-use crate::context::RequestMonitoring;
-use crate::error::ReportableError;
-use crate::intern::ProjectIdInt;
-use crate::metrics::ApiLockMetrics;
-use crate::rate_limiter::{DynamicLimiter, Outcome, RateLimiterConfig, Token};
-use crate::types::{EndpointCacheKey, EndpointId};
-use crate::{compute, scram};
-
-pub(crate) mod errors {
-    use thiserror::Error;
-
-    use super::ApiLockError;
-    use crate::control_plane::messages::{self, ControlPlaneError, Reason};
-    use crate::error::{io_error, ErrorKind, ReportableError, UserFacingError};
-    use crate::proxy::retry::CouldRetry;
-
-    /// A go-to error message which doesn't leak any detail.
-    pub(crate) const REQUEST_FAILED: &str = "Console request failed";
-
-    /// Common console API error.
-    #[derive(Debug, Error)]
-    pub(crate) enum ApiError {
-        /// Error returned by the console itself.
-        #[error("{REQUEST_FAILED} with {0}")]
-        ControlPlane(Box<ControlPlaneError>),
-
-        /// Various IO errors like broken pipe or malformed payload.
-        #[error("{REQUEST_FAILED}: {0}")]
-        Transport(#[from] std::io::Error),
-    }
-
-    impl ApiError {
-        /// Returns HTTP status code if it's the reason for failure.
-        pub(crate) fn get_reason(&self) -> messages::Reason {
-            match self {
-                ApiError::ControlPlane(e) => e.get_reason(),
-                ApiError::Transport(_) => messages::Reason::Unknown,
-            }
-        }
-    }
-
-    impl UserFacingError for ApiError {
-        fn to_string_client(&self) -> String {
-            match self {
-                // To minimize risks, only select errors are forwarded to users.
-                ApiError::ControlPlane(c) => c.get_user_facing_message(),
-                ApiError::Transport(_) => REQUEST_FAILED.to_owned(),
-            }
-        }
-    }
-
-    impl ReportableError for ApiError {
-        fn get_error_kind(&self) -> crate::error::ErrorKind {
-            match self {
-                ApiError::ControlPlane(e) => match e.get_reason() {
-                    Reason::RoleProtected => ErrorKind::User,
-                    Reason::ResourceNotFound => ErrorKind::User,
-                    Reason::ProjectNotFound => ErrorKind::User,
-                    Reason::EndpointNotFound => ErrorKind::User,
-                    Reason::BranchNotFound => ErrorKind::User,
-                    Reason::RateLimitExceeded => ErrorKind::ServiceRateLimit,
-                    Reason::NonDefaultBranchComputeTimeExceeded => ErrorKind::Quota,
-                    Reason::ActiveTimeQuotaExceeded => ErrorKind::Quota,
-                    Reason::ComputeTimeQuotaExceeded => ErrorKind::Quota,
-                    Reason::WrittenDataQuotaExceeded => ErrorKind::Quota,
-                    Reason::DataTransferQuotaExceeded => ErrorKind::Quota,
-                    Reason::LogicalSizeQuotaExceeded => ErrorKind::Quota,
-                    Reason::ConcurrencyLimitReached => ErrorKind::ControlPlane,
-                    Reason::LockAlreadyTaken => ErrorKind::ControlPlane,
-                    Reason::RunningOperations => ErrorKind::ControlPlane,
-                    Reason::ActiveEndpointsLimitExceeded => ErrorKind::ControlPlane,
-                    Reason::Unknown => ErrorKind::ControlPlane,
-                },
-                ApiError::Transport(_) => crate::error::ErrorKind::ControlPlane,
-            }
-        }
-    }
-
-    impl CouldRetry for ApiError {
-        fn could_retry(&self) -> bool {
-            match self {
-                // retry some transport errors
-                Self::Transport(io) => io.could_retry(),
-                Self::ControlPlane(e) => e.could_retry(),
-            }
-        }
-    }
-
-    impl From<reqwest::Error> for ApiError {
-        fn from(e: reqwest::Error) -> Self {
-            io_error(e).into()
-        }
-    }
-
-    impl From<reqwest_middleware::Error> for ApiError {
-        fn from(e: reqwest_middleware::Error) -> Self {
-            io_error(e).into()
-        }
-    }
-
-    #[derive(Debug, Error)]
-    pub(crate) enum GetAuthInfoError {
-        // We shouldn't include the actual secret here.
-        #[error("Console responded with a malformed auth secret")]
-        BadSecret,
-
-        #[error(transparent)]
-        ApiError(ApiError),
-    }
-
-    // This allows more useful interactions than `#[from]`.
-    impl<E: Into<ApiError>> From<E> for GetAuthInfoError {
-        fn from(e: E) -> Self {
-            Self::ApiError(e.into())
-        }
-    }
-
-    impl UserFacingError for GetAuthInfoError {
-        fn to_string_client(&self) -> String {
-            match self {
-                // We absolutely should not leak any secrets!
-                Self::BadSecret => REQUEST_FAILED.to_owned(),
-                // However, API might return a meaningful error.
-                Self::ApiError(e) => e.to_string_client(),
-            }
-        }
-    }
-
-    impl ReportableError for GetAuthInfoError {
-        fn get_error_kind(&self) -> crate::error::ErrorKind {
-            match self {
-                Self::BadSecret => crate::error::ErrorKind::ControlPlane,
-                Self::ApiError(_) => crate::error::ErrorKind::ControlPlane,
-            }
-        }
-    }
-
-    #[derive(Debug, Error)]
-    pub(crate) enum WakeComputeError {
-        #[error("Console responded with a malformed compute address: {0}")]
-        BadComputeAddress(Box<str>),
-
-        #[error(transparent)]
-        ApiError(ApiError),
-
-        #[error("Too many connections attempts")]
-        TooManyConnections,
-
-        #[error("error acquiring resource permit: {0}")]
-        TooManyConnectionAttempts(#[from] ApiLockError),
-    }
-
-    // This allows more useful interactions than `#[from]`.
-    impl<E: Into<ApiError>> From<E> for WakeComputeError {
-        fn from(e: E) -> Self {
-            Self::ApiError(e.into())
-        }
-    }
-
-    impl UserFacingError for WakeComputeError {
-        fn to_string_client(&self) -> String {
-            match self {
-                // We shouldn't show user the address even if it's broken.
-                // Besides, user is unlikely to care about this detail.
-                Self::BadComputeAddress(_) => REQUEST_FAILED.to_owned(),
-                // However, API might return a meaningful error.
-                Self::ApiError(e) => e.to_string_client(),
-
-                Self::TooManyConnections => self.to_string(),
-
-                Self::TooManyConnectionAttempts(_) => {
-                    "Failed to acquire permit to connect to the database. Too many database connection attempts are currently ongoing.".to_owned()
-                }
-            }
-        }
-    }
-
-    impl ReportableError for WakeComputeError {
-        fn get_error_kind(&self) -> crate::error::ErrorKind {
-            match self {
-                Self::BadComputeAddress(_) => crate::error::ErrorKind::ControlPlane,
-                Self::ApiError(e) => e.get_error_kind(),
-                Self::TooManyConnections => crate::error::ErrorKind::RateLimit,
-                Self::TooManyConnectionAttempts(e) => e.get_error_kind(),
-            }
-        }
-    }
-
-    impl CouldRetry for WakeComputeError {
-        fn could_retry(&self) -> bool {
-            match self {
-                Self::BadComputeAddress(_) => false,
-                Self::ApiError(e) => e.could_retry(),
-                Self::TooManyConnections => false,
-                Self::TooManyConnectionAttempts(_) => false,
-            }
-        }
-    }
-
-    #[derive(Debug, Error)]
-    pub enum GetEndpointJwksError {
-        #[error("endpoint not found")]
-        EndpointNotFound,
-
-        #[error("failed to build control plane request: {0}")]
-        RequestBuild(#[source] reqwest::Error),
-
-        #[error("failed to send control plane request: {0}")]
-        RequestExecute(#[source] reqwest_middleware::Error),
-
-        #[error(transparent)]
-        ControlPlane(#[from] ApiError),
-
-        #[cfg(any(test, feature = "testing"))]
-        #[error(transparent)]
-        TokioPostgres(#[from] tokio_postgres::Error),
-
-        #[cfg(any(test, feature = "testing"))]
-        #[error(transparent)]
-        ParseUrl(#[from] url::ParseError),
-
-        #[cfg(any(test, feature = "testing"))]
-        #[error(transparent)]
-        TaskJoin(#[from] tokio::task::JoinError),
-    }
-}
-
-/// Auth secret which is managed by the cloud.
-#[derive(Clone, Eq, PartialEq, Debug)]
-pub(crate) enum AuthSecret {
-    #[cfg(any(test, feature = "testing"))]
-    /// Md5 hash of user's password.
-    Md5([u8; 16]),
-
-    /// [SCRAM](crate::scram) authentication info.
-    Scram(scram::ServerSecret),
-}
-
-#[derive(Default)]
-pub(crate) struct AuthInfo {
-    pub(crate) secret: Option<AuthSecret>,
-    /// List of IP addresses allowed for the autorization.
-    pub(crate) allowed_ips: Vec<IpPattern>,
-    /// Project ID. This is used for cache invalidation.
-    pub(crate) project_id: Option<ProjectIdInt>,
-}
-
-/// Info for establishing a connection to a compute node.
-/// This is what we get after auth succeeded, but not before!
-#[derive(Clone)]
-pub(crate) struct NodeInfo {
-    /// Compute node connection params.
-    /// It's sad that we have to clone this, but this will improve
-    /// once we migrate to a bespoke connection logic.
-    pub(crate) config: compute::ConnCfg,
-
-    /// Labels for proxy's metrics.
-    pub(crate) aux: MetricsAuxInfo,
-
-    /// Whether we should accept self-signed certificates (for testing)
-    pub(crate) allow_self_signed_compute: bool,
-}
-
-impl NodeInfo {
-    pub(crate) async fn connect(
-        &self,
-        ctx: &RequestMonitoring,
-        timeout: Duration,
-    ) -> Result<compute::PostgresConnection, compute::ConnectionError> {
-        self.config
-            .connect(
-                ctx,
-                self.allow_self_signed_compute,
-                self.aux.clone(),
-                timeout,
-            )
-            .await
-    }
-    pub(crate) fn reuse_settings(&mut self, other: Self) {
-        self.allow_self_signed_compute = other.allow_self_signed_compute;
-        self.config.reuse_password(other.config);
-    }
-
-    pub(crate) fn set_keys(&mut self, keys: &ComputeCredentialKeys) {
-        match keys {
-            #[cfg(any(test, feature = "testing"))]
-            ComputeCredentialKeys::Password(password) => self.config.password(password),
-            ComputeCredentialKeys::AuthKeys(auth_keys) => self.config.auth_keys(*auth_keys),
-            ComputeCredentialKeys::JwtPayload(_) | ComputeCredentialKeys::None => &mut self.config,
-        };
-    }
-}
-
-pub(crate) type NodeInfoCache =
-    TimedLru<EndpointCacheKey, Result<NodeInfo, Box<ControlPlaneError>>>;
-pub(crate) type CachedNodeInfo = Cached<&'static NodeInfoCache, NodeInfo>;
-pub(crate) type CachedRoleSecret = Cached<&'static ProjectInfoCacheImpl, Option<AuthSecret>>;
-pub(crate) type CachedAllowedIps = Cached<&'static ProjectInfoCacheImpl, Arc<Vec<IpPattern>>>;
-
-/// This will allocate per each call, but the http requests alone
-/// already require a few allocations, so it should be fine.
-pub(crate) trait Api {
-    /// Get the client's auth secret for authentication.
-    /// Returns option because user not found situation is special.
-    /// We still have to mock the scram to avoid leaking information that user doesn't exist.
-    async fn get_role_secret(
-        &self,
-        ctx: &RequestMonitoring,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError>;
-
-    async fn get_allowed_ips_and_secret(
-        &self,
-        ctx: &RequestMonitoring,
-        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError>;
-
-    async fn get_endpoint_jwks(
-        &self,
-        ctx: &RequestMonitoring,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError>;
-
-    /// Wake up the compute node and return the corresponding connection info.
-    async fn wake_compute(
-        &self,
-        ctx: &RequestMonitoring,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedNodeInfo, errors::WakeComputeError>;
-}
-
-#[non_exhaustive]
-#[derive(Clone)]
-pub enum ControlPlaneBackend {
-    /// Current Management API (V2).
-    Management(neon::Api),
-    /// Local mock control plane.
-    #[cfg(any(test, feature = "testing"))]
-    PostgresMock(mock::Api),
-    /// Internal testing
-    #[cfg(test)]
-    #[allow(private_interfaces)]
-    Test(Box<dyn crate::auth::backend::TestBackend>),
-}
-
-impl Api for ControlPlaneBackend {
-    async fn get_role_secret(
-        &self,
-        ctx: &RequestMonitoring,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
-        match self {
-            Self::Management(api) => api.get_role_secret(ctx, user_info).await,
-            #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
-            #[cfg(test)]
-            Self::Test(_) => {
-                unreachable!("this function should never be called in the test backend")
-            }
-        }
-    }
-
-    async fn get_allowed_ips_and_secret(
-        &self,
-        ctx: &RequestMonitoring,
-        user_info: &ComputeUserInfo,
-    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
-        match self {
-            Self::Management(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
-            #[cfg(test)]
-            Self::Test(api) => api.get_allowed_ips_and_secret(),
-        }
-    }
-
-    async fn get_endpoint_jwks(
-        &self,
-        ctx: &RequestMonitoring,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
-        match self {
-            Self::Management(api) => api.get_endpoint_jwks(ctx, endpoint).await,
-            #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
-            #[cfg(test)]
-            Self::Test(_api) => Ok(vec![]),
-        }
-    }
-
-    async fn wake_compute(
-        &self,
-        ctx: &RequestMonitoring,
-        user_info: &ComputeUserInfo,
-    ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
-        match self {
-            Self::Management(api) => api.wake_compute(ctx, user_info).await,
-            #[cfg(any(test, feature = "testing"))]
-            Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
-            #[cfg(test)]
-            Self::Test(api) => api.wake_compute(),
-        }
-    }
-}
-
-/// Various caches for [`control_plane`](super).
-pub struct ApiCaches {
-    /// Cache for the `wake_compute` API method.
-    pub(crate) node_info: NodeInfoCache,
-    /// Cache which stores project_id -> endpoint_ids mapping.
-    pub project_info: Arc<ProjectInfoCacheImpl>,
-    /// List of all valid endpoints.
-    pub endpoints_cache: Arc<EndpointsCache>,
-}
-
-impl ApiCaches {
-    pub fn new(
-        wake_compute_cache_config: CacheOptions,
-        project_info_cache_config: ProjectInfoCacheOptions,
-        endpoint_cache_config: EndpointCacheConfig,
-    ) -> Self {
-        Self {
-            node_info: NodeInfoCache::new(
-                "node_info_cache",
-                wake_compute_cache_config.size,
-                wake_compute_cache_config.ttl,
-                true,
-            ),
-            project_info: Arc::new(ProjectInfoCacheImpl::new(project_info_cache_config)),
-            endpoints_cache: Arc::new(EndpointsCache::new(endpoint_cache_config)),
-        }
-    }
-}
-
-/// Various caches for [`control_plane`](super).
-pub struct ApiLocks<K> {
-    name: &'static str,
-    node_locks: DashMap<K, Arc<DynamicLimiter>>,
-    config: RateLimiterConfig,
-    timeout: Duration,
-    epoch: std::time::Duration,
-    metrics: &'static ApiLockMetrics,
-}
-
-#[derive(Debug, thiserror::Error)]
-pub(crate) enum ApiLockError {
-    #[error("timeout acquiring resource permit")]
-    TimeoutError(#[from] tokio::time::error::Elapsed),
-}
-
-impl ReportableError for ApiLockError {
-    fn get_error_kind(&self) -> crate::error::ErrorKind {
-        match self {
-            ApiLockError::TimeoutError(_) => crate::error::ErrorKind::RateLimit,
-        }
-    }
-}
-
-impl<K: Hash + Eq + Clone> ApiLocks<K> {
-    pub fn new(
-        name: &'static str,
-        config: RateLimiterConfig,
-        shards: usize,
-        timeout: Duration,
-        epoch: std::time::Duration,
-        metrics: &'static ApiLockMetrics,
-    ) -> prometheus::Result<Self> {
-        Ok(Self {
-            name,
-            node_locks: DashMap::with_shard_amount(shards),
-            config,
-            timeout,
-            epoch,
-            metrics,
-        })
-    }
-
-    pub(crate) async fn get_permit(&self, key: &K) -> Result<WakeComputePermit, ApiLockError> {
-        if self.config.initial_limit == 0 {
-            return Ok(WakeComputePermit {
-                permit: Token::disabled(),
-            });
-        }
-        let now = Instant::now();
-        let semaphore = {
-            // get fast path
-            if let Some(semaphore) = self.node_locks.get(key) {
-                semaphore.clone()
-            } else {
-                self.node_locks
-                    .entry(key.clone())
-                    .or_insert_with(|| {
-                        self.metrics.semaphores_registered.inc();
-                        DynamicLimiter::new(self.config)
-                    })
-                    .clone()
-            }
-        };
-        let permit = semaphore.acquire_timeout(self.timeout).await;
-
-        self.metrics
-            .semaphore_acquire_seconds
-            .observe(now.elapsed().as_secs_f64());
-        info!("acquired permit {:?}", now.elapsed().as_secs_f64());
-        Ok(WakeComputePermit { permit: permit? })
-    }
-
-    pub async fn garbage_collect_worker(&self) {
-        if self.config.initial_limit == 0 {
-            return;
-        }
-        let mut interval =
-            tokio::time::interval(self.epoch / (self.node_locks.shards().len()) as u32);
-        loop {
-            for (i, shard) in self.node_locks.shards().iter().enumerate() {
-                interval.tick().await;
-                // temporary lock a single shard and then clear any semaphores that aren't currently checked out
-                // race conditions: if strong_count == 1, there's no way that it can increase while the shard is locked
-                // therefore releasing it is safe from race conditions
-                info!(
-                    name = self.name,
-                    shard = i,
-                    "performing epoch reclamation on api lock"
-                );
-                let mut lock = shard.write();
-                let timer = self.metrics.reclamation_lag_seconds.start_timer();
-                let count = lock
-                    .extract_if(|_, semaphore| Arc::strong_count(semaphore.get_mut()) == 1)
-                    .count();
-                drop(lock);
-                self.metrics.semaphores_unregistered.inc_by(count as u64);
-                timer.observe();
-            }
-        }
-    }
-}
-
-pub(crate) struct WakeComputePermit {
-    permit: Token,
-}
-
-impl WakeComputePermit {
-    pub(crate) fn should_check_cache(&self) -> bool {
-        !self.permit.is_disabled()
-    }
-    pub(crate) fn release(self, outcome: Outcome) {
-        self.permit.release(outcome);
-    }
-    pub(crate) fn release_result<T, E>(self, res: Result<T, E>) -> Result<T, E> {
-        match res {
-            Ok(_) => self.release(Outcome::Success),
-            Err(_) => self.release(Outcome::Overload),
-        }
-        res
-    }
-}
-
-impl FetchAuthRules for ControlPlaneBackend {
-    async fn fetch_auth_rules(
-        &self,
-        ctx: &RequestMonitoring,
-        endpoint: EndpointId,
-    ) -> Result<Vec<AuthRule>, FetchAuthRulesError> {
-        self.get_endpoint_jwks(ctx, endpoint)
-            .await
-            .map_err(FetchAuthRulesError::GetEndpointJwks)
-    }
-}
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index abb0599d08..be821925b5 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -20,14 +20,14 @@ use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
 use super::*;
 use crate::auth::backend::{
-    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned, TestBackend,
+    ComputeCredentialKeys, ComputeCredentials, ComputeUserInfo, MaybeOwned,
 };
 use crate::config::{CertResolver, RetryConfig};
-use crate::control_plane::messages::{ControlPlaneError, Details, MetricsAuxInfo, Status};
-use crate::control_plane::provider::{
-    CachedAllowedIps, CachedRoleSecret, ControlPlaneBackend, NodeInfoCache,
+use crate::control_plane::client::{ControlPlaneClient, TestControlPlaneClient};
+use crate::control_plane::messages::{ControlPlaneErrorMessage, Details, MetricsAuxInfo, Status};
+use crate::control_plane::{
+    self, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo, NodeInfoCache,
 };
-use crate::control_plane::{self, CachedNodeInfo, NodeInfo};
 use crate::error::ErrorKind;
 use crate::types::{BranchId, EndpointId, ProjectId};
 use crate::{sasl, scram};
@@ -490,7 +490,7 @@ impl ConnectMechanism for TestConnectMechanism {
     fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
 }
 
-impl TestBackend for TestConnectMechanism {
+impl TestControlPlaneClient for TestConnectMechanism {
     fn wake_compute(&self) -> Result<CachedNodeInfo, control_plane::errors::WakeComputeError> {
         let mut counter = self.counter.lock().unwrap();
         let action = self.sequence[*counter];
@@ -498,18 +498,19 @@ impl TestBackend for TestConnectMechanism {
         match action {
             ConnectAction::Wake => Ok(helper_create_cached_node_info(self.cache)),
             ConnectAction::WakeFail => {
-                let err =
-                    control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
+                let err = control_plane::errors::ControlPlaneError::Message(Box::new(
+                    ControlPlaneErrorMessage {
                         http_status_code: StatusCode::BAD_REQUEST,
                         error: "TEST".into(),
                         status: None,
-                    }));
+                    },
+                ));
                 assert!(!err.could_retry());
-                Err(control_plane::errors::WakeComputeError::ApiError(err))
+                Err(control_plane::errors::WakeComputeError::ControlPlane(err))
             }
             ConnectAction::WakeRetry => {
-                let err =
-                    control_plane::errors::ApiError::ControlPlane(Box::new(ControlPlaneError {
+                let err = control_plane::errors::ControlPlaneError::Message(Box::new(
+                    ControlPlaneErrorMessage {
                         http_status_code: StatusCode::BAD_REQUEST,
                         error: "TEST".into(),
                         status: Some(Status {
@@ -523,9 +524,10 @@ impl TestBackend for TestConnectMechanism {
                                 user_facing_message: None,
                             },
                         }),
-                    }));
+                    },
+                ));
                 assert!(err.could_retry());
-                Err(control_plane::errors::WakeComputeError::ApiError(err))
+                Err(control_plane::errors::WakeComputeError::ControlPlane(err))
             }
             x => panic!("expecting action {x:?}, wake_compute is called instead"),
         }
@@ -538,7 +540,7 @@ impl TestBackend for TestConnectMechanism {
         unimplemented!("not used in tests")
     }
 
-    fn dyn_clone(&self) -> Box<dyn TestBackend> {
+    fn dyn_clone(&self) -> Box<dyn TestControlPlaneClient> {
         Box::new(self.clone())
     }
 }
@@ -562,7 +564,7 @@ fn helper_create_connect_info(
     mechanism: &TestConnectMechanism,
 ) -> auth::Backend<'static, ComputeCredentials> {
     let user_info = auth::Backend::ControlPlane(
-        MaybeOwned::Owned(ControlPlaneBackend::Test(Box::new(mechanism.clone()))),
+        MaybeOwned::Owned(ControlPlaneClient::Test(Box::new(mechanism.clone()))),
         ComputeCredentials {
             info: ComputeUserInfo {
                 endpoint: "endpoint".into(),
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 4e61094264..f9f46bb66c 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -4,7 +4,7 @@ use super::connect_compute::ComputeConnectBackend;
 use crate::config::RetryConfig;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::WakeComputeError;
-use crate::control_plane::provider::CachedNodeInfo;
+use crate::control_plane::CachedNodeInfo;
 use crate::error::ReportableError;
 use crate::metrics::{
     ConnectOutcome, ConnectionFailuresBreakdownGroup, Metrics, RetriesMetricGroup, RetryType,
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index c2b0de1876..7fc5bd236d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -24,9 +24,9 @@ use crate::compute_ctl::{
 };
 use crate::config::ProxyConfig;
 use crate::context::RequestMonitoring;
+use crate::control_plane::client::ApiLockError;
 use crate::control_plane::errors::{GetAuthInfoError, WakeComputeError};
 use crate::control_plane::locks::ApiLocks;
-use crate::control_plane::provider::ApiLockError;
 use crate::control_plane::CachedNodeInfo;
 use crate::error::{ErrorKind, ReportableError, UserFacingError};
 use crate::intern::EndpointIdInt;

From 5d8284c7fec4b8c58cd66a1b6219fb09d08f504b Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 6 Nov 2024 11:27:55 +0100
Subject: [PATCH 171/239] proxy: Read cplane JWT with clap arg (#9654)

---
 proxy/Cargo.toml                       | 2 +-
 proxy/src/bin/proxy.rs                 | 9 +++++++++
 proxy/src/control_plane/client/neon.rs | 4 +---
 workspace_hack/Cargo.toml              | 4 ++--
 4 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index bc4da95a91..efd336dbea 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -23,7 +23,7 @@ bstr.workspace = true
 bytes = { workspace = true, features = ["serde"] }
 camino.workspace = true
 chrono.workspace = true
-clap.workspace = true
+clap = { workspace = true, features = ["derive", "env"] }
 compute_api.workspace = true
 consumption_metrics.workspace = true
 dashmap.workspace = true
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index 3179a929c4..efb3747829 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -92,6 +92,14 @@ struct ProxyCliArgs {
         default_value = "http://localhost:3000/authenticate_proxy_request/"
     )]
     auth_endpoint: String,
+    /// JWT used to connect to control plane.
+    #[clap(
+        long,
+        value_name = "JWT",
+        default_value = "",
+        env = "NEON_PROXY_TO_CONTROLPLANE_TOKEN"
+    )]
+    control_plane_token: Arc<str>,
     /// if this is not local proxy, this toggles whether we accept jwt or passwords for http
     #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
     is_auth_broker: bool,
@@ -734,6 +742,7 @@ fn build_auth_backend(
                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
             let api = control_plane::client::neon::NeonControlPlaneClient::new(
                 endpoint,
+                args.control_plane_token.clone(),
                 caches,
                 locks,
                 wake_compute_endpoint_rate_limiter,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 6c67d2df96..1588e50423 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -45,13 +45,11 @@ impl NeonControlPlaneClient {
     /// Construct an API object containing the auth parameters.
     pub fn new(
         endpoint: http::Endpoint,
+        jwt: Arc<str>,
         caches: &'static ApiCaches,
         locks: &'static ApiLocks<EndpointCacheKey>,
         wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
     ) -> Self {
-        let jwt = std::env::var("NEON_PROXY_TO_CONTROLPLANE_TOKEN")
-            .unwrap_or_default()
-            .into();
         Self {
             endpoint,
             caches,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 71ebab4119..02deecd385 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -24,8 +24,8 @@ base64ct = { version = "1", default-features = false, features = ["std"] }
 bytes = { version = "1", features = ["serde"] }
 camino = { version = "1", default-features = false, features = ["serde1"] }
 chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }
-clap = { version = "4", features = ["derive", "string"] }
-clap_builder = { version = "4", default-features = false, features = ["color", "help", "std", "string", "suggestions", "usage"] }
+clap = { version = "4", features = ["derive", "env", "string"] }
+clap_builder = { version = "4", default-features = false, features = ["color", "env", "help", "std", "string", "suggestions", "usage"] }
 crypto-bigint = { version = "0.5", features = ["generic-array", "zeroize"] }
 der = { version = "0.7", default-features = false, features = ["oid", "pem", "std"] }
 deranged = { version = "0.3", default-features = false, features = ["powerfmt", "serde", "std"] }

From bdd492b1d871271c94462736e438e8d38b1fa997 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 6 Nov 2024 12:03:38 +0100
Subject: [PATCH 172/239] proxy: Replace "web(auth)" with "console redirect"
 everywhere (#9655)

---
 proxy/src/auth/backend/console_redirect.rs | 20 +++++++++++---------
 proxy/src/auth/backend/mod.rs              |  4 ++--
 proxy/src/auth/mod.rs                      |  6 +++---
 proxy/src/bin/local_proxy.rs               |  2 +-
 proxy/src/bin/proxy.rs                     | 21 +++++++++++----------
 proxy/src/compute.rs                       |  4 ++--
 proxy/src/config.rs                        |  4 ++--
 proxy/src/context/mod.rs                   |  2 +-
 proxy/src/context/parquet.rs               |  2 +-
 proxy/src/control_plane/messages.rs        |  2 +-
 proxy/src/control_plane/mgmt.rs            |  8 ++++----
 11 files changed, 39 insertions(+), 36 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 1df59b4893..e25dc3d45e 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -16,7 +16,7 @@ use crate::stream::PqStream;
 use crate::{auth, compute, waiters};
 
 #[derive(Debug, Error)]
-pub(crate) enum WebAuthError {
+pub(crate) enum ConsoleRedirectError {
     #[error(transparent)]
     WaiterRegister(#[from] waiters::RegisterError),
 
@@ -32,13 +32,13 @@ pub struct ConsoleRedirectBackend {
     console_uri: reqwest::Url,
 }
 
-impl UserFacingError for WebAuthError {
+impl UserFacingError for ConsoleRedirectError {
     fn to_string_client(&self) -> String {
         "Internal error".to_string()
     }
 }
 
-impl ReportableError for WebAuthError {
+impl ReportableError for ConsoleRedirectError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
             Self::WaiterRegister(_) => crate::error::ErrorKind::Service,
@@ -103,7 +103,7 @@ async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<NodeInfo> {
-    ctx.set_auth_method(crate::context::AuthMethod::Web);
+    ctx.set_auth_method(crate::context::AuthMethod::ConsoleRedirect);
 
     // registering waiter can fail if we get unlucky with rng.
     // just try again.
@@ -116,7 +116,7 @@ async fn authenticate(
         }
     };
 
-    let span = info_span!("web", psql_session_id = &psql_session_id);
+    let span = info_span!("console_redirect", psql_session_id = &psql_session_id);
     let greeting = hello_message(link_uri, &psql_session_id);
 
     // Give user a URL to spawn a new database.
@@ -127,14 +127,16 @@ async fn authenticate(
         .write_message(&Be::NoticeResponse(&greeting))
         .await?;
 
-    // Wait for web console response (see `mgmt`).
+    // Wait for console response via control plane (see `mgmt`).
     info!(parent: &span, "waiting for console's reply...");
-    let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter)
+    let db_info = tokio::time::timeout(auth_config.console_redirect_confirmation_timeout, waiter)
         .await
         .map_err(|_elapsed| {
-            auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into())
+            auth::AuthError::confirmation_timeout(
+                auth_config.console_redirect_confirmation_timeout.into(),
+            )
         })?
-        .map_err(WebAuthError::from)?;
+        .map_err(ConsoleRedirectError::from)?;
 
     if auth_config.ip_allowlist_check_enabled {
         if let Some(allowed_ips) = &db_info.allowed_ips {
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index ccd5cf3071..242fe99de2 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -9,7 +9,7 @@ use std::sync::Arc;
 use std::time::Duration;
 
 pub use console_redirect::ConsoleRedirectBackend;
-pub(crate) use console_redirect::WebAuthError;
+pub(crate) use console_redirect::ConsoleRedirectError;
 use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
 use tokio::io::{AsyncRead, AsyncWrite};
@@ -560,7 +560,7 @@ mod tests {
         ip_allowlist_check_enabled: true,
         is_auth_broker: false,
         accept_jwts: false,
-        webauth_confirmation_timeout: std::time::Duration::from_secs(5),
+        console_redirect_confirmation_timeout: std::time::Duration::from_secs(5),
     });
 
     async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage {
diff --git a/proxy/src/auth/mod.rs b/proxy/src/auth/mod.rs
index 2bd7a2da3d..0198cc306e 100644
--- a/proxy/src/auth/mod.rs
+++ b/proxy/src/auth/mod.rs
@@ -32,7 +32,7 @@ pub(crate) type Result<T> = std::result::Result<T, AuthError>;
 #[derive(Debug, Error)]
 pub(crate) enum AuthError {
     #[error(transparent)]
-    Web(#[from] backend::WebAuthError),
+    ConsoleRedirect(#[from] backend::ConsoleRedirectError),
 
     #[error(transparent)]
     GetAuthInfo(#[from] control_plane::errors::GetAuthInfoError),
@@ -115,7 +115,7 @@ impl AuthError {
 impl UserFacingError for AuthError {
     fn to_string_client(&self) -> String {
         match self {
-            Self::Web(e) => e.to_string_client(),
+            Self::ConsoleRedirect(e) => e.to_string_client(),
             Self::GetAuthInfo(e) => e.to_string_client(),
             Self::Sasl(e) => e.to_string_client(),
             Self::PasswordFailed(_) => self.to_string(),
@@ -135,7 +135,7 @@ impl UserFacingError for AuthError {
 impl ReportableError for AuthError {
     fn get_error_kind(&self) -> crate::error::ErrorKind {
         match self {
-            Self::Web(e) => e.get_error_kind(),
+            Self::ConsoleRedirect(e) => e.get_error_kind(),
             Self::GetAuthInfo(e) => e.get_error_kind(),
             Self::Sasl(e) => e.get_error_kind(),
             Self::PasswordFailed(_) => crate::error::ErrorKind::User,
diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs
index df3628465f..fbdb1dec15 100644
--- a/proxy/src/bin/local_proxy.rs
+++ b/proxy/src/bin/local_proxy.rs
@@ -281,7 +281,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig
             ip_allowlist_check_enabled: true,
             is_auth_broker: false,
             accept_jwts: true,
-            webauth_confirmation_timeout: Duration::ZERO,
+            console_redirect_confirmation_timeout: Duration::ZERO,
         },
         proxy_protocol_v2: config::ProxyProtocolV2::Rejected,
         handshake_timeout: Duration::from_secs(10),
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index efb3747829..fda5b25961 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -51,11 +51,11 @@ static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
 #[derive(Clone, Debug, ValueEnum)]
 enum AuthBackendType {
-    Console,
-    // clap only shows the name, not the alias, in usage text.
-    // TODO: swap name/alias and deprecate "link"
-    #[value(name("link"), alias("web"))]
-    Web,
+    #[value(name("console"), alias("cplane"))]
+    ControlPlane,
+
+    #[value(name("link"), alias("control-redirect"))]
+    ConsoleRedirect,
 
     #[cfg(feature = "testing")]
     Postgres,
@@ -71,7 +71,7 @@ struct ProxyCliArgs {
     /// listen for incoming client connections on ip:port
     #[clap(short, long, default_value = "127.0.0.1:4432")]
     proxy: String,
-    #[clap(value_enum, long, default_value_t = AuthBackendType::Web)]
+    #[clap(value_enum, long, default_value_t = AuthBackendType::ConsoleRedirect)]
     auth_backend: AuthBackendType,
     /// listen for management callback connection on ip:port
     #[clap(short, long, default_value = "127.0.0.1:7000")]
@@ -82,7 +82,7 @@ struct ProxyCliArgs {
     /// listen for incoming wss connections on ip:port
     #[clap(long)]
     wss: Option<String>,
-    /// redirect unauthenticated users to the given uri in case of web auth
+    /// redirect unauthenticated users to the given uri in case of console redirect auth
     #[clap(short, long, default_value = "http://localhost:3000/psql_session/")]
     uri: String,
     /// cloud API endpoint for authenticating users
@@ -231,6 +231,7 @@ struct ProxyCliArgs {
     proxy_protocol_v2: ProxyProtocolV2,
 
     /// Time the proxy waits for the webauth session to be confirmed by the control plane.
+    // TODO: rename to `console_redirect_confirmation_timeout`.
     #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)]
     webauth_confirmation_timeout: std::time::Duration,
 }
@@ -667,7 +668,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
         ip_allowlist_check_enabled: !args.is_private_access_proxy,
         is_auth_broker: args.is_auth_broker,
         accept_jwts: args.is_auth_broker,
-        webauth_confirmation_timeout: args.webauth_confirmation_timeout,
+        console_redirect_confirmation_timeout: args.webauth_confirmation_timeout,
     };
 
     let config = ProxyConfig {
@@ -698,7 +699,7 @@ fn build_auth_backend(
     args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     match &args.auth_backend {
-        AuthBackendType::Console => {
+        AuthBackendType::ControlPlane => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
                 args.project_info_cache.parse()?;
@@ -771,7 +772,7 @@ fn build_auth_backend(
             Ok(Either::Left(config))
         }
 
-        AuthBackendType::Web => {
+        AuthBackendType::ConsoleRedirect => {
             let url = args.uri.parse()?;
             let backend = ConsoleRedirectBackend::new(url);
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index d397fc5160..ca4a348ed8 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -135,13 +135,13 @@ impl ConnCfg {
     /// Apply startup message params to the connection config.
     pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
         // Only set `user` if it's not present in the config.
-        // Web auth flow takes username from the console's response.
+        // Console redirect auth flow takes username from the console's response.
         if let (None, Some(user)) = (self.get_user(), params.get("user")) {
             self.user(user);
         }
 
         // Only set `dbname` if it's not present in the config.
-        // Web auth flow takes dbname from the console's response.
+        // Console redirect auth flow takes dbname from the console's response.
         if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
             self.dbname(dbname);
         }
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index f63d7e45aa..b048c9d389 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -78,7 +78,7 @@ pub struct AuthenticationConfig {
     pub jwks_cache: JwkCache,
     pub is_auth_broker: bool,
     pub accept_jwts: bool,
-    pub webauth_confirmation_timeout: tokio::time::Duration,
+    pub console_redirect_confirmation_timeout: tokio::time::Duration,
 }
 
 impl TlsConfig {
@@ -271,7 +271,7 @@ impl CertResolver {
         // auth-broker does not use SNI and instead uses the Neon-Connection-String header.
         // Auth broker has the subdomain `apiauth` we need to remove for the purposes of validating the Neon-Connection-String.
         //
-        // Console Web proxy does not use any wildcard domains and does not need any certificate selection or conn string
+        // Console Redirect proxy does not use any wildcard domains and does not need any certificate selection or conn string
         // validation, so let's we can continue with any common-name
         let common_name = if let Some(s) = common_name.strip_prefix("CN=*.") {
             s.to_string()
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 2a6c9c5969..6cf99c0c97 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -75,7 +75,7 @@ struct RequestMonitoringInner {
 #[derive(Clone, Debug)]
 pub(crate) enum AuthMethod {
     // aka passwordless, fka link
-    Web,
+    ConsoleRedirect,
     ScramSha256,
     ScramSha256Plus,
     Cleartext,
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index adbb74c8e5..4112de646f 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -134,7 +134,7 @@ impl From<&RequestMonitoringInner> for RequestData {
                 .as_ref()
                 .and_then(|options| serde_json::to_string(&Options { options }).ok()),
             auth_method: value.auth_method.as_ref().map(|x| match x {
-                super::AuthMethod::Web => "web",
+                super::AuthMethod::ConsoleRedirect => "console_redirect",
                 super::AuthMethod::ScramSha256 => "scram_sha_256",
                 super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                 super::AuthMethod::Cleartext => "cleartext",
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 75c932e6ab..8762ba874b 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -245,7 +245,7 @@ pub(crate) struct WakeCompute {
     pub(crate) aux: MetricsAuxInfo,
 }
 
-/// Async response which concludes the web auth flow.
+/// Async response which concludes the console redirect auth flow.
 /// Also known as `kickResponse` in the console.
 #[derive(Debug, Deserialize)]
 pub(crate) struct KickSession<'a> {
diff --git a/proxy/src/control_plane/mgmt.rs b/proxy/src/control_plane/mgmt.rs
index 5ac3acd28a..2f7359240d 100644
--- a/proxy/src/control_plane/mgmt.rs
+++ b/proxy/src/control_plane/mgmt.rs
@@ -24,8 +24,8 @@ pub(crate) fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), wai
     CPLANE_WAITERS.notify(psql_session_id, msg)
 }
 
-/// Console management API listener task.
-/// It spawns console response handlers needed for the web auth.
+/// Management API listener task.
+/// It spawns management response handlers needed for the console redirect auth flow.
 pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
     scopeguard::defer! {
         info!("mgmt has shut down");
@@ -43,13 +43,13 @@ pub async fn task_main(listener: TcpListener) -> anyhow::Result<Infallible> {
 
         tokio::task::spawn(
             async move {
-                info!("serving a new console management API connection");
+                info!("serving a new management API connection");
 
                 // these might be long running connections, have a separate logging for cancelling
                 // on shutdown and other ways of stopping.
                 let cancelled = scopeguard::guard(tracing::Span::current(), |span| {
                     let _e = span.entered();
-                    info!("console management API task cancelled");
+                    info!("management API task cancelled");
                 });
 
                 if let Err(e) = handle_connection(socket).await {

From 4dfa0c221b1ea4164e4260da8f62bcd98e816920 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 6 Nov 2024 14:10:32 +0000
Subject: [PATCH 173/239] pageserver: ingest pre-serialized batches of values
 (#9579)

## Problem

https://github.com/neondatabase/neon/pull/9524 split the decoding and
interpretation step from ingestion.
The output of the first phase is a `wal_decoder::models::InterpretedWalRecord`.
Before this patch set that struct contained a list of `Value` instances.

We wish to lift the decoding and interpretation step to the safekeeper,
but it would be nice if the safekeeper gave us a batch containing the raw data instead of actual values.

## Summary of changes

Main goal here is to make `InterpretedWalRecord` hold a raw buffer which
contains pre-serialized Values.
For this we do:
1. Add a `SerializedValueBatch` type. This is `inmemory_layer::SerializedBatch` with some
extra functionality for extension, observing values for shard 0 and tests.
2. Replace `inmemory_layer::SerializedBatch` with `SerializedValueBatch`
3. Make `DatadirModification` maintain a `SerializedValueBatch`.


### `DatadirModification` changes

`DatadirModification` now maintains a `SerializedValueBatch` and extends
it as new WAL records come in (to avoid flushing to disk on every
record).
In turn, this cascaded into a number of modifications to
`DatadirModification`:
1. Replace `pending_data_pages` and `pending_zero_data_pages` with `pending_data_batch`.
2. Removal of `pending_zero_data_pages` and its cousin `on_wal_record_end`
3. Rename `pending_bytes` to `pending_metadata_bytes` since this is what it tracks now.
4. Adapting of various utility methods like `len`, `approx_pending_bytes` and `has_dirty_data_pages`.

Removal of `pending_zero_data_pages` and the optimisation associated
with it ((1) and (2)) deserves more detail.

Previously all zero data pages went through `pending_zero_data_pages`.
We wrote zero data pages when filling gaps caused by relation extension
(case A) and when handling special wal records (case B). If it happened
that the same WAL record contained a non zero write for an entry in
`pending_zero_data_pages` we skipped the zero write.

Case A: We handle this differently now. When ingesting the
`SerialiezdValueBatch` associated with one PG WAL record, we identify the gaps and fill the
them in one go. Essentially, we move from a per key process (gaps were filled after each
new key), and replace it with a per record process. Hence, the optimisation is not
required anymore.

Case B: When the handling of a special record needs to zero out a key,
it just adds that to the current batch. I inspected the code, and I
don't think the optimisation kicked in here.
---
 libs/wal_decoder/Cargo.toml                   |   2 +-
 libs/wal_decoder/src/decoder.rs               | 113 +--
 libs/wal_decoder/src/lib.rs                   |   1 +
 libs/wal_decoder/src/models.rs                |  16 +-
 libs/wal_decoder/src/serialized_batch.rs      | 862 ++++++++++++++++++
 pageserver/benches/bench_ingest.rs            |   6 +-
 pageserver/src/pgdatadir_mapping.rs           | 361 ++++++--
 .../tenant/storage_layer/inmemory_layer.rs    | 116 +--
 pageserver/src/tenant/timeline.rs             |  35 +-
 .../walreceiver/walreceiver_connection.rs     |  14 +-
 pageserver/src/walingest.rs                   |  84 +-
 11 files changed, 1247 insertions(+), 363 deletions(-)
 create mode 100644 libs/wal_decoder/src/serialized_batch.rs

diff --git a/libs/wal_decoder/Cargo.toml b/libs/wal_decoder/Cargo.toml
index 3f80f8fcdb..c8c0f4c990 100644
--- a/libs/wal_decoder/Cargo.toml
+++ b/libs/wal_decoder/Cargo.toml
@@ -5,7 +5,7 @@ edition.workspace = true
 license.workspace = true
 
 [features]
-testing = []
+testing = ["pageserver_api/testing"]
 
 [dependencies]
 anyhow.workspace = true
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index 780fce3d69..684718d220 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -2,15 +2,13 @@
 //! raw bytes which represent a raw Postgres WAL record.
 
 use crate::models::*;
-use bytes::{Buf, Bytes, BytesMut};
-use pageserver_api::key::rel_block_to_key;
-use pageserver_api::record::NeonWalRecord;
+use crate::serialized_batch::SerializedValueBatch;
+use bytes::{Buf, Bytes};
 use pageserver_api::reltag::{RelTag, SlruKind};
 use pageserver_api::shard::ShardIdentity;
-use pageserver_api::value::Value;
+use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::VISIBILITYMAP_FORKNUM;
 use postgres_ffi::walrecord::*;
-use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
 use utils::lsn::Lsn;
 
 impl InterpretedWalRecord {
@@ -21,11 +19,12 @@ impl InterpretedWalRecord {
     pub fn from_bytes_filtered(
         buf: Bytes,
         shard: &ShardIdentity,
-        lsn: Lsn,
+        record_end_lsn: Lsn,
         pg_version: u32,
     ) -> anyhow::Result<InterpretedWalRecord> {
         let mut decoded = DecodedWALRecord::default();
         decode_wal_record(buf, &mut decoded, pg_version)?;
+        let xid = decoded.xl_xid;
 
         let flush_uncommitted = if decoded.is_dbase_create_copy(pg_version) {
             FlushUncommittedRecords::Yes
@@ -33,96 +32,20 @@ impl InterpretedWalRecord {
             FlushUncommittedRecords::No
         };
 
-        let metadata_record = MetadataRecord::from_decoded(&decoded, lsn, pg_version)?;
-
-        let mut blocks = Vec::default();
-        for blk in decoded.blocks.iter() {
-            let rel = RelTag {
-                spcnode: blk.rnode_spcnode,
-                dbnode: blk.rnode_dbnode,
-                relnode: blk.rnode_relnode,
-                forknum: blk.forknum,
-            };
-
-            let key = rel_block_to_key(rel, blk.blkno);
-
-            if !key.is_valid_key_on_write_path() {
-                anyhow::bail!("Unsupported key decoded at LSN {}: {}", lsn, key);
-            }
-
-            let key_is_local = shard.is_key_local(&key);
-
-            tracing::debug!(
-                lsn=%lsn,
-                key=%key,
-                "ingest: shard decision {}",
-                if !key_is_local { "drop" } else { "keep" },
-            );
-
-            if !key_is_local {
-                if shard.is_shard_zero() {
-                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
-                    // its blkno in case it implicitly extends a relation.
-                    blocks.push((key.to_compact(), None));
-                }
-
-                continue;
-            }
-
-            // Instead of storing full-page-image WAL record,
-            // it is better to store extracted image: we can skip wal-redo
-            // in this case. Also some FPI records may contain multiple (up to 32) pages,
-            // so them have to be copied multiple times.
-            //
-            let value = if blk.apply_image
-                && blk.has_image
-                && decoded.xl_rmid == pg_constants::RM_XLOG_ID
-                && (decoded.xl_info == pg_constants::XLOG_FPI
-                || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
-                // compression of WAL is not yet supported: fall back to storing the original WAL record
-                && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
-                // do not materialize null pages because them most likely be soon replaced with real data
-                && blk.bimg_len != 0
-            {
-                // Extract page image from FPI record
-                let img_len = blk.bimg_len as usize;
-                let img_offs = blk.bimg_offset as usize;
-                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
-                // TODO(vlad): skip the copy
-                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
-
-                if blk.hole_length != 0 {
-                    let tail = image.split_off(blk.hole_offset as usize);
-                    image.resize(image.len() + blk.hole_length as usize, 0u8);
-                    image.unsplit(tail);
-                }
-                //
-                // Match the logic of XLogReadBufferForRedoExtended:
-                // The page may be uninitialized. If so, we can't set the LSN because
-                // that would corrupt the page.
-                //
-                if !page_is_new(&image) {
-                    page_set_lsn(&mut image, lsn)
-                }
-                assert_eq!(image.len(), BLCKSZ as usize);
-
-                Value::Image(image.freeze())
-            } else {
-                Value::WalRecord(NeonWalRecord::Postgres {
-                    will_init: blk.will_init || blk.apply_image,
-                    rec: decoded.record.clone(),
-                })
-            };
-
-            blocks.push((key.to_compact(), Some(value)));
-        }
+        let metadata_record = MetadataRecord::from_decoded(&decoded, record_end_lsn, pg_version)?;
+        let batch = SerializedValueBatch::from_decoded_filtered(
+            decoded,
+            shard,
+            record_end_lsn,
+            pg_version,
+        )?;
 
         Ok(InterpretedWalRecord {
             metadata_record,
-            blocks,
-            lsn,
+            batch,
+            end_lsn: record_end_lsn,
             flush_uncommitted,
-            xid: decoded.xl_xid,
+            xid,
         })
     }
 }
@@ -130,7 +53,7 @@ impl InterpretedWalRecord {
 impl MetadataRecord {
     fn from_decoded(
         decoded: &DecodedWALRecord,
-        lsn: Lsn,
+        record_end_lsn: Lsn,
         pg_version: u32,
     ) -> anyhow::Result<Option<MetadataRecord>> {
         // Note: this doesn't actually copy the bytes since
@@ -151,7 +74,7 @@ impl MetadataRecord {
                 Ok(None)
             }
             pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
-            pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, lsn),
+            pg_constants::RM_XACT_ID => Self::decode_xact_record(&mut buf, decoded, record_end_lsn),
             pg_constants::RM_MULTIXACT_ID => {
                 Self::decode_multixact_record(&mut buf, decoded, pg_version)
             }
@@ -163,7 +86,7 @@ impl MetadataRecord {
             //
             // Alternatively, one can make the checkpoint part of the subscription protocol
             // to the pageserver. This should work fine, but can be done at a later point.
-            pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, lsn),
+            pg_constants::RM_XLOG_ID => Self::decode_xlog_record(&mut buf, decoded, record_end_lsn),
             pg_constants::RM_LOGICALMSG_ID => {
                 Self::decode_logical_message_record(&mut buf, decoded)
             }
diff --git a/libs/wal_decoder/src/lib.rs b/libs/wal_decoder/src/lib.rs
index 05349d17c9..a8a26956e6 100644
--- a/libs/wal_decoder/src/lib.rs
+++ b/libs/wal_decoder/src/lib.rs
@@ -1,2 +1,3 @@
 pub mod decoder;
 pub mod models;
+pub mod serialized_batch;
diff --git a/libs/wal_decoder/src/models.rs b/libs/wal_decoder/src/models.rs
index 92b66fcefd..5d90eeb69c 100644
--- a/libs/wal_decoder/src/models.rs
+++ b/libs/wal_decoder/src/models.rs
@@ -2,7 +2,8 @@
 //! ready for the pageserver to interpret. They are derived from the original
 //! WAL records, so that each struct corresponds closely to one WAL record of
 //! a specific kind. They contain the same information as the original WAL records,
-//! just decoded into structs and fields for easier access.
+//! but the values are already serialized in a [`SerializedValueBatch`], which
+//! is the format that the pageserver is expecting them in.
 //!
 //! The ingestion code uses these structs to help with parsing the WAL records,
 //! and it splits them into a stream of modifications to the key-value pairs that
@@ -25,9 +26,7 @@
 //!                     |--> write to KV store within the pageserver
 
 use bytes::Bytes;
-use pageserver_api::key::CompactKey;
 use pageserver_api::reltag::{RelTag, SlruKind};
-use pageserver_api::value::Value;
 use postgres_ffi::walrecord::{
     XlMultiXactCreate, XlMultiXactTruncate, XlRelmapUpdate, XlReploriginDrop, XlReploriginSet,
     XlSmgrTruncate, XlXactParsedRecord,
@@ -35,6 +34,8 @@ use postgres_ffi::walrecord::{
 use postgres_ffi::{Oid, TransactionId};
 use utils::lsn::Lsn;
 
+use crate::serialized_batch::SerializedValueBatch;
+
 pub enum FlushUncommittedRecords {
     Yes,
     No,
@@ -45,12 +46,11 @@ pub struct InterpretedWalRecord {
     /// Optional metadata record - may cause writes to metadata keys
     /// in the storage engine
     pub metadata_record: Option<MetadataRecord>,
-    /// Images or deltas for blocks modified in the original WAL record.
-    /// The [`Value`] is optional to avoid sending superfluous data to
-    /// shard 0 for relation size tracking.
-    pub blocks: Vec<(CompactKey, Option<Value>)>,
+    /// A pre-serialized batch along with the required metadata for ingestion
+    /// by the pageserver
+    pub batch: SerializedValueBatch,
     /// Byte offset within WAL for the end of the original PG WAL record
-    pub lsn: Lsn,
+    pub end_lsn: Lsn,
     /// Whether to flush all uncommitted modifications to the storage engine
     /// before ingesting this record. This is currently only used for legacy PG
     /// database creations which read pages from a template database. Such WAL
diff --git a/libs/wal_decoder/src/serialized_batch.rs b/libs/wal_decoder/src/serialized_batch.rs
new file mode 100644
index 0000000000..8f33291023
--- /dev/null
+++ b/libs/wal_decoder/src/serialized_batch.rs
@@ -0,0 +1,862 @@
+//! This module implements batch type for serialized [`pageserver_api::value::Value`]
+//! instances. Each batch contains a raw buffer (serialized values)
+//! and a list of metadata for each (key, LSN) tuple present in the batch.
+//!
+//! Such batches are created from decoded PG wal records and ingested
+//! by the pageserver by writing directly to the ephemeral file.
+
+use std::collections::BTreeSet;
+
+use bytes::{Bytes, BytesMut};
+use pageserver_api::key::rel_block_to_key;
+use pageserver_api::keyspace::KeySpace;
+use pageserver_api::record::NeonWalRecord;
+use pageserver_api::reltag::RelTag;
+use pageserver_api::shard::ShardIdentity;
+use pageserver_api::{key::CompactKey, value::Value};
+use postgres_ffi::walrecord::{DecodedBkpBlock, DecodedWALRecord};
+use postgres_ffi::{page_is_new, page_set_lsn, pg_constants, BLCKSZ};
+use utils::bin_ser::BeSer;
+use utils::lsn::Lsn;
+
+use pageserver_api::key::Key;
+
+static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
+
+/// Accompanying metadata for the batch
+/// A value may be serialized and stored into the batch or just "observed".
+/// Shard 0 currently "observes" all values in order to accurately track
+/// relation sizes. In the case of "observed" values, we only need to know
+/// the key and LSN, so two types of metadata are supported to save on network
+/// bandwidth.
+pub enum ValueMeta {
+    Serialized(SerializedValueMeta),
+    Observed(ObservedValueMeta),
+}
+
+impl ValueMeta {
+    pub fn key(&self) -> CompactKey {
+        match self {
+            Self::Serialized(ser) => ser.key,
+            Self::Observed(obs) => obs.key,
+        }
+    }
+
+    pub fn lsn(&self) -> Lsn {
+        match self {
+            Self::Serialized(ser) => ser.lsn,
+            Self::Observed(obs) => obs.lsn,
+        }
+    }
+}
+
+/// Wrapper around [`ValueMeta`] that implements ordering by
+/// (key, LSN) tuples
+struct OrderedValueMeta(ValueMeta);
+
+impl Ord for OrderedValueMeta {
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        (self.0.key(), self.0.lsn()).cmp(&(other.0.key(), other.0.lsn()))
+    }
+}
+
+impl PartialOrd for OrderedValueMeta {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl PartialEq for OrderedValueMeta {
+    fn eq(&self, other: &Self) -> bool {
+        (self.0.key(), self.0.lsn()) == (other.0.key(), other.0.lsn())
+    }
+}
+
+impl Eq for OrderedValueMeta {}
+
+/// Metadata for a [`Value`] serialized into the batch.
+pub struct SerializedValueMeta {
+    pub key: CompactKey,
+    pub lsn: Lsn,
+    /// Starting offset of the value for the (key, LSN) tuple
+    /// in [`SerializedValueBatch::raw`]
+    pub batch_offset: u64,
+    pub len: usize,
+    pub will_init: bool,
+}
+
+/// Metadata for a [`Value`] observed by the batch
+pub struct ObservedValueMeta {
+    pub key: CompactKey,
+    pub lsn: Lsn,
+}
+
+/// Batch of serialized [`Value`]s.
+pub struct SerializedValueBatch {
+    /// [`Value`]s serialized in EphemeralFile's native format,
+    /// ready for disk write by the pageserver
+    pub raw: Vec<u8>,
+
+    /// Metadata to make sense of the bytes in [`Self::raw`]
+    /// and represent "observed" values.
+    ///
+    /// Invariant: Metadata entries for any given key are ordered
+    /// by LSN. Note that entries for a key do not have to be contiguous.
+    pub metadata: Vec<ValueMeta>,
+
+    /// The highest LSN of any value in the batch
+    pub max_lsn: Lsn,
+
+    /// Number of values encoded by [`Self::raw`]
+    pub len: usize,
+}
+
+impl Default for SerializedValueBatch {
+    fn default() -> Self {
+        Self {
+            raw: Default::default(),
+            metadata: Default::default(),
+            max_lsn: Lsn(0),
+            len: 0,
+        }
+    }
+}
+
+impl SerializedValueBatch {
+    /// Build a batch of serialized values from a decoded PG WAL record
+    ///
+    /// The batch will only contain values for keys targeting the specifiec
+    /// shard. Shard 0 is a special case, where any keys that don't belong to
+    /// it are "observed" by the batch (i.e. present in [`SerializedValueBatch::metadata`],
+    /// but absent from the raw buffer [`SerializedValueBatch::raw`]).
+    pub(crate) fn from_decoded_filtered(
+        decoded: DecodedWALRecord,
+        shard: &ShardIdentity,
+        record_end_lsn: Lsn,
+        pg_version: u32,
+    ) -> anyhow::Result<SerializedValueBatch> {
+        // First determine how big the buffer needs to be and allocate it up-front.
+        // This duplicates some of the work below, but it's empirically much faster.
+        let estimated_buffer_size = Self::estimate_buffer_size(&decoded, shard, pg_version);
+        let mut buf = Vec::<u8>::with_capacity(estimated_buffer_size);
+
+        let mut metadata: Vec<ValueMeta> = Vec::with_capacity(decoded.blocks.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        let mut len: usize = 0;
+        for blk in decoded.blocks.iter() {
+            let relative_off = buf.len() as u64;
+
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+
+            if !key.is_valid_key_on_write_path() {
+                anyhow::bail!("Unsupported key decoded at LSN {}: {}", record_end_lsn, key);
+            }
+
+            let key_is_local = shard.is_key_local(&key);
+
+            tracing::debug!(
+                lsn=%record_end_lsn,
+                key=%key,
+                "ingest: shard decision {}",
+                if !key_is_local { "drop" } else { "keep" },
+            );
+
+            if !key_is_local {
+                if shard.is_shard_zero() {
+                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                    // its blkno in case it implicitly extends a relation.
+                    metadata.push(ValueMeta::Observed(ObservedValueMeta {
+                        key: key.to_compact(),
+                        lsn: record_end_lsn,
+                    }))
+                }
+
+                continue;
+            }
+
+            // Instead of storing full-page-image WAL record,
+            // it is better to store extracted image: we can skip wal-redo
+            // in this case. Also some FPI records may contain multiple (up to 32) pages,
+            // so them have to be copied multiple times.
+            //
+            let val = if Self::block_is_image(&decoded, blk, pg_version) {
+                // Extract page image from FPI record
+                let img_len = blk.bimg_len as usize;
+                let img_offs = blk.bimg_offset as usize;
+                let mut image = BytesMut::with_capacity(BLCKSZ as usize);
+                // TODO(vlad): skip the copy
+                image.extend_from_slice(&decoded.record[img_offs..img_offs + img_len]);
+
+                if blk.hole_length != 0 {
+                    let tail = image.split_off(blk.hole_offset as usize);
+                    image.resize(image.len() + blk.hole_length as usize, 0u8);
+                    image.unsplit(tail);
+                }
+                //
+                // Match the logic of XLogReadBufferForRedoExtended:
+                // The page may be uninitialized. If so, we can't set the LSN because
+                // that would corrupt the page.
+                //
+                if !page_is_new(&image) {
+                    page_set_lsn(&mut image, record_end_lsn)
+                }
+                assert_eq!(image.len(), BLCKSZ as usize);
+
+                Value::Image(image.freeze())
+            } else {
+                Value::WalRecord(NeonWalRecord::Postgres {
+                    will_init: blk.will_init || blk.apply_image,
+                    rec: decoded.record.clone(),
+                })
+            };
+
+            val.ser_into(&mut buf)
+                .expect("Writing into in-memory buffer is infallible");
+
+            let val_ser_size = buf.len() - relative_off as usize;
+
+            metadata.push(ValueMeta::Serialized(SerializedValueMeta {
+                key: key.to_compact(),
+                lsn: record_end_lsn,
+                batch_offset: relative_off,
+                len: val_ser_size,
+                will_init: val.will_init(),
+            }));
+            max_lsn = std::cmp::max(max_lsn, record_end_lsn);
+            len += 1;
+        }
+
+        if cfg!(any(debug_assertions, test)) {
+            let batch = Self {
+                raw: buf,
+                metadata,
+                max_lsn,
+                len,
+            };
+
+            batch.validate_lsn_order();
+
+            return Ok(batch);
+        }
+
+        Ok(Self {
+            raw: buf,
+            metadata,
+            max_lsn,
+            len,
+        })
+    }
+
+    /// Look into the decoded PG WAL record and determine
+    /// roughly how large the buffer for serialized values needs to be.
+    fn estimate_buffer_size(
+        decoded: &DecodedWALRecord,
+        shard: &ShardIdentity,
+        pg_version: u32,
+    ) -> usize {
+        let mut estimate: usize = 0;
+
+        for blk in decoded.blocks.iter() {
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+
+            if !shard.is_key_local(&key) {
+                continue;
+            }
+
+            if Self::block_is_image(decoded, blk, pg_version) {
+                // 4 bytes for the Value::Image discriminator
+                // 8 bytes for encoding the size of the buffer
+                // BLCKSZ for the raw image
+                estimate += (4 + 8 + BLCKSZ) as usize;
+            } else {
+                // 4 bytes for the Value::WalRecord discriminator
+                // 4 bytes for the NeonWalRecord::Postgres discriminator
+                // 1 bytes for NeonWalRecord::Postgres::will_init
+                // 8 bytes for encoding the size of the buffer
+                // length of the raw record
+                estimate += 8 + 1 + 8 + decoded.record.len();
+            }
+        }
+
+        estimate
+    }
+
+    fn block_is_image(decoded: &DecodedWALRecord, blk: &DecodedBkpBlock, pg_version: u32) -> bool {
+        blk.apply_image
+            && blk.has_image
+            && decoded.xl_rmid == pg_constants::RM_XLOG_ID
+            && (decoded.xl_info == pg_constants::XLOG_FPI
+            || decoded.xl_info == pg_constants::XLOG_FPI_FOR_HINT)
+            // compression of WAL is not yet supported: fall back to storing the original WAL record
+            && !postgres_ffi::bkpimage_is_compressed(blk.bimg_info, pg_version)
+            // do not materialize null pages because them most likely be soon replaced with real data
+            && blk.bimg_len != 0
+    }
+
+    /// Encode a list of values and metadata into a serialized batch
+    ///
+    /// This is used by the pageserver ingest code to conveniently generate
+    /// batches for metadata writes.
+    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> Self {
+        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
+        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
+        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
+        let mut buf = Vec::<u8>::with_capacity(buffer_size);
+
+        let mut metadata: Vec<ValueMeta> = Vec::with_capacity(batch.len());
+        let mut max_lsn: Lsn = Lsn(0);
+        let len = batch.len();
+        for (key, lsn, val_ser_size, val) in batch {
+            let relative_off = buf.len() as u64;
+
+            val.ser_into(&mut buf)
+                .expect("Writing into in-memory buffer is infallible");
+
+            metadata.push(ValueMeta::Serialized(SerializedValueMeta {
+                key,
+                lsn,
+                batch_offset: relative_off,
+                len: val_ser_size,
+                will_init: val.will_init(),
+            }));
+            max_lsn = std::cmp::max(max_lsn, lsn);
+        }
+
+        // Assert that we didn't do any extra allocations while building buffer.
+        debug_assert!(buf.len() <= buffer_size);
+
+        if cfg!(any(debug_assertions, test)) {
+            let batch = Self {
+                raw: buf,
+                metadata,
+                max_lsn,
+                len,
+            };
+
+            batch.validate_lsn_order();
+
+            return batch;
+        }
+
+        Self {
+            raw: buf,
+            metadata,
+            max_lsn,
+            len,
+        }
+    }
+
+    /// Add one value to the batch
+    ///
+    /// This is used by the pageserver ingest code to include metadata block
+    /// updates for a single key.
+    pub fn put(&mut self, key: CompactKey, value: Value, lsn: Lsn) {
+        let relative_off = self.raw.len() as u64;
+        value.ser_into(&mut self.raw).unwrap();
+
+        let val_ser_size = self.raw.len() - relative_off as usize;
+        self.metadata
+            .push(ValueMeta::Serialized(SerializedValueMeta {
+                key,
+                lsn,
+                batch_offset: relative_off,
+                len: val_ser_size,
+                will_init: value.will_init(),
+            }));
+
+        self.max_lsn = std::cmp::max(self.max_lsn, lsn);
+        self.len += 1;
+
+        if cfg!(any(debug_assertions, test)) {
+            self.validate_lsn_order();
+        }
+    }
+
+    /// Extend with the contents of another batch
+    ///
+    /// One batch is generated for each decoded PG WAL record.
+    /// They are then merged to accumulate reasonably sized writes.
+    pub fn extend(&mut self, mut other: SerializedValueBatch) {
+        let extend_batch_start_offset = self.raw.len() as u64;
+
+        self.raw.extend(other.raw);
+
+        // Shift the offsets in the batch we are extending with
+        other.metadata.iter_mut().for_each(|meta| match meta {
+            ValueMeta::Serialized(ser) => {
+                ser.batch_offset += extend_batch_start_offset;
+                if cfg!(debug_assertions) {
+                    let value_end = ser.batch_offset + ser.len as u64;
+                    assert!((value_end as usize) <= self.raw.len());
+                }
+            }
+            ValueMeta::Observed(_) => {}
+        });
+        self.metadata.extend(other.metadata);
+
+        self.max_lsn = std::cmp::max(self.max_lsn, other.max_lsn);
+
+        self.len += other.len;
+
+        if cfg!(any(debug_assertions, test)) {
+            self.validate_lsn_order();
+        }
+    }
+
+    /// Add zero images for the (key, LSN) tuples specified
+    ///
+    /// PG versions below 16 do not zero out pages before extending
+    /// a relation and may leave gaps. Such gaps need to be identified
+    /// by the pageserver ingest logic and get patched up here.
+    ///
+    /// Note that this function does not validate that the gaps have been
+    /// identified correctly (it does not know relation sizes), so it's up
+    /// to the call-site to do it properly.
+    pub fn zero_gaps(&mut self, gaps: Vec<(KeySpace, Lsn)>) {
+        // Implementation note:
+        //
+        // Values within [`SerializedValueBatch::raw`] do not have any ordering requirements,
+        // but the metadata entries should be ordered properly (see
+        // [`SerializedValueBatch::metadata`]).
+        //
+        // Exploiting this observation we do:
+        // 1. Drain all the metadata entries into an ordered set.
+        // The use of a BTreeSet keyed by (Key, Lsn) relies on the observation that Postgres never
+        // includes more than one update to the same block in the same WAL record.
+        // 2. For each (key, LSN) gap tuple, append a zero image to the raw buffer
+        // and add an index entry to the ordered metadata set.
+        // 3. Drain the ordered set back into a metadata vector
+
+        let mut ordered_metas = self
+            .metadata
+            .drain(..)
+            .map(OrderedValueMeta)
+            .collect::<BTreeSet<_>>();
+        for (keyspace, lsn) in gaps {
+            self.max_lsn = std::cmp::max(self.max_lsn, lsn);
+
+            for gap_range in keyspace.ranges {
+                let mut key = gap_range.start;
+                while key != gap_range.end {
+                    let relative_off = self.raw.len() as u64;
+
+                    // TODO(vlad): Can we be cheeky and write only one zero image, and
+                    // make all index entries requiring a zero page point to it?
+                    // Alternatively, we can change the index entry format to represent zero pages
+                    // without writing them at all.
+                    Value::Image(ZERO_PAGE.clone())
+                        .ser_into(&mut self.raw)
+                        .unwrap();
+                    let val_ser_size = self.raw.len() - relative_off as usize;
+
+                    ordered_metas.insert(OrderedValueMeta(ValueMeta::Serialized(
+                        SerializedValueMeta {
+                            key: key.to_compact(),
+                            lsn,
+                            batch_offset: relative_off,
+                            len: val_ser_size,
+                            will_init: true,
+                        },
+                    )));
+
+                    self.len += 1;
+
+                    key = key.next();
+                }
+            }
+        }
+
+        self.metadata = ordered_metas.into_iter().map(|ord| ord.0).collect();
+
+        if cfg!(any(debug_assertions, test)) {
+            self.validate_lsn_order();
+        }
+    }
+
+    /// Checks if the batch is empty
+    ///
+    /// A batch is empty when it contains no serialized values.
+    /// Note that it may still contain observed values.
+    pub fn is_empty(&self) -> bool {
+        let empty = self.raw.is_empty();
+
+        if cfg!(debug_assertions) && empty {
+            assert!(self
+                .metadata
+                .iter()
+                .all(|meta| matches!(meta, ValueMeta::Observed(_))));
+        }
+
+        empty
+    }
+
+    /// Returns the number of values serialized in the batch
+    pub fn len(&self) -> usize {
+        self.len
+    }
+
+    /// Returns the size of the buffer wrapped by the batch
+    pub fn buffer_size(&self) -> usize {
+        self.raw.len()
+    }
+
+    pub fn updates_key(&self, key: &Key) -> bool {
+        self.metadata.iter().any(|meta| match meta {
+            ValueMeta::Serialized(ser) => key.to_compact() == ser.key,
+            ValueMeta::Observed(_) => false,
+        })
+    }
+
+    pub fn validate_lsn_order(&self) {
+        use std::collections::HashMap;
+
+        let mut last_seen_lsn_per_key: HashMap<CompactKey, Lsn> = HashMap::default();
+
+        for meta in self.metadata.iter() {
+            let lsn = meta.lsn();
+            let key = meta.key();
+
+            if let Some(prev_lsn) = last_seen_lsn_per_key.insert(key, lsn) {
+                assert!(
+                    lsn >= prev_lsn,
+                    "Ordering violated by {}: {} < {}",
+                    Key::from_compact(key),
+                    lsn,
+                    prev_lsn
+                );
+            }
+        }
+    }
+}
+
+#[cfg(all(test, feature = "testing"))]
+mod tests {
+    use super::*;
+
+    fn validate_batch(
+        batch: &SerializedValueBatch,
+        values: &[(CompactKey, Lsn, usize, Value)],
+        gaps: Option<&Vec<(KeySpace, Lsn)>>,
+    ) {
+        // Invariant 1: The metadata for a given entry in the batch
+        // is correct and can be used to deserialize back to the original value.
+        for (key, lsn, size, value) in values.iter() {
+            let meta = batch
+                .metadata
+                .iter()
+                .find(|meta| (meta.key(), meta.lsn()) == (*key, *lsn))
+                .unwrap();
+            let meta = match meta {
+                ValueMeta::Serialized(ser) => ser,
+                ValueMeta::Observed(_) => unreachable!(),
+            };
+
+            assert_eq!(meta.len, *size);
+            assert_eq!(meta.will_init, value.will_init());
+
+            let start = meta.batch_offset as usize;
+            let end = meta.batch_offset as usize + meta.len;
+            let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
+            assert_eq!(&value_from_batch, value);
+        }
+
+        let mut expected_buffer_size: usize = values.iter().map(|(_, _, size, _)| size).sum();
+        let mut gap_pages_count: usize = 0;
+
+        // Invariant 2: Zero pages were added for identified gaps and their metadata
+        // is correct.
+        if let Some(gaps) = gaps {
+            for (gap_keyspace, lsn) in gaps {
+                for gap_range in &gap_keyspace.ranges {
+                    let mut gap_key = gap_range.start;
+                    while gap_key != gap_range.end {
+                        let meta = batch
+                            .metadata
+                            .iter()
+                            .find(|meta| (meta.key(), meta.lsn()) == (gap_key.to_compact(), *lsn))
+                            .unwrap();
+                        let meta = match meta {
+                            ValueMeta::Serialized(ser) => ser,
+                            ValueMeta::Observed(_) => unreachable!(),
+                        };
+
+                        let zero_value = Value::Image(ZERO_PAGE.clone());
+                        let zero_value_size = zero_value.serialized_size().unwrap() as usize;
+
+                        assert_eq!(meta.len, zero_value_size);
+                        assert_eq!(meta.will_init, zero_value.will_init());
+
+                        let start = meta.batch_offset as usize;
+                        let end = meta.batch_offset as usize + meta.len;
+                        let value_from_batch = Value::des(&batch.raw[start..end]).unwrap();
+                        assert_eq!(value_from_batch, zero_value);
+
+                        gap_pages_count += 1;
+                        expected_buffer_size += zero_value_size;
+                        gap_key = gap_key.next();
+                    }
+                }
+            }
+        }
+
+        // Invariant 3: The length of the batch is equal to the number
+        // of values inserted, plus the number of gap pages. This extends
+        // to the raw buffer size.
+        assert_eq!(batch.len(), values.len() + gap_pages_count);
+        assert_eq!(expected_buffer_size, batch.buffer_size());
+
+        // Invariant 4: Metadata entries for any given key are sorted in LSN order.
+        batch.validate_lsn_order();
+    }
+
+    #[test]
+    fn test_creation_from_values() {
+        const LSN: Lsn = Lsn(0x10);
+        let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let values = vec![
+            (
+                key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+            (
+                key.to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("baz")),
+            ),
+            (
+                key.next().next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("taz")),
+            ),
+        ];
+
+        let values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let batch = SerializedValueBatch::from_values(values.clone());
+
+        validate_batch(&batch, &values, None);
+
+        assert!(!batch.is_empty());
+    }
+
+    #[test]
+    fn test_put() {
+        const LSN: Lsn = Lsn(0x10);
+        let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let values = vec![
+            (
+                key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+        ];
+
+        let mut values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let mut batch = SerializedValueBatch::from_values(values.clone());
+
+        validate_batch(&batch, &values, None);
+
+        let value = (
+            key.to_compact(),
+            Lsn(LSN.0 + 0x10),
+            Value::WalRecord(NeonWalRecord::wal_append("baz")),
+        );
+        let serialized_size = value.2.serialized_size().unwrap() as usize;
+        let value = (value.0, value.1, serialized_size, value.2);
+        values.push(value.clone());
+        batch.put(value.0, value.3, value.1);
+
+        validate_batch(&batch, &values, None);
+
+        let value = (
+            key.next().next().to_compact(),
+            LSN,
+            Value::WalRecord(NeonWalRecord::wal_append("taz")),
+        );
+        let serialized_size = value.2.serialized_size().unwrap() as usize;
+        let value = (value.0, value.1, serialized_size, value.2);
+        values.push(value.clone());
+        batch.put(value.0, value.3, value.1);
+
+        validate_batch(&batch, &values, None);
+    }
+
+    #[test]
+    fn test_extension() {
+        const LSN: Lsn = Lsn(0x10);
+        let key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let values = vec![
+            (
+                key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+            (
+                key.next().next().to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("taz")),
+            ),
+        ];
+
+        let mut values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let mut batch = SerializedValueBatch::from_values(values.clone());
+
+        let other_values = vec![
+            (
+                key.to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo")),
+            ),
+            (
+                key.next().to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("bar")),
+            ),
+            (
+                key.next().next().to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("taz")),
+            ),
+        ];
+
+        let other_values = other_values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+        let other_batch = SerializedValueBatch::from_values(other_values.clone());
+
+        values.extend(other_values);
+        batch.extend(other_batch);
+
+        validate_batch(&batch, &values, None);
+    }
+
+    #[test]
+    fn test_gap_zeroing() {
+        const LSN: Lsn = Lsn(0x10);
+        let rel_foo_base_key = Key::from_hex("110000000033333333444444445500000001").unwrap();
+
+        let rel_bar_base_key = {
+            let mut key = rel_foo_base_key;
+            key.field4 += 1;
+            key
+        };
+
+        let values = vec![
+            (
+                rel_foo_base_key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo1")),
+            ),
+            (
+                rel_foo_base_key.add(1).to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo2")),
+            ),
+            (
+                rel_foo_base_key.add(5).to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("foo3")),
+            ),
+            (
+                rel_foo_base_key.add(1).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo4")),
+            ),
+            (
+                rel_foo_base_key.add(10).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo5")),
+            ),
+            (
+                rel_foo_base_key.add(11).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo6")),
+            ),
+            (
+                rel_foo_base_key.add(12).to_compact(),
+                Lsn(LSN.0 + 0x10),
+                Value::WalRecord(NeonWalRecord::wal_append("foo7")),
+            ),
+            (
+                rel_bar_base_key.to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar1")),
+            ),
+            (
+                rel_bar_base_key.add(4).to_compact(),
+                LSN,
+                Value::WalRecord(NeonWalRecord::wal_append("bar2")),
+            ),
+        ];
+
+        let values = values
+            .into_iter()
+            .map(|(key, lsn, value)| (key, lsn, value.serialized_size().unwrap() as usize, value))
+            .collect::<Vec<_>>();
+
+        let mut batch = SerializedValueBatch::from_values(values.clone());
+
+        let gaps = vec![
+            (
+                KeySpace {
+                    ranges: vec![
+                        rel_foo_base_key.add(2)..rel_foo_base_key.add(5),
+                        rel_bar_base_key.add(1)..rel_bar_base_key.add(4),
+                    ],
+                },
+                LSN,
+            ),
+            (
+                KeySpace {
+                    ranges: vec![rel_foo_base_key.add(6)..rel_foo_base_key.add(10)],
+                },
+                Lsn(LSN.0 + 0x10),
+            ),
+        ];
+
+        batch.zero_gaps(gaps.clone());
+        validate_batch(&batch, &values, Some(&gaps));
+    }
+}
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 0a1ad9cd6b..f6b2a8e031 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -9,7 +9,6 @@ use pageserver::{
     l0_flush::{L0FlushConfig, L0FlushGlobalState},
     page_cache,
     task_mgr::TaskKind,
-    tenant::storage_layer::inmemory_layer::SerializedBatch,
     tenant::storage_layer::InMemoryLayer,
     virtual_file,
 };
@@ -18,6 +17,7 @@ use utils::{
     bin_ser::BeSer,
     id::{TenantId, TimelineId},
 };
+use wal_decoder::serialized_batch::SerializedValueBatch;
 
 // A very cheap hash for generating non-sequential keys.
 fn murmurhash32(mut h: u32) -> u32 {
@@ -102,13 +102,13 @@ async fn ingest(
         batch.push((key.to_compact(), lsn, data_ser_size, data.clone()));
         if batch.len() >= BATCH_SIZE {
             let this_batch = std::mem::take(&mut batch);
-            let serialized = SerializedBatch::from_values(this_batch).unwrap();
+            let serialized = SerializedValueBatch::from_values(this_batch);
             layer.put_batch(serialized, &ctx).await?;
         }
     }
     if !batch.is_empty() {
         let this_batch = std::mem::take(&mut batch);
-        let serialized = SerializedBatch::from_values(this_batch).unwrap();
+        let serialized = SerializedValueBatch::from_values(this_batch);
         layer.put_batch(serialized, &ctx).await?;
     }
     layer.freeze(lsn + 1).await;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index dc2dc08b53..7b106569a4 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -24,6 +24,7 @@ use pageserver_api::key::{
 use pageserver_api::keyspace::SparseKeySpace;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
+use pageserver_api::shard::ShardIdentity;
 use pageserver_api::value::Value;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -38,6 +39,7 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::pausable_failpoint;
 use utils::{bin_ser::BeSer, lsn::Lsn};
+use wal_decoder::serialized_batch::SerializedValueBatch;
 
 /// Max delta records appended to the AUX_FILES_KEY (for aux v1). The write path will write a full image once this threshold is reached.
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
@@ -170,12 +172,11 @@ impl Timeline {
             tline: self,
             pending_lsns: Vec::new(),
             pending_metadata_pages: HashMap::new(),
-            pending_data_pages: Vec::new(),
-            pending_zero_data_pages: Default::default(),
+            pending_data_batch: None,
             pending_deletions: Vec::new(),
             pending_nblocks: 0,
             pending_directory_entries: Vec::new(),
-            pending_bytes: 0,
+            pending_metadata_bytes: 0,
             lsn,
         }
     }
@@ -1025,21 +1026,14 @@ pub struct DatadirModification<'a> {
 
     /// Data writes, ready to be flushed into an ephemeral layer. See [`Self::is_data_key`] for
     /// which keys are stored here.
-    pending_data_pages: Vec<(CompactKey, Lsn, usize, Value)>,
-
-    // Sometimes during ingest, for example when extending a relation, we would like to write a zero page.  However,
-    // if we encounter a write from postgres in the same wal record, we will drop this entry.
-    //
-    // Unlike other 'pending' fields, this does not last until the next call to commit(): it is flushed
-    // at the end of each wal record, and all these writes implicitly are at lsn Self::lsn
-    pending_zero_data_pages: HashSet<CompactKey>,
+    pending_data_batch: Option<SerializedValueBatch>,
 
     /// For special "directory" keys that store key-value maps, track the size of the map
     /// if it was updated in this modification.
     pending_directory_entries: Vec<(DirectoryKind, usize)>,
 
-    /// An **approximation** of how large our EphemeralFile write will be when committed.
-    pending_bytes: usize,
+    /// An **approximation** of how many metadata bytes will be written to the EphemeralFile.
+    pending_metadata_bytes: usize,
 }
 
 impl<'a> DatadirModification<'a> {
@@ -1054,11 +1048,17 @@ impl<'a> DatadirModification<'a> {
     }
 
     pub(crate) fn approx_pending_bytes(&self) -> usize {
-        self.pending_bytes
+        self.pending_data_batch
+            .as_ref()
+            .map_or(0, |b| b.buffer_size())
+            + self.pending_metadata_bytes
     }
 
-    pub(crate) fn has_dirty_data_pages(&self) -> bool {
-        (!self.pending_data_pages.is_empty()) || (!self.pending_zero_data_pages.is_empty())
+    pub(crate) fn has_dirty_data(&self) -> bool {
+        !self
+            .pending_data_batch
+            .as_ref()
+            .map_or(true, |b| b.is_empty())
     }
 
     /// Set the current lsn
@@ -1070,9 +1070,6 @@ impl<'a> DatadirModification<'a> {
             self.lsn
         );
 
-        // If we are advancing LSN, then state from previous wal record should have been flushed.
-        assert!(self.pending_zero_data_pages.is_empty());
-
         if lsn > self.lsn {
             self.pending_lsns.push(self.lsn);
             self.lsn = lsn;
@@ -1147,6 +1144,107 @@ impl<'a> DatadirModification<'a> {
         Ok(())
     }
 
+    /// Creates a relation if it is not already present.
+    /// Returns the current size of the relation
+    pub(crate) async fn create_relation_if_required(
+        &mut self,
+        rel: RelTag,
+        ctx: &RequestContext,
+    ) -> Result<u32, PageReconstructError> {
+        // Get current size and put rel creation if rel doesn't exist
+        //
+        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
+        //       check the cache too. This is because eagerly checking the cache results in
+        //       less work overall and 10% better performance. It's more work on cache miss
+        //       but cache miss is rare.
+        if let Some(nblocks) = self.tline.get_cached_rel_size(&rel, self.get_lsn()) {
+            Ok(nblocks)
+        } else if !self
+            .tline
+            .get_rel_exists(rel, Version::Modified(self), ctx)
+            .await?
+        {
+            // create it with 0 size initially, the logic below will extend it
+            self.put_rel_creation(rel, 0, ctx)
+                .await
+                .context("Relation Error")?;
+            Ok(0)
+        } else {
+            self.tline
+                .get_rel_size(rel, Version::Modified(self), ctx)
+                .await
+        }
+    }
+
+    /// Given a block number for a relation (which represents a newly written block),
+    /// the previous block count of the relation, and the shard info, find the gaps
+    /// that were created by the newly written block if any.
+    fn find_gaps(
+        rel: RelTag,
+        blkno: u32,
+        previous_nblocks: u32,
+        shard: &ShardIdentity,
+    ) -> Option<KeySpace> {
+        let mut key = rel_block_to_key(rel, blkno);
+        let mut gap_accum = None;
+
+        for gap_blkno in previous_nblocks..blkno {
+            key.field6 = gap_blkno;
+
+            if shard.get_shard_number(&key) != shard.number {
+                continue;
+            }
+
+            gap_accum
+                .get_or_insert_with(KeySpaceAccum::new)
+                .add_key(key);
+        }
+
+        gap_accum.map(|accum| accum.to_keyspace())
+    }
+
+    pub async fn ingest_batch(
+        &mut self,
+        mut batch: SerializedValueBatch,
+        // TODO(vlad): remove this argument and replace the shard check with is_key_local
+        shard: &ShardIdentity,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut gaps_at_lsns = Vec::default();
+
+        for meta in batch.metadata.iter() {
+            let (rel, blkno) = Key::from_compact(meta.key()).to_rel_block()?;
+            let new_nblocks = blkno + 1;
+
+            let old_nblocks = self.create_relation_if_required(rel, ctx).await?;
+            if new_nblocks > old_nblocks {
+                self.put_rel_extend(rel, new_nblocks, ctx).await?;
+            }
+
+            if let Some(gaps) = Self::find_gaps(rel, blkno, old_nblocks, shard) {
+                gaps_at_lsns.push((gaps, meta.lsn()));
+            }
+        }
+
+        if !gaps_at_lsns.is_empty() {
+            batch.zero_gaps(gaps_at_lsns);
+        }
+
+        match self.pending_data_batch.as_mut() {
+            Some(pending_batch) => {
+                pending_batch.extend(batch);
+            }
+            None if !batch.is_empty() => {
+                self.pending_data_batch = Some(batch);
+            }
+            None => {
+                // Nothing to initialize the batch with
+            }
+        }
+
+        Ok(())
+    }
+
     /// Put a new page version that can be constructed from a WAL record
     ///
     /// NOTE: this will *not* implicitly extend the relation, if the page is beyond the
@@ -1229,8 +1327,13 @@ impl<'a> DatadirModification<'a> {
                 self.lsn
             );
         }
-        self.pending_zero_data_pages.insert(key.to_compact());
-        self.pending_bytes += ZERO_PAGE.len();
+
+        let batch = self
+            .pending_data_batch
+            .get_or_insert_with(SerializedValueBatch::default);
+
+        batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
+
         Ok(())
     }
 
@@ -1248,17 +1351,14 @@ impl<'a> DatadirModification<'a> {
                 self.lsn
             );
         }
-        self.pending_zero_data_pages.insert(key.to_compact());
-        self.pending_bytes += ZERO_PAGE.len();
-        Ok(())
-    }
 
-    /// Call this at the end of each WAL record.
-    pub(crate) fn on_record_end(&mut self) {
-        let pending_zero_data_pages = std::mem::take(&mut self.pending_zero_data_pages);
-        for key in pending_zero_data_pages {
-            self.put_data(key, Value::Image(ZERO_PAGE.clone()));
-        }
+        let batch = self
+            .pending_data_batch
+            .get_or_insert_with(SerializedValueBatch::default);
+
+        batch.put(key.to_compact(), Value::Image(ZERO_PAGE.clone()), self.lsn);
+
+        Ok(())
     }
 
     /// Store a relmapper file (pg_filenode.map) in the repository
@@ -1750,12 +1850,17 @@ impl<'a> DatadirModification<'a> {
         let mut writer = self.tline.writer().await;
 
         // Flush relation and  SLRU data blocks, keep metadata.
-        let pending_data_pages = std::mem::take(&mut self.pending_data_pages);
+        if let Some(batch) = self.pending_data_batch.take() {
+            tracing::debug!(
+                "Flushing batch with max_lsn={}. Last record LSN is {}",
+                batch.max_lsn,
+                self.tline.get_last_record_lsn()
+            );
 
-        // This bails out on first error without modifying pending_updates.
-        // That's Ok, cf this function's doc comment.
-        writer.put_batch(pending_data_pages, ctx).await?;
-        self.pending_bytes = 0;
+            // This bails out on first error without modifying pending_updates.
+            // That's Ok, cf this function's doc comment.
+            writer.put_batch(batch, ctx).await?;
+        }
 
         if pending_nblocks != 0 {
             writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1775,9 +1880,6 @@ impl<'a> DatadirModification<'a> {
     /// All the modifications in this atomic update are stamped by the specified LSN.
     ///
     pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
-        // Commit should never be called mid-wal-record
-        assert!(self.pending_zero_data_pages.is_empty());
-
         let mut writer = self.tline.writer().await;
 
         let pending_nblocks = self.pending_nblocks;
@@ -1785,21 +1887,49 @@ impl<'a> DatadirModification<'a> {
 
         // Ordering: the items in this batch do not need to be in any global order, but values for
         // a particular Key must be in Lsn order relative to one another.  InMemoryLayer relies on
-        // this to do efficient updates to its index.
-        let mut write_batch = std::mem::take(&mut self.pending_data_pages);
+        // this to do efficient updates to its index.  See [`wal_decoder::serialized_batch`] for
+        // more details.
 
-        write_batch.extend(
-            self.pending_metadata_pages
+        let metadata_batch = {
+            let pending_meta = self
+                .pending_metadata_pages
                 .drain()
                 .flat_map(|(key, values)| {
                     values
                         .into_iter()
                         .map(move |(lsn, value_size, value)| (key, lsn, value_size, value))
-                }),
-        );
+                })
+                .collect::<Vec<_>>();
 
-        if !write_batch.is_empty() {
-            writer.put_batch(write_batch, ctx).await?;
+            if pending_meta.is_empty() {
+                None
+            } else {
+                Some(SerializedValueBatch::from_values(pending_meta))
+            }
+        };
+
+        let data_batch = self.pending_data_batch.take();
+
+        let maybe_batch = match (data_batch, metadata_batch) {
+            (Some(mut data), Some(metadata)) => {
+                data.extend(metadata);
+                Some(data)
+            }
+            (Some(data), None) => Some(data),
+            (None, Some(metadata)) => Some(metadata),
+            (None, None) => None,
+        };
+
+        if let Some(batch) = maybe_batch {
+            tracing::debug!(
+                "Flushing batch with max_lsn={}. Last record LSN is {}",
+                batch.max_lsn,
+                self.tline.get_last_record_lsn()
+            );
+
+            // This bails out on first error without modifying pending_updates.
+            // That's Ok, cf this function's doc comment.
+            writer.put_batch(batch, ctx).await?;
         }
 
         if !self.pending_deletions.is_empty() {
@@ -1809,6 +1939,9 @@ impl<'a> DatadirModification<'a> {
 
         self.pending_lsns.push(self.lsn);
         for pending_lsn in self.pending_lsns.drain(..) {
+            // TODO(vlad): pretty sure the comment below is not valid anymore
+            // and we can call finish write with the latest LSN
+            //
             // Ideally, we should be able to call writer.finish_write() only once
             // with the highest LSN. However, the last_record_lsn variable in the
             // timeline keeps track of the latest LSN and the immediate previous LSN
@@ -1824,14 +1957,14 @@ impl<'a> DatadirModification<'a> {
             writer.update_directory_entries_count(kind, count as u64);
         }
 
-        self.pending_bytes = 0;
+        self.pending_metadata_bytes = 0;
 
         Ok(())
     }
 
     pub(crate) fn len(&self) -> usize {
         self.pending_metadata_pages.len()
-            + self.pending_data_pages.len()
+            + self.pending_data_batch.as_ref().map_or(0, |b| b.len())
             + self.pending_deletions.len()
     }
 
@@ -1873,11 +2006,10 @@ impl<'a> DatadirModification<'a> {
             // modifications before ingesting DB create operations, which are the only kind that reads
             // data pages during ingest.
             if cfg!(debug_assertions) {
-                for (dirty_key, _, _, _) in &self.pending_data_pages {
-                    debug_assert!(&key.to_compact() != dirty_key);
-                }
-
-                debug_assert!(!self.pending_zero_data_pages.contains(&key.to_compact()))
+                assert!(!self
+                    .pending_data_batch
+                    .as_ref()
+                    .map_or(false, |b| b.updates_key(&key)));
             }
         }
 
@@ -1895,18 +2027,10 @@ impl<'a> DatadirModification<'a> {
     }
 
     fn put_data(&mut self, key: CompactKey, val: Value) {
-        let val_serialized_size = val.serialized_size().unwrap() as usize;
-
-        // If this page was previously zero'd in the same WalRecord, then drop the previous zero page write.  This
-        // is an optimization that avoids persisting both the zero page generated by us (e.g. during a relation extend),
-        // and the subsequent postgres-originating write
-        if self.pending_zero_data_pages.remove(&key) {
-            self.pending_bytes -= ZERO_PAGE.len();
-        }
-
-        self.pending_bytes += val_serialized_size;
-        self.pending_data_pages
-            .push((key, self.lsn, val_serialized_size, val))
+        let batch = self
+            .pending_data_batch
+            .get_or_insert_with(SerializedValueBatch::default);
+        batch.put(key, val, self.lsn);
     }
 
     fn put_metadata(&mut self, key: CompactKey, val: Value) {
@@ -1914,10 +2038,10 @@ impl<'a> DatadirModification<'a> {
         // Replace the previous value if it exists at the same lsn
         if let Some((last_lsn, last_value_ser_size, last_value)) = values.last_mut() {
             if *last_lsn == self.lsn {
-                // Update the pending_bytes contribution from this entry, and update the serialized size in place
-                self.pending_bytes -= *last_value_ser_size;
+                // Update the pending_metadata_bytes contribution from this entry, and update the serialized size in place
+                self.pending_metadata_bytes -= *last_value_ser_size;
                 *last_value_ser_size = val.serialized_size().unwrap() as usize;
-                self.pending_bytes += *last_value_ser_size;
+                self.pending_metadata_bytes += *last_value_ser_size;
 
                 // Use the latest value, this replaces any earlier write to the same (key,lsn), such as much
                 // have been generated by synthesized zero page writes prior to the first real write to a page.
@@ -1927,8 +2051,12 @@ impl<'a> DatadirModification<'a> {
         }
 
         let val_serialized_size = val.serialized_size().unwrap() as usize;
-        self.pending_bytes += val_serialized_size;
+        self.pending_metadata_bytes += val_serialized_size;
         values.push((self.lsn, val_serialized_size, val));
+
+        if key == CHECKPOINT_KEY.to_compact() {
+            tracing::debug!("Checkpoint key added to pending with size {val_serialized_size}");
+        }
     }
 
     fn delete(&mut self, key_range: Range<Key>) {
@@ -2037,7 +2165,11 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 #[cfg(test)]
 mod tests {
     use hex_literal::hex;
-    use utils::id::TimelineId;
+    use pageserver_api::{models::ShardParameters, shard::ShardStripeSize};
+    use utils::{
+        id::TimelineId,
+        shard::{ShardCount, ShardNumber},
+    };
 
     use super::*;
 
@@ -2091,6 +2223,93 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn gap_finding() {
+        let rel = RelTag {
+            spcnode: 1663,
+            dbnode: 208101,
+            relnode: 2620,
+            forknum: 0,
+        };
+        let base_blkno = 1;
+
+        let base_key = rel_block_to_key(rel, base_blkno);
+        let before_base_key = rel_block_to_key(rel, base_blkno - 1);
+
+        let shard = ShardIdentity::unsharded();
+
+        let mut previous_nblocks = 0;
+        for i in 0..10 {
+            let crnt_blkno = base_blkno + i;
+            let gaps = DatadirModification::find_gaps(rel, crnt_blkno, previous_nblocks, &shard);
+
+            previous_nblocks = crnt_blkno + 1;
+
+            if i == 0 {
+                // The first block we write is 1, so we should find the gap.
+                assert_eq!(gaps.unwrap(), KeySpace::single(before_base_key..base_key));
+            } else {
+                assert!(gaps.is_none());
+            }
+        }
+
+        // This is an update to an already existing block. No gaps here.
+        let update_blkno = 5;
+        let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
+        assert!(gaps.is_none());
+
+        // This is an update past the current end block.
+        let after_gap_blkno = 20;
+        let gaps = DatadirModification::find_gaps(rel, after_gap_blkno, previous_nblocks, &shard);
+
+        let gap_start_key = rel_block_to_key(rel, previous_nblocks);
+        let after_gap_key = rel_block_to_key(rel, after_gap_blkno);
+        assert_eq!(
+            gaps.unwrap(),
+            KeySpace::single(gap_start_key..after_gap_key)
+        );
+    }
+
+    #[test]
+    fn sharded_gap_finding() {
+        let rel = RelTag {
+            spcnode: 1663,
+            dbnode: 208101,
+            relnode: 2620,
+            forknum: 0,
+        };
+
+        let first_blkno = 6;
+
+        // This shard will get the even blocks
+        let shard = ShardIdentity::from_params(
+            ShardNumber(0),
+            &ShardParameters {
+                count: ShardCount(2),
+                stripe_size: ShardStripeSize(1),
+            },
+        );
+
+        // Only keys belonging to this shard are considered as gaps.
+        let mut previous_nblocks = 0;
+        let gaps =
+            DatadirModification::find_gaps(rel, first_blkno, previous_nblocks, &shard).unwrap();
+        assert!(!gaps.ranges.is_empty());
+        for gap_range in gaps.ranges {
+            let mut k = gap_range.start;
+            while k != gap_range.end {
+                assert_eq!(shard.get_shard_number(&k), shard.number);
+                k = k.next();
+            }
+        }
+
+        previous_nblocks = first_blkno;
+
+        let update_blkno = 2;
+        let gaps = DatadirModification::find_gaps(rel, update_blkno, previous_nblocks, &shard);
+        assert!(gaps.is_none());
+    }
+
     /*
         fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
             let incremental = timeline.get_current_logical_size();
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index df448a0963..2ce26ed2eb 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -12,7 +12,7 @@ use crate::tenant::timeline::GetVectoredError;
 use crate::tenant::PageReconstructError;
 use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
 use crate::{l0_flush, page_cache};
-use anyhow::{anyhow, Context, Result};
+use anyhow::{anyhow, Result};
 use camino::Utf8PathBuf;
 use pageserver_api::key::CompactKey;
 use pageserver_api::key::Key;
@@ -25,6 +25,7 @@ use std::sync::{Arc, OnceLock};
 use std::time::Instant;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
+use wal_decoder::serialized_batch::{SerializedValueBatch, SerializedValueMeta, ValueMeta};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use crate::metrics::TIMELINE_EPHEMERAL_BYTES;
@@ -452,6 +453,7 @@ impl InMemoryLayer {
                         len,
                         will_init,
                     } = index_entry.unpack();
+
                     reads.entry(key).or_default().push(ValueRead {
                         entry_lsn: *entry_lsn,
                         read: vectored_dio_read::LogicalRead::new(
@@ -513,68 +515,6 @@ impl InMemoryLayer {
     }
 }
 
-/// Offset of a particular Value within a serialized batch.
-struct SerializedBatchOffset {
-    key: CompactKey,
-    lsn: Lsn,
-    // TODO: separate type when we start serde-serializing this value, to avoid coupling
-    // in-memory representation to serialization format.
-    index_entry: IndexEntry,
-}
-
-pub struct SerializedBatch {
-    /// Blobs serialized in EphemeralFile's native format, ready for passing to [`EphemeralFile::write_raw`].
-    pub(crate) raw: Vec<u8>,
-
-    /// Index of values in [`Self::raw`], using offsets relative to the start of the buffer.
-    offsets: Vec<SerializedBatchOffset>,
-
-    /// The highest LSN of any value in the batch
-    pub(crate) max_lsn: Lsn,
-}
-
-impl SerializedBatch {
-    pub fn from_values(batch: Vec<(CompactKey, Lsn, usize, Value)>) -> anyhow::Result<Self> {
-        // Pre-allocate a big flat buffer to write into. This should be large but not huge: it is soft-limited in practice by
-        // [`crate::pgdatadir_mapping::DatadirModification::MAX_PENDING_BYTES`]
-        let buffer_size = batch.iter().map(|i| i.2).sum::<usize>();
-        let mut cursor = std::io::Cursor::new(Vec::<u8>::with_capacity(buffer_size));
-
-        let mut offsets: Vec<SerializedBatchOffset> = Vec::with_capacity(batch.len());
-        let mut max_lsn: Lsn = Lsn(0);
-        for (key, lsn, val_ser_size, val) in batch {
-            let relative_off = cursor.position();
-
-            val.ser_into(&mut cursor)
-                .expect("Writing into in-memory buffer is infallible");
-
-            offsets.push(SerializedBatchOffset {
-                key,
-                lsn,
-                index_entry: IndexEntry::new(IndexEntryNewArgs {
-                    base_offset: 0,
-                    batch_offset: relative_off,
-                    len: val_ser_size,
-                    will_init: val.will_init(),
-                })
-                .context("higher-level code ensures that values are within supported ranges")?,
-            });
-            max_lsn = std::cmp::max(max_lsn, lsn);
-        }
-
-        let buffer = cursor.into_inner();
-
-        // Assert that we didn't do any extra allocations while building buffer.
-        debug_assert!(buffer.len() <= buffer_size);
-
-        Ok(Self {
-            raw: buffer,
-            offsets,
-            max_lsn,
-        })
-    }
-}
-
 fn inmem_layer_display(mut f: impl Write, start_lsn: Lsn, end_lsn: Lsn) -> std::fmt::Result {
     write!(f, "inmem-{:016X}-{:016X}", start_lsn.0, end_lsn.0)
 }
@@ -642,7 +582,7 @@ impl InMemoryLayer {
     /// TODO: it can be made retryable if we aborted the process on EphemeralFile write errors.
     pub async fn put_batch(
         &self,
-        serialized_batch: SerializedBatch,
+        serialized_batch: SerializedValueBatch,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let mut inner = self.inner.write().await;
@@ -650,27 +590,13 @@ impl InMemoryLayer {
 
         let base_offset = inner.file.len();
 
-        let SerializedBatch {
+        let SerializedValueBatch {
             raw,
-            mut offsets,
+            metadata,
             max_lsn: _,
+            len: _,
         } = serialized_batch;
 
-        // Add the base_offset to the batch's index entries which are relative to the batch start.
-        for offset in &mut offsets {
-            let IndexEntryUnpacked {
-                will_init,
-                len,
-                pos,
-            } = offset.index_entry.unpack();
-            offset.index_entry = IndexEntry::new(IndexEntryNewArgs {
-                base_offset,
-                batch_offset: pos,
-                len: len.into_usize(),
-                will_init,
-            })?;
-        }
-
         // Write the batch to the file
         inner.file.write_raw(&raw, ctx).await?;
         let new_size = inner.file.len();
@@ -683,12 +609,28 @@ impl InMemoryLayer {
         assert_eq!(new_size, expected_new_len);
 
         // Update the index with the new entries
-        for SerializedBatchOffset {
-            key,
-            lsn,
-            index_entry,
-        } in offsets
-        {
+        for meta in metadata {
+            let SerializedValueMeta {
+                key,
+                lsn,
+                batch_offset,
+                len,
+                will_init,
+            } = match meta {
+                ValueMeta::Serialized(ser) => ser,
+                ValueMeta::Observed(_) => {
+                    continue;
+                }
+            };
+
+            // Add the base_offset to the batch's index entries which are relative to the batch start.
+            let index_entry = IndexEntry::new(IndexEntryNewArgs {
+                base_offset,
+                batch_offset,
+                len,
+                will_init,
+            })?;
+
             let vec_map = inner.index.entry(key).or_default();
             let old = vec_map.append_or_update_last(lsn, index_entry).unwrap().0;
             if old.is_some() {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 12919866a3..ee823beca8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -24,8 +24,8 @@ use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::{
     key::{
-        CompactKey, KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX,
-        NON_INHERITED_RANGE, NON_INHERITED_SPARSE_RANGE,
+        KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
+        NON_INHERITED_SPARSE_RANGE,
     },
     keyspace::{KeySpaceAccum, KeySpaceRandomAccum, SparseKeyPartitioning},
     models::{
@@ -49,6 +49,7 @@ use utils::{
     fs_ext, pausable_failpoint,
     sync::gate::{Gate, GateGuard},
 };
+use wal_decoder::serialized_batch::SerializedValueBatch;
 
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -131,7 +132,6 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::gc_result::GcResult;
 use crate::ZERO_PAGE;
 use pageserver_api::key::Key;
-use pageserver_api::value::Value;
 
 use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
@@ -141,9 +141,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::{
-    config::TenantConf,
-    storage_layer::{inmemory_layer, LayerVisibilityHint},
-    upload_queue::NotInitialized,
+    config::TenantConf, storage_layer::LayerVisibilityHint, upload_queue::NotInitialized,
     MaybeOffloaded,
 };
 use super::{debug_assert_current_span_has_tenant_and_timeline_id, AttachedTenantConf};
@@ -157,6 +155,9 @@ use super::{
     GcError,
 };
 
+#[cfg(test)]
+use pageserver_api::value::Value;
+
 #[derive(Debug, PartialEq, Eq, Clone, Copy)]
 pub(crate) enum FlushLoopState {
     NotStarted,
@@ -5736,23 +5737,22 @@ impl<'a> TimelineWriter<'a> {
     /// Put a batch of keys at the specified Lsns.
     pub(crate) async fn put_batch(
         &mut self,
-        batch: Vec<(CompactKey, Lsn, usize, Value)>,
+        batch: SerializedValueBatch,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         if batch.is_empty() {
             return Ok(());
         }
 
-        let serialized_batch = inmemory_layer::SerializedBatch::from_values(batch)?;
-        let batch_max_lsn = serialized_batch.max_lsn;
-        let buf_size: u64 = serialized_batch.raw.len() as u64;
+        let batch_max_lsn = batch.max_lsn;
+        let buf_size: u64 = batch.buffer_size() as u64;
 
         let action = self.get_open_layer_action(batch_max_lsn, buf_size);
         let layer = self
             .handle_open_layer_action(batch_max_lsn, action, ctx)
             .await?;
 
-        let res = layer.put_batch(serialized_batch, ctx).await;
+        let res = layer.put_batch(batch, ctx).await;
 
         if res.is_ok() {
             // Update the current size only when the entire write was ok.
@@ -5787,11 +5787,14 @@ impl<'a> TimelineWriter<'a> {
             );
         }
         let val_ser_size = value.serialized_size().unwrap() as usize;
-        self.put_batch(
-            vec![(key.to_compact(), lsn, val_ser_size, value.clone())],
-            ctx,
-        )
-        .await
+        let batch = SerializedValueBatch::from_values(vec![(
+            key.to_compact(),
+            lsn,
+            val_ser_size,
+            value.clone(),
+        )]);
+
+        self.put_batch(batch, ctx).await
     }
 
     pub(crate) async fn delete_batch(
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index eb19fb691f..34bf959058 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -331,11 +331,11 @@ pub(super) async fn handle_walreceiver_connection(
                         Ok(())
                     }
 
-                    while let Some((lsn, recdata)) = waldecoder.poll_decode()? {
+                    while let Some((record_end_lsn, recdata)) = waldecoder.poll_decode()? {
                         // It is important to deal with the aligned records as lsn in getPage@LSN is
                         // aligned and can be several bytes bigger. Without this alignment we are
                         // at risk of hitting a deadlock.
-                        if !lsn.is_aligned() {
+                        if !record_end_lsn.is_aligned() {
                             return Err(WalReceiverError::Other(anyhow!("LSN not aligned")));
                         }
 
@@ -343,7 +343,7 @@ pub(super) async fn handle_walreceiver_connection(
                         let interpreted = InterpretedWalRecord::from_bytes_filtered(
                             recdata,
                             modification.tline.get_shard_identity(),
-                            lsn,
+                            record_end_lsn,
                             modification.tline.pg_version,
                         )?;
 
@@ -366,9 +366,11 @@ pub(super) async fn handle_walreceiver_connection(
                         let ingested = walingest
                             .ingest_record(interpreted, &mut modification, &ctx)
                             .await
-                            .with_context(|| format!("could not ingest record at {lsn}"))?;
+                            .with_context(|| {
+                                format!("could not ingest record at {record_end_lsn}")
+                            })?;
                         if !ingested {
-                            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+                            tracing::debug!("ingest: filtered out record @ LSN {record_end_lsn}");
                             WAL_INGEST.records_filtered.inc();
                             filtered_records += 1;
                         }
@@ -378,7 +380,7 @@ pub(super) async fn handle_walreceiver_connection(
                         // to timeout the tests.
                         fail_point!("walreceiver-after-ingest");
 
-                        last_rec_lsn = lsn;
+                        last_rec_lsn = record_end_lsn;
 
                         // Commit every ingest_batch_size records. Even if we filtered out
                         // all records, we still need to call commit to advance the LSN.
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 84353970b7..c3ccd8a2e4 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -28,14 +28,13 @@ use std::time::Duration;
 use std::time::Instant;
 use std::time::SystemTime;
 
-use pageserver_api::key::Key;
 use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::fsm_logical_to_physical;
 use postgres_ffi::walrecord::*;
 use postgres_ffi::{dispatch_pgversion, enum_pgversion, enum_pgversion_dispatch, TimestampTz};
 use wal_decoder::models::*;
 
-use anyhow::{bail, Context, Result};
+use anyhow::{bail, Result};
 use bytes::{Buf, Bytes};
 use tracing::*;
 use utils::failpoint_support;
@@ -51,7 +50,6 @@ use crate::ZERO_PAGE;
 use pageserver_api::key::rel_block_to_key;
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
-use pageserver_api::value::Value;
 use postgres_ffi::pg_constants;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, INIT_FORKNUM, MAIN_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::TransactionId;
@@ -156,12 +154,12 @@ impl WalIngest {
         WAL_INGEST.records_received.inc();
         let prev_len = modification.len();
 
-        modification.set_lsn(interpreted.lsn)?;
+        modification.set_lsn(interpreted.end_lsn)?;
 
         if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes) {
             // Records of this type should always be preceded by a commit(), as they
             // rely on reading data pages back from the Timeline.
-            assert!(!modification.has_dirty_data_pages());
+            assert!(!modification.has_dirty_data());
         }
 
         assert!(!self.checkpoint_modified);
@@ -275,28 +273,9 @@ impl WalIngest {
             }
         }
 
-        // Iterate through all the key value pairs provided in the interpreted block
-        // and update the modification currently in-flight to include them.
-        for (compact_key, maybe_value) in interpreted.blocks.into_iter() {
-            let (rel, blk) = Key::from_compact(compact_key).to_rel_block()?;
-            match maybe_value {
-                Some(Value::Image(img)) => {
-                    self.put_rel_page_image(modification, rel, blk, img, ctx)
-                        .await?;
-                }
-                Some(Value::WalRecord(rec)) => {
-                    self.put_rel_wal_record(modification, rel, blk, rec, ctx)
-                        .await?;
-                }
-                None => {
-                    // Shard 0 tracks relation sizes. We will observe
-                    // its blkno in case it implicitly extends a relation.
-                    assert!(self.shard.is_shard_zero());
-                    self.observe_decoded_block(modification, rel, blk, ctx)
-                        .await?;
-                }
-            }
-        }
+        modification
+            .ingest_batch(interpreted.batch, &self.shard, ctx)
+            .await?;
 
         // If checkpoint data was updated, store the new version in the repository
         if self.checkpoint_modified {
@@ -310,8 +289,6 @@ impl WalIngest {
         // until commit() is called to flush the data into the repository and update
         // the latest LSN.
 
-        modification.on_record_end();
-
         Ok(modification.len() > prev_len)
     }
 
@@ -334,17 +311,6 @@ impl WalIngest {
         Ok((epoch as u64) << 32 | xid as u64)
     }
 
-    /// Do not store this block, but observe it for the purposes of updating our relation size state.
-    async fn observe_decoded_block(
-        &mut self,
-        modification: &mut DatadirModification<'_>,
-        rel: RelTag,
-        blkno: BlockNumber,
-        ctx: &RequestContext,
-    ) -> Result<(), PageReconstructError> {
-        self.handle_rel_extend(modification, rel, blkno, ctx).await
-    }
-
     async fn ingest_clear_vm_bits(
         &mut self,
         clear_vm_bits: ClearVmBits,
@@ -1248,6 +1214,7 @@ impl WalIngest {
         Ok(())
     }
 
+    #[cfg(test)]
     async fn put_rel_page_image(
         &mut self,
         modification: &mut DatadirModification<'_>,
@@ -1297,36 +1264,7 @@ impl WalIngest {
         let new_nblocks = blknum + 1;
         // Check if the relation exists. We implicitly create relations on first
         // record.
-        // TODO: would be nice if to be more explicit about it
-
-        // Get current size and put rel creation if rel doesn't exist
-        //
-        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
-        //       check the cache too. This is because eagerly checking the cache results in
-        //       less work overall and 10% better performance. It's more work on cache miss
-        //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = modification
-            .tline
-            .get_cached_rel_size(&rel, modification.get_lsn())
-        {
-            nblocks
-        } else if !modification
-            .tline
-            .get_rel_exists(rel, Version::Modified(modification), ctx)
-            .await?
-        {
-            // create it with 0 size initially, the logic below will extend it
-            modification
-                .put_rel_creation(rel, 0, ctx)
-                .await
-                .context("Relation Error")?;
-            0
-        } else {
-            modification
-                .tline
-                .get_rel_size(rel, Version::Modified(modification), ctx)
-                .await?
-        };
+        let old_nblocks = modification.create_relation_if_required(rel, ctx).await?;
 
         if new_nblocks > old_nblocks {
             //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
@@ -1553,25 +1491,21 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
             .await?;
-        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x30));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
             .await?;
-        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x40));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
             .await?;
-        m.on_record_end();
         m.commit(&ctx).await?;
         let mut m = tline.begin_modification(Lsn(0x50));
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
             .await?;
-        m.on_record_end();
         m.commit(&ctx).await?;
 
         assert_current_logical_size(&tline, Lsn(0x50));
@@ -1713,7 +1647,6 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
             .await?;
-        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline
@@ -1739,7 +1672,6 @@ mod tests {
         walingest
             .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
             .await?;
-        m.on_record_end();
         m.commit(&ctx).await?;
         assert_eq!(
             tline

From d182ff294c85f01e77bd0cf4e5345e3357986f23 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 6 Nov 2024 15:14:43 +0000
Subject: [PATCH 174/239] storcon: respect tenant scheduling policy in
 drain/fill (#9657)

## Problem

Pinning a tenant by setting Pause scheduling policy doesn't work because
drain/fill code moves the tenant around during deploys.

Closes: https://github.com/neondatabase/neon/issues/9612

## Summary of changes

- In drain, only move a tenant if it is in Active or Essential mode
- In fill, only move a tenant if it is in Active mode.

The asymmetry is a bit annoying, but it faithfully respects the purposes
of the modes: Essential is meant to endeavor to keep the tenant
available, which means it needs to be drained but doesn't need to be
migrated during fills.
---
 storage_controller/src/drain_utils.rs | 16 +++++++++++++++-
 storage_controller/src/service.rs     | 10 ++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/storage_controller/src/drain_utils.rs b/storage_controller/src/drain_utils.rs
index dea1f04649..47f4276ff2 100644
--- a/storage_controller/src/drain_utils.rs
+++ b/storage_controller/src/drain_utils.rs
@@ -3,7 +3,7 @@ use std::{
     sync::Arc,
 };
 
-use pageserver_api::controller_api::NodeSchedulingPolicy;
+use pageserver_api::controller_api::{NodeSchedulingPolicy, ShardSchedulingPolicy};
 use utils::{id::NodeId, shard::TenantShardId};
 
 use crate::{
@@ -98,6 +98,20 @@ impl TenantShardDrain {
             return None;
         }
 
+        // Only tenants with a normal (Active) scheduling policy are proactively moved
+        // around during a node drain.  Shards which have been manually configured to a different
+        // policy are only rescheduled by manual intervention.
+        match tenant_shard.get_scheduling_policy() {
+            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {
+                // A migration during drain is classed as 'essential' because it is required to
+                // uphold our availability goals for the tenant: this shard is elegible for migration.
+            }
+            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
+                // If we have been asked to avoid rescheduling this shard, then do not migrate it during a drain
+                return None;
+            }
+        }
+
         match scheduler.node_preferred(tenant_shard.intent.get_secondary()) {
             Some(node) => Some(node),
             None => {
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3f6cbfef59..e3a147bc06 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6721,6 +6721,16 @@ impl Service {
             .tenants
             .iter_mut()
             .filter_map(|(tid, tenant_shard)| {
+                if !matches!(
+                    tenant_shard.get_scheduling_policy(),
+                    ShardSchedulingPolicy::Active
+                ) {
+                    // Only include tenants in fills if they have a normal (Active) scheduling policy.  We
+                    // even exclude Essential, because moving to fill a node is not essential to keeping this
+                    // tenant available.
+                    return None;
+                }
+
                 if tenant_shard.intent.get_secondary().contains(&node_id) {
                     if let Some(primary) = tenant_shard.intent.get_attached() {
                         return Some((*primary, *tid));

From 73bdc9a2d0d3731c0191d0c2063039985825f26f Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 6 Nov 2024 17:40:40 +0000
Subject: [PATCH 175/239] [proxy]: minor changes to endpoint-cache handling
 (#9666)

I think I meant to make these changes over 6 months ago. alas, better
late than never.

1. should_reject doesn't eagerly intern the endpoint string
2. Rate limiter uses a std Mutex instead of a tokio Mutex.
3. Recently I introduced a `-local-proxy` endpoint suffix. I forgot to
add this to normalize.
4. Random but a small cleanup making the ControlPlaneEvent deser
directly to the interned strings.
---
 proxy/src/cache/endpoints.rs           | 74 ++++++++++++++++----------
 proxy/src/control_plane/client/neon.rs |  2 -
 proxy/src/types.rs                     | 29 +++++-----
 3 files changed, 59 insertions(+), 46 deletions(-)

diff --git a/proxy/src/cache/endpoints.rs b/proxy/src/cache/endpoints.rs
index a488d8a9cd..07769e053c 100644
--- a/proxy/src/cache/endpoints.rs
+++ b/proxy/src/cache/endpoints.rs
@@ -1,13 +1,12 @@
 use std::convert::Infallible;
 use std::future::pending;
 use std::sync::atomic::{AtomicBool, Ordering};
-use std::sync::Arc;
+use std::sync::{Arc, Mutex};
 
 use dashmap::DashSet;
 use redis::streams::{StreamReadOptions, StreamReadReply};
 use redis::{AsyncCommands, FromRedisValue, Value};
 use serde::Deserialize;
-use tokio::sync::Mutex;
 use tokio_util::sync::CancellationToken;
 use tracing::info;
 
@@ -32,17 +31,17 @@ struct ControlPlaneEvent {
 
 #[derive(Deserialize, Debug, Clone, PartialEq)]
 struct EndpointCreated {
-    endpoint_id: String,
+    endpoint_id: EndpointIdInt,
 }
 
 #[derive(Deserialize, Debug, Clone, PartialEq)]
 struct BranchCreated {
-    branch_id: String,
+    branch_id: BranchIdInt,
 }
 
 #[derive(Deserialize, Debug, Clone, PartialEq)]
 struct ProjectCreated {
-    project_id: String,
+    project_id: ProjectIdInt,
 }
 
 impl TryFrom<&Value> for ControlPlaneEvent {
@@ -76,53 +75,72 @@ impl EndpointsCache {
         }
     }
 
-    pub(crate) async fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
+    pub(crate) fn is_valid(&self, ctx: &RequestMonitoring, endpoint: &EndpointId) -> bool {
         if !self.ready.load(Ordering::Acquire) {
+            // the endpoint cache is not yet fully initialised.
             return true;
         }
-        let rejected = self.should_reject(endpoint);
-        ctx.set_rejected(rejected);
-        info!(?rejected, "check endpoint is valid, disabled cache");
-        // If cache is disabled, just collect the metrics and return or
-        // If the limiter allows, we don't need to check the cache.
-        if self.config.disable_cache || self.limiter.lock().await.check() {
+
+        if !self.should_reject(endpoint) {
+            ctx.set_rejected(false);
             return true;
         }
-        !rejected
+
+        // report that we might want to reject this endpoint
+        ctx.set_rejected(true);
+
+        // If cache is disabled, just collect the metrics and return.
+        if self.config.disable_cache {
+            return true;
+        }
+
+        // If the limiter allows, we can pretend like it's valid
+        // (incase it is, due to redis channel lag).
+        if self.limiter.lock().unwrap().check() {
+            return true;
+        }
+
+        // endpoint not found, and there's too much load.
+        false
     }
 
     fn should_reject(&self, endpoint: &EndpointId) -> bool {
         if endpoint.is_endpoint() {
-            !self.endpoints.contains(&EndpointIdInt::from(endpoint))
+            let Some(endpoint) = EndpointIdInt::get(endpoint) else {
+                // if we haven't interned this endpoint, it's not in the cache.
+                return true;
+            };
+            !self.endpoints.contains(&endpoint)
         } else if endpoint.is_branch() {
-            !self
-                .branches
-                .contains(&BranchIdInt::from(&endpoint.as_branch()))
+            let Some(branch) = BranchIdInt::get(endpoint) else {
+                // if we haven't interned this branch, it's not in the cache.
+                return true;
+            };
+            !self.branches.contains(&branch)
         } else {
-            !self
-                .projects
-                .contains(&ProjectIdInt::from(&endpoint.as_project()))
+            let Some(project) = ProjectIdInt::get(endpoint) else {
+                // if we haven't interned this project, it's not in the cache.
+                return true;
+            };
+            !self.projects.contains(&project)
         }
     }
 
     fn insert_event(&self, event: ControlPlaneEvent) {
         if let Some(endpoint_created) = event.endpoint_created {
-            self.endpoints
-                .insert(EndpointIdInt::from(&endpoint_created.endpoint_id.into()));
+            self.endpoints.insert(endpoint_created.endpoint_id);
             Metrics::get()
                 .proxy
                 .redis_events_count
                 .inc(RedisEventsCount::EndpointCreated);
         } else if let Some(branch_created) = event.branch_created {
-            self.branches
-                .insert(BranchIdInt::from(&branch_created.branch_id.into()));
+            self.branches.insert(branch_created.branch_id);
             Metrics::get()
                 .proxy
                 .redis_events_count
                 .inc(RedisEventsCount::BranchCreated);
         } else if let Some(project_created) = event.project_created {
-            self.projects
-                .insert(ProjectIdInt::from(&project_created.project_id.into()));
+            self.projects.insert(project_created.project_id);
             Metrics::get()
                 .proxy
                 .redis_events_count
@@ -247,11 +265,13 @@ mod tests {
     fn test_parse_control_plane_event() {
         let s = r#"{"branch_created":null,"endpoint_created":{"endpoint_id":"ep-rapid-thunder-w0qqw2q9"},"project_created":null,"type":"endpoint_created"}"#;
 
+        let endpoint_id: EndpointId = "ep-rapid-thunder-w0qqw2q9".into();
+
         assert_eq!(
             serde_json::from_str::<ControlPlaneEvent>(s).unwrap(),
             ControlPlaneEvent {
                 endpoint_created: Some(EndpointCreated {
-                    endpoint_id: "ep-rapid-thunder-w0qqw2q9".into()
+                    endpoint_id: endpoint_id.into(),
                 }),
                 branch_created: None,
                 project_created: None,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 1588e50423..26ff4e1402 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -72,7 +72,6 @@ impl NeonControlPlaneClient {
             .caches
             .endpoints_cache
             .is_valid(ctx, &user_info.endpoint.normalize())
-            .await
         {
             info!("endpoint is not valid, skipping the request");
             return Ok(AuthInfo::default());
@@ -145,7 +144,6 @@ impl NeonControlPlaneClient {
             .caches
             .endpoints_cache
             .is_valid(ctx, &endpoint.normalize())
-            .await
         {
             return Err(GetEndpointJwksError::EndpointNotFound);
         }
diff --git a/proxy/src/types.rs b/proxy/src/types.rs
index b0408a51d1..6e0bd61c94 100644
--- a/proxy/src/types.rs
+++ b/proxy/src/types.rs
@@ -64,24 +64,28 @@ macro_rules! smol_str_wrapper {
 }
 
 const POOLER_SUFFIX: &str = "-pooler";
+pub(crate) const LOCAL_PROXY_SUFFIX: &str = "-local-proxy";
 
 impl EndpointId {
     #[must_use]
-    pub fn normalize(&self) -> Self {
+    fn normalize_str(&self) -> &str {
         if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            stripped.into()
+            stripped
+        } else if let Some(stripped) = self.as_ref().strip_suffix(LOCAL_PROXY_SUFFIX) {
+            stripped
         } else {
-            self.clone()
+            self
         }
     }
 
+    #[must_use]
+    pub fn normalize(&self) -> Self {
+        self.normalize_str().into()
+    }
+
     #[must_use]
     pub fn normalize_intern(&self) -> EndpointIdInt {
-        if let Some(stripped) = self.as_ref().strip_suffix(POOLER_SUFFIX) {
-            EndpointIdTag::get_interner().get_or_intern(stripped)
-        } else {
-            self.into()
-        }
+        EndpointIdTag::get_interner().get_or_intern(self.normalize_str())
     }
 }
 
@@ -110,13 +114,4 @@ impl EndpointId {
     pub(crate) fn is_branch(&self) -> bool {
         self.0.starts_with("br-")
     }
-    // pub(crate) fn is_project(&self) -> bool {
-    //     !self.is_endpoint() && !self.is_branch()
-    // }
-    pub(crate) fn as_branch(&self) -> BranchId {
-        BranchId(self.0.clone())
-    }
-    pub(crate) fn as_project(&self) -> ProjectId {
-        ProjectId(self.0.clone())
-    }
 }

From 1d3559d4bc0d9be3f657fe15de5acf55e19d5c0a Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 6 Nov 2024 13:17:02 -0500
Subject: [PATCH 176/239] feat(pageserver): add fast path for sparse keyspace
 read (#9631)

In https://github.com/neondatabase/neon/issues/9441, the tenant has a
lot of aux keys spread in multiple aux files. The perf tool shows that a
significant amount of time is spent on remove_overlapping_keys. For
sparse keyspaces, we don't need to report missing key errors anyways,
and it's very likely that we will need to read all layers intersecting
with the key range. Therefore, this patch adds a new fast path for
sparse keyspace reads that we do not track `unmapped_keyspace` in a
fine-grained way. We only modify it when we find an image layer.

In debug mode, it was ~5min to read the aux files for a dump of the
tenant, and now it's only 8s, that's a 60x speedup.

## Summary of changes

* Do not add sparse keys into `keys_done` so that remove_overlapping
does nothing.
* Allow `ValueReconstructSituation::Complete` to be updated again in
`ValuesReconstructState::update_key` for sparse keyspaces.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/storage_layer.rs | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 8f4219bbbc..9e3a25cbbc 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -12,7 +12,7 @@ pub mod merge_iterator;
 
 use crate::context::{AccessStatsBehavior, RequestContext};
 use bytes::Bytes;
-use pageserver_api::key::Key;
+use pageserver_api::key::{Key, NON_INHERITED_SPARSE_RANGE};
 use pageserver_api::keyspace::{KeySpace, KeySpaceRandomAccum};
 use pageserver_api::record::NeonWalRecord;
 use pageserver_api::value::Value;
@@ -196,6 +196,9 @@ impl ValuesReconstructState {
     /// Returns true if this was the last value needed for the key and false otherwise.
     ///
     /// If the key is done after the update, mark it as such.
+    ///
+    /// If the key is in the sparse keyspace (i.e., aux files), we do not track them in
+    /// `key_done`.
     pub(crate) fn update_key(
         &mut self,
         key: &Key,
@@ -206,10 +209,18 @@ impl ValuesReconstructState {
             .keys
             .entry(*key)
             .or_insert(Ok(VectoredValueReconstructState::default()));
-
+        let is_sparse_key = NON_INHERITED_SPARSE_RANGE.contains(key);
         if let Ok(state) = state {
             let key_done = match state.situation {
-                ValueReconstructSituation::Complete => unreachable!(),
+                ValueReconstructSituation::Complete => {
+                    if is_sparse_key {
+                        // Sparse keyspace might be visited multiple times because
+                        // we don't track unmapped keyspaces.
+                        return ValueReconstructSituation::Complete;
+                    } else {
+                        unreachable!()
+                    }
+                }
                 ValueReconstructSituation::Continue => match value {
                     Value::Image(img) => {
                         state.img = Some((lsn, img));
@@ -234,7 +245,9 @@ impl ValuesReconstructState {
 
             if key_done && state.situation == ValueReconstructSituation::Continue {
                 state.situation = ValueReconstructSituation::Complete;
-                self.keys_done.add_key(*key);
+                if !is_sparse_key {
+                    self.keys_done.add_key(*key);
+                }
             }
 
             state.situation

From 93123f2623e5a8df90b0814aba3bf14edac27351 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 6 Nov 2024 13:28:23 -0600
Subject: [PATCH 177/239] Rename compute_backpressure_throttling_ms to
 compute_backpressure_throttling_seconds

This is in line with the Prometheus guidance[0]. We also haven't started
using this metric, so renaming is essentially free.

Link: https://prometheus.io/docs/practices/naming/ [0]
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.jsonnet                            | 2 +-
 .../etc/sql_exporter/compute_backpressure_throttling_ms.sql   | 1 -
 ...nnet => compute_backpressure_throttling_seconds.libsonnet} | 4 ++--
 .../sql_exporter/compute_backpressure_throttling_seconds.sql  | 1 +
 4 files changed, 4 insertions(+), 4 deletions(-)
 delete mode 100644 compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
 rename compute/etc/sql_exporter/{compute_backpressure_throttling_ms.libsonnet => compute_backpressure_throttling_seconds.libsonnet} (71%)
 create mode 100644 compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index e73fb132ee..c6fa645b41 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -3,7 +3,7 @@
   metrics: [
     import 'sql_exporter/checkpoints_req.libsonnet',
     import 'sql_exporter/checkpoints_timed.libsonnet',
-    import 'sql_exporter/compute_backpressure_throttling_ms.libsonnet',
+    import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
     import 'sql_exporter/compute_current_lsn.libsonnet',
     import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
     import 'sql_exporter/compute_receive_lsn.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
deleted file mode 100644
index 1fa62d38a4..0000000000
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.sql
+++ /dev/null
@@ -1 +0,0 @@
-SELECT neon.backpressure_throttling_time() AS throttled;
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
similarity index 71%
rename from compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
rename to compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
index b25bb73d0f..02c803cfa6 100644
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_ms.libsonnet
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.libsonnet
@@ -1,10 +1,10 @@
 {
-  metric_name: 'compute_backpressure_throttling_ms',
+  metric_name: 'compute_backpressure_throttling_seconds',
   type: 'gauge',
   help: 'Time compute has spent throttled',
   key_labels: null,
   values: [
     'throttled',
   ],
-  query: importstr 'sql_exporter/compute_backpressure_throttling_ms.sql',
+  query: importstr 'sql_exporter/compute_backpressure_throttling_seconds.sql',
 }
diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
new file mode 100644
index 0000000000..459c586d18
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
@@ -0,0 +1 @@
+SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;

From 11fc1a4c12e435215e44f5f0d8a424335f7018f6 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 6 Nov 2024 15:18:21 -0500
Subject: [PATCH 178/239] fix(test): use layer map dump in
 `test_readonly_node_gc` to validate layers protected by leases (#9551)

Fixes #9518.

## Problem

After removing the assertion `layers_removed == 0` in #9506, we could
miss breakage if we solely rely on the successful execution of the
`SELECT` query to check if lease is properly protecting layers. Details
listed in #9518.

Also, in integration tests, we sometimes run into the race condition
where getpage request comes before the lease get renewed (item 2 of
#8817), even if compute_ctl sends a lease renewal as soon as it sees a
`/configure` API calls that updates the `pageserver_connstr`. In this
case, we would observe a getpage request error stating that we `tried to
request a page version that was garbage collected` (as we seen in
[Allure
Report](https://neon-github-public-dev.s3.amazonaws.com/reports/pr-8613/11550393107/index.html#suites/3ccffb1d100105b98aed3dc19b717917/d1a1ba47bc180493)).

## Summary of changes

- Use layer map dump to verify if the lease protects what it claimed:
Record all historical layers that has `start_lsn <= lease_lsn` before
and after running timeline gc. This is the same check as
https://github.com/neondatabase/neon/blob/ad79f4246030583e5af418cb087bf7582266accc/pageserver/src/tenant/timeline.rs#L5025-L5027
The set recorded after GC should contain every layer in the set recorded
before GC.
- Wait until log contains another successful lease request before
running the `SELECT` query after GC. We argued in #8817 that the bad
request can only exist within a short period after migration/restart,
and our test shows that as long as a lease renewal is done before the
first getpage request sent after reconfiguration, we will not have bad
request.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 test_runner/regress/test_readonly_node.py | 71 ++++++++++++++++++++---
 1 file changed, 64 insertions(+), 7 deletions(-)

diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 8151160477..f257f0853b 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -1,19 +1,22 @@
 from __future__ import annotations
 
 import time
+from typing import Union
 
 import pytest
-from fixtures.common_types import Lsn
+from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     Endpoint,
+    LogCursor,
     NeonEnv,
     NeonEnvBuilder,
     last_flush_lsn_upload,
     tenant_get_shards,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, wait_until
 
 
 #
@@ -169,23 +172,63 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
             )
         return last_flush_lsn
 
-    def trigger_gc_and_select(env: NeonEnv, ep_static: Endpoint, ctx: str):
+    def get_layers_protected_by_lease(
+        ps_http: PageserverHttpClient,
+        tenant_id: Union[TenantId, TenantShardId],
+        timeline_id: TimelineId,
+        lease_lsn: Lsn,
+    ) -> set[str]:
+        """Get all layers whose start_lsn is less than or equal to the lease lsn."""
+        layer_map_info = ps_http.layer_map_info(tenant_id, timeline_id)
+        return set(
+            x.layer_file_name
+            for x in layer_map_info.historic_layers
+            if Lsn(x.lsn_start) <= lease_lsn
+        )
+
+    def trigger_gc_and_select(
+        env: NeonEnv,
+        ep_static: Endpoint,
+        lease_lsn: Lsn,
+        ctx: str,
+        offset: None | LogCursor = None,
+    ) -> LogCursor:
         """
         Trigger GC manually on all pageservers. Then run an `SELECT` query.
         """
         for shard, ps in tenant_get_shards(env, env.initial_tenant):
             client = ps.http_client()
+            layers_guarded_before_gc = get_layers_protected_by_lease(
+                client, shard, env.initial_timeline, lease_lsn=lsn
+            )
             gc_result = client.timeline_gc(shard, env.initial_timeline, 0)
+            layers_guarded_after_gc = get_layers_protected_by_lease(
+                client, shard, env.initial_timeline, lease_lsn=lsn
+            )
+
             # Note: cannot assert on `layers_removed` here because it could be layers
-            # not guarded by the lease. Rely on successful execution of the query instead.
+            # not guarded by the lease. Instead, use layer map dump.
+            assert layers_guarded_before_gc.issubset(
+                layers_guarded_after_gc
+            ), "Layers guarded by lease before GC should not be removed"
+
             log.info(f"{gc_result=}")
 
+        # wait for lease renewal before running query.
+        _, offset = wait_until(
+            20,
+            0.5,
+            lambda: ep_static.assert_log_contains(
+                "lsn_lease_bg_task.*Request succeeded", offset=offset
+            ),
+        )
         with ep_static.cursor() as cur:
             # Following query should succeed if pages are properly guarded by leases.
             cur.execute("SELECT count(*) FROM t0")
             assert cur.fetchone() == (ROW_COUNT,)
 
         log.info(f"`SELECT` query succeed after GC, {ctx=}")
+        return offset
 
     # Insert some records on main branch
     with env.endpoints.create_start("main") as ep_main:
@@ -213,7 +256,9 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
 
             generate_updates_on_main(env, ep_main, 3, end=100)
 
-            trigger_gc_and_select(env, ep_static, ctx="Before pageservers restart")
+            offset = trigger_gc_and_select(
+                env, ep_static, lease_lsn=lsn, ctx="Before pageservers restart"
+            )
 
             # Trigger Pageserver restarts
             for ps in env.pageservers:
@@ -222,7 +267,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
                 time.sleep(LSN_LEASE_LENGTH / 2)
                 ps.start()
 
-            trigger_gc_and_select(env, ep_static, ctx="After pageservers restart")
+            trigger_gc_and_select(
+                env,
+                ep_static,
+                lease_lsn=lsn,
+                ctx="After pageservers restart",
+                offset=offset,
+            )
 
             # Reconfigure pageservers
             env.pageservers[0].stop()
@@ -231,7 +282,13 @@ def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
             )
             env.storage_controller.reconcile_until_idle()
 
-            trigger_gc_and_select(env, ep_static, ctx="After putting pageserver 0 offline")
+            trigger_gc_and_select(
+                env,
+                ep_static,
+                lease_lsn=lsn,
+                ctx="After putting pageserver 0 offline",
+                offset=offset,
+            )
 
         # Do some update so we can increment latest_gc_cutoff
         generate_updates_on_main(env, ep_main, i, end=100)

From 2a95a51a0df6602aa444fd07842e4b347f970ac9 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Wed, 6 Nov 2024 15:41:01 -0500
Subject: [PATCH 179/239] refactor(pageserver): better pageservice command
 parsing (#9597)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

close https://github.com/neondatabase/neon/issues/9460

## Summary of changes

A full rewrite of pagestream cmdline parsing to make it more robust and
readable.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 pageserver/src/page_service.rs | 682 ++++++++++++++++++++++++---------
 1 file changed, 504 insertions(+), 178 deletions(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index aed8a87851..f07474df6a 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1,10 +1,11 @@
 //! The Page Service listens for client connections and serves their GetPage@LSN
 //! requests.
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use futures::FutureExt;
+use itertools::Itertools;
 use once_cell::sync::OnceCell;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
@@ -1221,6 +1222,222 @@ impl PageServerHandler {
     }
 }
 
+/// `basebackup tenant timeline [lsn] [--gzip] [--replica]`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct BaseBackupCmd {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Option<Lsn>,
+    gzip: bool,
+    replica: bool,
+}
+
+/// `fullbackup tenant timeline [lsn] [prev_lsn]`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct FullBackupCmd {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    lsn: Option<Lsn>,
+    prev_lsn: Option<Lsn>,
+}
+
+/// `pagestream_v2 tenant timeline`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct PageStreamCmd {
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+}
+
+/// `lease lsn tenant timeline lsn`
+#[derive(Debug, Clone, Eq, PartialEq)]
+struct LeaseLsnCmd {
+    tenant_shard_id: TenantShardId,
+    timeline_id: TimelineId,
+    lsn: Lsn,
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+enum PageServiceCmd {
+    Set,
+    PageStream(PageStreamCmd),
+    BaseBackup(BaseBackupCmd),
+    FullBackup(FullBackupCmd),
+    LeaseLsn(LeaseLsnCmd),
+}
+
+impl PageStreamCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() != 2 {
+            bail!(
+                "invalid number of parameters for pagestream command: {}",
+                query
+            );
+        }
+        let tenant_id = TenantId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+        })
+    }
+}
+
+impl FullBackupCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() < 2 || parameters.len() > 4 {
+            bail!(
+                "invalid number of parameters for basebackup command: {}",
+                query
+            );
+        }
+        let tenant_id = TenantId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        // The caller is responsible for providing correct lsn and prev_lsn.
+        let lsn = if let Some(lsn_str) = parameters.get(2) {
+            Some(
+                Lsn::from_str(lsn_str)
+                    .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
+            )
+        } else {
+            None
+        };
+        let prev_lsn = if let Some(prev_lsn_str) = parameters.get(3) {
+            Some(
+                Lsn::from_str(prev_lsn_str)
+                    .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
+            )
+        } else {
+            None
+        };
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+            lsn,
+            prev_lsn,
+        })
+    }
+}
+
+impl BaseBackupCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() < 2 {
+            bail!(
+                "invalid number of parameters for basebackup command: {}",
+                query
+            );
+        }
+        let tenant_id = TenantId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        let lsn;
+        let flags_parse_from;
+        if let Some(maybe_lsn) = parameters.get(2) {
+            if *maybe_lsn == "latest" {
+                lsn = None;
+                flags_parse_from = 3;
+            } else if maybe_lsn.starts_with("--") {
+                lsn = None;
+                flags_parse_from = 2;
+            } else {
+                lsn = Some(
+                    Lsn::from_str(maybe_lsn)
+                        .with_context(|| format!("Failed to parse lsn from {maybe_lsn}"))?,
+                );
+                flags_parse_from = 3;
+            }
+        } else {
+            lsn = None;
+            flags_parse_from = 2;
+        }
+
+        let mut gzip = false;
+        let mut replica = false;
+
+        for &param in &parameters[flags_parse_from..] {
+            match param {
+                "--gzip" => {
+                    if gzip {
+                        bail!("duplicate parameter for basebackup command: {param}")
+                    }
+                    gzip = true
+                }
+                "--replica" => {
+                    if replica {
+                        bail!("duplicate parameter for basebackup command: {param}")
+                    }
+                    replica = true
+                }
+                _ => bail!("invalid parameter for basebackup command: {param}"),
+            }
+        }
+        Ok(Self {
+            tenant_id,
+            timeline_id,
+            lsn,
+            gzip,
+            replica,
+        })
+    }
+}
+
+impl LeaseLsnCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let parameters = query.split_whitespace().collect_vec();
+        if parameters.len() != 3 {
+            bail!(
+                "invalid number of parameters for lease lsn command: {}",
+                query
+            );
+        }
+        let tenant_shard_id = TenantShardId::from_str(parameters[0])
+            .with_context(|| format!("Failed to parse tenant id from {}", parameters[0]))?;
+        let timeline_id = TimelineId::from_str(parameters[1])
+            .with_context(|| format!("Failed to parse timeline id from {}", parameters[1]))?;
+        let lsn = Lsn::from_str(parameters[2])
+            .with_context(|| format!("Failed to parse lsn from {}", parameters[2]))?;
+        Ok(Self {
+            tenant_shard_id,
+            timeline_id,
+            lsn,
+        })
+    }
+}
+
+impl PageServiceCmd {
+    fn parse(query: &str) -> anyhow::Result<Self> {
+        let query = query.trim();
+        let Some((cmd, other)) = query.split_once(' ') else {
+            bail!("cannot parse query: {query}")
+        };
+        match cmd.to_ascii_lowercase().as_str() {
+            "pagestream_v2" => Ok(Self::PageStream(PageStreamCmd::parse(other)?)),
+            "basebackup" => Ok(Self::BaseBackup(BaseBackupCmd::parse(other)?)),
+            "fullbackup" => Ok(Self::FullBackup(FullBackupCmd::parse(other)?)),
+            "lease" => {
+                let Some((cmd2, other)) = other.split_once(' ') else {
+                    bail!("invalid lease command: {cmd}");
+                };
+                let cmd2 = cmd2.to_ascii_lowercase();
+                if cmd2 == "lsn" {
+                    Ok(Self::LeaseLsn(LeaseLsnCmd::parse(other)?))
+                } else {
+                    bail!("invalid lease command: {cmd}");
+                }
+            }
+            "set" => Ok(Self::Set),
+            _ => Err(anyhow::anyhow!("unsupported command {cmd} in {query}")),
+        }
+    }
+}
+
 impl<IO> postgres_backend::Handler<IO> for PageServerHandler
 where
     IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
@@ -1277,206 +1494,137 @@ where
         fail::fail_point!("ps::connection-start::process-query");
 
         let ctx = self.connection_ctx.attached_child();
-        debug!("process query {query_string:?}");
-        let parts = query_string.split_whitespace().collect::<Vec<_>>();
-        if let Some(params) = parts.strip_prefix(&["pagestream_v2"]) {
-            if params.len() != 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for pagestream command"
-                )));
-            }
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::PageStreamV2)
-                .inc();
-
-            self.handle_pagerequests(
-                pgb,
+        debug!("process query {query_string}");
+        let query = PageServiceCmd::parse(query_string)?;
+        match query {
+            PageServiceCmd::PageStream(PageStreamCmd {
                 tenant_id,
                 timeline_id,
-                PagestreamProtocolVersion::V2,
-                ctx,
-            )
-            .await?;
-        } else if let Some(params) = parts.strip_prefix(&["basebackup"]) {
-            if params.len() < 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for basebackup command"
-                )));
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_id))
+                    .record("timeline_id", field::display(timeline_id));
+
+                self.check_permission(Some(tenant_id))?;
+
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::PageStreamV2)
+                    .inc();
+
+                self.handle_pagerequests(
+                    pgb,
+                    tenant_id,
+                    timeline_id,
+                    PagestreamProtocolVersion::V2,
+                    ctx,
+                )
+                .await?;
             }
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn,
+                gzip,
+                replica,
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_id))
+                    .record("timeline_id", field::display(timeline_id));
 
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
+                self.check_permission(Some(tenant_id))?;
 
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Basebackup)
-                .inc();
-
-            let mut lsn = None;
-            let mut replica = false;
-            let mut gzip = false;
-            for param in &params[2..] {
-                if param.starts_with("--") {
-                    match *param {
-                        "--gzip" => gzip = true,
-                        "--replica" => replica = true,
-                        _ => {
-                            return Err(QueryError::Other(anyhow::anyhow!(
-                                "Unknown parameter {param}",
-                            )))
-                        }
-                    }
-                } else {
-                    lsn = Some(
-                        Lsn::from_str(param)
-                            .with_context(|| format!("Failed to parse Lsn from {param}"))?,
-                    );
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Basebackup)
+                    .inc();
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+                let res = async {
+                    self.handle_basebackup_request(
+                        pgb,
+                        tenant_id,
+                        timeline_id,
+                        lsn,
+                        None,
+                        false,
+                        gzip,
+                        replica,
+                        &ctx,
+                    )
+                    .await?;
+                    pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
+                    Result::<(), QueryError>::Ok(())
                 }
+                .await;
+                metric_recording.observe(&res);
+                res?;
             }
+            // same as basebackup, but result includes relational data as well
+            PageServiceCmd::FullBackup(FullBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn,
+                prev_lsn,
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_id))
+                    .record("timeline_id", field::display(timeline_id));
 
-            let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
-            let res = async {
+                self.check_permission(Some(tenant_id))?;
+
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::Fullbackup)
+                    .inc();
+
+                // Check that the timeline exists
                 self.handle_basebackup_request(
                     pgb,
                     tenant_id,
                     timeline_id,
                     lsn,
-                    None,
+                    prev_lsn,
+                    true,
+                    false,
                     false,
-                    gzip,
-                    replica,
                     &ctx,
                 )
                 .await?;
                 pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-                Result::<(), QueryError>::Ok(())
             }
-            .await;
-            metric_recording.observe(&res);
-            res?;
-        }
-        // same as basebackup, but result includes relational data as well
-        else if let Some(params) = parts.strip_prefix(&["fullbackup"]) {
-            if params.len() < 2 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number for fullbackup command"
-                )));
+            PageServiceCmd::Set => {
+                // important because psycopg2 executes "SET datestyle TO 'ISO'"
+                // on connect
+                pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
             }
-
-            let tenant_id = TenantId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            // The caller is responsible for providing correct lsn and prev_lsn.
-            let lsn = if let Some(lsn_str) = params.get(2) {
-                Some(
-                    Lsn::from_str(lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {lsn_str}"))?,
-                )
-            } else {
-                None
-            };
-            let prev_lsn = if let Some(prev_lsn_str) = params.get(3) {
-                Some(
-                    Lsn::from_str(prev_lsn_str)
-                        .with_context(|| format!("Failed to parse Lsn from {prev_lsn_str}"))?,
-                )
-            } else {
-                None
-            };
-
-            self.check_permission(Some(tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::Fullbackup)
-                .inc();
-
-            // Check that the timeline exists
-            self.handle_basebackup_request(
-                pgb,
-                tenant_id,
+            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
+                tenant_shard_id,
                 timeline_id,
                 lsn,
-                prev_lsn,
-                true,
-                false,
-                false,
-                &ctx,
-            )
-            .await?;
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.to_ascii_lowercase().starts_with("set ") {
-            // important because psycopg2 executes "SET datestyle TO 'ISO'"
-            // on connect
-            pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?;
-        } else if query_string.starts_with("lease lsn ") {
-            let params = &parts[2..];
-            if params.len() != 3 {
-                return Err(QueryError::Other(anyhow::anyhow!(
-                    "invalid param number {} for lease lsn command",
-                    params.len()
-                )));
+            }) => {
+                tracing::Span::current()
+                    .record("tenant_id", field::display(tenant_shard_id))
+                    .record("timeline_id", field::display(timeline_id));
+
+                self.check_permission(Some(tenant_shard_id.tenant_id))?;
+
+                COMPUTE_COMMANDS_COUNTERS
+                    .for_command(ComputeCommandKind::LeaseLsn)
+                    .inc();
+
+                match self
+                    .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
+                    .await
+                {
+                    Ok(()) => {
+                        pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?
+                    }
+                    Err(e) => {
+                        error!("error obtaining lsn lease for {lsn}: {e:?}");
+                        pgb.write_message_noflush(&BeMessage::ErrorResponse(
+                            &e.to_string(),
+                            Some(e.pg_error_code()),
+                        ))?
+                    }
+                };
             }
-
-            let tenant_shard_id = TenantShardId::from_str(params[0])
-                .with_context(|| format!("Failed to parse tenant id from {}", params[0]))?;
-            let timeline_id = TimelineId::from_str(params[1])
-                .with_context(|| format!("Failed to parse timeline id from {}", params[1]))?;
-
-            tracing::Span::current()
-                .record("tenant_id", field::display(tenant_shard_id))
-                .record("timeline_id", field::display(timeline_id));
-
-            self.check_permission(Some(tenant_shard_id.tenant_id))?;
-
-            COMPUTE_COMMANDS_COUNTERS
-                .for_command(ComputeCommandKind::LeaseLsn)
-                .inc();
-
-            // The caller is responsible for providing correct lsn.
-            let lsn = Lsn::from_str(params[2])
-                .with_context(|| format!("Failed to parse Lsn from {}", params[2]))?;
-
-            match self
-                .handle_make_lsn_lease(pgb, tenant_shard_id, timeline_id, lsn, &ctx)
-                .await
-            {
-                Ok(()) => pgb.write_message_noflush(&BeMessage::CommandComplete(b"SELECT 1"))?,
-                Err(e) => {
-                    error!("error obtaining lsn lease for {lsn}: {e:?}");
-                    pgb.write_message_noflush(&BeMessage::ErrorResponse(
-                        &e.to_string(),
-                        Some(e.pg_error_code()),
-                    ))?
-                }
-            };
-        } else {
-            return Err(QueryError::Other(anyhow::anyhow!(
-                "unknown command {query_string}"
-            )));
         }
 
         Ok(())
@@ -1525,3 +1673,181 @@ fn set_tracing_field_shard_id(timeline: &Timeline) {
     );
     debug_assert_current_span_has_tenant_and_timeline_id();
 }
+
+#[cfg(test)]
+mod tests {
+    use utils::shard::ShardCount;
+
+    use super::*;
+
+    #[test]
+    fn pageservice_cmd_parse() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+        let cmd =
+            PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id} {timeline_id}")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::PageStream(PageStreamCmd {
+                tenant_id,
+                timeline_id
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id}")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: false,
+                replica: false
+            })
+        );
+        let cmd =
+            PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} --gzip")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: true,
+                replica: false
+            })
+        );
+        let cmd =
+            PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} latest")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: false,
+                replica: false
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!("basebackup {tenant_id} {timeline_id} 0/16ABCDE"))
+            .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
+                gzip: false,
+                replica: false
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --replica --gzip"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                gzip: true,
+                replica: true
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} 0/16ABCDE --replica --gzip"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::BaseBackup(BaseBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
+                gzip: true,
+                replica: true
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!("fullbackup {tenant_id} {timeline_id}")).unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::FullBackup(FullBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: None,
+                prev_lsn: None
+            })
+        );
+        let cmd = PageServiceCmd::parse(&format!(
+            "fullbackup {tenant_id} {timeline_id} 0/16ABCDE 0/16ABCDF"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::FullBackup(FullBackupCmd {
+                tenant_id,
+                timeline_id,
+                lsn: Some(Lsn::from_str("0/16ABCDE").unwrap()),
+                prev_lsn: Some(Lsn::from_str("0/16ABCDF").unwrap()),
+            })
+        );
+        let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+        let cmd = PageServiceCmd::parse(&format!(
+            "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
+                tenant_shard_id,
+                timeline_id,
+                lsn: Lsn::from_str("0/16ABCDE").unwrap(),
+            })
+        );
+        let tenant_shard_id = TenantShardId::split(&tenant_shard_id, ShardCount(8))[1];
+        let cmd = PageServiceCmd::parse(&format!(
+            "lease lsn {tenant_shard_id} {timeline_id} 0/16ABCDE"
+        ))
+        .unwrap();
+        assert_eq!(
+            cmd,
+            PageServiceCmd::LeaseLsn(LeaseLsnCmd {
+                tenant_shard_id,
+                timeline_id,
+                lsn: Lsn::from_str("0/16ABCDE").unwrap(),
+            })
+        );
+        let cmd = PageServiceCmd::parse("set a = b").unwrap();
+        assert_eq!(cmd, PageServiceCmd::Set);
+        let cmd = PageServiceCmd::parse("SET foo").unwrap();
+        assert_eq!(cmd, PageServiceCmd::Set);
+    }
+
+    #[test]
+    fn pageservice_cmd_err_handling() {
+        let tenant_id = TenantId::generate();
+        let timeline_id = TimelineId::generate();
+        let cmd = PageServiceCmd::parse("unknown_command");
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse("pagestream_v2");
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx"));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!("pagestream_v2 {tenant_id}xxx {timeline_id}xxx"));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --gzip --gzip"
+        ));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --gzip --unknown"
+        ));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!(
+            "basebackup {tenant_id} {timeline_id} --gzip 0/16ABCDE"
+        ));
+        assert!(cmd.is_err());
+        let cmd = PageServiceCmd::parse(&format!("lease {tenant_id} {timeline_id} gzip 0/16ABCDE"));
+        assert!(cmd.is_err());
+    }
+}

From 011c0a175f91d3f636acb11dc8c7613b64ccb6cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 7 Nov 2024 01:53:58 +0100
Subject: [PATCH 180/239] Support copying layers in detach_ancestor from before
 shard splits (#9669)

We need to use the shard associated with the layer file, not the shard
associated with our current tenant shard ID.

Due to shard splits, the shard IDs can refer to older files.

close https://github.com/neondatabase/neon/issues/9667
---
 .../src/tenant/remote_timeline_client.rs      |  6 +--
 .../src/tenant/timeline/detach_ancestor.rs    | 14 +++++--
 .../regress/test_timeline_detach_ancestor.py  | 41 ++++++++++++++++---
 3 files changed, 49 insertions(+), 12 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 03ec18c882..0aa8d61036 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1445,7 +1445,7 @@ impl RemoteTimelineClient {
         let remote_path = remote_layer_path(
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
-            self.tenant_shard_id.to_index(),
+            uploaded.metadata().shard,
             &uploaded.layer_desc().layer_name(),
             uploaded.metadata().generation,
         );
@@ -1486,7 +1486,7 @@ impl RemoteTimelineClient {
             &adopted
                 .get_timeline_id()
                 .expect("Source timeline should be alive"),
-            self.tenant_shard_id.to_index(),
+            adopted.metadata().shard,
             &adopted.layer_desc().layer_name(),
             adopted.metadata().generation,
         );
@@ -1494,7 +1494,7 @@ impl RemoteTimelineClient {
         let target_remote_path = remote_layer_path(
             &self.tenant_shard_id.tenant_id,
             &self.timeline_id,
-            self.tenant_shard_id.to_index(),
+            adopted_as.metadata().shard,
             &adopted_as.layer_desc().layer_name(),
             adopted_as.metadata().generation,
         );
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
index b4c0ab0329..f8bc4352e2 100644
--- a/pageserver/src/tenant/timeline/detach_ancestor.rs
+++ b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -12,7 +12,7 @@ use crate::{
     virtual_file::{MaybeFatalIo, VirtualFile},
 };
 use anyhow::Context;
-use pageserver_api::models::detach_ancestor::AncestorDetached;
+use pageserver_api::{models::detach_ancestor::AncestorDetached, shard::ShardIdentity};
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use tracing::Instrument;
@@ -376,8 +376,14 @@ pub(super) async fn prepare(
         tasks.spawn(
             async move {
                 let _permit = limiter.acquire().await;
-                let owned =
-                    remote_copy(&adopted, &timeline, timeline.generation, &timeline.cancel).await?;
+                let owned = remote_copy(
+                    &adopted,
+                    &timeline,
+                    timeline.generation,
+                    timeline.shard_identity,
+                    &timeline.cancel,
+                )
+                .await?;
                 tracing::info!(layer=%owned, "remote copied");
                 Ok(owned)
             }
@@ -629,6 +635,7 @@ async fn remote_copy(
     adopted: &Layer,
     adoptee: &Arc<Timeline>,
     generation: Generation,
+    shard_identity: ShardIdentity,
     cancel: &CancellationToken,
 ) -> Result<Layer, Error> {
     // depending if Layer::keep_resident we could hardlink
@@ -636,6 +643,7 @@ async fn remote_copy(
     let mut metadata = adopted.metadata();
     debug_assert!(metadata.generation <= generation);
     metadata.generation = generation;
+    metadata.shard = shard_identity.shard_index();
 
     let owned = crate::tenant::storage_layer::Layer::for_evicted(
         adoptee.conf,
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index ed47f9432b..0e8519e07b 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -16,6 +16,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
     flush_ep_to_pageserver,
+    last_flush_lsn_upload,
     wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import HistoricLayerInfo, PageserverApiException
@@ -576,27 +577,49 @@ def test_compaction_induced_by_detaches_in_history(
     assert_pageserver_backups_equal(fullbackup_before, fullbackup_after, set())
 
 
-@pytest.mark.parametrize("sharded", [True, False])
+@pytest.mark.parametrize("shards_initial_after", [(1, 1), (2, 2), (1, 4)])
 def test_timeline_ancestor_detach_idempotent_success(
-    neon_env_builder: NeonEnvBuilder, sharded: bool
+    neon_env_builder: NeonEnvBuilder, shards_initial_after: tuple[int, int]
 ):
-    shards = 2 if sharded else 1
+    shards_initial = shards_initial_after[0]
+    shards_after = shards_initial_after[1]
 
-    neon_env_builder.num_pageservers = shards
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+    neon_env_builder.num_pageservers = shards_after
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shards_initial if shards_initial > 1 else None,
+        initial_tenant_conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": 512 * 1024,
+            "compaction_threshold": 1,
+            "compaction_target_size": 512 * 1024,
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+        },
+    )
 
     pageservers = dict((int(p.id), p) for p in env.pageservers)
 
     for ps in pageservers.values():
         ps.allowed_errors.extend(SHUTDOWN_ALLOWED_ERRORS)
 
-    if sharded:
+    if shards_after > 1:
         # FIXME: should this be in the neon_env_builder.init_start?
         env.storage_controller.reconcile_until_idle()
         client = env.storage_controller.pageserver_api()
     else:
         client = env.pageserver.http_client()
 
+    # Write some data so that we have some layers to copy
+    with env.endpoints.create_start("main", tenant_id=env.initial_tenant) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,1024)",
+            ]
+        )
+        last_flush_lsn_upload(env, endpoint, env.initial_tenant, env.initial_timeline)
+
     first_branch = env.create_branch("first_branch")
 
     _ = env.create_branch("second_branch", ancestor_branch_name="first_branch")
@@ -607,6 +630,12 @@ def test_timeline_ancestor_detach_idempotent_success(
     reparented1 = env.create_branch("first_reparented", ancestor_branch_name="main")
     reparented2 = env.create_branch("second_reparented", ancestor_branch_name="main")
 
+    if shards_after > shards_initial:
+        # Do a shard split
+        # This is a reproducer for https://github.com/neondatabase/neon/issues/9667
+        env.storage_controller.tenant_shard_split(env.initial_tenant, shards_after)
+        env.storage_controller.reconcile_until_idle()
+
     first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch)
     assert set(first_reparenting_response) == {reparented1, reparented2}
 

From e1d0b73824dfd42895202c666c24779023d4c4b0 Mon Sep 17 00:00:00 2001
From: Cheng Chen <ccqmpux@gmail.com>
Date: Wed, 6 Nov 2024 20:41:18 -0800
Subject: [PATCH 181/239] chore(compute): Bump pg_mooncake to the latest
 version

---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 30126de56c..f070f66c0a 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1151,8 +1151,8 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 # The topmost commit in the `neon` branch at the time of writing this
 # https://github.com/Mooncake-Labs/pg_mooncake/commits/neon/
-# https://github.com/Mooncake-Labs/pg_mooncake/commit/568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
-ENV PG_MOONCAKE_VERSION=568b5a82b5fc16136bdf4ca5aac3e0cc261ab48d
+# https://github.com/Mooncake-Labs/pg_mooncake/commit/077c92c452bb6896a7b7776ee95f039984f076af
+ENV PG_MOONCAKE_VERSION=077c92c452bb6896a7b7776ee95f039984f076af
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
 
 RUN case "${PG_VERSION}" in \

From d6aa26a533844671a3b48e03ed9bb8c2345bb137 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 7 Nov 2024 11:38:39 +0100
Subject: [PATCH 182/239] postgres_ffi: make `WalGenerator` generic over record
 generator (#9614)

## Problem

Benchmarks need more control over the WAL generated by `WalGenerator`.
In particular, they need to vary the size of logical messages.

## Summary of changes

* Make `WalGenerator` generic over `RecordGenerator`, which constructs
WAL records.
* Add `LogicalMessageGenerator` which emits logical messages, with a
configurable payload.
* Minor tweaks and code reorganization.

There are no changes to the core logic or emitted WAL.
---
 libs/postgres_ffi/src/wal_generator.rs        | 236 +++++++++++-------
 libs/postgres_ffi/src/xlog_utils.rs           |  14 +-
 .../tests/walproposer_sim/walproposer_disk.rs |   8 +-
 3 files changed, 156 insertions(+), 102 deletions(-)

diff --git a/libs/postgres_ffi/src/wal_generator.rs b/libs/postgres_ffi/src/wal_generator.rs
index 97968c269b..dc679eea33 100644
--- a/libs/postgres_ffi/src/wal_generator.rs
+++ b/libs/postgres_ffi/src/wal_generator.rs
@@ -1,10 +1,10 @@
-use std::ffi::CStr;
+use std::ffi::{CStr, CString};
 
 use bytes::{Bytes, BytesMut};
 use crc32c::crc32c_append;
 use utils::lsn::Lsn;
 
-use super::bindings::{XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
+use super::bindings::{RmgrId, XLogLongPageHeaderData, XLogPageHeaderData, XLOG_PAGE_MAGIC};
 use super::xlog_utils::{
     XlLogicalMessage, XLOG_RECORD_CRC_OFFS, XLOG_SIZE_OF_XLOG_RECORD, XLP_BKP_REMOVABLE,
     XLP_FIRST_IS_CONTRECORD,
@@ -16,11 +16,65 @@ use crate::pg_constants::{
 };
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 
-/// Generates binary WAL records for use in tests and benchmarks. Currently only generates logical
-/// messages (effectively noops) with a fixed payload. It is used as an iterator which yields
-/// encoded bytes for a single WAL record, including internal page headers if it spans pages.
-/// Concatenating the bytes will yield a complete, well-formed WAL, which can be chunked at segment
-/// boundaries if desired. Not optimized for performance.
+/// A WAL record payload. Will be prefixed by an XLogRecord header when encoded.
+pub struct Record {
+    pub rmid: RmgrId,
+    pub info: u8,
+    pub data: Bytes,
+}
+
+impl Record {
+    /// Encodes the WAL record including an XLogRecord header. prev_lsn is the start position of
+    /// the previous record in the WAL -- this is ignored by the Safekeeper, but not Postgres.
+    pub fn encode(&self, prev_lsn: Lsn) -> Bytes {
+        // Prefix data with block ID and length.
+        let data_header = Bytes::from(match self.data.len() {
+            0 => vec![],
+            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, self.data.len() as u8],
+            256.. => {
+                let len_bytes = (self.data.len() as u32).to_le_bytes();
+                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
+            }
+        });
+
+        // Construct the WAL record header.
+        let mut header = XLogRecord {
+            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + self.data.len()) as u32,
+            xl_xid: 0,
+            xl_prev: prev_lsn.into(),
+            xl_info: self.info,
+            xl_rmid: self.rmid,
+            __bindgen_padding_0: [0; 2],
+            xl_crc: 0, // see below
+        };
+
+        // Compute the CRC checksum for the data, and the header up to the CRC field.
+        let mut crc = 0;
+        crc = crc32c_append(crc, &data_header);
+        crc = crc32c_append(crc, &self.data);
+        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
+        header.xl_crc = crc;
+
+        // Encode the final header and record.
+        let header = header.encode().unwrap();
+
+        [header, data_header, self.data.clone()].concat().into()
+    }
+}
+
+/// Generates WAL record payloads.
+///
+/// TODO: currently only provides LogicalMessageGenerator for trivial noop messages. Add a generator
+/// that creates a table and inserts rows.
+pub trait RecordGenerator: Iterator<Item = Record> {}
+
+impl<I: Iterator<Item = Record>> RecordGenerator for I {}
+
+/// Generates binary WAL for use in tests and benchmarks. The provided record generator constructs
+/// the WAL records. It is used as an iterator which yields encoded bytes for a single WAL record,
+/// including internal page headers if it spans pages. Concatenating the bytes will yield a
+/// complete, well-formed WAL, which can be chunked at segment boundaries if desired. Not optimized
+/// for performance.
 ///
 /// The WAL format is version-dependant (see e.g. `XLOG_PAGE_MAGIC`), so make sure to import this
 /// for the appropriate Postgres version (e.g. `postgres_ffi::v17::wal_generator::WalGenerator`).
@@ -31,10 +85,10 @@ use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 /// |        Segment 1         |        Segment 2         |        Segment 3         |
 /// | Page 1 | Page 2 | Page 3 | Page 4 | Page 5 | Page 6 | Page 7 | Page 8 | Page 9 |
 /// | R1 |   R2  |R3|  R4  | R5  |  R6  |                 R7            | R8  |
-///
-/// TODO: support generating actual tables and rows.
 #[derive(Default)]
-pub struct WalGenerator {
+pub struct WalGenerator<R: RecordGenerator> {
+    /// Generates record payloads for the WAL.
+    pub record_generator: R,
     /// Current LSN to append the next record at.
     ///
     /// Callers can modify this (and prev_lsn) to restart generation at a different LSN, but should
@@ -46,73 +100,35 @@ pub struct WalGenerator {
     pub prev_lsn: Lsn,
 }
 
-impl WalGenerator {
-    // For now, hardcode the message payload.
-    // TODO: support specifying the payload size.
-    const PREFIX: &CStr = c"prefix";
-    const MESSAGE: &[u8] = b"message";
-
-    // Hardcode the sys, timeline, and DB IDs. We can make them configurable if we care about them.
+impl<R: RecordGenerator> WalGenerator<R> {
+    // Hardcode the sys and timeline ID. We can make them configurable if we care about them.
     const SYS_ID: u64 = 0;
     const TIMELINE_ID: u32 = 1;
-    const DB_ID: u32 = 0;
 
-    /// Creates a new WAL generator, which emits logical message records (noops).
-    pub fn new() -> Self {
-        Self::default()
+    /// Creates a new WAL generator with the given record generator.
+    pub fn new(record_generator: R) -> WalGenerator<R> {
+        Self {
+            record_generator,
+            lsn: Lsn(0),
+            prev_lsn: Lsn(0),
+        }
     }
 
-    /// Encodes a logical message (basically a noop), with the given prefix and message.
-    pub(crate) fn encode_logical_message(prefix: &CStr, message: &[u8]) -> Bytes {
-        let prefix = prefix.to_bytes_with_nul();
-        let header = XlLogicalMessage {
-            db_id: Self::DB_ID,
-            transactional: 0,
-            prefix_size: prefix.len() as u64,
-            message_size: message.len() as u64,
-        };
-        [&header.encode(), prefix, message].concat().into()
+    /// Appends a record with an arbitrary payload at the current LSN, then increments the LSN.
+    /// Returns the WAL bytes for the record, including page headers and padding, and the start LSN.
+    fn append_record(&mut self, record: Record) -> (Lsn, Bytes) {
+        let record = record.encode(self.prev_lsn);
+        let record = Self::insert_pages(record, self.lsn);
+        let record = Self::pad_record(record, self.lsn);
+        let lsn = self.lsn;
+        self.prev_lsn = self.lsn;
+        self.lsn += record.len() as u64;
+        (lsn, record)
     }
 
-    /// Encode a WAL record with the given payload data (e.g. a logical message).
-    pub(crate) fn encode_record(data: Bytes, rmid: u8, info: u8, prev_lsn: Lsn) -> Bytes {
-        // Prefix data with block ID and length.
-        let data_header = Bytes::from(match data.len() {
-            0 => vec![],
-            1..=255 => vec![XLR_BLOCK_ID_DATA_SHORT, data.len() as u8],
-            256.. => {
-                let len_bytes = (data.len() as u32).to_le_bytes();
-                [&[XLR_BLOCK_ID_DATA_LONG], len_bytes.as_slice()].concat()
-            }
-        });
-
-        // Construct the WAL record header.
-        let mut header = XLogRecord {
-            xl_tot_len: (XLOG_SIZE_OF_XLOG_RECORD + data_header.len() + data.len()) as u32,
-            xl_xid: 0,
-            xl_prev: prev_lsn.into(),
-            xl_info: info,
-            xl_rmid: rmid,
-            __bindgen_padding_0: [0; 2],
-            xl_crc: 0, // see below
-        };
-
-        // Compute the CRC checksum for the data, and the header up to the CRC field.
-        let mut crc = 0;
-        crc = crc32c_append(crc, &data_header);
-        crc = crc32c_append(crc, &data);
-        crc = crc32c_append(crc, &header.encode().unwrap()[0..XLOG_RECORD_CRC_OFFS]);
-        header.xl_crc = crc;
-
-        // Encode the final header and record.
-        let header = header.encode().unwrap();
-
-        [header, data_header, data].concat().into()
-    }
-
-    /// Injects page headers on 8KB page boundaries. Takes the current LSN position where the record
+    /// Inserts page headers on 8KB page boundaries. Takes the current LSN position where the record
     /// is to be appended.
-    fn encode_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
+    fn insert_pages(record: Bytes, mut lsn: Lsn) -> Bytes {
         // Fast path: record fits in current page, and the page already has a header.
         if lsn.remaining_in_block() as usize >= record.len() && lsn.block_offset() > 0 {
             return record;
@@ -173,31 +189,71 @@ impl WalGenerator {
         }
         [record, Bytes::from(vec![0; padding])].concat().into()
     }
-
-    /// Generates a record with an arbitrary payload at the current LSN, then increments the LSN.
-    pub fn generate_record(&mut self, data: Bytes, rmid: u8, info: u8) -> Bytes {
-        let record = Self::encode_record(data, rmid, info, self.prev_lsn);
-        let record = Self::encode_pages(record, self.lsn);
-        let record = Self::pad_record(record, self.lsn);
-        self.prev_lsn = self.lsn;
-        self.lsn += record.len() as u64;
-        record
-    }
-
-    /// Generates a logical message at the current LSN. Can be used to construct arbitrary messages.
-    pub fn generate_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> Bytes {
-        let data = Self::encode_logical_message(prefix, message);
-        self.generate_record(data, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE)
-    }
 }
 
-/// Generate WAL records as an iterator.
-impl Iterator for WalGenerator {
+/// Generates WAL records as an iterator.
+impl<R: RecordGenerator> Iterator for WalGenerator<R> {
     type Item = (Lsn, Bytes);
 
     fn next(&mut self) -> Option<Self::Item> {
-        let lsn = self.lsn;
-        let record = self.generate_logical_message(Self::PREFIX, Self::MESSAGE);
-        Some((lsn, record))
+        let record = self.record_generator.next()?;
+        Some(self.append_record(record))
+    }
+}
+
+/// Generates logical message records (effectively noops) with a fixed message.
+pub struct LogicalMessageGenerator {
+    prefix: CString,
+    message: Vec<u8>,
+}
+
+impl LogicalMessageGenerator {
+    const DB_ID: u32 = 0; // hardcoded for now
+    const RM_ID: RmgrId = RM_LOGICALMSG_ID;
+    const INFO: u8 = XLOG_LOGICAL_MESSAGE;
+
+    /// Creates a new LogicalMessageGenerator.
+    pub fn new(prefix: &CStr, message: &[u8]) -> Self {
+        Self {
+            prefix: prefix.to_owned(),
+            message: message.to_owned(),
+        }
+    }
+
+    /// Encodes a logical message.
+    fn encode(prefix: &CStr, message: &[u8]) -> Bytes {
+        let prefix = prefix.to_bytes_with_nul();
+        let header = XlLogicalMessage {
+            db_id: Self::DB_ID,
+            transactional: 0,
+            prefix_size: prefix.len() as u64,
+            message_size: message.len() as u64,
+        };
+        [&header.encode(), prefix, message].concat().into()
+    }
+}
+
+impl Iterator for LogicalMessageGenerator {
+    type Item = Record;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        Some(Record {
+            rmid: Self::RM_ID,
+            info: Self::INFO,
+            data: Self::encode(&self.prefix, &self.message),
+        })
+    }
+}
+
+impl WalGenerator<LogicalMessageGenerator> {
+    /// Convenience method for appending a WAL record with an arbitrary logical message at the
+    /// current WAL LSN position. Returns the start LSN and resulting WAL bytes.
+    pub fn append_logical_message(&mut self, prefix: &CStr, message: &[u8]) -> (Lsn, Bytes) {
+        let record = Record {
+            rmid: LogicalMessageGenerator::RM_ID,
+            info: LogicalMessageGenerator::INFO,
+            data: LogicalMessageGenerator::encode(prefix, message),
+        };
+        self.append_record(record)
     }
 }
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 78a965174f..852b20eace 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -12,9 +12,9 @@ use super::bindings::{
     CheckPoint, ControlFileData, DBState_DB_SHUTDOWNED, FullTransactionId, TimeLineID, TimestampTz,
     XLogLongPageHeaderData, XLogPageHeaderData, XLogRecPtr, XLogRecord, XLogSegNo, XLOG_PAGE_MAGIC,
 };
-use super::wal_generator::WalGenerator;
+use super::wal_generator::LogicalMessageGenerator;
 use super::PG_MAJORVERSION;
-use crate::pg_constants::{self, RM_LOGICALMSG_ID, XLOG_LOGICAL_MESSAGE};
+use crate::pg_constants;
 use crate::PG_TLI;
 use crate::{uint32, uint64, Oid};
 use crate::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
@@ -493,12 +493,10 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Bytes {
     // This function can take untrusted input, so discard any NUL bytes in the prefix string.
     let prefix = CString::new(prefix.replace('\0', "")).expect("no NULs");
     let message = message.as_bytes();
-    WalGenerator::encode_record(
-        WalGenerator::encode_logical_message(&prefix, message),
-        RM_LOGICALMSG_ID,
-        XLOG_LOGICAL_MESSAGE,
-        Lsn(0),
-    )
+    LogicalMessageGenerator::new(&prefix, message)
+        .next()
+        .unwrap()
+        .encode(Lsn(0))
 }
 
 #[cfg(test)]
diff --git a/safekeeper/tests/walproposer_sim/walproposer_disk.rs b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
index f70cd65dfc..aefb3919a1 100644
--- a/safekeeper/tests/walproposer_sim/walproposer_disk.rs
+++ b/safekeeper/tests/walproposer_sim/walproposer_disk.rs
@@ -1,7 +1,7 @@
 use std::{ffi::CStr, sync::Arc};
 
 use parking_lot::{Mutex, MutexGuard};
-use postgres_ffi::v16::wal_generator::WalGenerator;
+use postgres_ffi::v16::wal_generator::{LogicalMessageGenerator, WalGenerator};
 use utils::lsn::Lsn;
 
 use super::block_storage::BlockStorage;
@@ -18,7 +18,7 @@ impl DiskWalProposer {
                 internal_available_lsn: Lsn(0),
                 prev_lsn: Lsn(0),
                 disk: BlockStorage::new(),
-                wal_generator: WalGenerator::new(),
+                wal_generator: WalGenerator::new(LogicalMessageGenerator::new(c"", &[])),
             }),
         })
     }
@@ -36,7 +36,7 @@ pub struct State {
     // actual WAL storage
     disk: BlockStorage,
     // WAL record generator
-    wal_generator: WalGenerator,
+    wal_generator: WalGenerator<LogicalMessageGenerator>,
 }
 
 impl State {
@@ -64,7 +64,7 @@ impl State {
 
     /// Inserts a logical record in the WAL at the current LSN.
     pub fn insert_logical_message(&mut self, prefix: &CStr, msg: &[u8]) {
-        let record = self.wal_generator.generate_logical_message(prefix, msg);
+        let (_, record) = self.wal_generator.append_logical_message(prefix, msg);
         self.disk.write(self.internal_available_lsn.into(), &record);
         self.prev_lsn = self.internal_available_lsn;
         self.internal_available_lsn += record.len() as u64;

From f54f0e8e2d4da4f2d297d73ffaceee37de4500d0 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 7 Nov 2024 14:29:52 +0300
Subject: [PATCH 183/239] Fix direct reading from WAL buffers. (#9639)

Fix direct reading from WAL buffers.
Pointer wasn't advanced which resulted in sending corrupted WAL if part
of read used WAL buffers and part read from the file. Also move it to
neon_walreader so that e.g. replication could also make use of it.

ref https://github.com/neondatabase/cloud/issues/19567
---
 pgxn/neon/neon_walreader.c            | 11 ++++++++
 pgxn/neon/walproposer.c               | 40 +++++++++++++++------------
 pgxn/neon/walproposer_pg.c            | 32 ++++-----------------
 test_runner/fixtures/neon_fixtures.py | 12 ++++++--
 4 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/pgxn/neon/neon_walreader.c b/pgxn/neon/neon_walreader.c
index b575712dbe..5854a7ef0f 100644
--- a/pgxn/neon/neon_walreader.c
+++ b/pgxn/neon/neon_walreader.c
@@ -611,6 +611,17 @@ NeonWALReadLocal(NeonWALReader *state, char *buf, XLogRecPtr startptr, Size coun
 	recptr = startptr;
 	nbytes = count;
 
+/* Try to read directly from WAL buffers first. */
+#if PG_MAJORVERSION_NUM >= 17
+	{
+		Size	rbytes;
+		rbytes = WALReadFromBuffers(p, recptr, nbytes, tli);
+		recptr += rbytes;
+		nbytes -= rbytes;
+		p += rbytes;
+	}
+#endif
+
 	while (nbytes > 0)
 	{
 		uint32		startoff;
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index d2a6104c74..e89ffdb628 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -1361,29 +1361,35 @@ SendAppendRequests(Safekeeper *sk)
 		if (sk->active_state == SS_ACTIVE_READ_WAL)
 		{
 			char	   *errmsg;
+			int			req_len;
 
 			req = &sk->appendRequest;
+			req_len = req->endLsn - req->beginLsn;
 
-			switch (wp->api.wal_read(sk,
-									 &sk->outbuf.data[sk->outbuf.len],
-									 req->beginLsn,
-									 req->endLsn - req->beginLsn,
-									 &errmsg))
+			/* We send zero sized AppenRequests as heartbeats; don't wal_read for these. */
+			if (req_len > 0)
 			{
-				case NEON_WALREAD_SUCCESS:
-					break;
-				case NEON_WALREAD_WOULDBLOCK:
-					return true;
-				case NEON_WALREAD_ERROR:
-					wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
-						   sk->host, sk->port, errmsg);
-					ShutdownConnection(sk);
-					return false;
-				default:
-					Assert(false);
+				switch (wp->api.wal_read(sk,
+										&sk->outbuf.data[sk->outbuf.len],
+										req->beginLsn,
+										req_len,
+										&errmsg))
+				{
+					case NEON_WALREAD_SUCCESS:
+						break;
+					case NEON_WALREAD_WOULDBLOCK:
+						return true;
+					case NEON_WALREAD_ERROR:
+						wp_log(WARNING, "WAL reading for node %s:%s failed: %s",
+							sk->host, sk->port, errmsg);
+						ShutdownConnection(sk);
+						return false;
+					default:
+						Assert(false);
+				}
 			}
 
-			sk->outbuf.len += req->endLsn - req->beginLsn;
+			sk->outbuf.len += req_len;
 
 			writeResult = wp->api.conn_async_write(sk, sk->outbuf.data, sk->outbuf.len);
 
diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c
index 706941c3f0..86444084ff 100644
--- a/pgxn/neon/walproposer_pg.c
+++ b/pgxn/neon/walproposer_pg.c
@@ -1489,33 +1489,11 @@ walprop_pg_wal_read(Safekeeper *sk, char *buf, XLogRecPtr startptr, Size count,
 {
 	NeonWALReadResult res;
 
-#if PG_MAJORVERSION_NUM >= 17
-	if (!sk->wp->config->syncSafekeepers)
-	{
-		Size	rbytes;
-		rbytes = WALReadFromBuffers(buf, startptr, count,
-									walprop_pg_get_timeline_id());
-
-		startptr += rbytes;
-		count -= rbytes;
-	}
-#endif
-
-	if (count == 0)
-	{
-		res = NEON_WALREAD_SUCCESS;
-	}
-	else
-	{
-		Assert(count > 0);
-
-		/* Now read the remaining WAL from the WAL file */
-		res = NeonWALRead(sk->xlogreader,
-						  buf,
-						  startptr,
-						  count,
-						  walprop_pg_get_timeline_id());
-	}
+	res = NeonWALRead(sk->xlogreader,
+					  buf,
+					  startptr,
+					  count,
+					  walprop_pg_get_timeline_id());
 
 	if (res == NEON_WALREAD_SUCCESS)
 	{
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e4d6e6da5d..e23f46d1ca 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -4177,9 +4177,15 @@ class Safekeeper(LogUtils):
         return self
 
     def assert_no_errors(self):
-        assert not self.log_contains("manager task finished prematurely")
-        assert not self.log_contains("error while acquiring WalResidentTimeline guard")
-        assert not self.log_contains("timeout while acquiring WalResidentTimeline guard")
+        not_allowed = [
+            "manager task finished prematurely",
+            "error while acquiring WalResidentTimeline guard",
+            "timeout while acquiring WalResidentTimeline guard",
+            "invalid xlog page header:",
+            "WAL record crc mismatch at",
+        ]
+        for na in not_allowed:
+            assert not self.log_contains(na)
 
     def append_logical_message(
         self, tenant_id: TenantId, timeline_id: TimelineId, request: dict[str, Any]

From 01265b7bc66275a8f974fa42e8aeb8ad050404b3 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 7 Nov 2024 14:24:03 +0100
Subject: [PATCH 184/239] safekeeper: add basic WAL ingestion benchmarks
 (#9531)

## Problem

We don't have any benchmarks for Safekeeper WAL ingestion.

## Summary of changes

Add some basic benchmarks for WAL ingestion, specifically for
`SafeKeeper::process_msg()` (single append) and `WalAcceptor` (pipelined
batch ingestion). Also add some baseline file write benchmarks.
---
 Cargo.lock                        |   2 +
 safekeeper/Cargo.toml             |   6 +
 safekeeper/benches/README.md      |  22 ++
 safekeeper/benches/benchutils.rs  | 102 +++++++++
 safekeeper/benches/receive_wal.rs | 341 ++++++++++++++++++++++++++++++
 safekeeper/src/lib.rs             |   4 +-
 safekeeper/src/state.rs           |   1 -
 safekeeper/src/timeline.rs        |  61 ++++--
 8 files changed, 516 insertions(+), 23 deletions(-)
 create mode 100644 safekeeper/benches/README.md
 create mode 100644 safekeeper/benches/benchutils.rs
 create mode 100644 safekeeper/benches/receive_wal.rs

diff --git a/Cargo.lock b/Cargo.lock
index 9c2a0b455e..7d18f44aec 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5146,6 +5146,7 @@ dependencies = [
  "chrono",
  "clap",
  "crc32c",
+ "criterion",
  "desim",
  "fail",
  "futures",
@@ -5153,6 +5154,7 @@ dependencies = [
  "http 1.1.0",
  "humantime",
  "hyper 0.14.30",
+ "itertools 0.10.5",
  "metrics",
  "once_cell",
  "parking_lot 0.12.1",
diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml
index ec08d02240..85561e4aff 100644
--- a/safekeeper/Cargo.toml
+++ b/safekeeper/Cargo.toml
@@ -61,8 +61,14 @@ utils.workspace = true
 workspace_hack.workspace = true
 
 [dev-dependencies]
+criterion.workspace = true
+itertools.workspace = true
 walproposer.workspace = true
 rand.workspace = true
 desim.workspace = true
 tracing.workspace = true
 tracing-subscriber = { workspace = true, features = ["json"] }
+
+[[bench]]
+name = "receive_wal"
+harness = false
diff --git a/safekeeper/benches/README.md b/safekeeper/benches/README.md
new file mode 100644
index 0000000000..4119cc8d6e
--- /dev/null
+++ b/safekeeper/benches/README.md
@@ -0,0 +1,22 @@
+## Safekeeper Benchmarks
+
+To run benchmarks:
+
+```sh
+# All benchmarks.
+cargo bench --package safekeeper
+
+# Specific file.
+cargo bench --package safekeeper --bench receive_wal
+
+# Specific benchmark.
+cargo bench --package safekeeper --bench receive_wal process_msg/fsync=false
+
+# List available benchmarks.
+cargo bench --package safekeeper --benches -- --list
+```
+
+Additional charts and statistics are available in `target/criterion/report/index.html`.
+
+Benchmarks are automatically compared against the previous run. To compare against other runs, see
+`--baseline` and `--save-baseline`.
\ No newline at end of file
diff --git a/safekeeper/benches/benchutils.rs b/safekeeper/benches/benchutils.rs
new file mode 100644
index 0000000000..4e8dc58c49
--- /dev/null
+++ b/safekeeper/benches/benchutils.rs
@@ -0,0 +1,102 @@
+use std::sync::Arc;
+
+use camino_tempfile::Utf8TempDir;
+use safekeeper::rate_limit::RateLimiter;
+use safekeeper::safekeeper::{ProposerAcceptorMessage, ProposerElected, SafeKeeper, TermHistory};
+use safekeeper::state::{TimelinePersistentState, TimelineState};
+use safekeeper::timeline::{get_timeline_dir, SharedState, StateSK, Timeline};
+use safekeeper::timelines_set::TimelinesSet;
+use safekeeper::wal_backup::remote_timeline_path;
+use safekeeper::{control_file, wal_storage, SafeKeeperConf};
+use tokio::fs::create_dir_all;
+use utils::id::{NodeId, TenantTimelineId};
+use utils::lsn::Lsn;
+
+/// A Safekeeper benchmarking environment. Uses a tempdir for storage, removed on drop.
+pub struct Env {
+    /// Whether to enable fsync.
+    pub fsync: bool,
+    /// Benchmark directory. Deleted when dropped.
+    pub tempdir: Utf8TempDir,
+}
+
+impl Env {
+    /// Creates a new benchmarking environment in a temporary directory. fsync controls whether to
+    /// enable fsyncing.
+    pub fn new(fsync: bool) -> anyhow::Result<Self> {
+        let tempdir = camino_tempfile::tempdir()?;
+        Ok(Self { fsync, tempdir })
+    }
+
+    /// Constructs a Safekeeper config for the given node ID.
+    fn make_conf(&self, node_id: NodeId) -> SafeKeeperConf {
+        let mut conf = SafeKeeperConf::dummy();
+        conf.my_id = node_id;
+        conf.no_sync = !self.fsync;
+        conf.workdir = self.tempdir.path().join(format!("safekeeper-{node_id}"));
+        conf
+    }
+
+    /// Constructs a Safekeeper with the given node and tenant/timeline ID.
+    ///
+    /// TODO: we should support using in-memory storage, to measure non-IO costs. This would be
+    /// easier if SafeKeeper used trait objects for storage rather than generics. It's also not
+    /// currently possible to construct a timeline using non-file storage since StateSK only accepts
+    /// SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>.
+    pub async fn make_safekeeper(
+        &self,
+        node_id: NodeId,
+        ttid: TenantTimelineId,
+    ) -> anyhow::Result<SafeKeeper<control_file::FileStorage, wal_storage::PhysicalStorage>> {
+        let conf = self.make_conf(node_id);
+
+        let timeline_dir = get_timeline_dir(&conf, &ttid);
+        create_dir_all(&timeline_dir).await?;
+
+        let mut pstate = TimelinePersistentState::empty();
+        pstate.tenant_id = ttid.tenant_id;
+        pstate.timeline_id = ttid.timeline_id;
+
+        let wal = wal_storage::PhysicalStorage::new(&ttid, &timeline_dir, &pstate, conf.no_sync)?;
+        let ctrl =
+            control_file::FileStorage::create_new(&timeline_dir, pstate, conf.no_sync).await?;
+        let state = TimelineState::new(ctrl);
+        let mut safekeeper = SafeKeeper::new(state, wal, conf.my_id)?;
+
+        // Emulate an initial election.
+        safekeeper
+            .process_msg(&ProposerAcceptorMessage::Elected(ProposerElected {
+                term: 1,
+                start_streaming_at: Lsn(0),
+                term_history: TermHistory(vec![(1, Lsn(0)).into()]),
+                timeline_start_lsn: Lsn(0),
+            }))
+            .await?;
+
+        Ok(safekeeper)
+    }
+
+    /// Constructs a timeline, including a new Safekeeper with the given node ID, and spawns its
+    /// manager task.
+    pub async fn make_timeline(
+        &self,
+        node_id: NodeId,
+        ttid: TenantTimelineId,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let conf = self.make_conf(node_id);
+        let timeline_dir = get_timeline_dir(&conf, &ttid);
+        let remote_path = remote_timeline_path(&ttid)?;
+
+        let safekeeper = self.make_safekeeper(node_id, ttid).await?;
+        let shared_state = SharedState::new(StateSK::Loaded(safekeeper));
+
+        let timeline = Timeline::new(ttid, &timeline_dir, &remote_path, shared_state);
+        timeline.bootstrap(
+            &mut timeline.write_shared_state().await,
+            &conf,
+            Arc::new(TimelinesSet::default()), // ignored for now
+            RateLimiter::new(0, 0),
+        );
+        Ok(timeline)
+    }
+}
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
new file mode 100644
index 0000000000..e32d7526ca
--- /dev/null
+++ b/safekeeper/benches/receive_wal.rs
@@ -0,0 +1,341 @@
+//! WAL ingestion benchmarks.
+
+#[path = "benchutils.rs"]
+mod benchutils;
+
+use std::io::Write as _;
+
+use benchutils::Env;
+use camino_tempfile::tempfile;
+use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion};
+use itertools::Itertools as _;
+use postgres_ffi::v17::wal_generator::{LogicalMessageGenerator, WalGenerator};
+use safekeeper::receive_wal::{self, WalAcceptor};
+use safekeeper::safekeeper::{
+    AcceptorProposerMessage, AppendRequest, AppendRequestHeader, ProposerAcceptorMessage,
+};
+use tokio::io::AsyncWriteExt as _;
+use utils::id::{NodeId, TenantTimelineId};
+use utils::lsn::Lsn;
+
+const KB: usize = 1024;
+const MB: usize = 1024 * KB;
+const GB: usize = 1024 * MB;
+
+// Register benchmarks with Criterion.
+criterion_group!(
+    benches,
+    bench_process_msg,
+    bench_wal_acceptor,
+    bench_wal_acceptor_throughput,
+    bench_file_write
+);
+criterion_main!(benches);
+
+/// Benchmarks SafeKeeper::process_msg() as time per message and throughput. Each message is an
+/// AppendRequest with a single WAL record containing an XlLogicalMessage of varying size. When
+/// measuring throughput, only the logical message payload is considered, excluding
+/// segment/page/record headers.
+fn bench_process_msg(c: &mut Criterion) {
+    let mut g = c.benchmark_group("process_msg");
+    for fsync in [false, true] {
+        for commit in [false, true] {
+            for size in [8, KB, 8 * KB, 128 * KB, MB] {
+                // Kind of weird to change the group throughput per benchmark, but it's the only way
+                // to vary it per benchmark. It works.
+                g.throughput(criterion::Throughput::Bytes(size as u64));
+                g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
+                    run_bench(b, size, fsync, commit).unwrap()
+                });
+            }
+        }
+    }
+
+    // The actual benchmark. If commit is true, advance the commit LSN on every message.
+    fn run_bench(b: &mut Bencher, size: usize, fsync: bool, commit: bool) -> anyhow::Result<()> {
+        let runtime = tokio::runtime::Builder::new_current_thread() // single is fine, sync IO only
+            .enable_all()
+            .build()?;
+
+        // Construct the payload. The prefix counts towards the payload (including NUL terminator).
+        let prefix = c"p";
+        let prefixlen = prefix.to_bytes_with_nul().len();
+        assert!(size >= prefixlen);
+        let message = vec![0; size - prefixlen];
+
+        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
+
+        // Set up the Safekeeper.
+        let env = Env::new(fsync)?;
+        let mut safekeeper =
+            runtime.block_on(env.make_safekeeper(NodeId(1), TenantTimelineId::generate()))?;
+
+        b.iter_batched_ref(
+            // Pre-construct WAL records and requests. Criterion will batch them.
+            || {
+                let (lsn, record) = walgen.next().expect("endless WAL");
+                ProposerAcceptorMessage::AppendRequest(AppendRequest {
+                    h: AppendRequestHeader {
+                        term: 1,
+                        term_start_lsn: Lsn(0),
+                        begin_lsn: lsn,
+                        end_lsn: lsn + record.len() as u64,
+                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
+                        truncate_lsn: Lsn(0),
+                        proposer_uuid: [0; 16],
+                    },
+                    wal_data: record,
+                })
+            },
+            // Benchmark message processing (time per message).
+            |msg| {
+                runtime
+                    .block_on(safekeeper.process_msg(msg))
+                    .expect("message failed")
+            },
+            BatchSize::SmallInput, // automatically determine a batch size
+        );
+        Ok(())
+    }
+}
+
+/// Benchmarks WalAcceptor message processing time by sending it a batch of WAL records and waiting
+/// for it to confirm that the last LSN has been flushed to storage. We pipeline a bunch of messages
+/// instead of measuring each individual message to amortize costs (e.g. fsync), which is more
+/// realistic. Records are XlLogicalMessage with a tiny payload (~64 bytes per record including
+/// headers). Records are pre-constructed to avoid skewing the benchmark.
+///
+/// TODO: add benchmarks with in-memory storage, see comment on `Env::make_safekeeper()`:
+fn bench_wal_acceptor(c: &mut Criterion) {
+    let mut g = c.benchmark_group("wal_acceptor");
+    for fsync in [false, true] {
+        for n in [1, 100, 10000] {
+            g.bench_function(format!("fsync={fsync}/n={n}"), |b| {
+                run_bench(b, n, fsync).unwrap()
+            });
+        }
+    }
+
+    /// The actual benchmark. n is the number of WAL records to send in a pipelined batch.
+    fn run_bench(b: &mut Bencher, n: usize, fsync: bool) -> anyhow::Result<()> {
+        let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
+
+        let env = Env::new(fsync)?;
+        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(c"prefix", b"message"));
+
+        // Create buffered channels that can fit all requests, to avoid blocking on channels.
+        let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(n);
+        let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(n);
+
+        // Spawn the WalAcceptor task.
+        runtime.block_on(async {
+            // TODO: WalAcceptor doesn't actually need a full timeline, only
+            // Safekeeper::process_msg(). Consider decoupling them to simplify the setup.
+            let tli = env
+                .make_timeline(NodeId(1), TenantTimelineId::generate())
+                .await?
+                .wal_residence_guard()
+                .await?;
+            WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
+            anyhow::Ok(())
+        })?;
+
+        b.iter_batched(
+            // Pre-construct a batch of WAL records and requests.
+            || {
+                walgen
+                    .take(n)
+                    .map(|(lsn, record)| AppendRequest {
+                        h: AppendRequestHeader {
+                            term: 1,
+                            term_start_lsn: Lsn(0),
+                            begin_lsn: lsn,
+                            end_lsn: lsn + record.len() as u64,
+                            commit_lsn: Lsn(0),
+                            truncate_lsn: Lsn(0),
+                            proposer_uuid: [0; 16],
+                        },
+                        wal_data: record,
+                    })
+                    .collect_vec()
+            },
+            // Benchmark batch ingestion (time per batch).
+            |reqs| {
+                runtime.block_on(async {
+                    let final_lsn = reqs.last().unwrap().h.end_lsn;
+                    // Stuff all the messages into the buffered channel to pipeline them.
+                    for req in reqs {
+                        let msg = ProposerAcceptorMessage::AppendRequest(req);
+                        msg_tx.send(msg).await.expect("send failed");
+                    }
+                    // Wait for the last message to get flushed.
+                    while let Some(reply) = reply_rx.recv().await {
+                        if let AcceptorProposerMessage::AppendResponse(resp) = reply {
+                            if resp.flush_lsn >= final_lsn {
+                                return;
+                            }
+                        }
+                    }
+                    panic!("disconnected")
+                })
+            },
+            BatchSize::PerIteration, // only run one request batch at a time
+        );
+        Ok(())
+    }
+}
+
+/// Benchmarks WalAcceptor throughput by sending 1 GB of data with varying message sizes and waiting
+/// for the last LSN to be flushed to storage. Only the actual message payload counts towards
+/// throughput, headers are excluded and considered overhead. Records are XlLogicalMessage.
+///
+/// To avoid running out of memory, messages are constructed during the benchmark.
+fn bench_wal_acceptor_throughput(c: &mut Criterion) {
+    const VOLUME: usize = GB; // NB: excludes message/page/segment headers and padding
+
+    let mut g = c.benchmark_group("wal_acceptor_throughput");
+    g.sample_size(10);
+    g.throughput(criterion::Throughput::Bytes(VOLUME as u64));
+
+    for fsync in [false, true] {
+        for commit in [false, true] {
+            for size in [KB, 8 * KB, 128 * KB, MB] {
+                assert_eq!(VOLUME % size, 0, "volume must be divisible by size");
+                let count = VOLUME / size;
+                g.bench_function(format!("fsync={fsync}/commit={commit}/size={size}"), |b| {
+                    run_bench(b, count, size, fsync, commit).unwrap()
+                });
+            }
+        }
+    }
+
+    /// The actual benchmark. size is the payload size per message, count is the number of messages.
+    /// If commit is true, advance the commit LSN on each message.
+    fn run_bench(
+        b: &mut Bencher,
+        count: usize,
+        size: usize,
+        fsync: bool,
+        commit: bool,
+    ) -> anyhow::Result<()> {
+        let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
+
+        // Construct the payload. The prefix counts towards the payload (including NUL terminator).
+        let prefix = c"p";
+        let prefixlen = prefix.to_bytes_with_nul().len();
+        assert!(size >= prefixlen);
+        let message = vec![0; size - prefixlen];
+
+        let walgen = &mut WalGenerator::new(LogicalMessageGenerator::new(prefix, &message));
+
+        // Construct and spawn the WalAcceptor task.
+        let env = Env::new(fsync)?;
+
+        let (msg_tx, msg_rx) = tokio::sync::mpsc::channel(receive_wal::MSG_QUEUE_SIZE);
+        let (reply_tx, mut reply_rx) = tokio::sync::mpsc::channel(receive_wal::REPLY_QUEUE_SIZE);
+
+        runtime.block_on(async {
+            let tli = env
+                .make_timeline(NodeId(1), TenantTimelineId::generate())
+                .await?
+                .wal_residence_guard()
+                .await?;
+            WalAcceptor::spawn(tli, msg_rx, reply_tx, Some(0));
+            anyhow::Ok(())
+        })?;
+
+        // Ingest the WAL.
+        b.iter(|| {
+            runtime.block_on(async {
+                let reqgen = walgen.take(count).map(|(lsn, record)| AppendRequest {
+                    h: AppendRequestHeader {
+                        term: 1,
+                        term_start_lsn: Lsn(0),
+                        begin_lsn: lsn,
+                        end_lsn: lsn + record.len() as u64,
+                        commit_lsn: if commit { lsn } else { Lsn(0) }, // commit previous record
+                        truncate_lsn: Lsn(0),
+                        proposer_uuid: [0; 16],
+                    },
+                    wal_data: record,
+                });
+
+                // Send requests.
+                for req in reqgen {
+                    _ = reply_rx.try_recv(); // discard any replies, to avoid blocking
+                    let msg = ProposerAcceptorMessage::AppendRequest(req);
+                    msg_tx.send(msg).await.expect("send failed");
+                }
+
+                // Wait for last message to get flushed.
+                while let Some(reply) = reply_rx.recv().await {
+                    if let AcceptorProposerMessage::AppendResponse(resp) = reply {
+                        if resp.flush_lsn >= walgen.lsn {
+                            return;
+                        }
+                    }
+                }
+                panic!("disconnected")
+            })
+        });
+        Ok(())
+    }
+}
+
+/// Benchmarks OS write throughput by appending blocks of a given size to a file. This is intended
+/// to compare Tokio and stdlib writes, and give a baseline for optimal WAL throughput.
+fn bench_file_write(c: &mut Criterion) {
+    let mut g = c.benchmark_group("file_write");
+
+    for kind in ["stdlib", "tokio"] {
+        for fsync in [false, true] {
+            for size in [8, KB, 8 * KB, 128 * KB, MB] {
+                // Kind of weird to change the group throughput per benchmark, but it's the only way to
+                // vary it per benchmark. It works.
+                g.throughput(criterion::Throughput::Bytes(size as u64));
+                g.bench_function(
+                    format!("{kind}/fsync={fsync}/size={size}"),
+                    |b| match kind {
+                        "stdlib" => run_bench_stdlib(b, size, fsync).unwrap(),
+                        "tokio" => run_bench_tokio(b, size, fsync).unwrap(),
+                        name => panic!("unknown kind {name}"),
+                    },
+                );
+            }
+        }
+    }
+
+    fn run_bench_stdlib(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
+        let mut file = tempfile()?;
+        let buf = vec![0u8; size];
+
+        b.iter(|| {
+            file.write_all(&buf).unwrap();
+            file.flush().unwrap();
+            if fsync {
+                file.sync_data().unwrap();
+            }
+        });
+
+        Ok(())
+    }
+
+    fn run_bench_tokio(b: &mut Bencher, size: usize, fsync: bool) -> anyhow::Result<()> {
+        let runtime = tokio::runtime::Runtime::new()?; // needs multithreaded
+
+        let mut file = tokio::fs::File::from_std(tempfile()?);
+        let buf = vec![0u8; size];
+
+        b.iter(|| {
+            runtime.block_on(async {
+                file.write_all(&buf).await.unwrap();
+                file.flush().await.unwrap();
+                if fsync {
+                    file.sync_data().await.unwrap();
+                }
+            })
+        });
+
+        Ok(())
+    }
+}
diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs
index b1cddaf062..6d68b6b59b 100644
--- a/safekeeper/src/lib.rs
+++ b/safekeeper/src/lib.rs
@@ -112,9 +112,7 @@ impl SafeKeeperConf {
 }
 
 impl SafeKeeperConf {
-    #[cfg(test)]
-    #[allow(unused)]
-    fn dummy() -> Self {
+    pub fn dummy() -> Self {
         SafeKeeperConf {
             workdir: Utf8PathBuf::from("./"),
             no_sync: false,
diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index 0826a148ec..b8925d785e 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -138,7 +138,6 @@ impl TimelinePersistentState {
         })
     }
 
-    #[cfg(test)]
     pub fn empty() -> Self {
         TimelinePersistentState::new(
             &TenantTimelineId::empty(),
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index f0113978c4..fa91241177 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -2,7 +2,7 @@
 //! to glue together SafeKeeper and all other background services.
 
 use anyhow::{anyhow, bail, Result};
-use camino::Utf8PathBuf;
+use camino::{Utf8Path, Utf8PathBuf};
 use remote_storage::RemotePath;
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
@@ -325,8 +325,17 @@ pub struct SharedState {
 }
 
 impl SharedState {
+    /// Creates a new SharedState.
+    pub fn new(sk: StateSK) -> Self {
+        Self {
+            sk,
+            peers_info: PeersInfo(vec![]),
+            wal_removal_on_hold: false,
+        }
+    }
+
     /// Restore SharedState from control file. If file doesn't exist, bails out.
-    fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
+    pub fn restore(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Result<Self> {
         let timeline_dir = get_timeline_dir(conf, ttid);
         let control_store = control_file::FileStorage::restore_new(&timeline_dir, conf.no_sync)?;
         if control_store.server.wal_seg_size == 0 {
@@ -352,11 +361,7 @@ impl SharedState {
             }
         };
 
-        Ok(Self {
-            sk,
-            peers_info: PeersInfo(vec![]),
-            wal_removal_on_hold: false,
-        })
+        Ok(Self::new(sk))
     }
 
     pub(crate) fn get_wal_seg_size(&self) -> usize {
@@ -480,11 +485,13 @@ pub struct Timeline {
 }
 
 impl Timeline {
-    /// Load existing timeline from disk.
-    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
-        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
-
-        let shared_state = SharedState::restore(conf, &ttid)?;
+    /// Constructs a new timeline.
+    pub fn new(
+        ttid: TenantTimelineId,
+        timeline_dir: &Utf8Path,
+        remote_path: &RemotePath,
+        shared_state: SharedState,
+    ) -> Arc<Self> {
         let (commit_lsn_watch_tx, commit_lsn_watch_rx) =
             watch::channel(shared_state.sk.state().commit_lsn);
         let (term_flush_lsn_watch_tx, term_flush_lsn_watch_rx) = watch::channel(TermLsn::from((
@@ -494,10 +501,11 @@ impl Timeline {
         let (shared_state_version_tx, shared_state_version_rx) = watch::channel(0);
 
         let walreceivers = WalReceivers::new();
-        let remote_path = remote_timeline_path(&ttid)?;
-        Ok(Arc::new(Timeline {
+
+        Arc::new(Self {
             ttid,
-            remote_path,
+            remote_path: remote_path.to_owned(),
+            timeline_dir: timeline_dir.to_owned(),
             commit_lsn_watch_tx,
             commit_lsn_watch_rx,
             term_flush_lsn_watch_tx,
@@ -508,13 +516,28 @@ impl Timeline {
             walsenders: WalSenders::new(walreceivers.clone()),
             walreceivers,
             cancel: CancellationToken::default(),
-            timeline_dir: get_timeline_dir(conf, &ttid),
             manager_ctl: ManagerCtl::new(),
             broker_active: AtomicBool::new(false),
             wal_backup_active: AtomicBool::new(false),
             last_removed_segno: AtomicU64::new(0),
             mgr_status: AtomicStatus::new(),
-        }))
+        })
+    }
+
+    /// Load existing timeline from disk.
+    pub fn load_timeline(conf: &SafeKeeperConf, ttid: TenantTimelineId) -> Result<Arc<Timeline>> {
+        let _enter = info_span!("load_timeline", timeline = %ttid.timeline_id).entered();
+
+        let shared_state = SharedState::restore(conf, &ttid)?;
+        let timeline_dir = get_timeline_dir(conf, &ttid);
+        let remote_path = remote_timeline_path(&ttid)?;
+
+        Ok(Timeline::new(
+            ttid,
+            &timeline_dir,
+            &remote_path,
+            shared_state,
+        ))
     }
 
     /// Initialize fresh timeline on disk and start background tasks. If init
@@ -1128,13 +1151,13 @@ async fn delete_dir(path: &Utf8PathBuf) -> Result<bool> {
 
 /// Get a path to the tenant directory. If you just need to get a timeline directory,
 /// use WalResidentTimeline::get_timeline_dir instead.
-pub(crate) fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
+pub fn get_tenant_dir(conf: &SafeKeeperConf, tenant_id: &TenantId) -> Utf8PathBuf {
     conf.workdir.join(tenant_id.to_string())
 }
 
 /// Get a path to the timeline directory. If you need to read WAL files from disk,
 /// use WalResidentTimeline::get_timeline_dir instead. This function does not check
 /// timeline eviction status and WAL files might not be present on disk.
-pub(crate) fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
+pub fn get_timeline_dir(conf: &SafeKeeperConf, ttid: &TenantTimelineId) -> Utf8PathBuf {
     get_tenant_dir(conf, &ttid.tenant_id).join(ttid.timeline_id.to_string())
 }

From f18aa04b902204f2356b7ff67dc04bb4d3176327 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 7 Nov 2024 16:09:57 +0100
Subject: [PATCH 185/239] safekeeper: use `set_len()` to zero out segments
 (#9665)

## Problem

When we create a new segment, we zero it out in order to avoid changing
the length and fsyncing metadata on every write. However, we zeroed it
out by writing 8 KB zero-pages, and Tokio file writes have non-trivial
overhead.

## Summary of changes

Zero out the segment using
[`File::set_len()`](https://docs.rs/tokio/latest/i686-unknown-linux-gnu/tokio/fs/struct.File.html#method.set_len)
instead. This will typically (depending on the filesystem) just write a
sparse file and omit the 16 MB of data entirely. This improves WAL
append throughput for large messages by over 400% with fsync disabled,
and 100% with fsync enabled.
---
 safekeeper/src/wal_storage.rs                 | 45 ++++++++-----------
 .../regress/test_wal_acceptor_async.py        |  2 +-
 2 files changed, 20 insertions(+), 27 deletions(-)

diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 33b8bfe28e..4e67940c51 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -31,7 +31,6 @@ use crate::state::TimelinePersistentState;
 use crate::wal_backup::{read_object, remote_timeline_path};
 use postgres_ffi::waldecoder::WalStreamDecoder;
 use postgres_ffi::XLogFileName;
-use postgres_ffi::XLOG_BLCKSZ;
 use pq_proto::SystemId;
 use utils::{id::TenantTimelineId, lsn::Lsn};
 
@@ -223,6 +222,15 @@ impl PhysicalStorage {
         )
     }
 
+    /// Call fsync if config requires so.
+    async fn fsync_file(&mut self, file: &File) -> Result<()> {
+        if !self.no_sync {
+            self.metrics
+                .observe_flush_seconds(time_io_closure(file.sync_all()).await?);
+        }
+        Ok(())
+    }
+
     /// Call fdatasync if config requires so.
     async fn fdatasync_file(&mut self, file: &File) -> Result<()> {
         if !self.no_sync {
@@ -256,11 +264,15 @@ impl PhysicalStorage {
             // half initialized segment, first bake it under tmp filename and
             // then rename.
             let tmp_path = self.timeline_dir.join("waltmp");
-            let mut file = File::create(&tmp_path)
+            let file = File::create(&tmp_path)
                 .await
                 .with_context(|| format!("Failed to open tmp wal file {:?}", &tmp_path))?;
 
-            write_zeroes(&mut file, self.wal_seg_size).await?;
+            fail::fail_point!("sk-zero-segment", |_| {
+                info!("sk-zero-segment failpoint hit");
+                Err(anyhow::anyhow!("failpoint: sk-zero-segment"))
+            });
+            file.set_len(self.wal_seg_size as u64).await?;
 
             // Note: this doesn't get into observe_flush_seconds metric. But
             // segment init should be separate metric, if any.
@@ -486,12 +498,12 @@ impl Storage for PhysicalStorage {
         // Remove all segments after the given LSN.
         remove_segments_from_disk(&self.timeline_dir, self.wal_seg_size, |x| x > segno).await?;
 
-        let (mut file, is_partial) = self.open_or_create(segno).await?;
+        let (file, is_partial) = self.open_or_create(segno).await?;
 
         // Fill end with zeroes
-        file.seek(SeekFrom::Start(xlogoff as u64)).await?;
-        write_zeroes(&mut file, self.wal_seg_size - xlogoff).await?;
-        self.fdatasync_file(&file).await?;
+        file.set_len(xlogoff as u64).await?;
+        file.set_len(self.wal_seg_size as u64).await?;
+        self.fsync_file(&file).await?;
 
         if !is_partial {
             // Make segment partial once again
@@ -751,25 +763,6 @@ impl WalReader {
     }
 }
 
-/// Zero block for filling created WAL segments.
-const ZERO_BLOCK: &[u8] = &[0u8; XLOG_BLCKSZ];
-
-/// Helper for filling file with zeroes.
-async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
-    fail::fail_point!("sk-write-zeroes", |_| {
-        info!("write_zeroes hit failpoint");
-        Err(anyhow::anyhow!("failpoint: sk-write-zeroes"))
-    });
-
-    while count >= XLOG_BLCKSZ {
-        file.write_all(ZERO_BLOCK).await?;
-        count -= XLOG_BLCKSZ;
-    }
-    file.write_all(&ZERO_BLOCK[0..count]).await?;
-    file.flush().await?;
-    Ok(())
-}
-
 /// Helper function for opening WAL segment `segno` in `dir`. Returns file and
 /// whether it is .partial.
 pub(crate) async fn open_wal_file(
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index 92306469f8..f328974264 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -602,7 +602,7 @@ async def run_segment_init_failure(env: NeonEnv):
 
     sk = env.safekeepers[0]
     sk_http = sk.http_client()
-    sk_http.configure_failpoints([("sk-write-zeroes", "return")])
+    sk_http.configure_failpoints([("sk-zero-segment", "return")])
     conn = await ep.connect_async()
     ep.safe_psql("select pg_switch_wal()")  # jump to the segment boundary
     # next insertion should hang until failpoint is disabled.

From a8d9939ea959ad8c00b376fb8e189f630d3013ab Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 7 Nov 2024 10:38:15 -0500
Subject: [PATCH 186/239] fix(pageserver): reduce aux compaction threshold
 (#9647)

ref https://github.com/neondatabase/neon/issues/9441

The metrics from LR publisher testing project: ~300KB aux key deltas per
256MB files. Therefore, I think we can do compaction more aggressively
as these deltas are small and compaction can reduce layer download
latency. We also have a read path perf fix
https://github.com/neondatabase/neon/pull/9631 but I'd still combine the
read path fix with the reduce of the compaction threshold.

## Summary of changes

* reduce metadata compaction threshold
* use num of L1 delta layers as an indicator for metadata compaction
* dump more logs

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/pgdatadir_mapping.rs |  2 +-
 pageserver/src/tenant/timeline.rs   | 10 ++++------
 2 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 7b106569a4..7c1abbf3e2 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -45,7 +45,7 @@ use wal_decoder::serialized_batch::SerializedValueBatch;
 pub const MAX_AUX_FILE_DELTAS: usize = 1024;
 
 /// Max number of aux-file-related delta layers. The compaction will create a new image layer once this threshold is reached.
-pub const MAX_AUX_FILE_V2_DELTAS: usize = 64;
+pub const MAX_AUX_FILE_V2_DELTAS: usize = 16;
 
 #[derive(Debug)]
 pub enum LsnForTimestamp {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index ee823beca8..6e082aecf5 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -4090,6 +4090,7 @@ impl Timeline {
     ) -> Result<ImageLayerCreationOutcome, CreateImageLayersError> {
         // Metadata keys image layer creation.
         let mut reconstruct_state = ValuesReconstructState::default();
+        let begin = Instant::now();
         let data = self
             .get_vectored_impl(partition.clone(), lsn, &mut reconstruct_state, ctx)
             .await?;
@@ -4106,14 +4107,11 @@ impl Timeline {
             (new_data, total_kb_retrieved / 1024, total_keys_retrieved)
         };
         let delta_files_accessed = reconstruct_state.get_delta_layers_visited();
+        let elapsed = begin.elapsed();
 
         let trigger_generation = delta_files_accessed as usize >= MAX_AUX_FILE_V2_DELTAS;
-        debug!(
-            trigger_generation,
-            delta_files_accessed,
-            total_kb_retrieved,
-            total_keys_retrieved,
-            "generate metadata images"
+        info!(
+            "metadata key compaction: trigger_generation={trigger_generation}, delta_files_accessed={delta_files_accessed}, total_kb_retrieved={total_kb_retrieved}, total_keys_retrieved={total_keys_retrieved}, read_time={}s", elapsed.as_secs_f64()
         );
 
         if !trigger_generation && mode == ImageLayerCreationMode::Try {

From 75aa19aa2dcc3f8907acd0eeae9e7c866f226f23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 7 Nov 2024 17:13:50 +0100
Subject: [PATCH 187/239] Don't attach is_archived to debug output (#9679)

We are in branches where we know its value already.
---
 pageserver/src/tenant/timeline/offload.rs | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index cccf24e303..2dc461c28d 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -47,15 +47,12 @@ pub(crate) async fn offload_timeline(
     match is_archived {
         Some(true) => (),
         Some(false) => {
-            tracing::warn!(?is_archived, "tried offloading a non-archived timeline");
+            tracing::warn!("tried offloading a non-archived timeline");
             return Err(OffloadError::NotArchived);
         }
         None => {
             // This is legal: calls to this function can race with the timeline shutting down
-            tracing::info!(
-                ?is_archived,
-                "tried offloading a timeline whose remote storage is not initialized"
-            );
+            tracing::info!("tried offloading a timeline whose remote storage is not initialized");
             return Err(OffloadError::Cancelled);
         }
     }

From 82e3f0ecba8542cd5d1a95bf6d938aacbc073905 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Thu, 7 Nov 2024 16:24:38 +0000
Subject: [PATCH 188/239] [proxy/authorize]: improve JWKS reliability (#9676)

While setting up some tests, I noticed that we didn't support keycloak.
They make use of encryption JWKs as well as signature ones. Our current
jwks crate does not support parsing encryption keys which caused the
entire jwk set to fail to parse. Switching to lazy parsing fixes this.

Also while setting up tests, I couldn't use localhost jwks server as we
require HTTPS and we were using webpki so it was impossible to add a
custom CA. Enabling native roots addresses this possibility.

I saw some of our current e2e tests against our custom JWKS in s3 were
taking a while to fetch. I've added a timeout + retries to address this.
---
 Cargo.lock                             |   1 +
 proxy/Cargo.toml                       |   2 +-
 proxy/src/auth/backend/jwt.rs          | 162 +++++++++++++++++++++++--
 proxy/src/http/mod.rs                  |  22 ++--
 proxy/src/serverless/conn_pool_lib.rs  |   3 +-
 proxy/src/serverless/http_conn_pool.rs |   1 -
 workspace_hack/Cargo.toml              |   2 +-
 7 files changed, 168 insertions(+), 25 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7d18f44aec..00d58be2d5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4743,6 +4743,7 @@ dependencies = [
  "percent-encoding",
  "pin-project-lite",
  "rustls 0.22.4",
+ "rustls-native-certs 0.7.0",
  "rustls-pemfile 2.1.1",
  "rustls-pki-types",
  "serde",
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index efd336dbea..1665d6361a 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -60,7 +60,7 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
-reqwest.workspace = true
+reqwest = { workspace = true, features = ["rustls-tls-native-roots"] }
 reqwest-middleware = { workspace = true, features = ["json"] }
 reqwest-retry.workspace = true
 reqwest-tracing.workspace = true
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 83c3617612..bfc674139b 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -7,8 +7,11 @@ use arc_swap::ArcSwapOption;
 use dashmap::DashMap;
 use jose_jwk::crypto::KeyInfo;
 use reqwest::{redirect, Client};
+use reqwest_retry::policies::ExponentialBackoff;
+use reqwest_retry::RetryTransientMiddleware;
 use serde::de::Visitor;
 use serde::{Deserialize, Deserializer};
+use serde_json::value::RawValue;
 use signature::Verifier;
 use thiserror::Error;
 use tokio::time::Instant;
@@ -16,7 +19,7 @@ use tokio::time::Instant;
 use crate::auth::backend::ComputeCredentialKeys;
 use crate::context::RequestMonitoring;
 use crate::control_plane::errors::GetEndpointJwksError;
-use crate::http::parse_json_body_with_limit;
+use crate::http::read_body_with_limit;
 use crate::intern::RoleNameInt;
 use crate::types::{EndpointId, RoleName};
 
@@ -28,6 +31,10 @@ const MAX_RENEW: Duration = Duration::from_secs(3600);
 const MAX_JWK_BODY_SIZE: usize = 64 * 1024;
 const JWKS_USER_AGENT: &str = "neon-proxy";
 
+const JWKS_CONNECT_TIMEOUT: Duration = Duration::from_secs(2);
+const JWKS_FETCH_TIMEOUT: Duration = Duration::from_secs(5);
+const JWKS_FETCH_RETRIES: u32 = 3;
+
 /// How to get the JWT auth rules
 pub(crate) trait FetchAuthRules: Clone + Send + Sync + 'static {
     fn fetch_auth_rules(
@@ -55,7 +62,7 @@ pub(crate) struct AuthRule {
 }
 
 pub struct JwkCache {
-    client: reqwest::Client,
+    client: reqwest_middleware::ClientWithMiddleware,
 
     map: DashMap<(EndpointId, RoleName), Arc<JwkCacheEntryLock>>,
 }
@@ -117,6 +124,14 @@ impl Default for JwkCacheEntryLock {
     }
 }
 
+#[derive(Deserialize)]
+struct JwkSet<'a> {
+    /// we parse into raw-value because not all keys in a JWKS are ones
+    /// we can parse directly, so we parse them lazily.
+    #[serde(borrow)]
+    keys: Vec<&'a RawValue>,
+}
+
 impl JwkCacheEntryLock {
     async fn acquire_permit<'a>(self: &'a Arc<Self>) -> JwkRenewalPermit<'a> {
         JwkRenewalPermit::acquire_permit(self).await
@@ -130,7 +145,7 @@ impl JwkCacheEntryLock {
         &self,
         _permit: JwkRenewalPermit<'_>,
         ctx: &RequestMonitoring,
-        client: &reqwest::Client,
+        client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
         auth_rules: &F,
     ) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -154,22 +169,73 @@ impl JwkCacheEntryLock {
             let req = client.get(rule.jwks_url.clone());
             // TODO(conrad): eventually switch to using reqwest_middleware/`new_client_with_timeout`.
             // TODO(conrad): We need to filter out URLs that point to local resources. Public internet only.
-            match req.send().await.and_then(|r| r.error_for_status()) {
+            match req.send().await.and_then(|r| {
+                r.error_for_status()
+                    .map_err(reqwest_middleware::Error::Reqwest)
+            }) {
                 // todo: should we re-insert JWKs if we want to keep this JWKs URL?
                 // I expect these failures would be quite sparse.
                 Err(e) => tracing::warn!(url=?rule.jwks_url, error=?e, "could not fetch JWKs"),
                 Ok(r) => {
                     let resp: http::Response<reqwest::Body> = r.into();
-                    match parse_json_body_with_limit::<jose_jwk::JwkSet>(
-                        resp.into_body(),
-                        MAX_JWK_BODY_SIZE,
-                    )
-                    .await
+
+                    let bytes = match read_body_with_limit(resp.into_body(), MAX_JWK_BODY_SIZE)
+                        .await
                     {
+                        Ok(bytes) => bytes,
+                        Err(e) => {
+                            tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
+                            continue;
+                        }
+                    };
+
+                    match serde_json::from_slice::<JwkSet>(&bytes) {
                         Err(e) => {
                             tracing::warn!(url=?rule.jwks_url, error=?e, "could not decode JWKs");
                         }
                         Ok(jwks) => {
+                            // size_of::<&RawValue>() == 16
+                            // size_of::<jose_jwk::Jwk>() == 288
+                            // better to not pre-allocate this as it might be pretty large - especially if it has many
+                            // keys we don't want or need.
+                            // trivial 'attack': `{"keys":[` + repeat(`0`).take(30000).join(`,`) + `]}`
+                            // this would consume 8MiB just like that!
+                            let mut keys = vec![];
+                            let mut failed = 0;
+                            for key in jwks.keys {
+                                match serde_json::from_str::<jose_jwk::Jwk>(key.get()) {
+                                    Ok(key) => {
+                                        // if `use` (called `cls` in rust) is specified to be something other than signing,
+                                        // we can skip storing it.
+                                        if key
+                                            .prm
+                                            .cls
+                                            .as_ref()
+                                            .is_some_and(|c| *c != jose_jwk::Class::Signing)
+                                        {
+                                            continue;
+                                        }
+
+                                        keys.push(key);
+                                    }
+                                    Err(e) => {
+                                        tracing::debug!(url=?rule.jwks_url, failed=?e, "could not decode JWK");
+                                        failed += 1;
+                                    }
+                                }
+                            }
+                            keys.shrink_to_fit();
+
+                            if failed > 0 {
+                                tracing::warn!(url=?rule.jwks_url, failed, "could not decode JWKs");
+                            }
+
+                            if keys.is_empty() {
+                                tracing::warn!(url=?rule.jwks_url, "no valid JWKs found inside the response body");
+                                continue;
+                            }
+
+                            let jwks = jose_jwk::JwkSet { keys };
                             key_sets.insert(
                                 rule.id,
                                 KeySet {
@@ -179,7 +245,7 @@ impl JwkCacheEntryLock {
                                 },
                             );
                         }
-                    }
+                    };
                 }
             }
         }
@@ -196,7 +262,7 @@ impl JwkCacheEntryLock {
     async fn get_or_update_jwk_cache<F: FetchAuthRules>(
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
-        client: &reqwest::Client,
+        client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
         fetch: &F,
     ) -> Result<Arc<JwkCacheEntry>, JwtError> {
@@ -250,7 +316,7 @@ impl JwkCacheEntryLock {
         self: &Arc<Self>,
         ctx: &RequestMonitoring,
         jwt: &str,
-        client: &reqwest::Client,
+        client: &reqwest_middleware::ClientWithMiddleware,
         endpoint: EndpointId,
         role_name: &RoleName,
         fetch: &F,
@@ -369,8 +435,19 @@ impl Default for JwkCache {
         let client = Client::builder()
             .user_agent(JWKS_USER_AGENT)
             .redirect(redirect::Policy::none())
+            .tls_built_in_native_certs(true)
+            .connect_timeout(JWKS_CONNECT_TIMEOUT)
+            .timeout(JWKS_FETCH_TIMEOUT)
             .build()
-            .expect("using &str and standard redirect::Policy");
+            .expect("client config should be valid");
+
+        // Retry up to 3 times with increasing intervals between attempts.
+        let retry_policy = ExponentialBackoff::builder().build_with_max_retries(JWKS_FETCH_RETRIES);
+
+        let client = reqwest_middleware::ClientBuilder::new(client)
+            .with(RetryTransientMiddleware::new_with_policy(retry_policy))
+            .build();
+
         JwkCache {
             client,
             map: DashMap::default(),
@@ -1209,4 +1286,63 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
             }
         }
     }
+
+    #[tokio::test]
+    async fn check_jwk_keycloak_regression() {
+        let (rs, valid_jwk) = new_rsa_jwk(RS1, "rs1".into());
+        let valid_jwk = serde_json::to_value(valid_jwk).unwrap();
+
+        // This is valid, but we cannot parse it as we have no support for encryption JWKs, only signature based ones.
+        // This is taken directly from keycloak.
+        let invalid_jwk = serde_json::json! {
+            {
+                "kid": "U-Jc9xRli84eNqRpYQoIPF-GNuRWV3ZvAIhziRW2sbQ",
+                "kty": "RSA",
+                "alg": "RSA-OAEP",
+                "use": "enc",
+                "n": "yypYWsEKmM_wWdcPnSGLSm5ytw1WG7P7EVkKSulcDRlrM6HWj3PR68YS8LySYM2D9Z-79oAdZGKhIfzutqL8rK1vS14zDuPpAM-RWY3JuQfm1O_-1DZM8-07PmVRegP5KPxsKblLf_My8ByH6sUOIa1p2rbe2q_b0dSTXYu1t0dW-cGL5VShc400YymvTwpc-5uYNsaVxZajnB7JP1OunOiuCJ48AuVp3PqsLzgoXqlXEB1ZZdch3xT3bxaTtNruGvG4xmLZY68O_T3yrwTCNH2h_jFdGPyXdyZToCMSMK2qSbytlfwfN55pT9Vv42Lz1YmoB7XRjI9aExKPc5AxFw",
+                "e": "AQAB",
+                "x5c": [
+                    "MIICmzCCAYMCBgGS41E6azANBgkqhkiG9w0BAQsFADARMQ8wDQYDVQQDDAZtYXN0ZXIwHhcNMjQxMDMxMTYwMTQ0WhcNMzQxMDMxMTYwMzI0WjARMQ8wDQYDVQQDDAZtYXN0ZXIwggEiMA0GCSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQDLKlhawQqYz/BZ1w+dIYtKbnK3DVYbs/sRWQpK6VwNGWszodaPc9HrxhLwvJJgzYP1n7v2gB1kYqEh/O62ovysrW9LXjMO4+kAz5FZjcm5B+bU7/7UNkzz7Ts+ZVF6A/ko/GwpuUt/8zLwHIfqxQ4hrWnatt7ar9vR1JNdi7W3R1b5wYvlVKFzjTRjKa9PClz7m5g2xpXFlqOcHsk/U66c6K4InjwC5Wnc+qwvOCheqVcQHVll1yHfFPdvFpO02u4a8bjGYtljrw79PfKvBMI0faH+MV0Y/Jd3JlOgIxIwrapJvK2V/B83nmlP1W/jYvPViagHtdGMj1oTEo9zkDEXAgMBAAEwDQYJKoZIhvcNAQELBQADggEBAECYX59+Q9v6c9sb6Q0/C6IgLWG2nVCgVE1YWwIzz+68WrhlmNCRuPjY94roB+tc2tdHbj+Nh3LMzJk7L1KCQoW1+LPK6A6E8W9ad0YPcuw8csV2pUA3+H56exQMH0fUAPQAU7tXWvnQ7otcpV1XA8afn/NTMTsnxi9mSkor8MLMYQ3aeRyh1+LAchHBthWiltqsSUqXrbJF59u5p0ghquuKcWR3TXsA7klGYBgGU5KAJifr9XT87rN0bOkGvbeWAgKvnQnjZwxdnLqTfp/pRY/PiJJHhgIBYPIA7STGnMPjmJ995i34zhnbnd8WHXJA3LxrIMqLW/l8eIdvtM1w8KI="
+                ],
+                "x5t": "QhfzMMnuAfkReTgZ1HtrfyOeeZs",
+                "x5t#S256": "cmHDUdKgLiRCEN28D5FBy9IJLFmR7QWfm77SLhGTCTU"
+            }
+        };
+
+        let jwks = serde_json::json! {{ "keys": [invalid_jwk, valid_jwk ] }};
+        let jwks_addr = jwks_server(move |path| match path {
+            "/" => Some(serde_json::to_vec(&jwks).unwrap()),
+            _ => None,
+        })
+        .await;
+
+        let role_name = RoleName::from("anonymous");
+        let role = RoleNameInt::from(&role_name);
+
+        let rules = vec![AuthRule {
+            id: "foo".to_owned(),
+            jwks_url: format!("http://{jwks_addr}/").parse().unwrap(),
+            audience: None,
+            role_names: vec![role],
+        }];
+
+        let fetch = Fetch(rules);
+        let jwk_cache = JwkCache::default();
+
+        let endpoint = EndpointId::from("ep");
+
+        let token = new_rsa_jwt("rs1".into(), rs);
+
+        jwk_cache
+            .check_jwt(
+                &RequestMonitoring::test(),
+                endpoint.clone(),
+                &role_name,
+                &fetch,
+                &token,
+            )
+            .await
+            .unwrap();
+    }
 }
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index f1b632e704..b1642cedb3 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -6,7 +6,6 @@ pub mod health_server;
 
 use std::time::Duration;
 
-use anyhow::bail;
 use bytes::Bytes;
 use http::Method;
 use http_body_util::BodyExt;
@@ -16,7 +15,7 @@ use reqwest_middleware::RequestBuilder;
 pub(crate) use reqwest_middleware::{ClientWithMiddleware, Error};
 pub(crate) use reqwest_retry::policies::ExponentialBackoff;
 pub(crate) use reqwest_retry::RetryTransientMiddleware;
-use serde::de::DeserializeOwned;
+use thiserror::Error;
 
 use crate::metrics::{ConsoleRequest, Metrics};
 use crate::url::ApiUrl;
@@ -122,10 +121,19 @@ impl Endpoint {
     }
 }
 
-pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
+#[derive(Error, Debug)]
+pub(crate) enum ReadBodyError {
+    #[error("Content length exceeds limit of {limit} bytes")]
+    BodyTooLarge { limit: usize },
+
+    #[error(transparent)]
+    Read(#[from] reqwest::Error),
+}
+
+pub(crate) async fn read_body_with_limit(
     mut b: impl Body<Data = Bytes, Error = reqwest::Error> + Unpin,
     limit: usize,
-) -> anyhow::Result<D> {
+) -> Result<Vec<u8>, ReadBodyError> {
     // We could use `b.limited().collect().await.to_bytes()` here
     // but this ends up being slightly more efficient as far as I can tell.
 
@@ -133,20 +141,20 @@ pub(crate) async fn parse_json_body_with_limit<D: DeserializeOwned>(
     // in reqwest, this value is influenced by the Content-Length header.
     let lower_bound = match usize::try_from(b.size_hint().lower()) {
         Ok(bound) if bound <= limit => bound,
-        _ => bail!("Content length exceeds limit of {limit} bytes"),
+        _ => return Err(ReadBodyError::BodyTooLarge { limit }),
     };
     let mut bytes = Vec::with_capacity(lower_bound);
 
     while let Some(frame) = b.frame().await.transpose()? {
         if let Ok(data) = frame.into_data() {
             if bytes.len() + data.len() > limit {
-                bail!("Content length exceeds limit of {limit} bytes")
+                return Err(ReadBodyError::BodyTooLarge { limit });
             }
             bytes.extend_from_slice(&data);
         }
     }
 
-    Ok(serde_json::from_slice::<D>(&bytes)?)
+    Ok(bytes)
 }
 
 #[cfg(test)]
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index 00a8ac4768..61c39c32c9 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -16,8 +16,7 @@ use super::http_conn_pool::ClientDataHttp;
 use super::local_conn_pool::ClientDataLocal;
 use crate::auth::backend::ComputeUserInfo;
 use crate::context::RequestMonitoring;
-use crate::control_plane::messages::ColdStartInfo;
-use crate::control_plane::messages::MetricsAuxInfo;
+use crate::control_plane::messages::{ColdStartInfo, MetricsAuxInfo};
 use crate::metrics::{HttpEndpointPoolsGuard, Metrics};
 use crate::types::{DbName, EndpointCacheKey, RoleName};
 use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 56be70abec..a1d4473b01 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -7,7 +7,6 @@ use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use rand::Rng;
-use std::result::Result::Ok;
 use tokio::net::TcpStream;
 use tracing::{debug, error, info, info_span, Instrument};
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 02deecd385..ae4018a884 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -64,7 +64,7 @@ rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
 regex-syntax = { version = "0.8" }
-reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "stream"] }
+reqwest = { version = "0.12", default-features = false, features = ["blocking", "json", "rustls-tls", "rustls-tls-native-roots", "stream"] }
 rustls = { version = "0.23", default-features = false, features = ["logging", "ring", "std", "tls12"] }
 scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }

From 9132d80aa3e20c58878b945b86c442bf7756a30e Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Thu, 7 Nov 2024 19:00:25 +0100
Subject: [PATCH 189/239] add pgcopydb tool to build tools image (#9658)

## Problem

build-tools image does not provide superuser, so additional packages can
not be installed during GitHub benchmarking workflows but need to be
added to the image

## Summary of changes

install pgcopydb version 0.17-1 or higher into build-tools bookworm
image

```bash
docker run -it neondatabase/build-tools:<tag>-bookworm-arm64 /bin/bash
...
nonroot@c23c6f4901ce:~$ LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version;
13:58:19.768 8 INFO   Running pgcopydb version 0.17 from "/pgcopydb/bin/pgcopydb"
pgcopydb version 0.17
compiled with PostgreSQL 16.4 (Debian 16.4-1.pgdg120+2) on aarch64-unknown-linux-gnu, compiled by gcc (Debian 12.2.0-14) 12.2.0, 64-bit
compatible with Postgres 11, 12, 13, 14, 15, and 16
```

Example usage of that image in a workflow

https://github.com/neondatabase/neon/actions/runs/11725718371/job/32662681172#step:7:14
---
 build-tools.Dockerfile | 66 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 63 insertions(+), 3 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 93f1e48afa..c1190b13f4 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -1,12 +1,66 @@
 ARG DEBIAN_VERSION=bullseye
 
-FROM debian:${DEBIAN_VERSION}-slim
+FROM debian:bookworm-slim AS pgcopydb_builder
+ARG DEBIAN_VERSION
+
+RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
+        set -e && \
+        apt update && \
+        apt install -y --no-install-recommends \
+        ca-certificates wget gpg && \
+        wget -qO - https://www.postgresql.org/media/keys/ACCC4CF8.asc | gpg --dearmor -o /usr/share/keyrings/postgresql-keyring.gpg && \
+        echo "deb [signed-by=/usr/share/keyrings/postgresql-keyring.gpg] http://apt.postgresql.org/pub/repos/apt bookworm-pgdg main" > /etc/apt/sources.list.d/pgdg.list && \
+        apt-get update && \
+        apt install -y --no-install-recommends \
+        build-essential \
+        autotools-dev \
+        libedit-dev \
+        libgc-dev \
+        libpam0g-dev \
+        libreadline-dev \
+        libselinux1-dev \
+        libxslt1-dev \
+        libssl-dev \
+        libkrb5-dev \
+        zlib1g-dev \
+        liblz4-dev \
+        libpq5 \
+        libpq-dev \
+        libzstd-dev \
+        postgresql-16 \
+        postgresql-server-dev-16 \
+        postgresql-common  \
+        python3-sphinx && \
+        wget -O /tmp/pgcopydb.tar.gz https://github.com/dimitri/pgcopydb/archive/refs/tags/v0.17.tar.gz && \
+        mkdir /tmp/pgcopydb && \
+        tar -xzf /tmp/pgcopydb.tar.gz -C /tmp/pgcopydb --strip-components=1 && \
+        cd /tmp/pgcopydb && \
+        make -s clean && \
+        make -s -j12 install && \
+        libpq_path=$(find /lib /usr/lib -name "libpq.so.5" | head -n 1) && \
+        mkdir -p /pgcopydb/lib && \
+        cp "$libpq_path" /pgcopydb/lib/; \
+    else \
+        # copy command below will fail if we don't have dummy files, so we create them for other debian versions
+        mkdir -p /usr/lib/postgresql/16/bin && touch /usr/lib/postgresql/16/bin/pgcopydb && \
+        mkdir -p mkdir -p /pgcopydb/lib && touch /pgcopydb/lib/libpq.so.5; \
+    fi
+
+FROM debian:${DEBIAN_VERSION}-slim AS build_tools
 ARG DEBIAN_VERSION
 
 # Add nonroot user
 RUN useradd -ms /bin/bash nonroot -b /home
 SHELL ["/bin/bash", "-c"]
 
+RUN mkdir -p /pgcopydb/bin && \
+    mkdir -p /pgcopydb/lib && \
+    chmod -R 755 /pgcopydb && \
+    chown -R nonroot:nonroot /pgcopydb
+        
+COPY --from=pgcopydb_builder /usr/lib/postgresql/16/bin/pgcopydb /pgcopydb/bin/pgcopydb 
+COPY --from=pgcopydb_builder /pgcopydb/lib/libpq.so.5 /pgcopydb/lib/libpq.so.5 
+
 # System deps
 #
 # 'gdb' is included so that we get backtraces of core dumps produced in
@@ -38,7 +92,7 @@ RUN set -e \
         libseccomp-dev \
         libsqlite3-dev \
         libssl-dev \
-        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && libstdc++-10-dev || libstdc++-11-dev) \
+        $([[ "${DEBIAN_VERSION}" = "bullseye" ]] && echo libstdc++-10-dev || echo libstdc++-11-dev) \
         libtool \
         libxml2-dev \
         libxmlsec1-dev \
@@ -235,7 +289,13 @@ RUN whoami \
     && cargo --version --verbose \
     && rustup --version --verbose \
     && rustc --version --verbose \
-    && clang --version
+    && clang --version 
+
+RUN if [ "${DEBIAN_VERSION}" = "bookworm" ]; then \
+    LD_LIBRARY_PATH=/pgcopydb/lib /pgcopydb/bin/pgcopydb --version; \
+else \
+    echo "pgcopydb is not available for ${DEBIAN_VERSION}"; \
+fi
 
 # Set following flag to check in Makefile if its running in Docker
 RUN touch /home/nonroot/.docker_build

From 79929bb1b654391e63d0f02f668993834806b837 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 8 Nov 2024 10:35:03 +0200
Subject: [PATCH 190/239] Disable `rust_2024_compatibility` lint option (#9615)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compiling with nightly rust compiler, I'm getting a lot of errors like
this:

    error: `if let` assigns a shorter lifetime since Edition 2024
       --> proxy/src/auth/backend/jwt.rs:226:16
        |
    226 |             if let Some(permit) = self.try_acquire_permit() {
        |                ^^^^^^^^^^^^^^^^^^^-------------------------
        |                                   |
| this value has a significant drop implementation which may observe a
major change in drop order and requires your discretion
        |
        = warning: this changes meaning in Rust 2024
= note: for more information, see issue #124085
<https://github.com/rust-lang/rust/issues/124085>
    help: the value is now dropped here in Edition 2024
       --> proxy/src/auth/backend/jwt.rs:241:13
        |
    241 |             } else {
        |             ^
    note: the lint level is defined here
       --> proxy/src/lib.rs:8:5
        |
    8   |     rust_2024_compatibility
        |     ^^^^^^^^^^^^^^^^^^^^^^^
= note: `#[deny(if_let_rescope)]` implied by
`#[deny(rust_2024_compatibility)]`

and this:

error: these values and local bindings have significant drop
implementation that will have a different drop order from that of
Edition 2021
       --> proxy/src/auth/backend/jwt.rs:376:18
        |
    369 |         let client = Client::builder()
| ------ these values have significant drop implementation and will
observe changes in drop order under Edition 2024
    ...
    376 |             map: DashMap::default(),
        |                  ^^^^^^^^^^^^^^^^^^
        |
        = warning: this changes meaning in Rust 2024
= note: for more information, see issue #123739
<https://github.com/rust-lang/rust/issues/123739>
= note: `#[deny(tail_expr_drop_order)]` implied by
`#[deny(rust_2024_compatibility)]`

They are caused by the `rust_2024_compatibility` lint option.

When we actually switch to the 2024 edition, it makes sense to go
through all these and check that the drop order changes don't break
anything, but in the meanwhile, there's no easy way to avoid these
errors. Disable it, to allow compiling with nightly again.

Co-authored-by: Arpad Müller <arpad-m@users.noreply.github.com>
---
 proxy/src/lib.rs | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index f95d645c23..ad7e1d2771 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -1,12 +1,6 @@
 // rustc lints/lint groups
 // https://doc.rust-lang.org/rustc/lints/groups.html
-#![deny(
-    deprecated,
-    future_incompatible,
-    let_underscore,
-    nonstandard_style,
-    rust_2024_compatibility
-)]
+#![deny(deprecated, future_incompatible, let_underscore, nonstandard_style)]
 #![warn(clippy::all, clippy::pedantic, clippy::cargo)]
 // List of denied lints from the clippy::restriction group.
 // https://rust-lang.github.io/rust-clippy/master/index.html#?groups=restriction

From 027889b06ca9324604575183d84aede5f0c4c906 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?JC=20Gr=C3=BCnhage?= <jc@neon.tech>
Date: Fri, 8 Nov 2024 10:44:59 +0100
Subject: [PATCH 191/239] ci: use set-docker-config-dir from dev-actions
 (#9638)

set-docker-config-dir was replicated over multiple repositories.

The replica of this action was removed from this repository and it's
using the version from github.com/neondatabase/dev-actions instead
---
 .../actions/set-docker-config-dir/action.yml  | 36 -------------------
 .github/workflows/build-build-tools-image.yml |  2 +-
 .github/workflows/build_and_test.yml          |  8 ++---
 3 files changed, 5 insertions(+), 41 deletions(-)
 delete mode 100644 .github/actions/set-docker-config-dir/action.yml

diff --git a/.github/actions/set-docker-config-dir/action.yml b/.github/actions/set-docker-config-dir/action.yml
deleted file mode 100644
index 3ee8bec8c6..0000000000
--- a/.github/actions/set-docker-config-dir/action.yml
+++ /dev/null
@@ -1,36 +0,0 @@
-name: "Set custom docker config directory"
-description: "Create a directory for docker config and set DOCKER_CONFIG"
-
-# Use custom DOCKER_CONFIG directory to avoid conflicts with default settings
-runs:
-  using: "composite"
-  steps:
-  - name: Show warning on GitHub-hosted runners
-    if: runner.environment == 'github-hosted'
-    shell: bash -euo pipefail {0}
-    run: |
-      # Using the following environment variables to find a path to the workflow file
-      # ${GITHUB_WORKFLOW_REF} - octocat/hello-world/.github/workflows/my-workflow.yml@refs/heads/my_branch
-      # ${GITHUB_REPOSITORY}   - octocat/hello-world
-      # ${GITHUB_REF}          - refs/heads/my_branch
-      # From https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/variables
-
-      filename_with_ref=${GITHUB_WORKFLOW_REF#"$GITHUB_REPOSITORY/"}
-      filename=${filename_with_ref%"@$GITHUB_REF"}
-
-      # https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/workflow-commands-for-github-actions#setting-a-warning-message
-      title='Unnecessary usage of `.github/actions/set-docker-config-dir`'
-      message='No need to use `.github/actions/set-docker-config-dir` action on GitHub-hosted runners'
-      echo "::warning file=${filename},title=${title}::${message}"
-
-  - uses: pyTooling/Actions/with-post-step@74afc5a42a17a046c90c68cb5cfa627e5c6c5b6b # v1.0.7
-    env:
-      DOCKER_CONFIG: .docker-custom-${{ github.run_id }}-${{ github.run_attempt }}
-    with:
-      main: |
-        mkdir -p "${DOCKER_CONFIG}"
-        echo DOCKER_CONFIG=${DOCKER_CONFIG} | tee -a $GITHUB_ENV
-      post: |
-        if [ -d "${DOCKER_CONFIG}" ]; then
-          rm -r "${DOCKER_CONFIG}"
-        fi
diff --git a/.github/workflows/build-build-tools-image.yml b/.github/workflows/build-build-tools-image.yml
index 10750089b2..82b065c524 100644
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -64,7 +64,7 @@ jobs:
 
       - uses: actions/checkout@v4
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bba51ddc92..bcf021a9a1 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -552,7 +552,7 @@ jobs:
         with:
           submodules: true
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -643,7 +643,7 @@ jobs:
         with:
           submodules: true
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/setup-buildx-action@v3
         with:
           cache-binary: false
@@ -824,7 +824,7 @@ jobs:
           curl -fL https://github.com/neondatabase/autoscaling/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
@@ -860,7 +860,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - uses: ./.github/actions/set-docker-config-dir
+      - uses: neondatabase/dev-actions/set-docker-config-dir@6094485bf440001c94a94a3f9e221e81ff6b6193
       - uses: docker/login-action@v3
         with:
           username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}

From aa9112efce42869472fbee7bfa0048f12d3ff81a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Fri, 8 Nov 2024 10:16:04 +0000
Subject: [PATCH 192/239] pageserver: add `no_sync` for use in regression tests
 (1/2) (#9677)

## Problem

In test environments, the `syncfs` that the pageserver does on startup
can take a long time, as other tests running concurrently might have
many gigabytes of dirty pages.

## Summary of changes

- Add a `no_sync` option to the pageserver's config.
- Skip syncfs on startup if this is set
- A subsequent PR (https://github.com/neondatabase/neon/pull/9678) will
enable this by default in tests. We need to wait until after the next
release to avoid breaking compat tests, which would fail if we set
no_sync & use an old pageserver binary.

Q: Why is this a different mechanism than safekeeper, which as a
--no-sync CLI?
A: Because the way we manage pageservers in neon_local depends on the
pageserver.toml containing the full configuration, whereas safekeepers
have a config file which is neon-local-specific and can drive a CLI
flag.

Q: Why is the option no_sync rather than sync?
A: For boolean configs with a dangerous value, it's preferable to make
"false" the safe option, so that any downstream future config tooling
that might have a "booleans are false by default" behavior (e.g. golang
structs) is safe by default.

Q: Why only skip the syncfs, and not all fsyncs?
A: Skipping all fsyncs would require more code changes, and the most
acute problem isn't fsyncs themselves (these just slow down a running
test), it's the syncfs (which makes a pageserver startup slow as a
result of _other_ tests)
---
 control_plane/src/bin/neon_local.rs |  3 +++
 control_plane/src/local_env.rs      | 10 ++++++++++
 control_plane/src/pageserver.rs     |  1 +
 libs/pageserver_api/src/config.rs   |  3 +++
 pageserver/src/bin/pageserver.rs    | 18 +++++++++++-------
 pageserver/src/config.rs            |  5 +++++
 6 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs
index 48438adf43..c4063bbd1a 100644
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -944,6 +944,9 @@ fn handle_init(args: &InitCmdArgs) -> anyhow::Result<LocalEnv> {
                         pg_auth_type: AuthType::Trust,
                         http_auth_type: AuthType::Trust,
                         other: Default::default(),
+                        // Typical developer machines use disks with slow fsync, and we don't care
+                        // about data integrity: disable disk syncs.
+                        no_sync: true,
                     }
                 })
                 .collect(),
diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs
index 9dc2a0c36b..032c88a829 100644
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -225,6 +225,7 @@ pub struct PageServerConf {
     pub listen_http_addr: String,
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
+    pub no_sync: bool,
 }
 
 impl Default for PageServerConf {
@@ -235,6 +236,7 @@ impl Default for PageServerConf {
             listen_http_addr: String::new(),
             pg_auth_type: AuthType::Trust,
             http_auth_type: AuthType::Trust,
+            no_sync: false,
         }
     }
 }
@@ -249,6 +251,8 @@ pub struct NeonLocalInitPageserverConf {
     pub listen_http_addr: String,
     pub pg_auth_type: AuthType,
     pub http_auth_type: AuthType,
+    #[serde(default, skip_serializing_if = "std::ops::Not::not")]
+    pub no_sync: bool,
     #[serde(flatten)]
     pub other: HashMap<String, toml::Value>,
 }
@@ -261,6 +265,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
             listen_http_addr,
             pg_auth_type,
             http_auth_type,
+            no_sync,
             other: _,
         } = conf;
         Self {
@@ -269,6 +274,7 @@ impl From<&NeonLocalInitPageserverConf> for PageServerConf {
             listen_http_addr: listen_http_addr.clone(),
             pg_auth_type: *pg_auth_type,
             http_auth_type: *http_auth_type,
+            no_sync: *no_sync,
         }
     }
 }
@@ -569,6 +575,8 @@ impl LocalEnv {
                     listen_http_addr: String,
                     pg_auth_type: AuthType,
                     http_auth_type: AuthType,
+                    #[serde(default)]
+                    no_sync: bool,
                 }
                 let config_toml_path = dentry.path().join("pageserver.toml");
                 let config_toml: PageserverConfigTomlSubset = toml_edit::de::from_str(
@@ -591,6 +599,7 @@ impl LocalEnv {
                     listen_http_addr,
                     pg_auth_type,
                     http_auth_type,
+                    no_sync,
                 } = config_toml;
                 let IdentityTomlSubset {
                     id: identity_toml_id,
@@ -607,6 +616,7 @@ impl LocalEnv {
                     listen_http_addr,
                     pg_auth_type,
                     http_auth_type,
+                    no_sync,
                 };
                 pageservers.push(conf);
             }
diff --git a/control_plane/src/pageserver.rs b/control_plane/src/pageserver.rs
index eab76e14c3..ae5e22ddc6 100644
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -273,6 +273,7 @@ impl PageServerNode {
             )
         })?;
         let args = vec!["-D", datadir_path_str];
+
         background_process::start_process(
             "pageserver",
             &datadir,
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 00cc426c3c..6de34fdd35 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -106,6 +106,8 @@ pub struct ConfigToml {
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub no_sync: Option<bool>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -389,6 +391,7 @@ impl Default for ConfigToml {
             l0_flush: None,
             virtual_file_io_mode: None,
             tenant_config: TenantConfigToml::default(),
+            no_sync: None,
         }
     }
 }
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 782122139e..fe2a31167d 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -154,13 +154,17 @@ fn main() -> anyhow::Result<()> {
             },
         };
 
-        let started = Instant::now();
-        syncfs(dirfd)?;
-        let elapsed = started.elapsed();
-        info!(
-            elapsed_ms = elapsed.as_millis(),
-            "made tenant directory contents durable"
-        );
+        if conf.no_sync {
+            info!("Skipping syncfs on startup");
+        } else {
+            let started = Instant::now();
+            syncfs(dirfd)?;
+            let elapsed = started.elapsed();
+            info!(
+                elapsed_ms = elapsed.as_millis(),
+                "made tenant directory contents durable"
+            );
+        }
     }
 
     // Initialize up failpoints support
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 06d4326459..d62066ac22 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -178,6 +178,9 @@ pub struct PageServerConf {
 
     /// Direct IO settings
     pub virtual_file_io_mode: virtual_file::IoMode,
+
+    /// Optionally disable disk syncs (unsafe!)
+    pub no_sync: bool,
 }
 
 /// Token for authentication to safekeepers
@@ -332,6 +335,7 @@ impl PageServerConf {
             concurrent_tenant_size_logical_size_queries,
             virtual_file_io_engine,
             tenant_config,
+            no_sync,
         } = config_toml;
 
         let mut conf = PageServerConf {
@@ -409,6 +413,7 @@ impl PageServerConf {
                 .map(crate::l0_flush::L0FlushConfig::from)
                 .unwrap_or_default(),
             virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
+            no_sync: no_sync.unwrap_or(false),
         };
 
         // ------------------------------------------------------------

From 17c002b660a173bb6cdec07ae77103cd8580ee98 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 8 Nov 2024 14:54:58 +0200
Subject: [PATCH 193/239] Do not copy logical replicaiton slots to replica
 (#9458)

## Problem

Replication slots are now persisted using AUX files mechanism and
included in basebackup when replica is launched.
This slots are not somehow used at replica but hold WAL, which may cause
local disk space exhaustion.

## Summary of changes

Add `--replica` parameter to basebackup request and do not include
replication slot state files in basebackup for replica.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 compute_tools/src/compute.rs                  | 28 +++++++++--
 .../test_physical_and_logical_replicaiton.py  | 50 +++++++++++++++++++
 2 files changed, 73 insertions(+), 5 deletions(-)
 create mode 100644 test_runner/regress/test_physical_and_logical_replicaiton.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index d3e42fe618..0a8cb14058 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -364,11 +364,29 @@ impl ComputeNode {
         let pageserver_connect_micros = start_time.elapsed().as_micros() as u64;
 
         let basebackup_cmd = match lsn {
-            Lsn(0) => format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id),
-            _ => format!(
-                "basebackup {} {} {} --gzip",
-                spec.tenant_id, spec.timeline_id, lsn
-            ),
+            Lsn(0) => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id
+                    )
+                } else {
+                    format!("basebackup {} {} --gzip", spec.tenant_id, spec.timeline_id)
+                }
+            }
+            _ => {
+                if spec.spec.mode != ComputeMode::Primary {
+                    format!(
+                        "basebackup {} {} {} --gzip --replica",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                } else {
+                    format!(
+                        "basebackup {} {} {} --gzip",
+                        spec.tenant_id, spec.timeline_id, lsn
+                    )
+                }
+            }
         };
 
         let copyreader = client.copy_out(basebackup_cmd.as_str())?;
diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py
new file mode 100644
index 0000000000..ec14e08a14
--- /dev/null
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -0,0 +1,50 @@
+from __future__ import annotations
+
+import time
+
+from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
+
+
+def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
+    env = neon_simple_env
+
+    n_records = 100000
+
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
+    )
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
+    p_cur.execute("create publication pub1 for table t")
+
+    # start subscriber to primary
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
+    connstr = primary.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    time.sleep(1)
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
+    )
+
+    s_con = secondary.connect()
+    s_cur = s_con.cursor()
+
+    for pk in range(n_records):
+        p_cur.execute("insert into t (pk) values (%s)", (pk,))
+
+    s_cur.execute("select count(*) from t")
+    assert s_cur.fetchall()[0][0] == n_records
+
+    logical_replication_sync(vanilla_pg, primary)
+    assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
+
+    # Check that LR slot is not copied to replica
+    s_cur.execute("select count(*) from pg_replication_slots")
+    assert s_cur.fetchall()[0][0] == 0

From 3525d2e381c008904d05347742771f021325c6f8 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 8 Nov 2024 09:15:38 -0600
Subject: [PATCH 194/239] Update TimescaleDB to 2.17.1 for PG 17

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index f070f66c0a..6efef9e969 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -559,8 +559,8 @@ RUN case "${PG_VERSION}" in \
         export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
         ;; \
       "v17") \
-        export TIMESCALEDB_VERSION=2.17.0 \
-        export TIMESCALEDB_CHECKSUM=155bf64391d3558c42f31ca0e523cfc6252921974f75298c9039ccad1c89811a \
+        export TIMESCALEDB_VERSION=2.17.1 \
+        export TIMESCALEDB_CHECKSUM=6277cf43f5695e23dae1c5cfeba00474d730b66ed53665a84b787a6bb1a57e28 \
         ;; \
     esac && \
     wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \

From f561cbe1c709f07c507ffe642e975838ee430ef6 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 8 Nov 2024 10:35:27 -0500
Subject: [PATCH 195/239] fix(pageserver): drain upload queue before detaching
 ancestor (#9651)

In INC-317
https://neondb.slack.com/archives/C033RQ5SPDH/p1730815677932209, we saw
an interesting series of operations that would remove valid layer files
existing in the layer map.

* Timeline A starts compaction and generates an image layer Z but not
uploading it yet.
* Timeline B/C starts ancestor detaching (which should not affect
timeline A)
* The tenant gets restarted as part of the ancestor detaching process,
without increasing the generation number.
* Timeline A reloads, discovering the layer Z is a future layer, and
schedules a **deletion into the deletion queue**. This means that the
file will be deleted any time in the future.
* Timeline A starts compaction and generates layer Z again, adding it to
the layer map. Note that because we don't bump generation number during
ancestor detach, it has the same filename + generation number as the
original Z.
* Timeline A deletes layer Z from s3 + disk, and now we have a dangling
reference in the layer map, blocking all
compaction/logical_size_calculation process.

## Summary of changes

* We wait until all layers to be uploaded before shutting down the
tenants in `Flush` mode.
* Ancestor detach restarts now use this mode.
* Ancestor detach also waits for remote queue completion before starting
the detaching process.
* The patch ensures that we don't have any future image layer (or
something similar) after restart, but not fixing the underlying problem
around generation numbers.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                 | 15 +++++++++++
 pageserver/src/tenant/mgr.rs                  |  2 +-
 .../src/tenant/remote_timeline_client.rs      | 12 +++++++++
 pageserver/src/tenant/timeline.rs             | 25 ++++++++++++++-----
 4 files changed, 47 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 72eb3e7ade..d57bd98e95 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2169,6 +2169,21 @@ async fn timeline_detach_ancestor_handler(
         let ctx = RequestContext::new(TaskKind::DetachAncestor, DownloadBehavior::Download);
         let ctx = &ctx;
 
+        // Flush the upload queues of all timelines before detaching ancestor. We do the same thing again
+        // during shutdown. This early upload ensures the pageserver does not need to upload too many
+        // things and creates downtime during timeline reloads.
+        for timeline in tenant.list_timelines() {
+            timeline
+                .remote_client
+                .wait_completion()
+                .await
+                .map_err(|e| {
+                    ApiError::PreconditionFailed(format!("cannot drain upload queue: {e}").into())
+                })?;
+        }
+
+        tracing::info!("all timeline upload queues are drained");
+
         let timeline = tenant.get_timeline(timeline_id, true)?;
 
         let progress = timeline
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index a4c458b737..4fc9d740c8 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -1959,7 +1959,7 @@ impl TenantManager {
             attempt.before_reset_tenant();
 
             let (_guard, progress) = utils::completion::channel();
-            match tenant.shutdown(progress, ShutdownMode::Hard).await {
+            match tenant.shutdown(progress, ShutdownMode::Flush).await {
                 Ok(()) => {
                     slot_guard.drop_old_value().expect("it was just shutdown");
                 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 0aa8d61036..b37c16e133 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2201,6 +2201,18 @@ impl RemoteTimelineClient {
         inner.initialized_mut()?;
         Ok(UploadQueueAccessor { inner })
     }
+
+    pub(crate) fn no_pending_work(&self) -> bool {
+        let inner = self.upload_queue.lock().unwrap();
+        match &*inner {
+            UploadQueue::Uninitialized
+            | UploadQueue::Stopped(UploadQueueStopped::Uninitialized) => true,
+            UploadQueue::Stopped(UploadQueueStopped::Deletable(x)) => {
+                x.upload_queue_for_deletion.no_pending_work()
+            }
+            UploadQueue::Initialized(x) => x.no_pending_work(),
+        }
+    }
 }
 
 pub(crate) struct UploadQueueAccessor<'a> {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 6e082aecf5..4d086df2d1 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -852,6 +852,10 @@ pub(crate) enum ShutdownMode {
     /// While we are flushing, we continue to accept read I/O for LSNs ingested before
     /// the call to [`Timeline::shutdown`].
     FreezeAndFlush,
+    /// Only flush the layers to the remote storage without freezing any open layers. This is the
+    /// mode used by ancestor detach and any other operations that reloads a tenant but not increasing
+    /// the generation number.
+    Flush,
     /// Shut down immediately, without waiting for any open layers to flush.
     Hard,
 }
@@ -1678,11 +1682,6 @@ impl Timeline {
     pub(crate) async fn shutdown(&self, mode: ShutdownMode) {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
-        let try_freeze_and_flush = match mode {
-            ShutdownMode::FreezeAndFlush => true,
-            ShutdownMode::Hard => false,
-        };
-
         // Regardless of whether we're going to try_freeze_and_flush
         // or not, stop ingesting any more data. Walreceiver only provides
         // cancellation but no "wait until gone", because it uses the Timeline::gate.
@@ -1704,7 +1703,7 @@ impl Timeline {
         // ... and inform any waiters for newer LSNs that there won't be any.
         self.last_record_lsn.shutdown();
 
-        if try_freeze_and_flush {
+        if let ShutdownMode::FreezeAndFlush = mode {
             if let Some((open, frozen)) = self
                 .layers
                 .read()
@@ -1746,6 +1745,20 @@ impl Timeline {
                     warn!("failed to freeze and flush: {e:#}");
                 }
             }
+
+            // `self.remote_client.shutdown().await` above should have already flushed everything from the queue, but
+            // we also do a final check here to ensure that the queue is empty.
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
+        }
+
+        if let ShutdownMode::Flush = mode {
+            // drain the upload queue
+            self.remote_client.shutdown().await;
+            if !self.remote_client.no_pending_work() {
+                warn!("still have pending work in remote upload queue, but continuing shutting down anyways");
+            }
         }
 
         // Signal any subscribers to our cancellation token to drop out

From 30680d1f3289093b532ecf2a417b6fe3309ea57b Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 8 Nov 2024 17:00:31 +0000
Subject: [PATCH 196/239] tests: use tigther storcon scopes (#9696)

## Problem

https://github.com/neondatabase/neon/pull/9596 did not update tests
because that would've broken the compat tests.

## Summary of Changes

Use infra scope where possible.
---
 test_runner/fixtures/auth_tokens.py   |  1 +
 test_runner/fixtures/neon_fixtures.py | 12 ++++++------
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/test_runner/fixtures/auth_tokens.py b/test_runner/fixtures/auth_tokens.py
index 8ebaf61e5e..be16be81de 100644
--- a/test_runner/fixtures/auth_tokens.py
+++ b/test_runner/fixtures/auth_tokens.py
@@ -45,3 +45,4 @@ class TokenScope(str, Enum):
     SAFEKEEPER_DATA = "safekeeperdata"
     TENANT = "tenant"
     SCRUBBER = "scrubber"
+    INFRA = "infra"
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e23f46d1ca..83c68794c3 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1782,7 +1782,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "PUT",
             f"{self.api}/control/v1/node/{node_id}/drain",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def cancel_node_drain(self, node_id):
@@ -1790,7 +1790,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "DELETE",
             f"{self.api}/control/v1/node/{node_id}/drain",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def node_fill(self, node_id):
@@ -1798,7 +1798,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "PUT",
             f"{self.api}/control/v1/node/{node_id}/fill",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def cancel_node_fill(self, node_id):
@@ -1806,14 +1806,14 @@ class NeonStorageController(MetricsGetter, LogUtils):
         self.request(
             "DELETE",
             f"{self.api}/control/v1/node/{node_id}/fill",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
 
     def node_status(self, node_id):
         response = self.request(
             "GET",
             f"{self.api}/control/v1/node/{node_id}",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
         return response.json()
 
@@ -1829,7 +1829,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         response = self.request(
             "GET",
             f"{self.api}/control/v1/node",
-            headers=self.headers(TokenScope.ADMIN),
+            headers=self.headers(TokenScope.INFRA),
         )
         return response.json()
 

From b6bc954c5d3846214ee0a38010dd0228a7c2d7f5 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 8 Nov 2024 17:32:56 +0000
Subject: [PATCH 197/239] CI: move check codestyle python to reusable workflow
 and run on a merge_group (#9683)

## Problem

To prevent breaking main after Python 3.11 PR get merged
we need to enable merge queue and run `check-codestyle-python`
job on it

## Summary of changes
- Move `check-codestyle-python` to a reusable workflow
- Run this workflow on `merge_group` event
---
 .github/workflows/_check-codestyle-python.yml | 37 +++++++++++++++
 .github/workflows/build_and_test.yml          | 34 ++------------
 .github/workflows/pre-merge-checks.yml        | 47 +++++++++++++++++++
 3 files changed, 89 insertions(+), 29 deletions(-)
 create mode 100644 .github/workflows/_check-codestyle-python.yml
 create mode 100644 .github/workflows/pre-merge-checks.yml

diff --git a/.github/workflows/_check-codestyle-python.yml b/.github/workflows/_check-codestyle-python.yml
new file mode 100644
index 0000000000..9ae28a1379
--- /dev/null
+++ b/.github/workflows/_check-codestyle-python.yml
@@ -0,0 +1,37 @@
+name: Check Codestyle Python
+
+on:
+  workflow_call:
+    inputs:
+      build-tools-image:
+        description: 'build-tools image'
+        required: true
+        type: string
+
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+jobs:
+  check-codestyle-python:
+    runs-on: [ self-hosted, small ]
+    container:
+      image: ${{ inputs.build-tools-image }}
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/cache@v4
+        with:
+          path: ~/.cache/pypoetry/virtualenvs
+          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
+
+      - run: ./scripts/pysync
+
+      - run: poetry run ruff check .
+      - run: poetry run ruff format --check .
+      - run: poetry run mypy .
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index bcf021a9a1..d415e20db8 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -90,35 +90,10 @@ jobs:
 
   check-codestyle-python:
     needs: [ check-permissions, build-build-tools-image ]
-    runs-on: [ self-hosted, small ]
-    container:
-      image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
-      credentials:
-        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
-        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
-      options: --init
-
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v4
-
-      - name: Cache poetry deps
-        uses: actions/cache@v4
-        with:
-          path: ~/.cache/pypoetry/virtualenvs
-          key: v2-${{ runner.os }}-${{ runner.arch }}-python-deps-bookworm-${{ hashFiles('poetry.lock') }}
-
-      - name: Install Python deps
-        run: ./scripts/pysync
-
-      - name: Run `ruff check` to ensure code format
-        run: poetry run ruff check .
-
-      - name: Run `ruff format` to ensure code format
-        run: poetry run ruff format --check .
-
-      - name: Run mypy to check types
-        run: poetry run mypy .
+    uses: ./.github/workflows/_check-codestyle-python.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit
 
   check-codestyle-jsonnet:
     needs: [ check-permissions, build-build-tools-image ]
@@ -141,6 +116,7 @@ jobs:
   # Check that the vendor/postgres-* submodules point to the
   # corresponding REL_*_STABLE_neon branches.
   check-submodules:
+    needs: [ check-permissions ]
     runs-on: ubuntu-22.04
     steps:
       - name: Checkout
diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
new file mode 100644
index 0000000000..40ce644eb6
--- /dev/null
+++ b/.github/workflows/pre-merge-checks.yml
@@ -0,0 +1,47 @@
+name:
+
+on:
+  merge_group:
+    branches:
+      - main
+
+# No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
+permissions: {}
+
+jobs:
+  get-changed-files:
+    runs-on: ubuntu-22.04
+    outputs:
+      any_changed: ${{ steps.src.outputs.any_changed }}
+    steps:
+      - uses: actions/checkout@v4
+      - uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c # v45.0.3
+        id: src
+        with:
+          files: |
+            .github/workflows/pre-merge-checks.yml
+            **/**.py
+            poetry.lock
+            pyproject.toml
+
+      - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
+        env:
+          ALL_CHANGED_FILES: ${{ steps.src.outputs.all_changed_files }}
+        run: echo "${ALL_CHANGED_FILES}"
+
+  check-build-tools-image:
+    uses: ./.github/workflows/check-build-tools-image.yml
+
+  build-build-tools-image:
+    needs: [ check-build-tools-image ]
+    uses: ./.github/workflows/build-build-tools-image.yml
+    with:
+      image-tag: ${{ needs.check-build-tools-image.outputs.image-tag }}
+    secrets: inherit
+
+  check-codestyle-python:
+    needs: [ build-build-tools-image ]
+    uses: ./.github/workflows/_check-codestyle-python.yml
+    with:
+      build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
+    secrets: inherit

From 34a4eb6f2a7ddb9bd98b1b1f7b8959fa57b3007a Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 8 Nov 2024 12:19:18 -0600
Subject: [PATCH 198/239] Switch compute-related locales to C.UTF-8 by default

Right now, our environments create databases with the C locale, which is
really unfortunate for users who have data stored in other languages
that they want to analyze. For instance, show_trgm on Hebrew text
currently doesn't work in staging or production.

I don't envision this being the final solution. I think this is just a
way to set a known value so the pageserver doesn't use its parent
environment. The final solution to me is exposing initdb parameters to
users in the console. Then they could use a different locale or encoding
if they so chose.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/config.rs                 |  6 ++
 libs/pageserver_api/src/config.rs           |  3 +
 libs/utils/scripts/restore_from_wal.sh      | 39 ++++++++++++-
 pageserver/src/config.rs                    |  3 +
 pageserver/src/tenant.rs                    | 28 ++++++++--
 test_runner/regress/test_compute_locales.py | 61 +++++++++++++++++++++
 test_runner/regress/test_wal_restore.py     |  2 +
 7 files changed, 136 insertions(+), 6 deletions(-)
 create mode 100644 test_runner/regress/test_compute_locales.py

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 479100eb89..50e2a95e9d 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -73,6 +73,12 @@ pub fn write_postgres_conf(
         )?;
     }
 
+    // Locales
+    writeln!(file, "lc_messages='C.UTF-8'")?;
+    writeln!(file, "lc_monetary='C.UTF-8'")?;
+    writeln!(file, "lc_time='C.UTF-8'")?;
+    writeln!(file, "lc_numeric='C.UTF-8'")?;
+
     match spec.mode {
         ComputeMode::Primary => {}
         ComputeMode::Static(lsn) => {
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 6de34fdd35..4272181954 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -64,6 +64,7 @@ pub struct ConfigToml {
     #[serde(with = "humantime_serde")]
     pub wal_redo_timeout: Duration,
     pub superuser: String,
+    pub locale: String,
     pub page_cache_size: usize,
     pub max_file_descriptors: usize,
     pub pg_distrib_dir: Option<Utf8PathBuf>,
@@ -276,6 +277,7 @@ pub mod defaults {
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
 
     pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
+    pub const DEFAULT_LOCALE: &str = "C.UTF-8";
 
     pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
     pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
@@ -326,6 +328,7 @@ impl Default for ConfigToml {
             wal_redo_timeout: (humantime::parse_duration(DEFAULT_WAL_REDO_TIMEOUT)
                 .expect("cannot parse default wal redo timeout")),
             superuser: (DEFAULT_SUPERUSER.to_string()),
+            locale: DEFAULT_LOCALE.to_string(),
             page_cache_size: (DEFAULT_PAGE_CACHE_SIZE),
             max_file_descriptors: (DEFAULT_MAX_FILE_DESCRIPTORS),
             pg_distrib_dir: None, // Utf8PathBuf::from("./pg_install"), // TODO: formely, this was std::env::current_dir()
diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh
index 316ec8ed0d..93448369a0 100755
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,4 +1,4 @@
-#!/bin/bash
+#!/usr/bin/env bash
 
 set -euxo pipefail
 
@@ -6,9 +6,44 @@ PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
+PG_VERSION=$5
 SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
+
+# The way that initdb is invoked must match how the pageserver runs initdb.
+function initdb_with_args {
+    local cmd=(
+        "$PG_BIN"/initdb
+        -E utf8
+        -U cloud_admin
+        -D "$DATA_DIR"
+        --locale 'C.UTF-8'
+        --lc-collate 'C.UTF-8'
+        --lc-ctype 'C.UTF-8'
+        --lc-messages 'C.UTF-8'
+        --lc-monetary 'C.UTF-8'
+        --lc-numeric 'C.UTF-8'
+        --lc-time 'C.UTF-8'
+        --sysid="$SYSID"
+    )
+
+    case "$PG_VERSION" in
+        14)
+            # Postgres 14 and below didn't support --locale-provider
+            ;;
+        15 | 16)
+            cmd+=(--locale-provider 'libc')
+            ;;
+        *)
+            # Postgres 17 added the builtin provider
+            cmd+=(--locale-provider 'builtin')
+            ;;
+    esac
+
+    eval env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "${cmd[*]}"
+}
+
 rm -fr "$DATA_DIR"
-env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
+initdb_with_args
 echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
 echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index d62066ac22..b694a43599 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -69,6 +69,7 @@ pub struct PageServerConf {
     pub wal_redo_timeout: Duration,
 
     pub superuser: String,
+    pub locale: String,
 
     pub page_cache_size: usize,
     pub max_file_descriptors: usize,
@@ -301,6 +302,7 @@ impl PageServerConf {
             wait_lsn_timeout,
             wal_redo_timeout,
             superuser,
+            locale,
             page_cache_size,
             max_file_descriptors,
             pg_distrib_dir,
@@ -348,6 +350,7 @@ impl PageServerConf {
             wait_lsn_timeout,
             wal_redo_timeout,
             superuser,
+            locale,
             page_cache_size,
             max_file_descriptors,
             http_auth_type,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d45c99a41b..34ea6dae1f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4779,10 +4779,18 @@ async fn run_initdb(
 
     let _permit = INIT_DB_SEMAPHORE.acquire().await;
 
-    let initdb_command = tokio::process::Command::new(&initdb_bin_path)
+    let mut initdb_command = tokio::process::Command::new(&initdb_bin_path);
+    initdb_command
         .args(["--pgdata", initdb_target_dir.as_ref()])
         .args(["--username", &conf.superuser])
         .args(["--encoding", "utf8"])
+        .args(["--locale", &conf.locale])
+        .args(["--lc-collate", &conf.locale])
+        .args(["--lc-ctype", &conf.locale])
+        .args(["--lc-messages", &conf.locale])
+        .args(["--lc-monetary", &conf.locale])
+        .args(["--lc-numeric", &conf.locale])
+        .args(["--lc-time", &conf.locale])
         .arg("--no-instructions")
         .arg("--no-sync")
         .env_clear()
@@ -4792,15 +4800,27 @@ async fn run_initdb(
         // stdout invocation produces the same output every time, we don't need it
         .stdout(std::process::Stdio::null())
         // we would be interested in the stderr output, if there was any
-        .stderr(std::process::Stdio::piped())
-        .spawn()?;
+        .stderr(std::process::Stdio::piped());
+
+    // Before version 14, only the libc provide was available.
+    if pg_version > 14 {
+        // Version 17 brought with it a builtin locale provider which only provides
+        // C and C.UTF-8. While being safer for collation purposes since it is
+        // guaranteed to be consistent throughout a major release, it is also more
+        // performant.
+        let locale_provider = if pg_version >= 17 { "builtin" } else { "libc" };
+
+        initdb_command.args(["--locale-provider", locale_provider]);
+    }
+
+    let initdb_proc = initdb_command.spawn()?;
 
     // Ideally we'd select here with the cancellation token, but the problem is that
     // we can't safely terminate initdb: it launches processes of its own, and killing
     // initdb doesn't kill them. After we return from this function, we want the target
     // directory to be able to be cleaned up.
     // See https://github.com/neondatabase/neon/issues/6385
-    let initdb_output = initdb_command.wait_with_output().await?;
+    let initdb_output = initdb_proc.wait_with_output().await?;
     if !initdb_output.status.success() {
         return Err(InitdbError::Failed(
             initdb_output.status,
diff --git a/test_runner/regress/test_compute_locales.py b/test_runner/regress/test_compute_locales.py
new file mode 100644
index 0000000000..00ef32fb5e
--- /dev/null
+++ b/test_runner/regress/test_compute_locales.py
@@ -0,0 +1,61 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING, cast
+
+from fixtures.pg_version import PgVersion
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from fixtures.neon_fixtures import NeonEnv
+
+
+def test_default_locales(neon_simple_env: NeonEnv):
+    """
+    Test that the default locales for compute databases is C.UTF-8.
+    """
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+
+    domain_locales = cast(
+        "Sequence[str]",
+        endpoint.safe_psql(
+            "SELECT current_setting('lc_messages') AS lc_messages,"
+            + "current_setting('lc_monetary') AS lc_monetary,"
+            + "current_setting('lc_numeric') AS lc_numeric,"
+            + "current_setting('lc_time') AS lc_time"
+        )[0],
+    )
+    for dl in domain_locales:
+        assert dl == "C.UTF-8"
+
+    # Postgres 15 added the locale providers
+    if env.pg_version < PgVersion.V15:
+        results = cast(
+            "Sequence[str]",
+            endpoint.safe_psql(
+                "SELECT datcollate, datctype FROM pg_database WHERE datname = current_database()"
+            )[0],
+        )
+
+        datcollate = results[0]
+        datctype = results[1]
+    else:
+        results = cast(
+            "Sequence[str]",
+            endpoint.safe_psql(
+                "SELECT datlocprovider, datcollate, datctype FROM pg_database WHERE datname = current_database()"
+            )[0],
+        )
+        datlocprovider = results[0]
+        datcollate = results[1]
+        datctype = results[2]
+
+        if env.pg_version >= PgVersion.V17:
+            assert datlocprovider == "b", "The locale provider is not builtin"
+        else:
+            assert datlocprovider == "c", "The locale provider is not libc"
+
+    assert datcollate == "C.UTF-8"
+    assert datctype == "C.UTF-8"
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index 05b6ad8a9b..c8e51fde13 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -64,6 +64,7 @@ def test_wal_restore(
                 ),
                 str(data_dir),
                 str(port),
+                env.pg_version,
             ]
         )
         restored.start()
@@ -127,6 +128,7 @@ def test_wal_restore_initdb(
                 ),
                 str(data_dir),
                 str(port),
+                env.pg_version,
             ]
         )
         restored.start()

From ecca62a45dfa0be134c41c113adc8e2519b827af Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 8 Nov 2024 13:44:00 -0500
Subject: [PATCH 199/239] feat(pageserver): more log lines around frozen layers
 (#9697)

We saw pageserver OOMs
https://github.com/neondatabase/cloud/issues/19715 for tenants doing
large writes. Add log lines around in-memory layers to hopefully collect
some info during my on-call shift next week.

## Summary of changes

* Estimate in-memory size of an in-mem layer.
* Print frozen layer number if there are too many layers accumulated in
memory.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 .../tenant/storage_layer/inmemory_layer.rs    | 13 ++++++++++++
 pageserver/src/tenant/timeline.rs             | 20 +++++++++++++++++++
 2 files changed, 33 insertions(+)

diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index 2ce26ed2eb..af6112d535 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -67,6 +67,8 @@ pub struct InMemoryLayer {
     /// The above fields never change, except for `end_lsn`, which is only set once.
     /// All other changing parts are in `inner`, and protected by a mutex.
     inner: RwLock<InMemoryLayerInner>,
+
+    estimated_in_mem_size: AtomicU64,
 }
 
 impl std::fmt::Debug for InMemoryLayer {
@@ -543,6 +545,10 @@ impl InMemoryLayer {
         Ok(inner.file.len())
     }
 
+    pub fn estimated_in_mem_size(&self) -> u64 {
+        self.estimated_in_mem_size.load(AtomicOrdering::Relaxed)
+    }
+
     /// Create a new, empty, in-memory layer
     pub async fn create(
         conf: &'static PageServerConf,
@@ -572,6 +578,7 @@ impl InMemoryLayer {
                 file,
                 resource_units: GlobalResourceUnits::new(),
             }),
+            estimated_in_mem_size: AtomicU64::new(0),
         })
     }
 
@@ -642,6 +649,12 @@ impl InMemoryLayer {
                 // because this case is unexpected, and we would like tests to fail if this happens.
                 warn!("Key {} at {} written twice at same LSN", key, lsn);
             }
+            self.estimated_in_mem_size.fetch_add(
+                (std::mem::size_of::<CompactKey>()
+                    + std::mem::size_of::<Lsn>()
+                    + std::mem::size_of::<IndexEntry>()) as u64,
+                AtomicOrdering::Relaxed,
+            );
         }
 
         inner.resource_units.maybe_publish_size(new_size);
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4d086df2d1..60cc689c5e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -23,6 +23,7 @@ use handle::ShardTimelineId;
 use offload::OffloadError;
 use once_cell::sync::Lazy;
 use pageserver_api::{
+    config::tenant_conf_defaults::DEFAULT_COMPACTION_THRESHOLD,
     key::{
         KEY_SIZE, METADATA_KEY_BEGIN_PREFIX, METADATA_KEY_END_PREFIX, NON_INHERITED_RANGE,
         NON_INHERITED_SPARSE_RANGE,
@@ -3501,18 +3502,37 @@ impl Timeline {
 
                 let timer = self.metrics.flush_time_histo.start_timer();
 
+                let num_frozen_layers;
+                let frozen_layer_total_size;
                 let layer_to_flush = {
                     let guard = self.layers.read().await;
                     let Ok(lm) = guard.layer_map() else {
                         info!("dropping out of flush loop for timeline shutdown");
                         return;
                     };
+                    num_frozen_layers = lm.frozen_layers.len();
+                    frozen_layer_total_size = lm
+                        .frozen_layers
+                        .iter()
+                        .map(|l| l.estimated_in_mem_size())
+                        .sum::<u64>();
                     lm.frozen_layers.front().cloned()
                     // drop 'layers' lock to allow concurrent reads and writes
                 };
                 let Some(layer_to_flush) = layer_to_flush else {
                     break Ok(());
                 };
+                if num_frozen_layers
+                    > std::cmp::max(
+                        self.get_compaction_threshold(),
+                        DEFAULT_COMPACTION_THRESHOLD,
+                    )
+                    && frozen_layer_total_size >= /* 64 MB */ 64000000
+                {
+                    tracing::warn!(
+                        "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
+                    );
+                }
                 match self.flush_frozen_layer(layer_to_flush, ctx).await {
                     Ok(this_layer_to_lsn) => {
                         flushed_to_lsn = std::cmp::max(flushed_to_lsn, this_layer_to_lsn);

From ab47804d000addd668e2583275bfdeb8209502e4 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 8 Nov 2024 20:25:31 +0100
Subject: [PATCH 200/239] safekeeper: remove unused
 `WriteGuardSharedState::skip_update` (#9699)

---
 safekeeper/src/timeline.rs | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index fa91241177..85add6bfea 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -108,16 +108,11 @@ pub type ReadGuardSharedState<'a> = RwLockReadGuard<'a, SharedState>;
 pub struct WriteGuardSharedState<'a> {
     tli: Arc<Timeline>,
     guard: RwLockWriteGuard<'a, SharedState>,
-    skip_update: bool,
 }
 
 impl<'a> WriteGuardSharedState<'a> {
     fn new(tli: Arc<Timeline>, guard: RwLockWriteGuard<'a, SharedState>) -> Self {
-        WriteGuardSharedState {
-            tli,
-            guard,
-            skip_update: false,
-        }
+        WriteGuardSharedState { tli, guard }
     }
 }
 
@@ -159,12 +154,10 @@ impl Drop for WriteGuardSharedState<'_> {
             }
         });
 
-        if !self.skip_update {
-            // send notification about shared state update
-            self.tli.shared_state_version_tx.send_modify(|old| {
-                *old += 1;
-            });
-        }
+        // send notification about shared state update
+        self.tli.shared_state_version_tx.send_modify(|old| {
+            *old += 1;
+        });
     }
 }
 

From af8238ae52aaf81cb02fdc246f9a7914538ded7d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 8 Nov 2024 14:28:55 -0500
Subject: [PATCH 201/239] fix(pageserver): drain upload queue before offloading
 timeline (#9682)

It is possible at the point we shutdown the timeline, there are
still layer files we did not upload.

## Summary of changes

* If the queue is not empty, avoid offloading.
* Shutdown the timeline gracefully using the flush mode to
ensure all local files are uploaded before deleting the timeline
directory.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs             |  4 ++--
 pageserver/src/tenant.rs                  |  3 ++-
 pageserver/src/tenant/timeline.rs         | 10 +++++++---
 pageserver/src/tenant/timeline/offload.rs |  2 +-
 4 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index d57bd98e95..dde9c5dd0b 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2002,9 +2002,9 @@ async fn timeline_offload_handler(
                 "timeline has attached children".into(),
             ));
         }
-        if !timeline.can_offload() {
+        if let (false, reason) = timeline.can_offload() {
             return Err(ApiError::PreconditionFailed(
-                "Timeline::can_offload() returned false".into(),
+                format!("Timeline::can_offload() check failed: {}", reason) .into(),
             ));
         }
         offload_timeline(&tenant, &timeline)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 34ea6dae1f..903174680e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2493,7 +2493,8 @@ impl Tenant {
             timelines_to_compact_or_offload = timelines
                 .iter()
                 .filter_map(|(timeline_id, timeline)| {
-                    let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
+                    let (is_active, (can_offload, _)) =
+                        (timeline.is_active(), timeline.can_offload());
                     let has_no_unoffloaded_children = {
                         !timelines
                             .iter()
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 60cc689c5e..56faacbaee 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1570,12 +1570,16 @@ impl Timeline {
     ///
     /// This is neccessary but not sufficient for offloading of the timeline as it might have
     /// child timelines that are not offloaded yet.
-    pub(crate) fn can_offload(&self) -> bool {
+    pub(crate) fn can_offload(&self) -> (bool, &'static str) {
         if self.remote_client.is_archived() != Some(true) {
-            return false;
+            return (false, "the timeline is not archived");
+        }
+        if !self.remote_client.no_pending_work() {
+            // if the remote client is still processing some work, we can't offload
+            return (false, "the upload queue is not drained yet");
         }
 
-        true
+        (true, "ok")
     }
 
     /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
index 2dc461c28d..1394843467 100644
--- a/pageserver/src/tenant/timeline/offload.rs
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -58,7 +58,7 @@ pub(crate) async fn offload_timeline(
     }
 
     // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-    timeline.shutdown(super::ShutdownMode::Hard).await;
+    timeline.shutdown(super::ShutdownMode::Flush).await;
 
     // TODO extend guard mechanism above with method
     // to make deletions possible while offloading is in progress

From ecde8d763257703f143e3fd74c024fc73ff9f13f Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 8 Nov 2024 14:43:15 -0600
Subject: [PATCH 202/239] Improve type safety according to pyright

Pyright found many issues that mypy doesn't seem to want to catch or
mypy isn't configured to catch.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/benchmark_fixture.py     |  6 ++++
 test_runner/fixtures/compare_fixtures.py      |  9 +++--
 test_runner/fixtures/h2server.py              | 36 +++++++++++++------
 test_runner/fixtures/neon_fixtures.py         |  2 +-
 test_runner/fixtures/pageserver/http.py       |  2 +-
 test_runner/fixtures/pageserver/utils.py      |  4 +++
 test_runner/fixtures/paths.py                 |  4 +--
 test_runner/performance/test_copy.py          |  5 ++-
 .../regress/test_pageserver_generations.py    |  1 +
 test_runner/regress/test_proxy_websockets.py  |  2 +-
 test_runner/regress/test_sharding.py          |  1 +
 .../regress/test_storage_controller.py        |  2 ++
 test_runner/regress/test_storage_scrubber.py  |  7 +++-
 test_runner/regress/test_tenant_size.py       |  3 +-
 .../regress/test_threshold_based_eviction.py  |  1 +
 test_runner/regress/test_wal_acceptor.py      | 13 +++----
 16 files changed, 67 insertions(+), 31 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 74fe39ef53..d3419bd8b1 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -80,7 +80,13 @@ class PgBenchRunResult:
     ):
         stdout_lines = stdout.splitlines()
 
+        number_of_clients = 0
+        number_of_threads = 0
+        number_of_transactions_actually_processed = 0
+        latency_average = 0.0
         latency_stddev = None
+        tps = 0.0
+        scale = 0
 
         # we know significant parts of these values from test input
         # but to be precise take them from output
diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py
index 2195ae8225..85b6e7a3b8 100644
--- a/test_runner/fixtures/compare_fixtures.py
+++ b/test_runner/fixtures/compare_fixtures.py
@@ -8,7 +8,7 @@ from contextlib import _GeneratorContextManager, contextmanager
 
 # Type-related stuff
 from pathlib import Path
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, final
 
 import pytest
 from _pytest.fixtures import FixtureRequest
@@ -70,12 +70,12 @@ class PgCompare(ABC):
 
     @contextmanager
     @abstractmethod
-    def record_pageserver_writes(self, out_name: str):
+    def record_pageserver_writes(self, out_name: str) -> Iterator[None]:
         pass
 
     @contextmanager
     @abstractmethod
-    def record_duration(self, out_name: str):
+    def record_duration(self, out_name: str) -> Iterator[None]:
         pass
 
     @contextmanager
@@ -105,6 +105,7 @@ class PgCompare(ABC):
         return results
 
 
+@final
 class NeonCompare(PgCompare):
     """PgCompare interface for the neon stack."""
 
@@ -206,6 +207,7 @@ class NeonCompare(PgCompare):
         return self.zenbenchmark.record_duration(out_name)
 
 
+@final
 class VanillaCompare(PgCompare):
     """PgCompare interface for vanilla postgres."""
 
@@ -271,6 +273,7 @@ class VanillaCompare(PgCompare):
         return self.zenbenchmark.record_duration(out_name)
 
 
+@final
 class RemoteCompare(PgCompare):
     """PgCompare interface for a remote postgres instance."""
 
diff --git a/test_runner/fixtures/h2server.py b/test_runner/fixtures/h2server.py
index 92783e1fb2..e890b2bcf1 100644
--- a/test_runner/fixtures/h2server.py
+++ b/test_runner/fixtures/h2server.py
@@ -4,11 +4,14 @@ https://python-hyper.org/projects/hyper-h2/en/stable/asyncio-example.html
 auth-broker -> local-proxy needs a h2 connection, so we need a h2 server :)
 """
 
+from __future__ import annotations
+
 import asyncio
 import collections
 import io
 import json
 from collections.abc import AsyncIterable
+from typing import TYPE_CHECKING, final
 
 import pytest_asyncio
 from h2.config import H2Configuration
@@ -25,34 +28,45 @@ from h2.events import (
 )
 from h2.exceptions import ProtocolError, StreamClosedError
 from h2.settings import SettingCodes
+from typing_extensions import override
+
+if TYPE_CHECKING:
+    from typing import Any, Optional
+
 
 RequestData = collections.namedtuple("RequestData", ["headers", "data"])
 
 
+@final
 class H2Server:
-    def __init__(self, host, port) -> None:
+    def __init__(self, host: str, port: int) -> None:
         self.host = host
         self.port = port
 
 
+@final
 class H2Protocol(asyncio.Protocol):
     def __init__(self):
         config = H2Configuration(client_side=False, header_encoding="utf-8")
         self.conn = H2Connection(config=config)
-        self.transport = None
-        self.stream_data = {}
-        self.flow_control_futures = {}
+        self.transport: Optional[asyncio.Transport] = None
+        self.stream_data: dict[int, RequestData] = {}
+        self.flow_control_futures: dict[int, asyncio.Future[Any]] = {}
 
-    def connection_made(self, transport: asyncio.Transport):  # type: ignore[override]
+    @override
+    def connection_made(self, transport: asyncio.BaseTransport):
+        assert isinstance(transport, asyncio.Transport)
         self.transport = transport
         self.conn.initiate_connection()
         self.transport.write(self.conn.data_to_send())
 
-    def connection_lost(self, _exc):
+    @override
+    def connection_lost(self, exc: Optional[Exception]):
         for future in self.flow_control_futures.values():
             future.cancel()
         self.flow_control_futures = {}
 
+    @override
     def data_received(self, data: bytes):
         assert self.transport is not None
         try:
@@ -77,7 +91,7 @@ class H2Protocol(asyncio.Protocol):
                     self.window_updated(event.stream_id, event.delta)
                 elif isinstance(event, RemoteSettingsChanged):
                     if SettingCodes.INITIAL_WINDOW_SIZE in event.changed_settings:
-                        self.window_updated(None, 0)
+                        self.window_updated(0, 0)
 
                 self.transport.write(self.conn.data_to_send())
 
@@ -123,7 +137,7 @@ class H2Protocol(asyncio.Protocol):
         else:
             stream_data.data.write(data)
 
-    def stream_reset(self, stream_id):
+    def stream_reset(self, stream_id: int):
         """
         A stream reset was sent. Stop sending data.
         """
@@ -131,7 +145,7 @@ class H2Protocol(asyncio.Protocol):
             future = self.flow_control_futures.pop(stream_id)
             future.cancel()
 
-    async def send_data(self, data, stream_id):
+    async def send_data(self, data: bytes, stream_id: int):
         """
         Send data according to the flow control rules.
         """
@@ -161,7 +175,7 @@ class H2Protocol(asyncio.Protocol):
             self.transport.write(self.conn.data_to_send())
             data = data[chunk_size:]
 
-    async def wait_for_flow_control(self, stream_id):
+    async def wait_for_flow_control(self, stream_id: int):
         """
         Waits for a Future that fires when the flow control window is opened.
         """
@@ -169,7 +183,7 @@ class H2Protocol(asyncio.Protocol):
         self.flow_control_futures[stream_id] = f
         await f
 
-    def window_updated(self, stream_id, delta):
+    def window_updated(self, stream_id: int, delta):
         """
         A window update frame was received. Unblock some number of flow control
         Futures.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 83c68794c3..79baa8a32d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1857,7 +1857,7 @@ class NeonStorageController(MetricsGetter, LogUtils):
         shard_count: Optional[int] = None,
         shard_stripe_size: Optional[int] = None,
         tenant_config: Optional[dict[Any, Any]] = None,
-        placement_policy: Optional[Union[dict[Any, Any] | str]] = None,
+        placement_policy: Optional[Union[dict[Any, Any], str]] = None,
     ):
         """
         Use this rather than pageserver_api() when you need to include shard parameters
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 57a5d6875e..d1a9b5921a 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -316,7 +316,7 @@ class PageserverHttpClient(requests.Session, MetricsGetter):
     def tenant_location_conf(
         self,
         tenant_id: Union[TenantId, TenantShardId],
-        location_conf=dict[str, Any],
+        location_conf: dict[str, Any],
         flush_ms=None,
         lazy: Optional[bool] = None,
     ):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 4c4306be9e..ac7497ee6c 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -56,6 +56,8 @@ def wait_for_upload(
     lsn: Lsn,
 ):
     """waits for local timeline upload up to specified lsn"""
+
+    current_lsn = Lsn(0)
     for i in range(20):
         current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
@@ -203,6 +205,8 @@ def wait_for_last_record_lsn(
     lsn: Lsn,
 ) -> Lsn:
     """waits for pageserver to catch up to a certain lsn, returns the last observed lsn."""
+
+    current_lsn = Lsn(0)
     for i in range(1000):
         current_lsn = last_record_lsn(pageserver_http, tenant, timeline)
         if current_lsn >= lsn:
diff --git a/test_runner/fixtures/paths.py b/test_runner/fixtures/paths.py
index d950f2356d..60221573eb 100644
--- a/test_runner/fixtures/paths.py
+++ b/test_runner/fixtures/paths.py
@@ -112,7 +112,7 @@ def compatibility_snapshot_dir() -> Iterator[Path]:
 
 
 @pytest.fixture(scope="session")
-def compatibility_neon_binpath() -> Optional[Iterator[Path]]:
+def compatibility_neon_binpath() -> Iterator[Optional[Path]]:
     if os.getenv("REMOTE_ENV"):
         return
     comp_binpath = None
@@ -133,7 +133,7 @@ def pg_distrib_dir(base_dir: Path) -> Iterator[Path]:
 
 
 @pytest.fixture(scope="session")
-def compatibility_pg_distrib_dir() -> Optional[Iterator[Path]]:
+def compatibility_pg_distrib_dir() -> Iterator[Optional[Path]]:
     compat_distrib_dir = None
     if env_compat_postgres_bin := os.environ.get("COMPATIBILITY_POSTGRES_DISTRIB_DIR"):
         compat_distrib_dir = Path(env_compat_postgres_bin).resolve()
diff --git a/test_runner/performance/test_copy.py b/test_runner/performance/test_copy.py
index 743604a381..d571fab6b5 100644
--- a/test_runner/performance/test_copy.py
+++ b/test_runner/performance/test_copy.py
@@ -2,11 +2,13 @@ from __future__ import annotations
 
 from contextlib import closing
 from io import BufferedReader, RawIOBase
-from typing import Optional
+from typing import Optional, final
 
 from fixtures.compare_fixtures import PgCompare
+from typing_extensions import override
 
 
+@final
 class CopyTestData(RawIOBase):
     def __init__(self, rows: int):
         self.rows = rows
@@ -14,6 +16,7 @@ class CopyTestData(RawIOBase):
         self.linebuf: Optional[bytes] = None
         self.ptr = 0
 
+    @override
     def readable(self):
         return True
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 11ebb81023..8f6c9f16fd 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -656,6 +656,7 @@ def test_upgrade_generationless_local_file_paths(
     workload.write_rows(1000)
 
     attached_pageserver = env.get_tenant_pageserver(tenant_id)
+    assert attached_pageserver is not None
     secondary_pageserver = list([ps for ps in env.pageservers if ps.id != attached_pageserver.id])[
         0
     ]
diff --git a/test_runner/regress/test_proxy_websockets.py b/test_runner/regress/test_proxy_websockets.py
index 071ca7c54e..ea01252ce4 100644
--- a/test_runner/regress/test_proxy_websockets.py
+++ b/test_runner/regress/test_proxy_websockets.py
@@ -37,7 +37,7 @@ async def test_websockets(static_proxy: NeonProxy):
         startup_message.extend(b"\0")
         length = (4 + len(startup_message)).to_bytes(4, byteorder="big")
 
-        await websocket.send([length, startup_message])
+        await websocket.send([length, bytes(startup_message)])
 
         startup_response = await websocket.recv()
         assert isinstance(startup_response, bytes)
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 3a249bbdb4..ec633e352c 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -256,6 +256,7 @@ def test_sharding_split_compaction(
     # Cleanup part 1: while layers are still in PITR window, we should only drop layers that are fully redundant
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
 
         # Invoke compaction: this should drop any layers that don't overlap with the shard's key stripes
         detail_before = ps.http_client().timeline_detail(shard, timeline_id)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index c8de292588..a069e0d01c 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1237,6 +1237,7 @@ def test_storage_controller_tenant_deletion(
     # Assert attachments all have local content
     for shard_id in shard_ids:
         pageserver = env.get_tenant_pageserver(shard_id)
+        assert pageserver is not None
         assert pageserver.tenant_dir(shard_id).exists()
 
     # Assert all shards have some content in remote storage
@@ -2745,6 +2746,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
 
     # Upload but don't compact
     origin_pageserver = env.get_tenant_pageserver(tenant_id)
+    assert origin_pageserver is not None
     dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0]
     origin_pageserver.http_client().timeline_checkpoint(
         tenant_id, timeline_id, wait_until_uploaded=True, compact=False
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 05db0fe977..11ad2173ae 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -245,6 +245,7 @@ def test_scrubber_physical_gc_ancestors(
     workload.write_rows(100, upload=False)
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
         ps.http_client().timeline_checkpoint(
             shard, timeline_id, compact=False, wait_until_uploaded=True
@@ -270,6 +271,7 @@ def test_scrubber_physical_gc_ancestors(
     workload.churn_rows(100)
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         ps.http_client().timeline_compact(shard, timeline_id, force_image_layer_creation=True)
         ps.http_client().timeline_gc(shard, timeline_id, 0)
 
@@ -336,12 +338,15 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder
 
     # Issue a deletion queue flush so that the parent shard can't leave behind layers
     # that will look like unexpected garbage to the scrubber
-    env.get_tenant_pageserver(tenant_id).http_client().deletion_queue_flush(execute=True)
+    ps = env.get_tenant_pageserver(tenant_id)
+    assert ps is not None
+    ps.http_client().deletion_queue_flush(execute=True)
 
     new_shard_count = 4
     shards = env.storage_controller.tenant_shard_split(tenant_id, shard_count=new_shard_count)
     for shard in shards:
         ps = env.get_tenant_pageserver(shard)
+        assert ps is not None
         log.info(f"Waiting for shard {shard} on pageserver {ps.id}")
         ps.http_client().timeline_checkpoint(
             shard, timeline_id, compact=False, wait_until_uploaded=True
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index b41f1709bd..0c431fa453 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -315,6 +315,7 @@ def test_single_branch_get_tenant_size_grows(
         tenant_id: TenantId,
         timeline_id: TimelineId,
     ) -> tuple[Lsn, int]:
+        size = 0
         consistent = False
         size_debug = None
 
@@ -360,7 +361,7 @@ def test_single_branch_get_tenant_size_grows(
         collected_responses.append(("CREATE", current_lsn, size))
 
         batch_size = 100
-
+        prev_size = 0
         for i in range(3):
             with endpoint.cursor() as cur:
                 cur.execute(
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index 5f211ec4d4..68e9385035 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -146,6 +146,7 @@ def test_threshold_based_eviction(
                 out += [f"  {remote} {layer.layer_file_name}"]
             return "\n".join(out)
 
+    stable_for: float = 0
     observation_window = 8 * eviction_threshold
     consider_stable_when_no_change_for_seconds = 3 * eviction_threshold
     poll_interval = eviction_threshold / 3
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 157390c01c..e224d5eb01 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -1506,15 +1506,10 @@ class SafekeeperEnv:
             port=port.http,
             auth_token=None,
         )
-        try:
-            safekeeper_process = start_in_background(
-                cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status
-            )
-            return safekeeper_process
-        except Exception as e:
-            log.error(e)
-            safekeeper_process.kill()
-            raise Exception(f"Failed to start safekepeer as {cmd}, reason: {e}") from e
+        safekeeper_process = start_in_background(
+            cmd, safekeeper_dir, "safekeeper.log", safekeeper_client.check_status
+        )
+        return safekeeper_process
 
     def get_safekeeper_connstrs(self):
         assert self.safekeepers is not None, "safekeepers are not initialized"

From 2fcac0e66b1e4c5b6fb7adb01793c49850191f93 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sat, 9 Nov 2024 01:02:54 +0000
Subject: [PATCH 203/239] CI(pre-merge-checks): add required checks (#9700)

## Problem
The Merge queue doesn't work because it expects certain jobs, which we
don't have in the `pre-merge-checks` workflow.
But it turns out we can just create jobs/checks with the same names in
any workflow that we run.

## Summary of changes
- Add `conclusion` jobs
- Create `neon-cloud-e2e` status check
- Add a bunch of `if`s to handle cases with no relevant changes found
and prepare the workflow to run rust checks in the future
- List the workflow in `report-workflow-stats` to collect stats about it
---
 .github/workflows/pre-merge-checks.yml      | 61 ++++++++++++++++++---
 .github/workflows/report-workflow-stats.yml |  1 +
 2 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/pre-merge-checks.yml b/.github/workflows/pre-merge-checks.yml
index 40ce644eb6..137faa7abc 100644
--- a/.github/workflows/pre-merge-checks.yml
+++ b/.github/workflows/pre-merge-checks.yml
@@ -1,10 +1,14 @@
-name:
+name: Pre-merge checks
 
 on:
   merge_group:
     branches:
       - main
 
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
 
@@ -12,11 +16,11 @@ jobs:
   get-changed-files:
     runs-on: ubuntu-22.04
     outputs:
-      any_changed: ${{ steps.src.outputs.any_changed }}
+      python-changed: ${{ steps.python-src.outputs.any_changed }}
     steps:
       - uses: actions/checkout@v4
-      - uses: tj-actions/changed-files@c3a1bb2c992d77180ae65be6ae6c166cf40f857c # v45.0.3
-        id: src
+      - uses: tj-actions/changed-files@4edd678ac3f81e2dc578756871e4d00c19191daf # v45.0.4
+        id: python-src
         with:
           files: |
             .github/workflows/pre-merge-checks.yml
@@ -26,10 +30,13 @@ jobs:
 
       - name: PRINT ALL CHANGED FILES FOR DEBUG PURPOSES
         env:
-          ALL_CHANGED_FILES: ${{ steps.src.outputs.all_changed_files }}
-        run: echo "${ALL_CHANGED_FILES}"
+          PYTHON_CHANGED_FILES: ${{ steps.python-src.outputs.all_changed_files }}
+        run: |
+          echo "${PYTHON_CHANGED_FILES}"
 
   check-build-tools-image:
+    if: needs.get-changed-files.outputs.python-changed == 'true'
+    needs: [ get-changed-files ]
     uses: ./.github/workflows/check-build-tools-image.yml
 
   build-build-tools-image:
@@ -40,8 +47,48 @@ jobs:
     secrets: inherit
 
   check-codestyle-python:
-    needs: [ build-build-tools-image ]
+    if: needs.get-changed-files.outputs.python-changed == 'true'
+    needs: [ get-changed-files, build-build-tools-image ]
     uses: ./.github/workflows/_check-codestyle-python.yml
     with:
       build-tools-image: ${{ needs.build-build-tools-image.outputs.image }}-bookworm
     secrets: inherit
+
+  # To get items from the merge queue merged into main we need to satisfy "Status checks that are required".
+  # Currently we require 2 jobs (checks with exact name):
+  # - conclusion
+  # - neon-cloud-e2e
+  conclusion:
+    if: always()
+    permissions:
+      statuses: write # for `github.repos.createCommitStatus(...)`
+    needs:
+      - get-changed-files
+      - check-codestyle-python
+    runs-on: ubuntu-22.04
+    steps:
+      - name: Create fake `neon-cloud-e2e` check
+        uses: actions/github-script@v7
+        with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
+          script: |
+            const { repo, owner } = context.repo;
+            const targetUrl = `${context.serverUrl}/${owner}/${repo}/actions/runs/${context.runId}`;
+
+            await github.rest.repos.createCommitStatus({
+              owner: owner,
+              repo: repo,
+              sha: context.sha,
+              context: `neon-cloud-e2e`,
+              state: `success`,
+              target_url: targetUrl,
+              description: `fake check for merge queue`,
+            });
+
+      - name: Fail the job if any of the dependencies do not succeed or skipped
+        run: exit 1
+        if: |
+          (contains(needs.check-codestyle-python.result, 'skipped') && needs.get-changed-files.outputs.python-changed == 'true')
+          || contains(needs.*.result, 'failure')
+          || contains(needs.*.result, 'cancelled')
diff --git a/.github/workflows/report-workflow-stats.yml b/.github/workflows/report-workflow-stats.yml
index 6abeff7695..0d135a257c 100644
--- a/.github/workflows/report-workflow-stats.yml
+++ b/.github/workflows/report-workflow-stats.yml
@@ -23,6 +23,7 @@ on:
     - Test Postgres client libraries
     - Trigger E2E Tests
     - cleanup caches by a branch
+    - Pre-merge checks
     types: [completed]
 
 jobs:

From ceaa80ffebca3050e06c6a5d75f184c6e637ef50 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Mon, 11 Nov 2024 09:58:41 +0000
Subject: [PATCH 204/239] storcon: add peer token for peer to peer
 communication (#9695)

## Problem

We wish to stop using admin tokens in the infra repo, but step down
requests use the admin token.

## Summary of Changes

Introduce a new "ControllerPeer" scope and use it for step-down requests.
---
 libs/utils/src/auth.rs         | 5 +++++
 pageserver/src/auth.rs         | 3 ++-
 safekeeper/src/auth.rs         | 3 ++-
 storage_controller/src/http.rs | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/libs/utils/src/auth.rs b/libs/utils/src/auth.rs
index 5bd6f4bedc..f7acc61ac1 100644
--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -40,6 +40,11 @@ pub enum Scope {
     /// Allows access to storage controller APIs used by the scrubber, to interrogate the state
     /// of a tenant & post scrub results.
     Scrubber,
+
+    /// This scope is used for communication with other storage controller instances.
+    /// At the time of writing, this is only used for the step down request.
+    #[serde(rename = "controller_peer")]
+    ControllerPeer,
 }
 
 /// JWT payload. See docs/authentication.md for the format
diff --git a/pageserver/src/auth.rs b/pageserver/src/auth.rs
index 5c931fcfdb..4075427ab4 100644
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -19,7 +19,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::SafekeeperData
             | Scope::GenerationsApi
             | Scope::Infra
-            | Scope::Scrubber,
+            | Scope::Scrubber
+            | Scope::ControllerPeer,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/safekeeper/src/auth.rs b/safekeeper/src/auth.rs
index fdd0830b02..81c79fae30 100644
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -20,7 +20,8 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
             | Scope::PageServerApi
             | Scope::GenerationsApi
             | Scope::Infra
-            | Scope::Scrubber,
+            | Scope::Scrubber
+            | Scope::ControllerPeer,
             _,
         ) => Err(AuthError(
             format!(
diff --git a/storage_controller/src/http.rs b/storage_controller/src/http.rs
index f6ea1aedc6..9b5d4caf31 100644
--- a/storage_controller/src/http.rs
+++ b/storage_controller/src/http.rs
@@ -1033,7 +1033,7 @@ async fn handle_update_preferred_azs(req: Request<Body>) -> Result<Response<Body
 }
 
 async fn handle_step_down(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
+    check_permissions(&req, Scope::ControllerPeer)?;
 
     let req = match maybe_forward(req).await {
         ForwardOutcome::Forwarded(res) => {

From f510647c7e97432adf31b301cb596e76a2213077 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 11 Nov 2024 12:42:32 +0000
Subject: [PATCH 205/239] CI: retry `actions/github-script` for 5XX errors
 (#9703)

## Problem

GitHub API can return error 500, and it fails jobs that use
`actions/github-script` action.

## Summary of changes
- Add `retry: 500` to all `actions/github-script` usage
---
 .github/actions/allure-report-generate/action.yml | 2 ++
 .github/workflows/build_and_test.yml              | 2 ++
 .github/workflows/neon_extra_builds.yml           | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index 2bdb727719..16b6e71498 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -221,6 +221,8 @@ runs:
         REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
         COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
       with:
+        # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+        retries: 5
         script: |
           const { REPORT_URL, COMMIT_SHA } = process.env
 
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index d415e20db8..cc6f91d28e 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -497,6 +497,8 @@ jobs:
           REPORT_URL_NEW: ${{ steps.upload-coverage-report-new.outputs.report-url }}
           COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
           script: |
             const { REPORT_URL_NEW, COMMIT_SHA } = process.env
 
diff --git a/.github/workflows/neon_extra_builds.yml b/.github/workflows/neon_extra_builds.yml
index 287c9ea281..cd5a665402 100644
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -201,6 +201,8 @@ jobs:
           REPORT_URL: ${{ steps.upload-stats.outputs.report-url }}
           SHA: ${{ github.event.pull_request.head.sha || github.sha }}
         with:
+          # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
+          retries: 5
           script: |
             const { REPORT_URL, SHA } = process.env
 

From 48c06d9f7b7a87fe7cd97bc83b5300f38bf8011e Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 11 Nov 2024 09:13:46 -0500
Subject: [PATCH 206/239] fix(pageserver): increase frozen layer warning
 threshold; ignore in tests (#9705)

Perf benchmarks produce a lot of layers.

## Summary of changes

Bumping the threshold and ignore the warning.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/tenant/timeline.rs                 | 2 +-
 test_runner/fixtures/pageserver/allowed_errors.py | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 56faacbaee..09ddb19765 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3531,7 +3531,7 @@ impl Timeline {
                         self.get_compaction_threshold(),
                         DEFAULT_COMPACTION_THRESHOLD,
                     )
-                    && frozen_layer_total_size >= /* 64 MB */ 64000000
+                    && frozen_layer_total_size >= /* 128 MB */ 128000000
                 {
                     tracing::warn!(
                         "too many frozen layers: {num_frozen_layers} layers with estimated in-mem size of {frozen_layer_total_size} bytes",
diff --git a/test_runner/fixtures/pageserver/allowed_errors.py b/test_runner/fixtures/pageserver/allowed_errors.py
index fa85563e35..d05704c8e0 100755
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -93,6 +93,8 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
     ".*WARN.*path=/v1/utilization .*request was dropped before completing",
     # Can happen during shutdown
     ".*scheduling deletion on drop failed: queue is in state Stopped.*",
+    # Too many frozen layers error is normal during intensive benchmarks
+    ".*too many frozen layers.*",
 )
 
 
From 54a16766803046a691141d3f11778d70df1c3fda Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 11 Nov 2024 09:19:03 -0500
Subject: [PATCH 207/239] rfc: update aux file rfc to reflect latest
 optimizations (#9681)

Reflects https://github.com/neondatabase/neon/pull/9631 in the RFC.

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 docs/rfcs/038-aux-file-v2.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/rfcs/038-aux-file-v2.md b/docs/rfcs/038-aux-file-v2.md
index 9c3c336008..dc8c5d8fc4 100644
--- a/docs/rfcs/038-aux-file-v2.md
+++ b/docs/rfcs/038-aux-file-v2.md
@@ -91,7 +91,7 @@ generating the basebackup by scanning the `REPL_ORIGIN_KEY_PREFIX` keyspace.
 There are two places we need to read the aux files from the pageserver:
 
 * On the write path, when the compute node adds an aux file to the pageserver, we will retrieve the key from the storage, append the file to the hashed key, and write it back. The current `get` API already supports that.
-*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API will attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error.
+*  We use the vectored get API to retrieve all aux files during generating the basebackup. Because we need to scan a sparse keyspace, we slightly modified the vectored get path. The vectorized API used to always attempt to retrieve every single key within the requested key range, and therefore, we modified it in a way that keys within `NON_INHERITED_SPARSE_RANGE` will not trigger missing key error. Furthermore, as aux file reads usually need all layer files intersecting with that key range within the branch and cover a big keyspace, it incurs large overhead for tracking keyspaces that have not been read. Therefore, for sparse keyspaces, we [do not track](https://github.com/neondatabase/neon/pull/9631) `ummapped_keyspace`.
 
 ## Compaction and Image Layer Generation
 

From f63de5f5274ff86a478bfc8a1a00450d896d5ca6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 11 Nov 2024 17:55:50 +0100
Subject: [PATCH 208/239] safekeeper: add `initialize_segment` variant of
 `safekeeper_wal_storage_operation_seconds` (#9691)

## Problem

We don't have a metric capturing the latency of segment initialization.
This can be significant due to fsyncs.

## Summary of changes

Add an `initialize_segment` variant of
`safekeeper_wal_storage_operation_seconds`.
---
 safekeeper/src/metrics.rs     | 2 +-
 safekeeper/src/wal_storage.rs | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index bb56e923f8..bbd2f86898 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -55,7 +55,7 @@ pub static WRITE_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
 pub static FLUSH_WAL_SECONDS: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "safekeeper_flush_wal_seconds",
-        "Seconds spent syncing WAL to a disk",
+        "Seconds spent syncing WAL to a disk (excluding segment initialization)",
         DISK_FSYNC_SECONDS_BUCKETS.to_vec()
     )
     .expect("Failed to register safekeeper_flush_wal_seconds histogram")
diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 4e67940c51..11f372bceb 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -257,6 +257,9 @@ impl PhysicalStorage {
             // Try to open existing partial file
             Ok((file, true))
         } else {
+            let _timer = WAL_STORAGE_OPERATION_SECONDS
+                .with_label_values(&["initialize_segment"])
+                .start_timer();
             // Create and fill new partial file
             //
             // We're using fdatasync during WAL writing, so file size must not
@@ -274,8 +277,6 @@ impl PhysicalStorage {
             });
             file.set_len(self.wal_seg_size as u64).await?;
 
-            // Note: this doesn't get into observe_flush_seconds metric. But
-            // segment init should be separate metric, if any.
             if let Err(e) = durable_rename(&tmp_path, &wal_file_partial_path, !self.no_sync).await {
                 // Probably rename succeeded, but fsync of it failed. Remove
                 // the file then to avoid using it.

From 1aab34715a699e8532c49caa2bf1010e64f09a71 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 11 Nov 2024 17:01:02 +0000
Subject: [PATCH 209/239] Remove checklist from the PR template (#9702)

## Problem
Once we enable the merge queue for the `main` branch, it won't be
possible to adjust the commit message right after pressing the "Squash
and merge" button and the PR title + description will be used as is.

To avoid extra noise in the commits in the `main` with the checklist
leftovers, I propose removing the checklist from the PR template and
keeping only the Problem / Summary of changes.

## Summary of changes
- Remove the checklist from the PR template
---
 .github/pull_request_template.md | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md
index 22c025dd89..89328f20ee 100644
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@@ -1,14 +1,3 @@
 ## Problem
 
 ## Summary of changes
-
-## Checklist before requesting a review
-
-- [ ] I have performed a self-review of my code.
-- [ ] If it is a core feature, I have added thorough tests.
-- [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard?
-- [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section.
-
-## Checklist before merging
-
-- [ ] Do not forget to reformat commit message to not include the above checklist

From 8db84d99643b1c668c935a68610be59e8326ba63 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Mon, 11 Nov 2024 18:51:15 +0100
Subject: [PATCH 210/239] new ingest benchmark (#9711)

## Problem

We have no specific benchmark testing project migration of postgresql
project with existing data into Neon.
Typical steps of such a project migration are
- schema creation in the neon project
- initial COPY of relations
- creation of indexes and constraints
- vacuum analyze

## Summary of changes

Add a periodic benchmark running 9 AM UTC every day.
In each run:
- copy a 200 GiB project that has realistic schema, data, tables,
indexes and constraints from another project into
  - a new Neon project (7 CU fixed)
- an existing tenant, (but new branch and new database) that already has
4 TiB of data
- use pgcopydb tool to automate all steps and parallelize COPY and index
creation
- parse pgcopydb output and report performance metrics in Neon
performance test database

## Logs

This benchmark has been tested first manually and then as part of
benchmarking.yml workflow, example run see

https://github.com/neondatabase/neon/actions/runs/11757679870
---
 .github/actionlint.yml                 |   1 +
 .github/workflows/ingest_benchmark.yml | 372 +++++++++++++++++++++++++
 2 files changed, 373 insertions(+)
 create mode 100644 .github/workflows/ingest_benchmark.yml

diff --git a/.github/actionlint.yml b/.github/actionlint.yml
index 1b602883c5..29c4d18f4a 100644
--- a/.github/actionlint.yml
+++ b/.github/actionlint.yml
@@ -20,3 +20,4 @@ config-variables:
   - REMOTE_STORAGE_AZURE_REGION
   - SLACK_UPCOMING_RELEASE_CHANNEL_ID
   - DEV_AWS_OIDC_ROLE_ARN
+  - BENCHMARK_INGEST_TARGET_PROJECTID
diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
new file mode 100644
index 0000000000..d770bb2bb5
--- /dev/null
+++ b/.github/workflows/ingest_benchmark.yml
@@ -0,0 +1,372 @@
+name: Benchmarking
+
+on:
+  # uncomment to run on push for debugging your PR
+  # push:
+  #   branches: [ your branch ]
+  schedule:
+    # * is a special character in YAML so you have to quote this string
+    #          ┌───────────── minute (0 - 59)
+    #          │ ┌───────────── hour (0 - 23)
+    #          │ │ ┌───────────── day of the month (1 - 31)
+    #          │ │ │ ┌───────────── month (1 - 12 or JAN-DEC)
+    #          │ │ │ │ ┌───────────── day of the week (0 - 6 or SUN-SAT)
+    - cron:   '0 9 * * *' # run once a day, timezone is utc
+  workflow_dispatch: # adds ability to run this manually
+    
+defaults:
+  run:
+    shell: bash -euxo pipefail {0}
+
+concurrency:
+  # Allow only one workflow globally because we need dedicated resources which only exist once
+  group: ingest-bench-workflow
+  cancel-in-progress: true
+
+jobs:
+  ingest:
+    strategy:
+      matrix:
+        target_project: [new_empty_project, large_existing_project]  
+    permissions:
+      contents: write
+      statuses: write
+      id-token: write # aws-actions/configure-aws-credentials
+    env:
+      PG_CONFIG: /tmp/neon/pg_install/v16/bin/pg_config
+      PSQL: /tmp/neon/pg_install/v16/bin/psql
+      PG_16_LIB_PATH: /tmp/neon/pg_install/v16/lib
+      PGCOPYDB: /pgcopydb/bin/pgcopydb
+      PGCOPYDB_LIB_PATH: /pgcopydb/lib
+    runs-on: [ self-hosted, us-east-2, x64 ]
+    container:
+      image: neondatabase/build-tools:pinned-bookworm
+      credentials:
+        username: ${{ secrets.NEON_DOCKERHUB_USERNAME }}
+        password: ${{ secrets.NEON_DOCKERHUB_PASSWORD }}
+      options: --init
+    timeout-minutes: 1440
+
+    steps:
+    - uses: actions/checkout@v4
+
+    - name: Configure AWS credentials # necessary to download artefacts
+      uses: aws-actions/configure-aws-credentials@v4
+      with:
+        aws-region: eu-central-1
+        role-to-assume: ${{ vars.DEV_AWS_OIDC_ROLE_ARN }}
+        role-duration-seconds: 18000 # 5 hours is currently max associated with IAM role 
+
+    - name: Download Neon artifact
+      uses: ./.github/actions/download
+      with:
+        name: neon-${{ runner.os }}-${{ runner.arch }}-release-artifact
+        path: /tmp/neon/
+        prefix: latest
+
+    - name: Create Neon Project
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      id: create-neon-project-ingest-target
+      uses: ./.github/actions/neon-project-create
+      with:
+        region_id: aws-us-east-2
+        postgres_version: 16
+        compute_units: '[7, 7]' # we want to test large compute here to avoid compute-side bottleneck
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'new_empty_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-project-ingest-target.outputs.dsn }}
+          NEW_PROJECT_ID: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+      run: |
+        echo "Initializing Neon project with project_id: ${NEW_PROJECT_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+
+    - name: Create Neon Branch for large tenant
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      id: create-neon-branch-ingest-target
+      uses: ./.github/actions/neon-branch-create
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Initialize Neon project and retrieve current backpressure seconds
+      if: ${{ matrix.target_project == 'large_existing_project' }}
+      env:
+          NEW_PROJECT_CONNSTR: ${{ steps.create-neon-branch-ingest-target.outputs.dsn }}
+          NEW_BRANCH_ID: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+      run: |
+        echo "Initializing Neon branch with branch_id: ${NEW_BRANCH_ID}"
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        # Extract the part before the database name
+        base_connstr="${NEW_PROJECT_CONNSTR%/*}"
+        # Extract the query parameters (if any) after the database name
+        query_params="${NEW_PROJECT_CONNSTR#*\?}"
+        # Reconstruct the new connection string
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          new_connstr="${base_connstr}/neondb?${query_params}"
+        else
+          new_connstr="${base_connstr}/neondb"
+        fi
+        ${PSQL} "${new_connstr}" -c "drop database ludicrous;"
+        ${PSQL} "${new_connstr}" -c "CREATE DATABASE ludicrous;"
+        if [ "$query_params" != "$NEW_PROJECT_CONNSTR" ]; then
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous?${query_params}"
+        else
+          NEW_PROJECT_CONNSTR="${base_connstr}/ludicrous"
+        fi
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "CREATE EXTENSION IF NOT EXISTS neon; CREATE EXTENSION IF NOT EXISTS neon_utils;"
+        BACKPRESSURE_TIME_BEFORE_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_BEFORE_INGEST=${BACKPRESSURE_TIME_BEFORE_INGEST}" >> $GITHUB_ENV
+        echo "NEW_PROJECT_CONNSTR=${NEW_PROJECT_CONNSTR}" >> $GITHUB_ENV
+      
+        
+    - name: Create pgcopydb filter file
+      run: |
+        cat << EOF > /tmp/pgcopydb_filter.txt
+          [include-only-table]
+          public.events
+          public.emails
+          public.email_transmissions
+          public.payments
+          public.editions
+          public.edition_modules
+          public.sp_content
+          public.email_broadcasts
+          public.user_collections
+          public.devices
+          public.user_accounts
+          public.lessons
+          public.lesson_users
+          public.payment_methods
+          public.orders
+          public.course_emails
+          public.modules
+          public.users
+          public.module_users
+          public.courses
+          public.payment_gateway_keys
+          public.accounts
+          public.roles
+          public.payment_gateways
+          public.management
+          public.event_names
+        EOF
+
+    - name: Invoke pgcopydb
+      env:
+          BENCHMARK_INGEST_SOURCE_CONNSTR: ${{ secrets.BENCHMARK_INGEST_SOURCE_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PGCOPYDB_LIB_PATH}:${PG_16_LIB_PATH}
+        export PGCOPYDB_SOURCE_PGURI="${BENCHMARK_INGEST_SOURCE_CONNSTR}"
+        export PGCOPYDB_TARGET_PGURI="${NEW_PROJECT_CONNSTR}"
+        export PGOPTIONS="-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7"
+        ${PG_CONFIG} --bindir
+        ${PGCOPYDB} --version
+        ${PGCOPYDB} clone --skip-vacuum  --no-owner --no-acl --skip-db-properties --table-jobs 4 \
+          --index-jobs 4 --restore-jobs 4 --split-tables-larger-than 10GB --skip-extensions \
+          --use-copy-binary --filters /tmp/pgcopydb_filter.txt 2>&1 | tee /tmp/pgcopydb_${{ matrix.target_project }}.log
+
+    # create dummy pgcopydb log to test parsing
+    # - name: create dummy log for parser test
+    #   run: |
+    #     cat << EOF > /tmp/pgcopydb_${{ matrix.target_project }}.log
+    #     2024-11-04 18:00:53.433 500861 INFO   main.c:136                Running pgcopydb version 0.17.10.g8361a93 from "/usr/lib/postgresql/17/bin/pgcopydb"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1225         [SOURCE] Copying database from "postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.434 500861 INFO   cli_common.c:1226         [TARGET] Copying database into "postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60"
+    #     2024-11-04 18:00:53.442 500861 INFO   copydb.c:105              Using work dir "/tmp/pgcopydb"
+    #     2024-11-04 18:00:53.541 500861 INFO   snapshot.c:107            Exported snapshot "00000008-00000033-1" from the source database
+    #     2024-11-04 18:00:53.556 500865 INFO   cli_clone_follow.c:543    STEP 1: fetch source database tables, indexes, and sequences
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:716       Splitting source candidate tables larger than 10 GB
+    #     2024-11-04 18:00:54.570 500865 INFO   copydb_schema.c:829       Table public.events is 96 GB large which is larger than --split-tables-larger-than 10 GB, and does not have a unique column of type integer: splitting by CTID
+    #     2024-11-04 18:01:05.538 500865 INFO   copydb_schema.c:905       Table public.events is 96 GB large, 10 COPY processes will be used, partitioning on ctid.
+    #     2024-11-04 18:01:05.564 500865 INFO   copydb_schema.c:905       Table public.email_transmissions is 27 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.584 500865 INFO   copydb_schema.c:905       Table public.lessons is 25 GB large, 4 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:905       Table public.lesson_users is 16 GB large, 3 COPY processes will be used, partitioning on id.
+    #     2024-11-04 18:01:05.605 500865 INFO   copydb_schema.c:761       Fetched information for 26 tables (including 4 tables split in 21 partitions total), with an estimated total of 907 million tuples and 175 GB on-disk
+    #     2024-11-04 18:01:05.687 500865 INFO   copydb_schema.c:968       Fetched information for 57 indexes (supporting 25 constraints)
+    #     2024-11-04 18:01:05.753 500865 INFO   sequences.c:78            Fetching information for 24 sequences
+    #     2024-11-04 18:01:05.903 500865 INFO   copydb_schema.c:1122      Fetched information for 4 extensions
+    #     2024-11-04 18:01:06.178 500865 INFO   copydb_schema.c:1538      Found 0 indexes (supporting 0 constraints) in the target database
+    #     2024-11-04 18:01:06.184 500865 INFO   cli_clone_follow.c:584    STEP 2: dump the source database schema (pre/post data)
+    #     2024-11-04 18:01:06.186 500865 INFO   pgcmd.c:468                /usr/lib/postgresql/16/bin/pg_dump -Fc --snapshot 00000008-00000033-1 --section=pre-data --section=post-data --file /tmp/pgcopydb/schema/schema.dump 'postgres://neondb_owner@ep-bitter-shape-w2c1ir0a.us-east-2.aws.neon.build/neondb?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60'
+    #     2024-11-04 18:01:06.952 500865 INFO   cli_clone_follow.c:592    STEP 3: restore the pre-data section to the target database
+    #     2024-11-04 18:01:07.004 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section pre-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/pre-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-04 18:01:07.438 500874 INFO   table-data.c:656          STEP 4: starting 4 table-data COPY processes
+    #     2024-11-04 18:01:07.451 500877 INFO   vacuum.c:139              STEP 8: skipping VACUUM jobs per --skip-vacuum
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:182             STEP 6: starting 4 CREATE INDEX processes
+    #     2024-11-04 18:01:07.457 500875 INFO   indexes.c:183             STEP 7: constraints are built by the CREATE INDEX processes
+    #     2024-11-04 18:01:07.507 500865 INFO   blobs.c:74                Skipping large objects: none found.
+    #     2024-11-04 18:01:07.509 500865 INFO   sequences.c:194           STEP 9: reset sequences values
+    #     2024-11-04 18:01:07.510 500886 INFO   sequences.c:290           Set sequences values on the target database
+    #     2024-11-04 20:49:00.587 500865 INFO   cli_clone_follow.c:608    STEP 10: restore the post-data section to the target database
+    #     2024-11-04 20:49:00.600 500865 INFO   pgcmd.c:1001               /usr/lib/postgresql/16/bin/pg_restore --dbname 'postgres://neondb_owner@ep-icy-union-w25qd5pj.us-east-2.aws.neon.build/ludicrous?sslmode=require&keepalives=1&keepalives_idle=10&keepalives_interval=10&keepalives_count=60' --section post-data --jobs 4 --no-owner --no-acl --use-list /tmp/pgcopydb/schema/post-filtered.list /tmp/pgcopydb/schema/schema.dump
+    #     2024-11-05 10:50:58.508 500865 INFO   cli_clone_follow.c:639    All step are now done, 16h49m elapsed
+    #     2024-11-05 10:50:58.508 500865 INFO   summary.c:3155            Printing summary for 26 tables and 57 indexes
+
+    #       OID | Schema |                 Name | Parts | copy duration | transmitted bytes | indexes | create index duration 
+    #     ------+--------+----------------------+-------+---------------+-------------------+---------+----------------------
+    #     24654 | public |               events |    10 |         1d11h |            878 GB |       1 |                 1h41m
+    #     24623 | public |  email_transmissions |     4 |         4h46m |             99 GB |       3 |                 2h04m
+    #     24665 | public |              lessons |     4 |         4h42m |            161 GB |       4 |                 1m11s
+    #     24661 | public |         lesson_users |     3 |         2h46m |             49 GB |       3 |                39m35s
+    #     24631 | public |               emails |     1 |        34m07s |             10 GB |       2 |                   17s
+    #     24739 | public |             payments |     1 |         5m47s |           1848 MB |       4 |                 4m40s
+    #     24681 | public |         module_users |     1 |         4m57s |           1610 MB |       3 |                 1m50s
+    #     24694 | public |               orders |     1 |         2m50s |            835 MB |       3 |                 1m05s
+    #     24597 | public |              devices |     1 |         1m45s |            498 MB |       2 |                   40s
+    #     24723 | public |      payment_methods |     1 |         1m24s |            548 MB |       2 |                   31s
+    #     24765 | public |     user_collections |     1 |         2m17s |           1005 MB |       2 |                 968ms
+    #     24774 | public |                users |     1 |           52s |            291 MB |       4 |                   27s
+    #     24760 | public |        user_accounts |     1 |           16s |            172 MB |       3 |                   16s
+    #     24606 | public |      edition_modules |     1 |         8s983 |             46 MB |       3 |                 4s749
+    #     24583 | public |        course_emails |     1 |         8s526 |             26 MB |       2 |                 996ms
+    #     24685 | public |              modules |     1 |         1s592 |             21 MB |       3 |                 1s696
+    #     24610 | public |             editions |     1 |         2s199 |           7483 kB |       2 |                 1s032
+    #     24755 | public |           sp_content |     1 |         1s555 |           4177 kB |       0 |                   0ms
+    #     24619 | public |     email_broadcasts |     1 |         744ms |           2645 kB |       2 |                 677ms
+    #     24590 | public |              courses |     1 |         387ms |           1540 kB |       2 |                 367ms
+    #     24704 | public | payment_gateway_keys |     1 |         1s972 |            164 kB |       2 |                  27ms
+    #     24576 | public |             accounts |     1 |          58ms |             24 kB |       1 |                  14ms
+    #     24647 | public |          event_names |     1 |          32ms |             397 B |       1 |                   8ms
+    #     24716 | public |     payment_gateways |     1 |         1s675 |             117 B |       1 |                  11ms
+    #     24748 | public |                roles |     1 |          71ms |             173 B |       1 |                   8ms
+    #     24676 | public |           management |     1 |          33ms |              40 B |       1 |                  19ms
+
+
+    #                                                   Step   Connection    Duration    Transfer   Concurrency
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #       Catalog Queries (table ordering, filtering, etc)       source         12s                         1
+    #                                             Dump Schema       source       765ms                         1
+    #                                         Prepare Schema       target       466ms                         1
+    #           COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)         both       2h47m                        12
+    #                                       COPY (cumulative)         both       7h46m     1225 GB             4
+    #                               CREATE INDEX (cumulative)       target       4h36m                         4
+    #                               CONSTRAINTS (cumulative)       target       8s493                         4
+    #                                     VACUUM (cumulative)       target         0ms                         4
+    #                                         Reset Sequences         both        60ms                         1
+    #                             Large Objects (cumulative)       (null)         0ms                         0
+    #                                         Finalize Schema         both      14h01m                         4
+    #     --------------------------------------------------   ----------  ----------  ----------  ------------
+    #                               Total Wall Clock Duration         both      16h49m                        20
+
+
+    #     EOF
+
+
+    - name: show tables sizes and retrieve current backpressure seconds
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+        ${PSQL} "${NEW_PROJECT_CONNSTR}" -c "\dt+"
+        BACKPRESSURE_TIME_AFTER_INGEST=$(${PSQL} "${NEW_PROJECT_CONNSTR}" -t -c "select backpressure_throttling_time()/1000000;")
+        echo "BACKPRESSURE_TIME_AFTER_INGEST=${BACKPRESSURE_TIME_AFTER_INGEST}" >> $GITHUB_ENV
+
+    - name: Parse pgcopydb log and report performance metrics
+      env:
+        PERF_TEST_RESULT_CONNSTR: ${{ secrets.PERF_TEST_RESULT_CONNSTR }}
+      run: |
+        export LD_LIBRARY_PATH=${PG_16_LIB_PATH}
+
+        # Define the log file path
+        LOG_FILE="/tmp/pgcopydb_${{ matrix.target_project }}.log"
+        
+        # Get the current git commit hash
+        git config --global --add safe.directory /__w/neon/neon
+        COMMIT_HASH=$(git rev-parse --short HEAD)
+        
+        # Define the platform and test suite
+        PLATFORM="pg16-${{ matrix.target_project }}-us-east-2-staging"
+        SUIT="pgcopydb_ingest_bench"
+        
+        # Function to convert time (e.g., "2h47m", "4h36m", "118ms", "8s493") to seconds
+        convert_to_seconds() {
+          local duration=$1
+          local total_seconds=0
+    
+          # Check for hours (h)
+          if [[ "$duration" =~ ([0-9]+)h ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 3600))
+          fi
+    
+          # Check for seconds (s)
+          if [[ "$duration" =~ ([0-9]+)s ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0}))
+          fi
+    
+          # Check for milliseconds (ms) (if applicable)
+          if [[ "$duration" =~ ([0-9]+)ms ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} / 1000))
+            duration=${duration/${BASH_REMATCH[0]}/} # need to remove it to avoid double counting with m 
+          fi
+
+          # Check for minutes (m) - must be checked after ms because m is contained in ms
+          if [[ "$duration" =~ ([0-9]+)m ]]; then
+            total_seconds=$((total_seconds + ${BASH_REMATCH[1]#0} * 60))
+          fi
+    
+          echo $total_seconds
+        }
+
+        # Calculate the backpressure difference in seconds
+        BACKPRESSURE_TIME_DIFF=$(awk "BEGIN {print $BACKPRESSURE_TIME_AFTER_INGEST - $BACKPRESSURE_TIME_BEFORE_INGEST}")
+
+        # Insert the backpressure time difference into the performance database
+        if [ -n "$BACKPRESSURE_TIME_DIFF" ]; then
+          PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+          INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+          VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', 'backpressure_time', ${BACKPRESSURE_TIME_DIFF}, 'seconds', 'lower_is_better', now());
+          \""
+          echo "Inserting backpressure time difference: ${BACKPRESSURE_TIME_DIFF} seconds"
+          eval $PSQL_CMD
+        fi
+
+        # Extract and process log lines
+        while IFS= read -r line; do
+          METRIC_NAME=""
+          # Match each desired line and extract the relevant information
+          if [[ "$line" =~ COPY,\ INDEX,\ CONSTRAINTS,\ VACUUM.* ]]; then
+            METRIC_NAME="COPY, INDEX, CONSTRAINTS, VACUUM (wall clock)"
+          elif [[ "$line" =~ COPY\ \(cumulative\).* ]]; then
+            METRIC_NAME="COPY (cumulative)"
+          elif [[ "$line" =~ CREATE\ INDEX\ \(cumulative\).* ]]; then
+            METRIC_NAME="CREATE INDEX (cumulative)"
+          elif [[ "$line" =~ CONSTRAINTS\ \(cumulative\).* ]]; then
+            METRIC_NAME="CONSTRAINTS (cumulative)"
+          elif [[ "$line" =~ Finalize\ Schema.* ]]; then
+            METRIC_NAME="Finalize Schema"
+          elif [[ "$line" =~ Total\ Wall\ Clock\ Duration.* ]]; then
+            METRIC_NAME="Total Wall Clock Duration"
+          fi
+          
+          # If a metric was matched, insert it into the performance database
+          if [ -n "$METRIC_NAME" ]; then
+            DURATION=$(echo "$line" | grep -oP '\d+h\d+m|\d+s|\d+ms|\d{1,2}h\d{1,2}m|\d+\.\d+s' | head -n 1)
+            METRIC_VALUE=$(convert_to_seconds "$DURATION")
+            PSQL_CMD="${PSQL} \"${PERF_TEST_RESULT_CONNSTR}\" -c \"
+            INSERT INTO public.perf_test_results (suit, revision, platform, metric_name, metric_value, metric_unit, metric_report_type, recorded_at_timestamp)
+            VALUES ('${SUIT}', '${COMMIT_HASH}', '${PLATFORM}', '${METRIC_NAME}', ${METRIC_VALUE}, 'seconds', 'lower_is_better', now());
+            \""
+            echo "Inserting ${METRIC_NAME} with value ${METRIC_VALUE} seconds"
+            eval $PSQL_CMD
+          fi
+        done < "$LOG_FILE"
+      
+    - name: Delete Neon Project
+      if: ${{ always() && matrix.target_project == 'new_empty_project' }}
+      uses: ./.github/actions/neon-project-delete
+      with:
+        project_id: ${{ steps.create-neon-project-ingest-target.outputs.project_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}
+
+    - name: Delete Neon Branch for large tenant
+      if: ${{ always() && matrix.target_project == 'large_existing_project' }}
+      uses: ./.github/actions/neon-branch-delete
+      with:
+        project_id: ${{ vars.BENCHMARK_INGEST_TARGET_PROJECTID }}
+        branch_id: ${{ steps.create-neon-branch-ingest-target.outputs.branch_id }}
+        api_key: ${{ secrets.NEON_STAGING_API_KEY }}

From e9dcfa2eb2950ff43a266238bb94cb2ec70fb8bc Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 11 Nov 2024 18:07:01 +0000
Subject: [PATCH 211/239] test_runner: skip more tests using decorator instead
 of pytest.skip (#9704)

## Problem

Running `pytest.skip(...)` in a test body instead of marking the test
with `@pytest.mark.skipif(...)` makes all fixtures to be initialised,
which is not necessary if the test is going to be skipped anyway.

Also, some tests are unnecessarily skipped (e.g. `test_layer_bloating`
on Postgres 17, or `test_idle_reconnections` at all) or run (e.g.
`test_parse_project_git_version_output_positive` more than on once
configuration) according to comments.

## Summary of changes
- Move `skip_on_postgres` / `xfail_on_postgres` /
`run_only_on_default_postgres` decorators to `fixture.utils`
- Add new `skip_in_debug_build` and `skip_on_ci` decorators
- Replace `pytest.skip(...)` calls with decorators where possible
---
 test_runner/fixtures/pg_version.py            | 31 ++------------
 test_runner/fixtures/utils.py                 | 41 ++++++++++++++++++-
 ...er_max_throughput_getpage_at_latest_lsn.py | 13 +++---
 test_runner/regress/test_branch_and_gc.py     |  8 ++--
 test_runner/regress/test_compaction.py        |  5 +--
 .../regress/test_download_extensions.py       |  8 ++--
 .../regress/test_ingestion_layer_size.py      |  9 ++--
 test_runner/regress/test_layer_bloating.py    | 13 ++++--
 test_runner/regress/test_layer_eviction.py    | 11 ++---
 test_runner/regress/test_logging.py           |  3 +-
 test_runner/regress/test_neon_cli.py          | 21 ++++------
 .../regress/test_pageserver_layer_rolling.py  | 12 ++----
 .../regress/test_pageserver_restart.py        | 10 ++---
 .../regress/test_pageserver_secondary.py      |  4 +-
 test_runner/regress/test_pg_regress.py        |  4 +-
 test_runner/regress/test_replica_start.py     | 11 +++--
 test_runner/regress/test_sharding.py          | 11 ++---
 .../regress/test_storage_controller.py        |  3 +-
 test_runner/regress/test_tenant_size.py       |  5 +--
 .../regress/test_timeline_detach_ancestor.py  | 18 ++++----
 test_runner/regress/test_wal_acceptor.py      |  7 ++--
 .../regress/test_wal_acceptor_async.py        |  5 +--
 22 files changed, 123 insertions(+), 130 deletions(-)

diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 01f0245665..4feab52c43 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -1,10 +1,8 @@
 from __future__ import annotations
 
 import enum
-import os
 from typing import TYPE_CHECKING
 
-import pytest
 from typing_extensions import override
 
 if TYPE_CHECKING:
@@ -18,12 +16,15 @@ This fixture is used to determine which version of Postgres to use for tests.
 
 # Inherit PgVersion from str rather than int to make it easier to pass as a command-line argument
 # TODO: use enum.StrEnum for Python >= 3.11
-@enum.unique
 class PgVersion(str, enum.Enum):
     V14 = "14"
     V15 = "15"
     V16 = "16"
     V17 = "17"
+
+    # Default Postgres Version for tests that don't really depend on Postgres itself
+    DEFAULT = V16
+
     # Instead of making version an optional parameter in methods, we can use this fake entry
     # to explicitly rely on the default server version (could be different from pg_version fixture value)
     NOT_SET = "<-POSTRGRES VERSION IS NOT SET->"
@@ -59,27 +60,3 @@ class PgVersion(str, enum.Enum):
         # Make mypy happy
         # See https://github.com/python/mypy/issues/3974
         return None
-
-
-DEFAULT_VERSION: PgVersion = PgVersion.V16
-
-
-def skip_on_postgres(version: PgVersion, reason: str):
-    return pytest.mark.skipif(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
-        reason=reason,
-    )
-
-
-def xfail_on_postgres(version: PgVersion, reason: str):
-    return pytest.mark.xfail(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
-        reason=reason,
-    )
-
-
-def run_only_on_default_postgres(reason: str):
-    return pytest.mark.skipif(
-        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is not DEFAULT_VERSION,
-        reason=reason,
-    )
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 01b7cf1026..96a651f0db 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -25,6 +25,7 @@ from fixtures.pageserver.common_types import (
     parse_delta_layer,
     parse_image_layer,
 )
+from fixtures.pg_version import PgVersion
 
 if TYPE_CHECKING:
     from collections.abc import Iterable
@@ -37,6 +38,7 @@ if TYPE_CHECKING:
 
 
 Fn = TypeVar("Fn", bound=Callable[..., Any])
+
 COMPONENT_BINARIES = {
     "storage_controller": ("storage_controller",),
     "storage_broker": ("storage_broker",),
@@ -519,7 +521,7 @@ def assert_pageserver_backups_equal(left: Path, right: Path, skip_files: set[str
     This is essentially:
 
     lines=$(comm -3 \
-        <(mkdir left && cd left && tar xf "$left" && find . -type f -print0 | xargs sha256sum | sort -k2) \
+        <(mkdir left  && cd left  && tar xf "$left"  && find . -type f -print0 | xargs sha256sum | sort -k2) \
         <(mkdir right && cd right && tar xf "$right" && find . -type f -print0 | xargs sha256sum | sort -k2) \
         | wc -l)
     [ "$lines" = "0" ]
@@ -643,3 +645,40 @@ def allpairs_versions():
         )
         ids.append(f"combination_{''.join(cur_id)}")
     return {"argnames": "combination", "argvalues": tuple(argvalues), "ids": ids}
+
+
+def skip_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
+        reason=reason,
+    )
+
+
+def xfail_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.xfail(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is version,
+        reason=reason,
+    )
+
+
+def run_only_on_default_postgres(reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.getenv("DEFAULT_PG_VERSION", PgVersion.DEFAULT)) is not PgVersion.DEFAULT,
+        reason=reason,
+    )
+
+
+def skip_in_debug_build(reason: str):
+    return pytest.mark.skipif(
+        os.getenv("BUILD_TYPE", "debug") == "debug",
+        reason=reason,
+    )
+
+
+def skip_on_ci(reason: str):
+    # `CI` variable is always set to `true` on GitHub
+    # Ref: https://docs.github.com/en/actions/writing-workflows/choosing-what-your-workflow-does/store-information-in-variables#default-environment-variables
+    return pytest.mark.skipif(
+        os.getenv("CI", "false") == "true",
+        reason=reason,
+    )
diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
index c038fc3fd2..3dbbb197f4 100644
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import json
-import os
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -14,7 +13,7 @@ from fixtures.neon_fixtures import (
     PgBin,
     wait_for_last_flush_lsn,
 )
-from fixtures.utils import get_scale_for_db, humantime_to_ms
+from fixtures.utils import get_scale_for_db, humantime_to_ms, skip_on_ci
 
 from performance.pageserver.util import (
     setup_pageserver_with_tenants,
@@ -38,9 +37,8 @@ if TYPE_CHECKING:
 @pytest.mark.parametrize("pgbench_scale", [get_scale_for_db(200)])
 @pytest.mark.parametrize("n_tenants", [500])
 @pytest.mark.timeout(10000)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+@skip_on_ci(
+    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
 )
 def test_pageserver_characterize_throughput_with_n_tenants(
     neon_env_builder: NeonEnvBuilder,
@@ -66,9 +64,8 @@ def test_pageserver_characterize_throughput_with_n_tenants(
 @pytest.mark.parametrize("n_clients", [1, 64])
 @pytest.mark.parametrize("n_tenants", [1])
 @pytest.mark.timeout(2400)
-@pytest.mark.skipif(
-    os.getenv("CI", "false") == "true",
-    reason="This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI",
+@skip_on_ci(
+    "This test needs lot of resources and should run on dedicated HW, not in github action runners as part of CI"
 )
 def test_pageserver_characterize_latencies_with_1_client_and_throughput_with_many_clients_one_tenant(
     neon_env_builder: NeonEnvBuilder,
diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py
index 6d1565c5e5..fccfbc7f09 100644
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -8,7 +8,7 @@ from fixtures.common_types import Lsn, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv
 from fixtures.pageserver.http import TimelineCreate406
-from fixtures.utils import query_scalar
+from fixtures.utils import query_scalar, skip_in_debug_build
 
 
 # Test the GC implementation when running with branching.
@@ -48,10 +48,8 @@ from fixtures.utils import query_scalar
 # Because the delta layer D covering lsn1 is corrupted, creating a branch
 # starting from lsn1 should return an error as follows:
 #     could not find data for key ... at LSN ..., for request at LSN ...
-def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_branch_and_gc(neon_simple_env: NeonEnv):
     env = neon_simple_env
     pageserver_http_client = env.pageserver.http_client()
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 420055ac3a..370df3c379 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import enum
 import json
-import os
 import time
 from typing import TYPE_CHECKING
 
@@ -13,7 +12,7 @@ from fixtures.neon_fixtures import (
     generate_uploads_and_deletions,
 )
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 
 if TYPE_CHECKING:
@@ -32,7 +31,7 @@ AGGRESIVE_COMPACTION_TENANT_CONF = {
 }
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     """
     This is a smoke test that compaction kicks in. The workload repeatedly churns
diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py
index 0134f80769..b2e19ad713 100644
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -12,6 +12,7 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.utils import skip_on_postgres
 from pytest_httpserver import HTTPServer
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -41,17 +42,14 @@ def neon_env_builder_local(
     return neon_env_builder
 
 
+@skip_on_postgres(PgVersion.V16, reason="TODO: PG16 extension building")
+@skip_on_postgres(PgVersion.V17, reason="TODO: PG17 extension building")
 def test_remote_extensions(
     httpserver: HTTPServer,
     neon_env_builder_local: NeonEnvBuilder,
     httpserver_listen_address,
     pg_version,
 ):
-    if pg_version == PgVersion.V16:
-        pytest.skip("TODO: PG16 extension building")
-    if pg_version == PgVersion.V17:
-        pytest.skip("TODO: PG17 extension building")
-
     # setup mock http server
     # that expects request for anon.tar.zst
     # and returns the requested file
diff --git a/test_runner/regress/test_ingestion_layer_size.py b/test_runner/regress/test_ingestion_layer_size.py
index 646dac8e6e..2916748925 100644
--- a/test_runner/regress/test_ingestion_layer_size.py
+++ b/test_runner/regress/test_ingestion_layer_size.py
@@ -4,25 +4,22 @@ from collections.abc import Iterable
 from dataclasses import dataclass
 from typing import TYPE_CHECKING
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.pageserver.http import HistoricLayerInfo, LayerMapInfo
-from fixtures.utils import human_bytes
+from fixtures.utils import human_bytes, skip_in_debug_build
 
 if TYPE_CHECKING:
     from typing import Union
 
 
-def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder, build_type: str):
+@skip_in_debug_build("debug run is unnecessarily slow")
+def test_ingesting_large_batches_of_images(neon_env_builder: NeonEnvBuilder):
     """
     Build a non-small GIN index which includes similarly batched up images in WAL stream as does pgvector
     to show that we no longer create oversized layers.
     """
 
-    if build_type == "debug":
-        pytest.skip("debug run is unnecessarily slow")
-
     minimum_initdb_size = 20 * 1024**2
     checkpoint_distance = 32 * 1024**2
     minimum_good_layer_size = checkpoint_distance * 0.9
diff --git a/test_runner/regress/test_layer_bloating.py b/test_runner/regress/test_layer_bloating.py
index a08d522fc2..d9043fef7f 100644
--- a/test_runner/regress/test_layer_bloating.py
+++ b/test_runner/regress/test_layer_bloating.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import os
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -10,12 +9,18 @@ from fixtures.neon_fixtures import (
     wait_for_last_flush_lsn,
 )
 from fixtures.pg_version import PgVersion
+from fixtures.utils import skip_on_postgres
 
 
+@skip_on_postgres(
+    PgVersion.V14,
+    reason="pg_log_standby_snapshot() function is available since Postgres 16",
+)
+@skip_on_postgres(
+    PgVersion.V15,
+    reason="pg_log_standby_snapshot() function is available since Postgres 16",
+)
 def test_layer_bloating(neon_env_builder: NeonEnvBuilder, vanilla_pg):
-    if neon_env_builder.pg_version != PgVersion.V16:
-        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
-
     env = neon_env_builder.init_start(
         initial_tenant_conf={
             "gc_period": "0s",
diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py
index c49ac6893e..2eb38c49b2 100644
--- a/test_runner/regress/test_layer_eviction.py
+++ b/test_runner/regress/test_layer_eviction.py
@@ -2,7 +2,6 @@ from __future__ import annotations
 
 import time
 
-import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -12,17 +11,13 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.common_types import parse_layer_file_name
 from fixtures.pageserver.utils import wait_for_upload
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.utils import skip_in_debug_build
 
 
 # Crates a few layers, ensures that we can evict them (removing locally but keeping track of them anyway)
 # and then download them back.
-def test_basic_eviction(
-    neon_env_builder: NeonEnvBuilder,
-    build_type: str,
-):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_basic_eviction(neon_env_builder: NeonEnvBuilder):
     neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
 
     env = neon_env_builder.init_start(
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index 9a3fdd835d..f6fbdcabfd 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -5,8 +5,7 @@ import uuid
 import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
-from fixtures.pg_version import run_only_on_default_postgres
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 
 
 @pytest.mark.parametrize("level", ["trace", "debug", "info", "warn", "error"])
diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py
index 783fb813cf..72db72f2b9 100644
--- a/test_runner/regress/test_neon_cli.py
+++ b/test_runner/regress/test_neon_cli.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 import subprocess
 from pathlib import Path
 from typing import cast
@@ -15,7 +14,7 @@ from fixtures.neon_fixtures import (
     parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
-from fixtures.pg_version import PgVersion, skip_on_postgres
+from fixtures.utils import run_only_on_default_postgres, skip_in_debug_build
 
 
 def helper_compare_timeline_list(
@@ -195,10 +194,8 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder):
     res.check_returncode()
 
 
-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_parse_project_git_version_output_positive():
     commit = "b6f77b5816cf1dba12a3bc8747941182ce220846"
 
@@ -217,10 +214,8 @@ def test_parse_project_git_version_output_positive():
         assert parse_project_git_version_output(example) == commit
 
 
-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="unit test for test support, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_parse_project_git_version_output_local_docker():
     """
     Makes sure the tests don't accept the default version in Dockerfile one gets without providing
@@ -234,10 +229,8 @@ def test_parse_project_git_version_output_local_docker():
     assert input in str(e)
 
 
-@skip_on_postgres(PgVersion.V14, reason="does not use postgres")
-@pytest.mark.skipif(
-    os.environ.get("BUILD_TYPE") == "debug", reason="cli api sanity, either build works"
-)
+@run_only_on_default_postgres(reason="does not use postgres")
+@skip_in_debug_build("unit test for test support, either build works")
 def test_binaries_version_parses(neon_binpath: Path):
     """
     Ensures that we can parse the actual outputs of --version from a set of binaries.
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index c0eb598891..200a323a3a 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 
 import asyncio
-import os
 import time
 from typing import TYPE_CHECKING
 
@@ -16,7 +15,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 
 if TYPE_CHECKING:
     from typing import Optional
@@ -227,12 +226,9 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
         assert get_dirty_bytes(env) >= dirty_after_write
 
 
-@pytest.mark.skipif(
-    # We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
-    # prohibitively slow in debug mode
-    os.getenv("BUILD_TYPE") == "debug",
-    reason="Avoid running bulkier ingest tests in debug mode",
-)
+# We have to use at least ~100MB of data to hit the lowest limit we can configure, which is
+# prohibitively slow in debug mode
+@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
 def test_total_size_limit(neon_env_builder: NeonEnvBuilder):
     """
     Test that checkpoints are done based on total ephemeral layer size, even if no one timeline is
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index f7c42fc893..fb6050689c 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -8,7 +8,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnvBuilder
 from fixtures.remote_storage import s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 
 
 # Test restarting page server, while safekeeper and compute node keep
@@ -155,12 +155,8 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 # safekeeper and compute node keep running.
 @pytest.mark.timeout(540)
 @pytest.mark.parametrize("shard_count", [None, 4])
-def test_pageserver_chaos(
-    neon_env_builder: NeonEnvBuilder, build_type: str, shard_count: Optional[int]
-):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
+@skip_in_debug_build("times out in debug builds")
+def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder, shard_count: Optional[int]):
     # same rationale as with the immediate stop; we might leave orphan layers behind.
     neon_env_builder.disable_scrub_on_exit()
     neon_env_builder.enable_pageserver_remote_storage(s3_storage())
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index 705b4ff054..d4aef96735 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -17,7 +17,7 @@ from fixtures.pageserver.utils import (
     wait_for_upload_queue_empty,
 )
 from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind, S3Storage, s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from werkzeug.wrappers.request import Request
 from werkzeug.wrappers.response import Response
@@ -765,7 +765,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder):
     assert download_rate < expect_download_rate * 2
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 @pytest.mark.parametrize("via_controller", [True, False])
 def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controller: bool):
     """
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index b97fccddf5..f4698191eb 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,7 +3,6 @@
 #
 from __future__ import annotations
 
-import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 from typing import TYPE_CHECKING, cast
@@ -19,6 +18,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import s3_storage
+from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
     from typing import Optional
@@ -329,7 +329,7 @@ def test_sql_regress(
     post_checks(env, test_output_dir, DBNAME, endpoint)
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_tx_abort_with_many_relations(
     neon_env_builder: NeonEnvBuilder,
 ):
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
index e81e7dad76..8e7c01f950 100644
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -30,7 +30,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, wait_for_last_flush_lsn, wait_replica_caughtup
 from fixtures.pg_version import PgVersion
-from fixtures.utils import query_scalar, wait_until
+from fixtures.utils import query_scalar, skip_on_postgres, wait_until
 
 CREATE_SUBXACTS_FUNC = """
 create or replace function create_subxacts(n integer) returns void as $$
@@ -137,6 +137,12 @@ def test_replica_start_scan_clog_crashed_xids(neon_simple_env: NeonEnv):
     assert secondary_cur.fetchone() == (1,)
 
 
+@skip_on_postgres(
+    PgVersion.V14, reason="pg_log_standby_snapshot() function is available since Postgres 16"
+)
+@skip_on_postgres(
+    PgVersion.V15, reason="pg_log_standby_snapshot() function is available since Postgres 16"
+)
 def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
     """
     Test that starting a replica works right after the primary has
@@ -149,9 +155,6 @@ def test_replica_start_at_running_xacts(neon_simple_env: NeonEnv, pg_version):
     """
     env = neon_simple_env
 
-    if env.pg_version == PgVersion.V14 or env.pg_version == PgVersion.V15:
-        pytest.skip("pg_log_standby_snapshot() function is available only in PG16")
-
     primary = env.endpoints.create_start(branch_name="main", endpoint_id="primary")
     primary_conn = primary.connect()
     primary_cur = primary_conn.cursor()
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index ec633e352c..0a4a53356d 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
 from fixtures.remote_storage import s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 from pytest_httpserver import HTTPServer
 from typing_extensions import override
@@ -853,12 +853,9 @@ def test_sharding_split_stripe_size(
     wait_until(10, 1, assert_restart_notification)
 
 
-@pytest.mark.skipif(
-    # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
-    # validating in this test don't benefit much from debug assertions.
-    os.getenv("BUILD_TYPE") == "debug",
-    reason="Avoid running bulkier ingest tests in debug mode",
-)
+# The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
+# validating in this test don't benefit much from debug assertions.
+@skip_in_debug_build("Avoid running bulkier ingest tests in debug mode")
 def test_sharding_ingest_layer_sizes(
     neon_env_builder: NeonEnvBuilder,
 ):
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index a069e0d01c..2c3d79b18a 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -36,11 +36,12 @@ from fixtures.pageserver.utils import (
     remote_storage_delete_key,
     timeline_delete_wait_completed,
 )
-from fixtures.pg_version import PgVersion, run_only_on_default_postgres
+from fixtures.pg_version import PgVersion
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, s3_storage
 from fixtures.storage_controller_proxy import StorageControllerProxy
 from fixtures.utils import (
+    run_only_on_default_postgres,
     run_pg_bench_small,
     subprocess_capture,
     wait_until,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 0c431fa453..8b733da0c6 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import os
 from concurrent.futures import ThreadPoolExecutor
 from pathlib import Path
 
@@ -21,7 +20,7 @@ from fixtures.pageserver.utils import (
     wait_until_tenant_active,
 )
 from fixtures.pg_version import PgVersion
-from fixtures.utils import wait_until
+from fixtures.utils import skip_in_debug_build, wait_until
 
 
 def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder):
@@ -279,7 +278,7 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa
     size_debug_file.write(size_debug)
 
 
-@pytest.mark.skipif(os.environ.get("BUILD_TYPE") == "debug", reason="only run with release build")
+@skip_in_debug_build("only run with release build")
 def test_single_branch_get_tenant_size_grows(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path, pg_version: PgVersion
 ):
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 0e8519e07b..ef0eb05612 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -869,8 +869,17 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
         assert count == 10000
 
 
-@pytest.mark.parametrize("mode", ["delete_timeline", "delete_tenant"])
-@pytest.mark.parametrize("sharded", [False, True])
+@pytest.mark.parametrize(
+    "mode, sharded",
+    [
+        ("delete_timeline", False),
+        ("delete_timeline", True),
+        ("delete_tenant", False),
+        # the shared/exclusive lock for tenant is blocking this:
+        # timeline detach ancestor takes shared, delete tenant takes exclusive
+        # ("delete_tenant", True)
+    ],
+)
 def test_timeline_detach_ancestor_interrupted_by_deletion(
     neon_env_builder: NeonEnvBuilder, mode: str, sharded: bool
 ):
@@ -885,11 +894,6 @@ def test_timeline_detach_ancestor_interrupted_by_deletion(
     - shutdown winning over complete, see test_timeline_is_deleted_before_timeline_detach_ancestor_completes
     """
 
-    if sharded and mode == "delete_tenant":
-        # the shared/exclusive lock for tenant is blocking this:
-        # timeline detach ancestor takes shared, delete tenant takes exclusive
-        pytest.skip("tenant deletion while timeline ancestor detach is underway cannot happen")
-
     shard_count = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shard_count
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index e224d5eb01..0676b3dd9a 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -54,6 +54,8 @@ from fixtures.utils import (
     PropagatingThread,
     get_dir_size,
     query_scalar,
+    run_only_on_default_postgres,
+    skip_in_debug_build,
     start_in_background,
     wait_until,
 )
@@ -2104,10 +2106,9 @@ def test_pull_timeline_while_evicted(neon_env_builder: NeonEnvBuilder):
 # The only way to verify this without manipulating time is to sleep for a while.
 # In this test we sleep for 60 seconds, so this test takes at least 1 minute to run.
 # This is longer than most other tests, we run it only for v16 to save CI resources.
+@run_only_on_default_postgres("run only on release build to save CI resources")
+@skip_in_debug_build("run only on release build to save CI resources")
 def test_idle_reconnections(neon_env_builder: NeonEnvBuilder):
-    if os.environ.get("PYTEST_CURRENT_TEST", "").find("[debug-pg16]") == -1:
-        pytest.skip("run only on debug postgres v16 to save CI resources")
-
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py
index f328974264..d3e989afa8 100644
--- a/test_runner/regress/test_wal_acceptor_async.py
+++ b/test_runner/regress/test_wal_acceptor_async.py
@@ -14,6 +14,7 @@ from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import getLogger
 from fixtures.neon_fixtures import Endpoint, NeonEnv, NeonEnvBuilder, Safekeeper
 from fixtures.remote_storage import RemoteStorageKind
+from fixtures.utils import skip_in_debug_build
 
 if TYPE_CHECKING:
     from typing import Optional
@@ -760,10 +761,8 @@ async def run_wal_lagging(env: NeonEnv, endpoint: Endpoint, test_output_dir: Pat
 # The test takes more than default 5 minutes on Postgres 16,
 # see https://github.com/neondatabase/neon/issues/5305
 @pytest.mark.timeout(600)
+@skip_in_debug_build("times out in debug builds")
 def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, build_type: str):
-    if build_type == "debug":
-        pytest.skip("times out in debug builds")
-
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 

From 2d9652c434642b852ebaae6969f87ec4d93e3014 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 13:53:12 -0600
Subject: [PATCH 212/239] Clean up C.UTF-8 locale changes

Removes some unnecessary initdb arguments, and fixes Neon for MacOS
since it doesn't seem to ship a C.UTF-8 locale.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/config.rs       | 15 +++++++++++----
 libs/pageserver_api/src/config.rs |  6 +++++-
 pageserver/src/tenant.rs          |  6 ------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index 50e2a95e9d..d4e413034e 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -74,10 +74,17 @@ pub fn write_postgres_conf(
     }
 
     // Locales
-    writeln!(file, "lc_messages='C.UTF-8'")?;
-    writeln!(file, "lc_monetary='C.UTF-8'")?;
-    writeln!(file, "lc_time='C.UTF-8'")?;
-    writeln!(file, "lc_numeric='C.UTF-8'")?;
+    if cfg!(target_os = "macos") {
+        writeln!(file, "lc_messages='C'")?;
+        writeln!(file, "lc_monetary='C'")?;
+        writeln!(file, "lc_time='C'")?;
+        writeln!(file, "lc_numeric='C'")?;
+    } else {
+        writeln!(file, "lc_messages='C.UTF-8'")?;
+        writeln!(file, "lc_monetary='C.UTF-8'")?;
+        writeln!(file, "lc_time='C.UTF-8'")?;
+        writeln!(file, "lc_numeric='C.UTF-8'")?;
+    }
 
     match spec.mode {
         ComputeMode::Primary => {}
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 4272181954..f48c1febb5 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -277,7 +277,11 @@ pub mod defaults {
     pub const DEFAULT_WAL_REDO_TIMEOUT: &str = "60 s";
 
     pub const DEFAULT_SUPERUSER: &str = "cloud_admin";
-    pub const DEFAULT_LOCALE: &str = "C.UTF-8";
+    pub const DEFAULT_LOCALE: &str = if cfg!(target_os = "macos") {
+        "C"
+    } else {
+        "C.UTF-8"
+    };
 
     pub const DEFAULT_PAGE_CACHE_SIZE: usize = 8192;
     pub const DEFAULT_MAX_FILE_DESCRIPTORS: usize = 100;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 903174680e..774672aed6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -4786,12 +4786,6 @@ async fn run_initdb(
         .args(["--username", &conf.superuser])
         .args(["--encoding", "utf8"])
         .args(["--locale", &conf.locale])
-        .args(["--lc-collate", &conf.locale])
-        .args(["--lc-ctype", &conf.locale])
-        .args(["--lc-messages", &conf.locale])
-        .args(["--lc-monetary", &conf.locale])
-        .args(["--lc-numeric", &conf.locale])
-        .args(["--lc-time", &conf.locale])
         .arg("--no-instructions")
         .arg("--no-sync")
         .env_clear()

From 5a138d08a3ab3c7cd79a81783ed1836e0a3dc14f Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Mon, 11 Nov 2024 15:30:32 -0500
Subject: [PATCH 213/239] feat(pageserver): support partial gc-compaction for
 delta layers (#9611)

The final patch for partial compaction, part of
https://github.com/neondatabase/neon/issues/9114, close
https://github.com/neondatabase/neon/issues/8921 (note that we didn't
implement parallel compaction or compaction scheduler for partial
compaction -- currently this needs to be scheduled by using a Python
script to split the keyspace, and in the future, automatically split
based on the key partitioning when the pageserver wants to trigger a
gc-compaction)

## Summary of changes

* Update the layer selection algorithm to use the same selection as full
compaction (everything intersect/below gc horizon)
* Update the layer selection algorithm to also generate a list of delta
layers that need to be rewritten
* Add the logic to rewrite delta layers and add them back to the layer
map
* Update test case to do partial compaction on deltas

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/compaction/src/helpers.rs          |   9 +
 pageserver/src/tenant.rs                      | 235 +++++++---
 .../src/tenant/storage_layer/delta_layer.rs   |   4 +
 .../tenant/storage_layer/filter_iterator.rs   |  25 +-
 .../tenant/storage_layer/merge_iterator.rs    |  82 +++-
 pageserver/src/tenant/timeline/compaction.rs  | 412 +++++++++++-------
 6 files changed, 521 insertions(+), 246 deletions(-)

diff --git a/pageserver/compaction/src/helpers.rs b/pageserver/compaction/src/helpers.rs
index 9dbb6ecedf..6b739d85a7 100644
--- a/pageserver/compaction/src/helpers.rs
+++ b/pageserver/compaction/src/helpers.rs
@@ -35,6 +35,15 @@ pub fn overlaps_with<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
     !(a.end <= b.start || b.end <= a.start)
 }
 
+/// Whether a fully contains b, example as below
+/// ```plain
+/// |      a       |
+///       |  b  |
+/// ```
+pub fn fully_contains<T: Ord>(a: &Range<T>, b: &Range<T>) -> bool {
+    a.start <= b.start && a.end >= b.end
+}
+
 pub fn union_to_keyspace<K: Ord>(a: &mut CompactionKeySpace<K>, b: CompactionKeySpace<K>) {
     let x = std::mem::take(a);
     let mut all_ranges_iter = [x.into_iter(), b.into_iter()]
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 774672aed6..e7c258d829 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -9223,6 +9223,23 @@ mod tests {
         Ok(())
     }
 
+    fn sort_layer_key(k1: &PersistentLayerKey, k2: &PersistentLayerKey) -> std::cmp::Ordering {
+        (
+            k1.is_delta,
+            k1.key_range.start,
+            k1.key_range.end,
+            k1.lsn_range.start,
+            k1.lsn_range.end,
+        )
+            .cmp(&(
+                k2.is_delta,
+                k2.key_range.start,
+                k2.key_range.end,
+                k2.lsn_range.start,
+                k2.lsn_range.end,
+            ))
+    }
+
     async fn inspect_and_sort(
         tline: &Arc<Timeline>,
         filter: Option<std::ops::Range<Key>>,
@@ -9231,25 +9248,30 @@ mod tests {
         if let Some(filter) = filter {
             all_layers.retain(|layer| overlaps_with(&layer.key_range, &filter));
         }
-        all_layers.sort_by(|k1, k2| {
-            (
-                k1.is_delta,
-                k1.key_range.start,
-                k1.key_range.end,
-                k1.lsn_range.start,
-                k1.lsn_range.end,
-            )
-                .cmp(&(
-                    k2.is_delta,
-                    k2.key_range.start,
-                    k2.key_range.end,
-                    k2.lsn_range.start,
-                    k2.lsn_range.end,
-                ))
-        });
+        all_layers.sort_by(sort_layer_key);
         all_layers
     }
 
+    #[cfg(feature = "testing")]
+    fn check_layer_map_key_eq(
+        mut left: Vec<PersistentLayerKey>,
+        mut right: Vec<PersistentLayerKey>,
+    ) {
+        left.sort_by(sort_layer_key);
+        right.sort_by(sort_layer_key);
+        if left != right {
+            eprintln!("---LEFT---");
+            for left in left.iter() {
+                eprintln!("{}", left);
+            }
+            eprintln!("---RIGHT---");
+            for right in right.iter() {
+                eprintln!("{}", right);
+            }
+            assert_eq!(left, right);
+        }
+    }
+
     #[cfg(feature = "testing")]
     #[tokio::test]
     async fn test_simple_partial_bottom_most_compaction() -> anyhow::Result<()> {
@@ -9342,127 +9364,206 @@ mod tests {
 
         let cancel = CancellationToken::new();
 
-        // Do a partial compaction on key range 0..4, we should generate a image layer; no other layers
-        // can be removed because they might be used for other key ranges.
+        // Do a partial compaction on key range 0..2
         tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(4)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
             all_layers,
             vec![
+                // newly-generated image layer for the partial compaction range 0-2
                 PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
                 PersistentLayerKey {
                     key_range: get_key(0)..get_key(10),
                     lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                 },
+                // delta1 is split and the second part is rewritten
                 PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(5)..get_key(7),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
                     lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
         );
 
-        // Do a partial compaction on key range 4..10
+        // Do a partial compaction on key range 2..4
         tline
-            .partial_compact_with_gc(Some(get_key(4)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
             all_layers,
             vec![
                 PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
+                    key_range: get_key(0)..get_key(2),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
                 PersistentLayerKey {
-                    // if (in the future) GC kicks in, this layer will be removed
                     key_range: get_key(0)..get_key(10),
                     lsn_range: Lsn(0x10)..Lsn(0x11),
-                    is_delta: false
+                    is_delta: false,
                 },
+                // image layer generated for the compaction range 2-4
                 PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
+                // we have key2/key3 above the retain_lsn, so we still need this delta layer
                 PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(5)..get_key(7),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
                     lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 4..9
+        tline
+            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                // image layer generated for this compaction range
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
+        );
+
+        // Do a partial compaction on key range 9..10
+        tline
+            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .await
+            .unwrap();
+        let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
+        check_layer_map_key_eq(
+            all_layers,
+            vec![
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(2),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(0)..get_key(10),
+                    lsn_range: Lsn(0x10)..Lsn(0x11),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(2)..get_key(4),
+                    lsn_range: Lsn(0x20)..Lsn(0x48),
+                    is_delta: true,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(4)..get_key(9),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                // image layer generated for the compaction range
+                PersistentLayerKey {
+                    key_range: get_key(9)..get_key(10),
+                    lsn_range: Lsn(0x20)..Lsn(0x21),
+                    is_delta: false,
+                },
+                PersistentLayerKey {
+                    key_range: get_key(8)..get_key(10),
+                    lsn_range: Lsn(0x48)..Lsn(0x50),
+                    is_delta: true,
+                },
+            ],
         );
 
         // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
         tline
-            .partial_compact_with_gc(Some(get_key(0)..get_key(10)), &cancel, EnumSet::new(), &ctx)
+            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
-        assert_eq!(
+        check_layer_map_key_eq(
             all_layers,
             vec![
-                PersistentLayerKey {
-                    key_range: get_key(0)..get_key(4),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
+                // aha, we removed all unnecessary image/delta layers and got a very clean layer map!
                 PersistentLayerKey {
                     key_range: get_key(0)..get_key(10),
                     lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
+                    is_delta: false,
                 },
                 PersistentLayerKey {
-                    key_range: get_key(4)..get_key(10),
-                    lsn_range: Lsn(0x20)..Lsn(0x21),
-                    is_delta: false
-                },
-                PersistentLayerKey {
-                    key_range: get_key(1)..get_key(4),
+                    key_range: get_key(2)..get_key(4),
                     lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
-                },
-                PersistentLayerKey {
-                    key_range: get_key(5)..get_key(7),
-                    lsn_range: Lsn(0x20)..Lsn(0x48),
-                    is_delta: true
+                    is_delta: true,
                 },
                 PersistentLayerKey {
                     key_range: get_key(8)..get_key(10),
                     lsn_range: Lsn(0x48)..Lsn(0x50),
-                    is_delta: true
-                }
-            ]
+                    is_delta: true,
+                },
+            ],
         );
 
         Ok(())
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 664c00a6b1..fec8a0a16c 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -653,6 +653,10 @@ impl DeltaLayerWriter {
         })
     }
 
+    pub fn is_empty(&self) -> bool {
+        self.inner.as_ref().unwrap().num_keys == 0
+    }
+
     ///
     /// Append a key-value pair to the file.
     ///
diff --git a/pageserver/src/tenant/storage_layer/filter_iterator.rs b/pageserver/src/tenant/storage_layer/filter_iterator.rs
index ccfcf68e8f..8660be1fcc 100644
--- a/pageserver/src/tenant/storage_layer/filter_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/filter_iterator.rs
@@ -1,4 +1,4 @@
-use std::ops::Range;
+use std::{ops::Range, sync::Arc};
 
 use anyhow::bail;
 use pageserver_api::{
@@ -9,7 +9,10 @@ use utils::lsn::Lsn;
 
 use pageserver_api::value::Value;
 
-use super::merge_iterator::MergeIterator;
+use super::{
+    merge_iterator::{MergeIterator, MergeIteratorItem},
+    PersistentLayerKey,
+};
 
 /// A filter iterator over merge iterators (and can be easily extended to other types of iterators).
 ///
@@ -48,10 +51,10 @@ impl<'a> FilterIterator<'a> {
         })
     }
 
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        while let Some(item) = self.inner.next().await? {
+    async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
+        while let Some(item) = self.inner.next_inner::<R>().await? {
             while self.current_filter_idx < self.retain_key_filters.len()
-                && item.0 >= self.retain_key_filters[self.current_filter_idx].end
+                && item.key_lsn_value().0 >= self.retain_key_filters[self.current_filter_idx].end
             {
                 // [filter region]    [filter region]     [filter region]
                 //                                     ^ item
@@ -68,7 +71,7 @@ impl<'a> FilterIterator<'a> {
                 //                                                 ^ current filter (nothing)
                 return Ok(None);
             }
-            if self.retain_key_filters[self.current_filter_idx].contains(&item.0) {
+            if self.retain_key_filters[self.current_filter_idx].contains(&item.key_lsn_value().0) {
                 // [filter region]    [filter region]     [filter region]
                 //                                              ^ item
                 //                                        ^ current filter
@@ -81,6 +84,16 @@ impl<'a> FilterIterator<'a> {
         }
         Ok(None)
     }
+
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 980202f12c..2667d130f5 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -1,6 +1,7 @@
 use std::{
     cmp::Ordering,
     collections::{binary_heap, BinaryHeap},
+    sync::Arc,
 };
 
 use anyhow::bail;
@@ -13,10 +14,11 @@ use pageserver_api::value::Value;
 use super::{
     delta_layer::{DeltaLayerInner, DeltaLayerIterator},
     image_layer::{ImageLayerInner, ImageLayerIterator},
+    PersistentLayerDesc, PersistentLayerKey,
 };
 
 #[derive(Clone, Copy)]
-enum LayerRef<'a> {
+pub(crate) enum LayerRef<'a> {
     Image(&'a ImageLayerInner),
     Delta(&'a DeltaLayerInner),
 }
@@ -62,18 +64,20 @@ impl LayerIterRef<'_> {
 /// 1. Unified iterator for image and delta layers.
 /// 2. `Ord` for use in [`MergeIterator::heap`] (for the k-merge).
 /// 3. Lazy creation of the real delta/image iterator.
-enum IteratorWrapper<'a> {
+pub(crate) enum IteratorWrapper<'a> {
     NotLoaded {
         ctx: &'a RequestContext,
         first_key_lower_bound: (Key, Lsn),
         layer: LayerRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
     },
     Loaded {
         iter: PeekableLayerIterRef<'a>,
+        source_desc: Arc<PersistentLayerKey>,
     },
 }
 
-struct PeekableLayerIterRef<'a> {
+pub(crate) struct PeekableLayerIterRef<'a> {
     iter: LayerIterRef<'a>,
     peeked: Option<(Key, Lsn, Value)>, // None == end
 }
@@ -151,6 +155,12 @@ impl<'a> IteratorWrapper<'a> {
             layer: LayerRef::Image(image_layer),
             first_key_lower_bound: (image_layer.key_range().start, image_layer.lsn()),
             ctx,
+            source_desc: PersistentLayerKey {
+                key_range: image_layer.key_range().clone(),
+                lsn_range: PersistentLayerDesc::image_layer_lsn_range(image_layer.lsn()),
+                is_delta: false,
+            }
+            .into(),
         }
     }
 
@@ -162,12 +172,18 @@ impl<'a> IteratorWrapper<'a> {
             layer: LayerRef::Delta(delta_layer),
             first_key_lower_bound: (delta_layer.key_range().start, delta_layer.lsn_range().start),
             ctx,
+            source_desc: PersistentLayerKey {
+                key_range: delta_layer.key_range().clone(),
+                lsn_range: delta_layer.lsn_range().clone(),
+                is_delta: true,
+            }
+            .into(),
         }
     }
 
     fn peek_next_key_lsn_value(&self) -> Option<(&Key, Lsn, Option<&Value>)> {
         match self {
-            Self::Loaded { iter } => iter
+            Self::Loaded { iter, .. } => iter
                 .peek()
                 .as_ref()
                 .map(|(key, lsn, val)| (key, *lsn, Some(val))),
@@ -191,6 +207,7 @@ impl<'a> IteratorWrapper<'a> {
             ctx,
             first_key_lower_bound,
             layer,
+            source_desc,
         } = self
         else {
             unreachable!()
@@ -206,7 +223,10 @@ impl<'a> IteratorWrapper<'a> {
                 );
             }
         }
-        *self = Self::Loaded { iter };
+        *self = Self::Loaded {
+            iter,
+            source_desc: source_desc.clone(),
+        };
         Ok(())
     }
 
@@ -220,11 +240,19 @@ impl<'a> IteratorWrapper<'a> {
     /// The public interfaces to use are [`crate::tenant::storage_layer::delta_layer::DeltaLayerIterator`] and
     /// [`crate::tenant::storage_layer::image_layer::ImageLayerIterator`].
     async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
-        let Self::Loaded { iter } = self else {
+        let Self::Loaded { iter, .. } = self else {
             panic!("must load the iterator before using")
         };
         iter.next().await
     }
+
+    /// Get the persistent layer key corresponding to this iterator
+    fn trace_source(&self) -> Arc<PersistentLayerKey> {
+        match self {
+            Self::Loaded { source_desc, .. } => source_desc.clone(),
+            Self::NotLoaded { source_desc, .. } => source_desc.clone(),
+        }
+    }
 }
 
 /// A merge iterator over delta/image layer iterators.
@@ -242,6 +270,32 @@ pub struct MergeIterator<'a> {
     heap: BinaryHeap<IteratorWrapper<'a>>,
 }
 
+pub(crate) trait MergeIteratorItem {
+    fn new(item: (Key, Lsn, Value), iterator: &IteratorWrapper<'_>) -> Self;
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value);
+}
+
+impl MergeIteratorItem for (Key, Lsn, Value) {
+    fn new(item: (Key, Lsn, Value), _: &IteratorWrapper<'_>) -> Self {
+        item
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        self
+    }
+}
+
+impl MergeIteratorItem for ((Key, Lsn, Value), Arc<PersistentLayerKey>) {
+    fn new(item: (Key, Lsn, Value), iter: &IteratorWrapper<'_>) -> Self {
+        (item, iter.trace_source().clone())
+    }
+
+    fn key_lsn_value(&self) -> &(Key, Lsn, Value) {
+        &self.0
+    }
+}
+
 impl<'a> MergeIterator<'a> {
     pub fn create(
         deltas: &[&'a DeltaLayerInner],
@@ -260,7 +314,7 @@ impl<'a> MergeIterator<'a> {
         }
     }
 
-    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+    pub(crate) async fn next_inner<R: MergeIteratorItem>(&mut self) -> anyhow::Result<Option<R>> {
         while let Some(mut iter) = self.heap.peek_mut() {
             if !iter.is_loaded() {
                 // Once we load the iterator, we can know the real first key-value pair in the iterator.
@@ -275,10 +329,22 @@ impl<'a> MergeIterator<'a> {
                 binary_heap::PeekMut::pop(iter);
                 continue;
             };
-            return Ok(Some(item));
+            return Ok(Some(R::new(item, &iter)));
         }
         Ok(None)
     }
+
+    /// Get the next key-value pair from the iterator.
+    pub async fn next(&mut self) -> anyhow::Result<Option<(Key, Lsn, Value)>> {
+        self.next_inner().await
+    }
+
+    /// Get the next key-value pair from the iterator, and trace where the key comes from.
+    pub async fn next_with_trace(
+        &mut self,
+    ) -> anyhow::Result<Option<((Key, Lsn, Value), Arc<PersistentLayerKey>)>> {
+        self.next_inner().await
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 01c2803881..e6ef1aae2b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -4,7 +4,7 @@
 //!
 //! The old legacy algorithm is implemented directly in `timeline.rs`.
 
-use std::collections::{BinaryHeap, HashSet};
+use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::sync::Arc;
 
@@ -56,7 +56,7 @@ use pageserver_api::value::Value;
 
 use utils::lsn::Lsn;
 
-use pageserver_compaction::helpers::overlaps_with;
+use pageserver_compaction::helpers::{fully_contains, overlaps_with};
 use pageserver_compaction::interface::*;
 
 use super::CompactionError;
@@ -64,6 +64,23 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
+pub struct GcCompactionJobDescription {
+    /// All layers to read in the compaction job
+    selected_layers: Vec<Layer>,
+    /// GC cutoff of the job
+    gc_cutoff: Lsn,
+    /// LSNs to retain for the job
+    retain_lsns_below_horizon: Vec<Lsn>,
+    /// Maximum layer LSN processed in this compaction
+    max_layer_lsn: Lsn,
+    /// Only compact layers overlapping with this range
+    compaction_key_range: Range<Key>,
+    /// When partial compaction is enabled, these layers need to be rewritten to ensure no overlap.
+    /// This field is here solely for debugging. The field will not be read once the compaction
+    /// description is generated.
+    rewrite_layers: Vec<Arc<PersistentLayerDesc>>,
+}
+
 /// The result of bottom-most compaction for a single key at each LSN.
 #[derive(Debug)]
 #[cfg_attr(test, derive(PartialEq))]
@@ -1722,7 +1739,8 @@ impl Timeline {
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(None, cancel, flags, ctx).await
+        self.partial_compact_with_gc(Key::MIN..Key::MAX, cancel, flags, ctx)
+            .await
     }
 
     /// An experimental compaction building block that combines compaction with garbage collection.
@@ -1732,12 +1750,15 @@ impl Timeline {
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
     ///
-    /// If `key_range`, it will only compact the keys within the range, aka partial compaction. This functionality
-    /// is not complete yet, and if it is set, only image layers will be generated.
-    ///
+    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// Partial compaction will read and process all layers overlapping with the key range, even if it might
+    /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
+    /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
+    /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
+    /// part of the range.
     pub(crate) async fn partial_compact_with_gc(
         self: &Arc<Self>,
-        compaction_key_range: Option<Range<Key>>,
+        compaction_key_range: Range<Key>,
         cancel: &CancellationToken,
         flags: EnumSet<CompactFlags>,
         ctx: &RequestContext,
@@ -1762,9 +1783,8 @@ impl Timeline {
         .await?;
 
         let dry_run = flags.contains(CompactFlags::DryRun);
-        let partial_compaction = compaction_key_range.is_some();
 
-        if let Some(ref compaction_key_range) = compaction_key_range {
+        if compaction_key_range == (Key::MIN..Key::MAX) {
             info!("running enhanced gc bottom-most compaction, dry_run={dry_run}, compaction_key_range={}..{}", compaction_key_range.start, compaction_key_range.end);
         } else {
             info!("running enhanced gc bottom-most compaction, dry_run={dry_run}");
@@ -1780,7 +1800,7 @@ impl Timeline {
         // The layer selection has the following properties:
         // 1. If a layer is in the selection, all layers below it are in the selection.
         // 2. Inferred from (1), for each key in the layer selection, the value can be reconstructed only with the layers in the layer selection.
-        let (layer_selection, gc_cutoff, retain_lsns_below_horizon) = if !partial_compaction {
+        let job_desc = {
             let guard = self.layers.read().await;
             let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
@@ -1810,9 +1830,21 @@ impl Timeline {
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
             // layers to compact.
+            let mut rewrite_layers = Vec::new();
             for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= max_layer_lsn {
+                if desc.get_lsn_range().end <= max_layer_lsn
+                    && overlaps_with(&desc.get_key_range(), &compaction_key_range)
+                {
+                    // If the layer overlaps with the compaction key range, we need to read it to obtain all keys within the range,
+                    // even if it might contain extra keys
                     selected_layers.push(guard.get_from_desc(&desc));
+                    // If the layer is not fully contained within the key range, we need to rewrite it if it's a delta layer (it's fine
+                    // to overlap image layers)
+                    if desc.is_delta()
+                        && !fully_contains(&compaction_key_range, &desc.get_key_range())
+                    {
+                        rewrite_layers.push(desc);
+                    }
                 }
             }
             if selected_layers.is_empty() {
@@ -1820,82 +1852,59 @@ impl Timeline {
                 return Ok(());
             }
             retain_lsns_below_horizon.sort();
-            (selected_layers, gc_cutoff, retain_lsns_below_horizon)
-        } else {
-            // In case of partial compaction, we currently only support generating image layers, and therefore,
-            // we pick all layers that are below the lowest retain_lsn and does not intersect with any of the layers.
-            let guard = self.layers.read().await;
-            let layers = guard.layer_map()?;
-            let gc_info = self.gc_info.read().unwrap();
-            let mut min_lsn = gc_info.cutoffs.select_min();
-            for (lsn, _, _) in &gc_info.retain_lsns {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
+            GcCompactionJobDescription {
+                selected_layers,
+                gc_cutoff,
+                retain_lsns_below_horizon,
+                max_layer_lsn,
+                compaction_key_range,
+                rewrite_layers,
             }
-            for lsn in gc_info.leases.keys() {
-                if lsn < &min_lsn {
-                    min_lsn = *lsn;
-                }
-            }
-            let mut selected_layers = Vec::new();
-            drop(gc_info);
-            // |-------| |-------| |-------|
-            // | Delta | | Delta | | Delta | -- min_lsn could be intersecting with the layers
-            // |-------| |-------| |-------| <- we want to pick all the layers below min_lsn, so that
-            // | Delta | | Delta | | Delta |    ...we can remove them after compaction
-            // |-------| |-------| |-------|
-            // Pick all the layers intersect or below the min_lsn, get the largest LSN in the selected layers.
-            let Some(compaction_key_range) = compaction_key_range.as_ref() else {
-                unreachable!()
-            };
-            for desc in layers.iter_historic_layers() {
-                if desc.get_lsn_range().end <= min_lsn
-                    && overlaps_with(&desc.key_range, compaction_key_range)
-                {
-                    selected_layers.push(guard.get_from_desc(&desc));
-                }
-            }
-            if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
-                return Ok(());
-            }
-            (selected_layers, min_lsn, Vec::new())
         };
         let lowest_retain_lsn = if self.ancestor_timeline.is_some() {
-            if partial_compaction {
-                warn!("partial compaction cannot run on child branches (for now)");
-                return Ok(());
-            }
             Lsn(self.ancestor_lsn.0 + 1)
         } else {
-            let res = retain_lsns_below_horizon
+            let res = job_desc
+                .retain_lsns_below_horizon
                 .first()
                 .copied()
-                .unwrap_or(gc_cutoff);
+                .unwrap_or(job_desc.gc_cutoff);
             if cfg!(debug_assertions) {
                 assert_eq!(
                     res,
-                    retain_lsns_below_horizon
+                    job_desc
+                        .retain_lsns_below_horizon
                         .iter()
                         .min()
                         .copied()
-                        .unwrap_or(gc_cutoff)
+                        .unwrap_or(job_desc.gc_cutoff)
                 );
             }
             res
         };
         info!(
-            "picked {} layers for compaction with gc_cutoff={} lowest_retain_lsn={}",
-            layer_selection.len(),
-            gc_cutoff,
-            lowest_retain_lsn
+            "picked {} layers for compaction ({} layers need rewriting) with max_layer_lsn={} gc_cutoff={} lowest_retain_lsn={}, key_range={}..{}",
+            job_desc.selected_layers.len(),
+            job_desc.rewrite_layers.len(),
+            job_desc.max_layer_lsn,
+            job_desc.gc_cutoff,
+            lowest_retain_lsn,
+            job_desc.compaction_key_range.start,
+            job_desc.compaction_key_range.end
         );
 
-        self.check_compaction_space(&layer_selection).await?;
+        for layer in &job_desc.selected_layers {
+            debug!("read layer: {}", layer.layer_desc().key());
+        }
+        for layer in &job_desc.rewrite_layers {
+            debug!("rewrite layer: {}", layer.key());
+        }
+
+        self.check_compaction_space(&job_desc.selected_layers)
+            .await?;
 
         // Generate statistics for the compaction
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
             let desc = layer.layer_desc();
             if desc.is_delta() {
                 stat.visit_delta_layer(desc.file_size());
@@ -1906,25 +1915,25 @@ impl Timeline {
 
         // Step 1: construct a k-merge iterator over all layers.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names: Vec<crate::tenant::storage_layer::LayerName> = layer_selection
+        let layer_names = job_desc
+            .selected_layers
             .iter()
             .map(|layer| layer.layer_desc().layer_name())
             .collect_vec();
         if let Some(err) = check_valid_layermap(&layer_names) {
-            bail!("cannot run gc-compaction because {}", err);
+            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
         }
         // The maximum LSN we are processing in this compaction loop
-        let end_lsn = layer_selection
+        let end_lsn = job_desc
+            .selected_layers
             .iter()
             .map(|l| l.layer_desc().lsn_range.end)
             .max()
             .unwrap();
-        // We don't want any of the produced layers to cover the full key range (i.e., MIN..MAX) b/c it will then be recognized
-        // as an L0 layer.
         let mut delta_layers = Vec::new();
         let mut image_layers = Vec::new();
         let mut downloaded_layers = Vec::new();
-        for layer in &layer_selection {
+        for layer in &job_desc.selected_layers {
             let resident_layer = layer.download_and_keep_resident().await?;
             downloaded_layers.push(resident_layer);
         }
@@ -1943,8 +1952,8 @@ impl Timeline {
             dense_ks,
             sparse_ks,
         )?;
-        // Step 2: Produce images+deltas. TODO: ensure newly-produced delta does not overlap with other deltas.
-        // Data of the same key.
+
+        // Step 2: Produce images+deltas.
         let mut accumulated_values = Vec::new();
         let mut last_key: Option<Key> = None;
 
@@ -1956,10 +1965,7 @@ impl Timeline {
                     self.conf,
                     self.timeline_id,
                     self.tenant_shard_id,
-                    compaction_key_range
-                        .as_ref()
-                        .map(|x| x.start)
-                        .unwrap_or(Key::MIN),
+                    job_desc.compaction_key_range.start,
                     lowest_retain_lsn,
                     self.get_compaction_target_size(),
                     ctx,
@@ -1979,6 +1985,13 @@ impl Timeline {
         )
         .await?;
 
+        #[derive(Default)]
+        struct RewritingLayers {
+            before: Option<DeltaLayerWriter>,
+            after: Option<DeltaLayerWriter>,
+        }
+        let mut delta_layer_rewriters = HashMap::<Arc<PersistentLayerKey>, RewritingLayers>::new();
+
         /// Returns None if there is no ancestor branch. Throw an error when the key is not found.
         ///
         /// Currently, we always get the ancestor image for each key in the child branch no matter whether the image
@@ -2004,10 +2017,51 @@ impl Timeline {
         // the key and LSN range are determined. However, to keep things simple here, we still
         // create this writer, and discard the writer in the end.
 
-        while let Some((key, lsn, val)) = merge_iter.next().await? {
+        while let Some(((key, lsn, val), desc)) = merge_iter.next_with_trace().await? {
             if cancel.is_cancelled() {
                 return Err(anyhow!("cancelled")); // TODO: refactor to CompactionError and pass cancel error
             }
+            if !job_desc.compaction_key_range.contains(&key) {
+                if !desc.is_delta {
+                    continue;
+                }
+                let rewriter = delta_layer_rewriters.entry(desc.clone()).or_default();
+                let rewriter = if key < job_desc.compaction_key_range.start {
+                    if rewriter.before.is_none() {
+                        rewriter.before = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                desc.key_range.start,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.before.as_mut().unwrap()
+                } else if key >= job_desc.compaction_key_range.end {
+                    if rewriter.after.is_none() {
+                        rewriter.after = Some(
+                            DeltaLayerWriter::new(
+                                self.conf,
+                                self.timeline_id,
+                                self.tenant_shard_id,
+                                job_desc.compaction_key_range.end,
+                                desc.lsn_range.clone(),
+                                ctx,
+                            )
+                            .await?,
+                        );
+                    }
+                    rewriter.after.as_mut().unwrap()
+                } else {
+                    unreachable!()
+                };
+                rewriter.put_value(key, lsn, val, ctx).await?;
+                continue;
+            }
             match val {
                 Value::Image(_) => stat.visit_image_key(&val),
                 Value::WalRecord(_) => stat.visit_wal_key(&val),
@@ -2018,35 +2072,27 @@ impl Timeline {
                 }
                 accumulated_values.push((key, lsn, val));
             } else {
-                let last_key = last_key.as_mut().unwrap();
-                stat.on_unique_key_visited();
-                let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-                    !compaction_key_range.contains(last_key)
-                } else {
-                    false
-                };
-                if !skip_adding_key {
-                    let retention = self
-                        .generate_key_retention(
-                            *last_key,
-                            &accumulated_values,
-                            gc_cutoff,
-                            &retain_lsns_below_horizon,
-                            COMPACTION_DELTA_THRESHOLD,
-                            get_ancestor_image(self, *last_key, ctx).await?,
-                        )
-                        .await?;
-                    // Put the image into the image layer. Currently we have a single big layer for the compaction.
-                    retention
-                        .pipe_to(
-                            *last_key,
-                            &mut delta_layer_writer,
-                            image_layer_writer.as_mut(),
-                            &mut stat,
-                            ctx,
-                        )
-                        .await?;
-                }
+                let last_key: &mut Key = last_key.as_mut().unwrap();
+                stat.on_unique_key_visited(); // TODO: adjust statistics for partial compaction
+                let retention = self
+                    .generate_key_retention(
+                        *last_key,
+                        &accumulated_values,
+                        job_desc.gc_cutoff,
+                        &job_desc.retain_lsns_below_horizon,
+                        COMPACTION_DELTA_THRESHOLD,
+                        get_ancestor_image(self, *last_key, ctx).await?,
+                    )
+                    .await?;
+                retention
+                    .pipe_to(
+                        *last_key,
+                        &mut delta_layer_writer,
+                        image_layer_writer.as_mut(),
+                        &mut stat,
+                        ctx,
+                    )
+                    .await?;
                 accumulated_values.clear();
                 *last_key = key;
                 accumulated_values.push((key, lsn, val));
@@ -2057,35 +2103,43 @@ impl Timeline {
         let last_key = last_key.expect("no keys produced during compaction");
         stat.on_unique_key_visited();
 
-        let skip_adding_key = if let Some(ref compaction_key_range) = compaction_key_range {
-            !compaction_key_range.contains(&last_key)
-        } else {
-            false
-        };
-        if !skip_adding_key {
-            let retention = self
-                .generate_key_retention(
-                    last_key,
-                    &accumulated_values,
-                    gc_cutoff,
-                    &retain_lsns_below_horizon,
-                    COMPACTION_DELTA_THRESHOLD,
-                    get_ancestor_image(self, last_key, ctx).await?,
-                )
-                .await?;
-            // Put the image into the image layer. Currently we have a single big layer for the compaction.
-            retention
-                .pipe_to(
-                    last_key,
-                    &mut delta_layer_writer,
-                    image_layer_writer.as_mut(),
-                    &mut stat,
-                    ctx,
-                )
-                .await?;
-        }
+        let retention = self
+            .generate_key_retention(
+                last_key,
+                &accumulated_values,
+                job_desc.gc_cutoff,
+                &job_desc.retain_lsns_below_horizon,
+                COMPACTION_DELTA_THRESHOLD,
+                get_ancestor_image(self, last_key, ctx).await?,
+            )
+            .await?;
+        retention
+            .pipe_to(
+                last_key,
+                &mut delta_layer_writer,
+                image_layer_writer.as_mut(),
+                &mut stat,
+                ctx,
+            )
+            .await?;
         // end: move the above part to the loop body
 
+        let mut rewrote_delta_layers = Vec::new();
+        for (key, writers) in delta_layer_rewriters {
+            if let Some(delta_writer_before) = writers.before {
+                let (desc, path) = delta_writer_before
+                    .finish(job_desc.compaction_key_range.start, ctx)
+                    .await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+            if let Some(delta_writer_after) = writers.after {
+                let (desc, path) = delta_writer_after.finish(key.key_range.end, ctx).await?;
+                let layer = Layer::finish_creating(self.conf, self, desc, &path)?;
+                rewrote_delta_layers.push(layer);
+            }
+        }
+
         let discard = |key: &PersistentLayerKey| {
             let key = key.clone();
             async move { KeyHistoryRetention::discard_key(&key, self, dry_run).await }
@@ -2093,10 +2147,7 @@ impl Timeline {
 
         let produced_image_layers = if let Some(writer) = image_layer_writer {
             if !dry_run {
-                let end_key = compaction_key_range
-                    .as_ref()
-                    .map(|x| x.end)
-                    .unwrap_or(Key::MAX);
+                let end_key = job_desc.compaction_key_range.end;
                 writer
                     .finish_with_discard_fn(self, ctx, end_key, discard)
                     .await?
@@ -2117,10 +2168,8 @@ impl Timeline {
             Vec::new()
         };
 
-        if partial_compaction && !produced_delta_layers.is_empty() {
-            bail!("implementation error: partial compaction should not be producing delta layers (for now)");
-        }
-
+        // TODO: make image/delta/rewrote_delta layers generation atomic. At this point, we already generated resident layers, and if
+        // compaction is cancelled at this point, we might have some layers that are not cleaned up.
         let mut compact_to = Vec::new();
         let mut keep_layers = HashSet::new();
         let produced_delta_layers_len = produced_delta_layers.len();
@@ -2128,52 +2177,84 @@ impl Timeline {
         for action in produced_delta_layers {
             match action {
                 BatchWriterResult::Produced(layer) => {
+                    if cfg!(debug_assertions) {
+                        info!("produced delta layer: {}", layer.layer_desc().key());
+                    }
                     stat.produce_delta_layer(layer.layer_desc().file_size());
                     compact_to.push(layer);
                 }
                 BatchWriterResult::Discarded(l) => {
+                    if cfg!(debug_assertions) {
+                        info!("discarded delta layer: {}", l);
+                    }
                     keep_layers.insert(l);
                     stat.discard_delta_layer();
                 }
             }
         }
+        for layer in &rewrote_delta_layers {
+            debug!(
+                "produced rewritten delta layer: {}",
+                layer.layer_desc().key()
+            );
+        }
+        compact_to.extend(rewrote_delta_layers);
         for action in produced_image_layers {
             match action {
                 BatchWriterResult::Produced(layer) => {
+                    debug!("produced image layer: {}", layer.layer_desc().key());
                     stat.produce_image_layer(layer.layer_desc().file_size());
                     compact_to.push(layer);
                 }
                 BatchWriterResult::Discarded(l) => {
+                    debug!("discarded image layer: {}", l);
                     keep_layers.insert(l);
                     stat.discard_image_layer();
                 }
             }
         }
-        let mut layer_selection = layer_selection;
-        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
-        if let Some(ref compaction_key_range) = compaction_key_range {
-            // Partial compaction might select more data than it processes, e.g., if
-            // the compaction_key_range only partially overlaps:
-            //
-            //         [---compaction_key_range---]
-            //   [---A----][----B----][----C----][----D----]
-            //
-            // A,B,C,D are all in the `layer_selection`. The created image layers contain
-            // whatever is needed from B, C, and from `----]` of A, and from  `[--` of D.
-            //
-            // In contrast, `[--A-` and `--D----]` have not been processed, so, we must
-            // keep that data.
-            //
-            // The solution for now is to keep A and D completely.
-            // (layer_selection is what we'll remove from the layer map, so,
-            //  retain what is _not_ fully covered by compaction_key_range).
-            layer_selection.retain(|x| {
-                let key_range = &x.layer_desc().key_range;
-                key_range.start >= compaction_key_range.start
-                    && key_range.end <= compaction_key_range.end
-            });
+
+        let mut layer_selection = job_desc.selected_layers;
+
+        // Partial compaction might select more data than it processes, e.g., if
+        // the compaction_key_range only partially overlaps:
+        //
+        //         [---compaction_key_range---]
+        //   [---A----][----B----][----C----][----D----]
+        //
+        // For delta layers, we will rewrite the layers so that it is cut exactly at
+        // the compaction key range, so we can always discard them. However, for image
+        // layers, as we do not rewrite them for now, we need to handle them differently.
+        // Assume image layers  A, B, C, D are all in the `layer_selection`.
+        //
+        // The created image layers contain whatever is needed from B, C, and from
+        // `----]` of A, and from  `[---` of D.
+        //
+        // In contrast, `[---A` and `D----]` have not been processed, so, we must
+        // keep that data.
+        //
+        // The solution for now is to keep A and D completely if they are image layers.
+        // (layer_selection is what we'll remove from the layer map, so, retain what
+        // is _not_ fully covered by compaction_key_range).
+        for layer in &layer_selection {
+            if !layer.layer_desc().is_delta() {
+                if !overlaps_with(
+                    &layer.layer_desc().key_range,
+                    &job_desc.compaction_key_range,
+                ) {
+                    bail!("violated constraint: image layer outside of compaction key range");
+                }
+                if !fully_contains(
+                    &job_desc.compaction_key_range,
+                    &layer.layer_desc().key_range,
+                ) {
+                    keep_layers.insert(layer.layer_desc().key());
+                }
+            }
         }
 
+        layer_selection.retain(|x| !keep_layers.contains(&x.layer_desc().key()));
+
         info!(
             "gc-compaction statistics: {}",
             serde_json::to_string(&stat)?
@@ -2192,6 +2273,7 @@ impl Timeline {
 
         // Step 3: Place back to the layer map.
         {
+            // TODO: sanity check if the layer map is valid (i.e., should not have overlaps)
             let mut guard = self.layers.write().await;
             guard
                 .open_mut()?

From fde16f86140deeefd300cf8bf3fc17dd93cfa22d Mon Sep 17 00:00:00 2001
From: Fedor Dikarev <fedor@neon.tech>
Date: Mon, 11 Nov 2024 21:33:29 +0100
Subject: [PATCH 214/239] use batch gh-workflow-stats-action with separate
 table (#9722)

We found that exporting GH Workflow Runs in batch is more efficient due
to
- better utilisation of Github API
- and gh runners usage is rounded to minutes, so even when ad-hoc export
is done in 5-10 seconds, we billed for one minute usage

So now we introduce batch exporting, with version v0.2.x of github
workflow stats exporter.
How it's expected to work now:
- every 15 minutes we query for the workflow runs, created in last 2
hours
- to avoid missing workflows that ran for more than 2 hours, every night
(00:25) we will query workflows created in past 24 hours and export them
as well
- should we have query for even longer periods?
- lets see how it works with current schedule
- for longer periods like for days or weeks, it may require to adjust
logic and concurrency of querying data, so lets for now use simpler
version
---
 .../workflows/report-workflow-stats-batch.yml | 29 +++++++++++++++++++
 1 file changed, 29 insertions(+)
 create mode 100644 .github/workflows/report-workflow-stats-batch.yml

diff --git a/.github/workflows/report-workflow-stats-batch.yml b/.github/workflows/report-workflow-stats-batch.yml
new file mode 100644
index 0000000000..98e394a3c2
--- /dev/null
+++ b/.github/workflows/report-workflow-stats-batch.yml
@@ -0,0 +1,29 @@
+name: Report Workflow Stats Batch
+
+on:
+  schedule:
+    - cron: '*/15 * * * *'
+    - cron: '25 0 * * *'
+
+jobs:
+  gh-workflow-stats-batch:
+    name: GitHub Workflow Stats Batch
+    runs-on: ubuntu-22.04
+    permissions:
+      actions: read
+    steps:
+    - name: Export Workflow Run for the past 2 hours
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '2h'
+    - name: Export Workflow Run for the past 24 hours
+      if: github.event.schedule == '25 0 * * *'
+      uses: neondatabase/gh-workflow-stats-action@v0.2.1
+      with:
+        db_uri: ${{ secrets.GH_REPORT_STATS_DB_RW_CONNSTR }}
+        db_table: "gh_workflow_stats_batch_neon"
+        gh_token: ${{ secrets.GITHUB_TOKEN }}
+        duration: '24h'

From 4b075db7ea69ebd666d65a80d49c5178c37e9607 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 14:49:37 -0600
Subject: [PATCH 215/239] Add a postgres_exporter config file

This exporter logs an ERROR if a file called `postgres_exporter.yml` is
not located in its current working directory. We can silence it by
adding an empty config file and pointing the exporter at it.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile     | 2 ++
 compute/etc/postgres_exporter.yml   | 0
 compute/vm-image-spec-bookworm.yaml | 2 +-
 compute/vm-image-spec-bullseye.yaml | 2 +-
 4 files changed, 4 insertions(+), 2 deletions(-)
 create mode 100644 compute/etc/postgres_exporter.yml

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 6efef9e969..a3e80223eb 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1475,6 +1475,8 @@ RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy
 COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
 COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
 
+COPY --chown=postgres compute/etc/postgres_exporter.yml /etc/postgres_exporter.yml
+
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter.yml               /etc/sql_exporter.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/neon_collector.yml             /etc/neon_collector.yml
 COPY --from=sql_exporter_preprocessor --chmod=0644 /home/nonroot/compute/etc/sql_exporter_autoscaling.yml   /etc/sql_exporter_autoscaling.yml
diff --git a/compute/etc/postgres_exporter.yml b/compute/etc/postgres_exporter.yml
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/compute/vm-image-spec-bookworm.yaml b/compute/vm-image-spec-bookworm.yaml
index 79f894c289..ac9f5c6904 100644
--- a/compute/vm-image-spec-bookworm.yaml
+++ b/compute/vm-image-spec-bookworm.yaml
@@ -26,7 +26,7 @@ commands:
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn
diff --git a/compute/vm-image-spec-bullseye.yaml b/compute/vm-image-spec-bullseye.yaml
index ff04b9e4c6..0d178e1c24 100644
--- a/compute/vm-image-spec-bullseye.yaml
+++ b/compute/vm-image-spec-bullseye.yaml
@@ -26,7 +26,7 @@ commands:
   - name: postgres-exporter
     user: nobody
     sysvInitAction: respawn
-    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter'
+    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres application_name=postgres-exporter" /bin/postgres_exporter --config.file=/etc/postgres_exporter.yml'
   - name: sql-exporter
     user: nobody
     sysvInitAction: respawn

From b018bc7da89c9adf889829e2ef684fae34012fc6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 11 Nov 2024 23:29:21 +0100
Subject: [PATCH 216/239] Add a retain_lsn test (#9599)

Add a test that ensures the `retain_lsn` functionality works. Right now,
there is not a single test that is broken if offloaded or non-offloaded
timelines don't get registered at their parents, preventing gc from
discarding the ancestor_lsns of the children. This PR fills that gap.

The test has four modes:

* `offloaded`: offload the child timeline, run compaction on the parent
timeline, unarchive the child timeline, then try reading from it.
hopefully the data is still there.
* `offloaded-corrupted`: offload the child timeline, corrupts the
manifest in a way that the pageserver believes the timeline was
flattened. This is the closest we can get to pretend the `retain_lsn`
mechanism doesn't exist for offloaded timelines, so we can avoid adding
endpoints to the pageserver that do this manually for tests. The test
then checks that indeed data is corrupted and the endpoint can't be
started. That way we know that the test is actually working, and
actually tests the `retain_lsn` mechanism, instead of say the lsn lease
mechanism, or one of the many other mechanisms that impede gc.
* `archived`: the child timeline gets archived but doesn't get
offloaded. this currently matches the `None` case but we might have
refactors in the future that make archived timelines sufficiently
different from non-archived ones.
* `None`: the child timeline doesn't even get archived. this tests that
normal timelines participate in `retain_lsn`. I've made them locally not
participate in `retain_lsn` (via commenting out the respective
`ancestor_children.push` statement in tenant.rs) and ran the testsuite,
and not a single test failed. So this test is first of its kind.

Part of #8088.
---
 test_runner/regress/test_timeline_archive.py | 154 ++++++++++++++++++-
 1 file changed, 152 insertions(+), 2 deletions(-)

diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 3e9812c38a..d3839e3d2c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -1,15 +1,22 @@
 from __future__ import annotations
 
+import json
+from typing import Optional
+
 import pytest
 from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
+from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     last_flush_lsn_upload,
 )
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty
-from fixtures.remote_storage import s3_storage
+from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty, list_prefix
+from fixtures.remote_storage import S3Storage, s3_storage
 from fixtures.utils import wait_until
+from mypy_boto3_s3.type_defs import (
+    ObjectTypeDef,
+)
 
 
 @pytest.mark.parametrize("shard_count", [0, 4])
@@ -369,3 +376,146 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
         neon_env_builder.pageserver_remote_storage,
         prefix=f"tenants/{str(tenant_id)}/",
     )
+
+
+@pytest.mark.parametrize("offload_child", ["offload", "offload-corrupt", "archive", None])
+def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Optional[str]):
+    """
+    Ensure that retain_lsn functionality for timelines works, both for offloaded and non-offloaded ones
+    """
+    if offload_child == "offload-corrupt":
+        # Our corruption code only works with S3 compatible storage
+        neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, root_timeline_id = env.create_tenant(
+        conf={
+            # small checkpointing and compaction targets to ensure we generate many upload operations
+            "checkpoint_distance": 128 * 1024,
+            "compaction_threshold": 1,
+            "compaction_target_size": 128 * 1024,
+            # set small image creation thresholds so that gc deletes data
+            "image_creation_threshold": 2,
+            # disable background compaction and GC. We invoke it manually when we want it to happen.
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            # Disable pitr, we only want the latest lsn
+            "pitr_interval": "0s",
+            # Don't rely on endpoint lsn leases
+            "lsn_lease_length": "0s",
+        }
+    )
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(v int, key serial primary key, t text default 'data_content')",
+                "SELECT setseed(0.4321)",
+                "INSERT INTO foo SELECT v FROM (SELECT generate_series(1,2048), (random() * 409600)::int as v) as random_numbers",
+            ]
+        )
+        pre_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
+        log.info(f"Pre branch sum: {pre_branch_sum}")
+        last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
+
+    # Create a branch and write some additional data to the parent
+    child_timeline_id = env.create_branch("test_archived_branch", tenant_id)
+
+    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
+        # Do some churn of the data. This is important so that we can overwrite image layers.
+        for i in range(10):
+            endpoint.safe_psql_many(
+                [
+                    f"SELECT setseed(0.23{i})",
+                    "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 2",
+                    "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 1",
+                    "UPDATE foo SET v=(random() * 409600)::int WHERE v % 3 = 0",
+                ]
+            )
+        post_branch_sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
+        log.info(f"Post branch sum: {post_branch_sum}")
+        last_flush_lsn_upload(env, endpoint, tenant_id, root_timeline_id)
+
+    if offload_child is not None:
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+        leaf_detail = ps_http.timeline_detail(
+            tenant_id,
+            child_timeline_id,
+        )
+        assert leaf_detail["is_archived"] is True
+        if "offload" in offload_child:
+            ps_http.timeline_offload(tenant_id, child_timeline_id)
+
+    # Do a restart to get rid of any in-memory objects (we only init gc info once, at attach)
+    env.pageserver.stop()
+    if offload_child == "offload-corrupt":
+        assert isinstance(env.pageserver_remote_storage, S3Storage)
+        listing = list_prefix(
+            env.pageserver_remote_storage, f"tenants/{str(tenant_id)}/tenant-manifest"
+        )
+        objects: list[ObjectTypeDef] = listing.get("Contents", [])
+        assert len(objects) > 0
+        remote_key: str = str(objects[0].get("Key", []))
+        local_path = str(env.repo_dir / "tenant-manifest.json")
+
+        log.info(f"Downloading {remote_key} -> {local_path}")
+        env.pageserver_remote_storage.client.download_file(
+            env.pageserver_remote_storage.bucket_name, remote_key, local_path
+        )
+
+        log.info(f"Corrupting {local_path}")
+        with open(local_path) as manifest_json_file:
+            manifest_json = json.load(manifest_json_file)
+        for offloaded_timeline in manifest_json["offloaded_timelines"]:
+            offloaded_timeline["ancestor_retain_lsn"] = None
+        with open(local_path, "w") as manifest_json_file:
+            json.dump(manifest_json, manifest_json_file)
+
+        log.info(f"Uploading {local_path} -> {remote_key}")
+        env.pageserver_remote_storage.client.upload_file(
+            local_path, env.pageserver_remote_storage.bucket_name, remote_key
+        )
+        # The point of our earlier efforts was to provoke these
+        env.pageserver.allowed_errors.extend(
+            [
+                ".*initial size calculation failed: PageRead.MissingKey.could not find data for key.*",
+                ".*page_service_conn_main.*could not find data for key.*",
+            ]
+        )
+    env.pageserver.start()
+
+    # Do an agressive gc and compaction of the parent branch
+    ps_http.timeline_gc(tenant_id=tenant_id, timeline_id=root_timeline_id, gc_horizon=0)
+    ps_http.timeline_checkpoint(
+        tenant_id,
+        root_timeline_id,
+        force_l0_compaction=True,
+        force_repartition=True,
+        wait_until_uploaded=True,
+        compact=True,
+    )
+
+    if offload_child is not None:
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+
+    # Now, after unarchival, the child timeline should still have its data accessible (or corrupted)
+    if offload_child == "offload-corrupt":
+        with pytest.raises(RuntimeError, match=".*failed to get basebackup.*"):
+            env.endpoints.create_start(
+                "test_archived_branch", tenant_id=tenant_id, basebackup_request_tries=1
+            )
+    else:
+        with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
+            sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
+            assert sum == pre_branch_sum

From 5be6b07cf169665bb99548c16af084971ccd7ec5 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 17:36:45 -0600
Subject: [PATCH 217/239] Improve typing related to
 regress/test_logical_replication.py (#9725)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_fixtures.py         |  4 +-
 .../regress/test_logical_replication.py       | 50 ++++++++++++-------
 2 files changed, 33 insertions(+), 21 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 79baa8a32d..0728a33a63 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -286,7 +286,7 @@ class PgProtocol:
         return self.safe_psql_many([query], **kwargs)[0]
 
     def safe_psql_many(
-        self, queries: Iterable[str], log_query=True, **kwargs: Any
+        self, queries: Iterable[str], log_query: bool = True, **kwargs: Any
     ) -> list[list[tuple[Any, ...]]]:
         """
         Execute queries against the node and return all rows.
@@ -306,7 +306,7 @@ class PgProtocol:
                         result.append(cur.fetchall())
         return result
 
-    def safe_psql_scalar(self, query, log_query=True) -> Any:
+    def safe_psql_scalar(self, query: str, log_query: bool = True) -> Any:
         """
         Execute query returning single row with single column.
         """
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index 30027463df..df83ca1c44 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -4,24 +4,31 @@ import time
 from functools import partial
 from random import choice
 from string import ascii_lowercase
+from typing import TYPE_CHECKING, cast
 
-from fixtures.common_types import Lsn
+from fixtures.common_types import Lsn, TenantId, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
-    NeonEnv,
-    NeonEnvBuilder,
-    PgProtocol,
     logical_replication_sync,
     wait_for_last_flush_lsn,
 )
 from fixtures.utils import wait_until
 
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import (
+        Endpoint,
+        NeonEnv,
+        NeonEnvBuilder,
+        PgProtocol,
+        VanillaPostgres,
+    )
+
 
 def random_string(n: int):
     return "".join([choice(ascii_lowercase) for _ in range(n)])
 
 
-def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
+def test_logical_replication(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
 
     tenant_id = env.initial_tenant
@@ -160,10 +167,10 @@ COMMIT;
 
 
 # Test that neon.logical_replication_max_snap_files works
-def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg):
-    def slot_removed(ep):
+def test_obsolete_slot_drop(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
+    def slot_removed(ep: Endpoint):
         assert (
-            endpoint.safe_psql(
+            ep.safe_psql(
                 "select count(*) from pg_replication_slots where slot_name = 'stale_slot'"
             )[0][0]
             == 0
@@ -254,7 +261,7 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 
 
 # Tests that walsender correctly blocks until WAL is downloaded from safekeepers
-def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg):
+def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg: VanillaPostgres):
     neon_env_builder.num_safekeepers = 3
     env = neon_env_builder.init_start()
 
@@ -336,13 +343,13 @@ FROM generate_series(1, 16384) AS seq; -- Inserts enough rows to exceed 16MB of
 #
 # Most pages start with a contrecord, so we don't do anything special
 # to ensure that.
-def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
+def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
 
     env.create_branch("init")
     endpoint = env.endpoints.create_start("init")
-    tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0]
-    timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0]
+    tenant_id = TenantId(cast("str", endpoint.safe_psql("show neon.tenant_id")[0][0]))
+    timeline_id = TimelineId(cast("str", endpoint.safe_psql("show neon.timeline_id")[0][0]))
 
     cur = endpoint.connect().cursor()
     cur.execute("create table t(key int, value text)")
@@ -380,7 +387,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg):
 # logical replication bug as such, but without logical replication,
 # records passed ot the WAL redo process are never large enough to hit
 # the bug.
-def test_large_records(neon_simple_env: NeonEnv, vanilla_pg):
+def test_large_records(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
 
     env.create_branch("init")
@@ -522,15 +529,20 @@ def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
     because for some WAL records like vacuum subscriber won't get any data at
     all.
     """
-    publisher_flush_lsn = Lsn(publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    publisher_flush_lsn = Lsn(
+        cast("str", publisher.safe_psql("SELECT pg_current_wal_flush_lsn()")[0][0])
+    )
 
     def check_caughtup():
-        res = publisher.safe_psql(
-            """
+        res = cast(
+            "tuple[str, str, str]",
+            publisher.safe_psql(
+                """
 select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication sr, pg_replication_slots s
    where s.active_pid = sr.pid and s.slot_type = 'logical';
                                   """
-        )[0]
+            )[0],
+        )
         sent_lsn, flush_lsn, curr_publisher_flush_lsn = Lsn(res[0]), Lsn(res[1]), Lsn(res[2])
         log.info(
             f"sent_lsn={sent_lsn}, flush_lsn={flush_lsn}, publisher_flush_lsn={curr_publisher_flush_lsn}, waiting flush_lsn to reach {publisher_flush_lsn}"
@@ -545,7 +557,7 @@ select sent_lsn, flush_lsn, pg_current_wal_flush_lsn() from pg_stat_replication
 # flush_lsn reporting to publisher. Without this, subscriber may ack too far,
 # losing data on restart because publisher implicitly advances positition given
 # in START_REPLICATION to the confirmed_flush_lsn of the slot.
-def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
+def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg: VanillaPostgres):
     env = neon_simple_env
     # use vanilla as publisher to allow writes on it when safekeeper is down
     vanilla_pg.configure(
@@ -593,7 +605,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg):
     # logical_replication_wait_flush_lsn_sync is expected to hang while
     # safekeeper is down.
     vanilla_pg.safe_psql("checkpoint;")
-    assert sub.safe_psql_scalar("SELECT count(*) FROM t") == 1000
+    assert cast("int", sub.safe_psql_scalar("SELECT count(*) FROM t")) == 1000
 
     # restart subscriber and ensure it can catch up lost tail again
     sub.stop(mode="immediate")

From cc8029c4c83b35c5750e435cd2833e0c6321e000 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 11 Nov 2024 20:10:53 -0600
Subject: [PATCH 218/239] Update pg_cron to 1.6.4

This comes with PG 17 support.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/compute-node.Dockerfile | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index a3e80223eb..32405ece86 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -624,16 +624,12 @@ FROM build-deps AS pg-cron-pg-build
 ARG PG_VERSION
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-# 1.6.4 available, supports v17
 # This is an experimental extension that we do not support on prod yet.
 # !Do not remove!
 # We set it in shared_preload_libraries and computes will fail to start if library is not found.
 ENV PATH="/usr/local/pgsql/bin/:$PATH"
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.0.tar.gz -O pg_cron.tar.gz && \
-    echo "383a627867d730222c272bfd25cd5e151c578d73f696d32910c7db8c665cc7db pg_cron.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/citusdata/pg_cron/archive/refs/tags/v1.6.4.tar.gz -O pg_cron.tar.gz && \
+    echo "52d1850ee7beb85a4cb7185731ef4e5a90d1de216709d8988324b0d02e76af61 pg_cron.tar.gz" | sha256sum --check && \
     mkdir pg_cron-src && cd pg_cron-src && tar xzf ../pg_cron.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \

From 6b19867410a92084a448d5058ca4329eafb01be8 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 12 Nov 2024 16:17:03 +0100
Subject: [PATCH 219/239] safekeeper: don't flush control file on WAL ingest
 path (#9698)

## Problem

The control file is flushed on the WAL ingest path when the commit LSN
advances by one segment, to bound the amount of recovery work in case of
a crash. This involves 3 additional fsyncs, which can have a significant
impact on WAL ingest throughput. This is to some extent mitigated by
`AppendResponse` not being emitted on segment bound flushes, since this
will prevent commit LSN advancement, which will be addressed separately.

## Summary of changes

Don't flush the control file on the WAL ingest path at all. Instead,
leave that responsibility to the timeline manager, but ask it to flush
eagerly if the control file lags the in-memory commit LSN by more than
one segment. This should not cause more than `REFRESH_INTERVAL` (300 ms)
additional latency before flushing the control file, which is
negligible.
---
 libs/utils/src/lsn.rs              |  5 +++++
 safekeeper/src/safekeeper.rs       | 12 ++----------
 safekeeper/src/timeline_manager.rs |  7 ++++++-
 3 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/libs/utils/src/lsn.rs b/libs/utils/src/lsn.rs
index 524f3604a1..f188165600 100644
--- a/libs/utils/src/lsn.rs
+++ b/libs/utils/src/lsn.rs
@@ -138,6 +138,11 @@ impl Lsn {
         self.0.checked_sub(other).map(Lsn)
     }
 
+    /// Subtract a number, saturating at numeric bounds instead of overflowing.
+    pub fn saturating_sub<T: Into<u64>>(self, other: T) -> Lsn {
+        Lsn(self.0.saturating_sub(other.into()))
+    }
+
     /// Subtract a number, returning the difference as i128 to avoid overflow.
     pub fn widening_sub<T: Into<u64>>(self, other: T) -> i128 {
         let other: u64 = other.into();
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index cf41d7a0ab..f4983d44d0 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -979,7 +979,8 @@ where
             self.wal_store.flush_wal().await?;
         }
 
-        // Update commit_lsn.
+        // Update commit_lsn. It will be flushed to the control file regularly by the timeline
+        // manager, off of the WAL ingest hot path.
         if msg.h.commit_lsn != Lsn(0) {
             self.update_commit_lsn(msg.h.commit_lsn).await?;
         }
@@ -992,15 +993,6 @@ where
         self.state.inmem.peer_horizon_lsn =
             max(self.state.inmem.peer_horizon_lsn, msg.h.truncate_lsn);
 
-        // Update truncate and commit LSN in control file.
-        // To avoid negative impact on performance of extra fsync, do it only
-        // when commit_lsn delta exceeds WAL segment size.
-        if self.state.commit_lsn + (self.state.server.wal_seg_size as u64)
-            < self.state.inmem.commit_lsn
-        {
-            self.state.flush().await?;
-        }
-
         trace!(
             "processed AppendRequest of len {}, begin_lsn={}, end_lsn={:?}, commit_lsn={:?}, truncate_lsn={:?}, flushed={:?}",
             msg.wal_data.len(),
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index 79200fff8d..e9fed21bf5 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -515,7 +515,12 @@ impl Manager {
             return;
         }
 
-        if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval {
+        if state.cfile_last_persist_at.elapsed() > self.conf.control_file_save_interval
+            // If the control file's commit_lsn lags more than one segment behind the current
+            // commit_lsn, flush immediately to limit recovery time in case of a crash. We don't do
+            // this on the WAL ingest hot path since it incurs fsync latency.
+            || state.commit_lsn.saturating_sub(state.cfile_commit_lsn).0 >= self.wal_seg_size as u64
+        {
             let mut write_guard = self.tli.write_shared_state().await;
             // it should be done in the background because it blocks manager task, but flush() should
             // be fast enough not to be a problem now

From cef165818c6aa38a6fb29e0b592b1d27a071c81d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Tue, 12 Nov 2024 10:37:31 -0500
Subject: [PATCH 220/239] test(pageserver): add gc-compaction tests with delta
 will_init (#9724)

I had an impression that gc-compaction didn't test the case where the
first record of the key history is will_init because of there are some
code path that will panic in this case. Luckily it got fixed in
https://github.com/neondatabase/neon/pull/9026 so we can now implement
such tests.

Part of https://github.com/neondatabase/neon/issues/9114

## Summary of changes

* Randomly changed some images into will_init neon wal record
* Split `test_simple_bottom_most_compaction_deltas` into two test cases,
one of them has the bottom layer as delta layer with will_init flags,
while the other is the original one with image layers.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 libs/pageserver_api/src/record.rs             |   8 +-
 pageserver/src/tenant.rs                      | 116 ++++++++++++++----
 .../tenant/storage_layer/merge_iterator.rs    |   4 +-
 pageserver/src/walredo/apply_neon.rs          |   4 +
 4 files changed, 104 insertions(+), 28 deletions(-)

diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs
index b80ed2f203..5c3f3deb82 100644
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -80,18 +80,18 @@ impl NeonWalRecord {
     }
 
     #[cfg(feature = "testing")]
-    pub fn wal_clear() -> Self {
+    pub fn wal_clear(s: impl AsRef<str>) -> Self {
         Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
             clear: true,
             will_init: false,
         }
     }
 
     #[cfg(feature = "testing")]
-    pub fn wal_init() -> Self {
+    pub fn wal_init(s: impl AsRef<str>) -> Self {
         Self::Test {
-            append: "".to_string(),
+            append: s.as_ref().to_string(),
             clear: true,
             will_init: true,
         }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index e7c258d829..d0a96e78a6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -7757,13 +7757,13 @@ mod tests {
             (
                 get_key(3),
                 Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_clear()),
+                Value::WalRecord(NeonWalRecord::wal_clear("c")),
             ),
             (get_key(4), Lsn(0x10), Value::Image("0x10".into())),
             (
                 get_key(4),
                 Lsn(0x20),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("i")),
             ),
         ];
         let image1 = vec![(get_key(1), "0x10".into())];
@@ -7912,8 +7912,30 @@ mod tests {
 
     #[cfg(feature = "testing")]
     #[tokio::test]
-    async fn test_simple_bottom_most_compaction_deltas() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("test_simple_bottom_most_compaction_deltas").await?;
+    async fn test_simple_bottom_most_compaction_deltas_1() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_1",
+            false,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    #[tokio::test]
+    async fn test_simple_bottom_most_compaction_deltas_2() -> anyhow::Result<()> {
+        test_simple_bottom_most_compaction_deltas_helper(
+            "test_simple_bottom_most_compaction_deltas_2",
+            true,
+        )
+        .await
+    }
+
+    #[cfg(feature = "testing")]
+    async fn test_simple_bottom_most_compaction_deltas_helper(
+        test_name: &'static str,
+        use_delta_bottom_layer: bool,
+    ) -> anyhow::Result<()> {
+        let harness = TenantHarness::create(test_name).await?;
         let (tenant, ctx) = harness.load().await;
 
         fn get_key(id: u32) -> Key {
@@ -7944,6 +7966,16 @@ mod tests {
         let img_layer = (0..10)
             .map(|id| (get_key(id), Bytes::from(format!("value {id}@0x10"))))
             .collect_vec();
+        // or, delta layer at 0x10 if `use_delta_bottom_layer` is true
+        let delta4 = (0..10)
+            .map(|id| {
+                (
+                    get_key(id),
+                    Lsn(0x08),
+                    Value::WalRecord(NeonWalRecord::wal_init(format!("value {id}@0x10"))),
+                )
+            })
+            .collect_vec();
 
         let delta1 = vec![
             (
@@ -7997,21 +8029,61 @@ mod tests {
             ),
         ];
 
-        let tline = tenant
-            .create_test_timeline_with_layers(
-                TIMELINE_ID,
-                Lsn(0x10),
-                DEFAULT_PG_VERSION,
-                &ctx,
-                vec![
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta1),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x10)..Lsn(0x48), delta2),
-                    DeltaLayerTestDesc::new_with_inferred_key_range(Lsn(0x48)..Lsn(0x50), delta3),
-                ], // delta layers
-                vec![(Lsn(0x10), img_layer)], // image layers
-                Lsn(0x50),
-            )
-            .await?;
+        let tline = if use_delta_bottom_layer {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x08),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x08)..Lsn(0x10),
+                            delta4,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x20)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        } else {
+            tenant
+                .create_test_timeline_with_layers(
+                    TIMELINE_ID,
+                    Lsn(0x10),
+                    DEFAULT_PG_VERSION,
+                    &ctx,
+                    vec![
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta1,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x10)..Lsn(0x48),
+                            delta2,
+                        ),
+                        DeltaLayerTestDesc::new_with_inferred_key_range(
+                            Lsn(0x48)..Lsn(0x50),
+                            delta3,
+                        ),
+                    ], // delta layers
+                    vec![(Lsn(0x10), img_layer)], // image layers
+                    Lsn(0x50),
+                )
+                .await?
+        };
         {
             // Update GC info
             let mut guard = tline.gc_info.write().unwrap();
@@ -8121,7 +8193,7 @@ mod tests {
             (
                 key,
                 Lsn(0x10),
-                Value::Image(Bytes::copy_from_slice(b"0x10")),
+                Value::WalRecord(NeonWalRecord::wal_init("0x10")),
             ),
             (
                 key,
@@ -8183,7 +8255,7 @@ mod tests {
                     Lsn(0x20),
                     KeyLogAtLsn(vec![(
                         Lsn(0x20),
-                        Value::Image(Bytes::copy_from_slice(b"0x10;0x20")),
+                        Value::Image(Bytes::from_static(b"0x10;0x20")),
                     )]),
                 ),
                 (
@@ -9165,7 +9237,7 @@ mod tests {
 
             let will_init = will_init_keys.contains(&i);
             if will_init {
-                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init())));
+                delta_layer_spec.push((key, lsn, Value::WalRecord(NeonWalRecord::wal_init(""))));
 
                 expected_key_values.insert(key, "".to_string());
             } else {
diff --git a/pageserver/src/tenant/storage_layer/merge_iterator.rs b/pageserver/src/tenant/storage_layer/merge_iterator.rs
index 2667d130f5..19cfcb0867 100644
--- a/pageserver/src/tenant/storage_layer/merge_iterator.rs
+++ b/pageserver/src/tenant/storage_layer/merge_iterator.rs
@@ -562,7 +562,7 @@ mod tests {
             (
                 get_key(0),
                 Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
             ),
             (
                 get_key(0),
@@ -572,7 +572,7 @@ mod tests {
             (
                 get_key(5),
                 Lsn(0x10),
-                Value::WalRecord(NeonWalRecord::wal_init()),
+                Value::WalRecord(NeonWalRecord::wal_init("")),
             ),
             (
                 get_key(5),
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index d712d8bf5e..78601d87af 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -253,6 +253,10 @@ pub(crate) fn apply_in_neon(
             use bytes::BufMut;
             if *will_init {
                 assert!(*clear, "init record must be clear to ensure correctness");
+                assert!(
+                    page.is_empty(),
+                    "init record must be the first entry to ensure correctness"
+                );
             }
             if *clear {
                 page.clear();

From 05381a48f05873f2bc64d116720c51b579f97a58 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 12 Nov 2024 18:57:31 +0100
Subject: [PATCH 221/239] utils: remove unnecessary fsync in `durable_rename()`
 (#9686)

## Problem

WAL segment fsyncs significantly affect WAL ingestion throughput.
`durable_rename()` is used when initializing every 16 MB segment, and
issues 3 fsyncs of which 1 was unnecessary.

## Summary of changes

Remove an fsync in `durable_rename` which is unnecessary with Linux and
ext4 (which we currently use). This improves WAL ingestion throughput by
up to 23% with large appends on my MacBook.
---
 libs/utils/src/crashsafe.rs | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/libs/utils/src/crashsafe.rs b/libs/utils/src/crashsafe.rs
index b97c6c7a45..5241ab183c 100644
--- a/libs/utils/src/crashsafe.rs
+++ b/libs/utils/src/crashsafe.rs
@@ -123,15 +123,27 @@ pub async fn fsync_async_opt(
     Ok(())
 }
 
-/// Like postgres' durable_rename, renames file issuing fsyncs do make it
-/// durable. After return, file and rename are guaranteed to be persisted.
+/// Like postgres' durable_rename, renames a file and issues fsyncs to make it durable. After
+/// returning, both the file and rename are guaranteed to be persisted. Both paths must be on the
+/// same file system.
 ///
-/// Unlike postgres, it only does fsyncs to 1) file to be renamed to make
-/// contents durable; 2) its directory entry to make rename durable 3) again to
-/// already renamed file, which is not required by standards but postgres does
-/// it, let's stick to that. Postgres additionally fsyncs newpath *before*
-/// rename if it exists to ensure that at least one of the files survives, but
-/// current callers don't need that.
+/// Unlike postgres, it only fsyncs 1) the file to make contents durable, and 2) the directory to
+/// make the rename durable. This sequence ensures the target file will never be incomplete.
+///
+/// Postgres also:
+///
+/// * Fsyncs the target file, if it exists, before the rename, to ensure either the new or existing
+///   file survives a crash. Current callers don't need this as it should already be fsynced if
+///   durability is needed.
+///
+/// * Fsyncs the file after the rename. This can be required with certain OSes or file systems (e.g.
+///   NFS), but not on Linux with most common file systems like ext4 (which we currently use).
+///
+/// An audit of 8 other databases found that none fsynced the file after a rename:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837180535>
+///
+/// eBPF probes confirmed that this is sufficient with ext4, XFS, and ZFS, but possibly not Btrfs:
+/// <https://github.com/neondatabase/neon/pull/9686#discussion_r1837926218>
 ///
 /// virtual_file.rs has similar code, but it doesn't use vfs.
 ///
@@ -149,9 +161,6 @@ pub async fn durable_rename(
     // Time to do the real deal.
     tokio::fs::rename(old_path.as_ref(), new_path.as_ref()).await?;
 
-    // Postgres'ish fsync of renamed file.
-    fsync_async_opt(new_path.as_ref(), do_fsync).await?;
-
     // Now fsync the parent
     let parent = match new_path.as_ref().parent() {
         Some(p) => p,

From a61d81bbc77ebe9635e7dd52fe738638a92141a3 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 12 Nov 2024 13:12:08 -0600
Subject: [PATCH 222/239] Calculate compute_backpressure_throttling_seconds
 correctly

The original value that we get is measured in microseconds. It comes
from a calculation using Postgres' GetCurrentTimestamp(), whihc is
implemented in terms of gettimeofday(2).

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .../sql_exporter/compute_backpressure_throttling_seconds.sql    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
index 459c586d18..d97d625d4c 100644
--- a/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
+++ b/compute/etc/sql_exporter/compute_backpressure_throttling_seconds.sql
@@ -1 +1 @@
-SELECT neon.backpressure_throttling_time()::float8 / 1000 AS throttled;
+SELECT (neon.backpressure_throttling_time()::float8 / 1000000) AS throttled;

From 3f80af8b1d185bde338cb2598e71972624153aef Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 12 Nov 2024 13:13:28 -0600
Subject: [PATCH 223/239] Add neon.logical_replication_max_logicalsnapdir_size

This GUC will drop replication slots if the size of the
pg_logical/snapshots directory (not including temp snapshot files)
becomes larger than the specified size. Keeping the size of this
directory smaller will help with basebackup size from the pageserver.

Part-of: https://github.com/neondatabase/neon/issues/8619
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 pgxn/neon/logical_replication_monitor.c | 151 ++++++++++++++++++------
 1 file changed, 116 insertions(+), 35 deletions(-)

diff --git a/pgxn/neon/logical_replication_monitor.c b/pgxn/neon/logical_replication_monitor.c
index 2de429b83d..1badbbed21 100644
--- a/pgxn/neon/logical_replication_monitor.c
+++ b/pgxn/neon/logical_replication_monitor.c
@@ -1,7 +1,8 @@
+#include <dirent.h>
 #include <limits.h>
 #include <string.h>
-#include <dirent.h>
 #include <signal.h>
+#include <sys/stat.h>
 
 #include "postgres.h"
 
@@ -21,17 +22,35 @@
 
 static int	logical_replication_max_snap_files = 300;
 
+/*
+ * According to Chi (shyzh), the pageserver _should_ be good with 10 MB worth of
+ * snapshot files. Let's use 8 MB since 8 is a power of 2.
+ */
+static int	logical_replication_max_logicalsnapdir_size = 8000;
+
+/*
+ * A primitive description of a logical snapshot file including the LSN of the
+ * file and its size.
+ */
+typedef struct SnapDesc {
+	XLogRecPtr	lsn;
+	off_t		sz;
+} SnapDesc;
+
 PGDLLEXPORT void LogicalSlotsMonitorMain(Datum main_arg);
 
+/*
+ * Sorts an array of snapshot descriptors by their LSN.
+ */
 static int
-LsnDescComparator(const void *a, const void *b)
+SnapDescComparator(const void *a, const void *b)
 {
-	XLogRecPtr	lsn1 = *((const XLogRecPtr *) a);
-	XLogRecPtr	lsn2 = *((const XLogRecPtr *) b);
+	const SnapDesc	*desc1 = a;
+	const SnapDesc	*desc2 = b;
 
-	if (lsn1 < lsn2)
+	if (desc1->lsn < desc2->lsn)
 		return 1;
-	else if (lsn1 == lsn2)
+	else if (desc1->lsn == desc2->lsn)
 		return 0;
 	else
 		return -1;
@@ -43,28 +62,39 @@ LsnDescComparator(const void *a, const void *b)
  * slots having lower restart_lsn should be dropped.
  */
 static XLogRecPtr
-get_num_snap_files_lsn_threshold(void)
+get_snapshots_cutoff_lsn(void)
 {
-	DIR		   *dirdesc;
-	struct dirent *de;
-	char	   *snap_path = "pg_logical/snapshots/";
-	int			lsns_allocated = 1024;
-	int			lsns_num = 0;
-	XLogRecPtr *lsns;
-	XLogRecPtr	cutoff;
+/* PG 18 has a constant defined for this, PG_LOGICAL_SNAPSHOTS_DIR */
+#define SNAPDIR "pg_logical/snapshots"
 
-	if (logical_replication_max_snap_files < 0)
+	DIR		   *dirdesc;
+	int			dirdesc_fd;
+	struct dirent *de;
+	size_t		snapshot_index = 0;
+	SnapDesc   *snapshot_descriptors;
+	size_t		descriptors_allocated = 1024;
+	XLogRecPtr	cutoff = 0;
+	off_t		logicalsnapdir_size = 0;
+	const int	logical_replication_max_logicalsnapdir_size_bytes = logical_replication_max_logicalsnapdir_size * 1000;
+
+	if (logical_replication_max_snap_files < 0 && logical_replication_max_logicalsnapdir_size < 0)
 		return 0;
 
-	lsns = palloc(sizeof(XLogRecPtr) * lsns_allocated);
+	snapshot_descriptors = palloc(sizeof(*snapshot_descriptors) * descriptors_allocated);
+
+	dirdesc = AllocateDir(SNAPDIR);
+	dirdesc_fd = dirfd(dirdesc);
+	if (dirdesc_fd == -1)
+		ereport(ERROR, errmsg("failed to get a file descriptor for " SNAPDIR ": %m"));
 
 	/* find all .snap files and get their lsns */
-	dirdesc = AllocateDir(snap_path);
-	while ((de = ReadDir(dirdesc, snap_path)) != NULL)
+	while ((de = ReadDir(dirdesc, SNAPDIR)) != NULL)
 	{
-		XLogRecPtr	lsn;
 		uint32		hi;
 		uint32		lo;
+		struct stat	st;
+		XLogRecPtr	lsn;
+		SnapDesc   *desc;
 
 		if (strcmp(de->d_name, ".") == 0 ||
 			strcmp(de->d_name, "..") == 0)
@@ -79,28 +109,69 @@ get_num_snap_files_lsn_threshold(void)
 
 		lsn = ((uint64) hi) << 32 | lo;
 		elog(DEBUG5, "found snap file %X/%X", LSN_FORMAT_ARGS(lsn));
-		if (lsns_allocated == lsns_num)
+
+		if (fstatat(dirdesc_fd, de->d_name, &st, 0) == -1)
+			ereport(ERROR, errmsg("failed to get the size of " SNAPDIR "/%s: %m", de->d_name));
+
+		if (descriptors_allocated == snapshot_index)
 		{
-			lsns_allocated *= 2;
-			lsns = repalloc(lsns, sizeof(XLogRecPtr) * lsns_allocated);
+			descriptors_allocated *= 2;
+			snapshot_descriptors = repalloc(snapshot_descriptors, sizeof(*snapshot_descriptors) * descriptors_allocated);
 		}
-		lsns[lsns_num++] = lsn;
+
+		desc = &snapshot_descriptors[snapshot_index++];
+		desc->lsn = lsn;
+		desc->sz = st.st_size;
 	}
-	/* sort by lsn desc */
-	qsort(lsns, lsns_num, sizeof(XLogRecPtr), LsnDescComparator);
-	/* and take cutoff at logical_replication_max_snap_files */
-	if (logical_replication_max_snap_files > lsns_num)
-		cutoff = 0;
-	/* have less files than cutoff */
-	else
+
+	qsort(snapshot_descriptors, snapshot_index, sizeof(*snapshot_descriptors), SnapDescComparator);
+
+	/* Are there more snapshot files than specified? */
+	if (logical_replication_max_snap_files <= snapshot_index)
 	{
-		cutoff = lsns[logical_replication_max_snap_files - 1];
-		elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %d .snap files, limit is %d",
-			 LSN_FORMAT_ARGS(cutoff), lsns_num, logical_replication_max_snap_files);
+		cutoff = snapshot_descriptors[logical_replication_max_snap_files - 1].lsn;
+		elog(LOG,
+			"ls_monitor: dropping logical slots with restart_lsn lower %X/%X, found %zu snapshot files, limit is %d",
+			LSN_FORMAT_ARGS(cutoff), snapshot_index, logical_replication_max_snap_files);
 	}
-	pfree(lsns);
+
+	/* Is the size of the logical snapshots directory larger than specified?
+	 *
+	 * It's possible we could hit both thresholds, so remove any extra files
+	 * first, and then truncate based on size of the remaining files.
+	 */
+	if (logicalsnapdir_size > logical_replication_max_logicalsnapdir_size_bytes)
+	{
+		/* Unfortunately, iterating the directory does not guarantee any order
+		 * so we can't cache an index in the preceding loop.
+		 */
+
+		off_t		sz;
+		const XLogRecPtr original = cutoff;
+
+		sz = snapshot_descriptors[0].sz;
+		for (size_t i = 1; i < logical_replication_max_snap_files; ++i)
+		{
+			if (sz > logical_replication_max_logicalsnapdir_size_bytes)
+			{
+				cutoff = snapshot_descriptors[i - 1].lsn;
+				break;
+			}
+
+			sz += snapshot_descriptors[i].sz;
+		}
+
+		if (cutoff != original)
+			elog(LOG, "ls_monitor: dropping logical slots with restart_lsn lower than %X/%X, " SNAPDIR " is larger than %d KB",
+					LSN_FORMAT_ARGS(cutoff), logical_replication_max_logicalsnapdir_size);
+	}
+
+	pfree(snapshot_descriptors);
 	FreeDir(dirdesc);
+
 	return cutoff;
+
+#undef SNAPDIR
 }
 
 void
@@ -118,6 +189,16 @@ InitLogicalReplicationMonitor(void)
 							0,
 							NULL, NULL, NULL);
 
+	DefineCustomIntVariable(
+							"neon.logical_replication_max_logicalsnapdir_size",
+							"Maximum allowed size of the pg_logical/snapshots directory (KB). When exceeded, slots are dropped until the limit is met. -1 disables the limit.",
+							NULL,
+							&logical_replication_max_logicalsnapdir_size,
+							8000, -1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_KB,
+							NULL, NULL, NULL);
+
 	memset(&bgw, 0, sizeof(bgw));
 	bgw.bgw_flags = BGWORKER_SHMEM_ACCESS;
 	bgw.bgw_start_time = BgWorkerStart_RecoveryFinished;
@@ -162,7 +243,7 @@ LogicalSlotsMonitorMain(Datum main_arg)
 		 * If there are too many .snap files, just drop all logical slots to
 		 * prevent aux files bloat.
 		 */
-		cutoff_lsn = get_num_snap_files_lsn_threshold();
+		cutoff_lsn = get_snapshots_cutoff_lsn();
 		if (cutoff_lsn > 0)
 		{
 			for (int i = 0; i < max_replication_slots; i++)

From 2256a5727a296c2cf90df6fde615aebcb454021c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 12 Nov 2024 21:35:44 +0100
Subject: [PATCH 224/239] safekeeper: use `WAL_SEGMENT_SIZE` for empty timeline
 state (#9734)

## Problem

`TimelinePersistentState::empty()`, used for tests and benchmarks, had a
hardcoded 16 MB WAL segment size. This caused confusion when attempting
to change the global segment size.

## Summary of changes

Inherit from `WAL_SEGMENT_SIZE` in `TimelinePersistentState::empty()`.
---
 safekeeper/src/state.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/state.rs b/safekeeper/src/state.rs
index b8925d785e..941b7e67d0 100644
--- a/safekeeper/src/state.rs
+++ b/safekeeper/src/state.rs
@@ -4,6 +4,7 @@
 use std::{cmp::max, ops::Deref};
 
 use anyhow::{bail, Result};
+use postgres_ffi::WAL_SEGMENT_SIZE;
 use safekeeper_api::models::TimelineTermBumpResponse;
 use serde::{Deserialize, Serialize};
 use utils::{
@@ -144,7 +145,7 @@ impl TimelinePersistentState {
             ServerInfo {
                 pg_version: 170000, /* Postgres server version (major * 10000) */
                 system_id: 0,       /* Postgres system identifier */
-                wal_seg_size: 16 * 1024 * 1024,
+                wal_seg_size: WAL_SEGMENT_SIZE as u32,
             },
             vec![],
             Lsn::INVALID,

From d8f5d435499447077e2519ac55664590835ee7e8 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Tue, 12 Nov 2024 15:48:19 -0600
Subject: [PATCH 225/239] Fix autocommit footguns in performance tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

psycopg2 has the following warning related to autocommit:

> By default, any query execution, including a simple SELECT will start
> a transaction: for long-running programs, if no further action is
> taken, the session will remain “idle in transaction”, an undesirable
> condition for several reasons (locks are held by the session, tables
> bloat…). For long lived scripts, either ensure to terminate a
> transaction as soon as possible or use an autocommit connection.

In the 2.9 release notes, psycopg2 also made the following change:

> `with connection` starts a transaction on autocommit transactions too

Some of these connections are indeed long-lived, so we were retaining
tons of WAL on the endpoints because we had a transaction pinned in the
past.

Link: https://www.psycopg.org/docs/news.html#what-s-new-in-psycopg-2-9
Link: https://github.com/psycopg/psycopg2/issues/941
Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .../performance/test_logical_replication.py   | 111 ++++++++++--------
 .../performance/test_physical_replication.py  |  61 ++++++----
 2 files changed, 98 insertions(+), 74 deletions(-)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 91d7e3446e..050c09c1e5 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -149,12 +149,16 @@ def test_subscriber_lag(
                 check_pgbench_still_running(pub_workload, "pub")
                 check_pgbench_still_running(sub_workload, "sub")
 
-                with (
-                    psycopg2.connect(pub_connstr) as pub_conn,
-                    psycopg2.connect(sub_connstr) as sub_conn,
-                ):
-                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+                pub_conn = psycopg2.connect(pub_connstr)
+                sub_conn = psycopg2.connect(sub_connstr)
+                pub_conn.autocommit = True
+                sub_conn.autocommit = True
+
+                with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                    lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn.close()
+                sub_conn.close()
 
                 log.info(f"Replica lagged behind master by {lag} seconds")
                 zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -206,6 +210,7 @@ def test_publisher_restart(
     sub_conn = psycopg2.connect(sub_connstr)
     pub_conn.autocommit = True
     sub_conn.autocommit = True
+
     with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
         pub_cur.execute("SELECT 1 FROM pg_catalog.pg_publication WHERE pubname = 'pub1'")
         pub_exists = len(pub_cur.fetchall()) != 0
@@ -222,6 +227,7 @@ def test_publisher_restart(
             sub_cur.execute(f"create subscription sub1 connection '{pub_connstr}' publication pub1")
 
         initial_sync_lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
     pub_conn.close()
     sub_conn.close()
 
@@ -248,12 +254,17 @@ def test_publisher_restart(
                     ["pgbench", "-c10", pgbench_duration, "-Mprepared"],
                     env=pub_env,
                 )
-                with (
-                    psycopg2.connect(pub_connstr) as pub_conn,
-                    psycopg2.connect(sub_connstr) as sub_conn,
-                ):
-                    with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
-                        lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn = psycopg2.connect(pub_connstr)
+                sub_conn = psycopg2.connect(sub_connstr)
+                pub_conn.autocommit = True
+                sub_conn.autocommit = True
+
+                with pub_conn.cursor() as pub_cur, sub_conn.cursor() as sub_cur:
+                    lag = measure_logical_replication_lag(sub_cur, pub_cur)
+
+                pub_conn.close()
+                sub_conn.close()
 
                 log.info(f"Replica lagged behind master by {lag} seconds")
                 zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
@@ -288,58 +299,56 @@ def test_snap_files(
     env = benchmark_project_pub.pgbench_env
     connstr = benchmark_project_pub.connstr
 
-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
-            is_super = cast("bool", cur.fetchall()[0][0])
-            assert is_super, "This benchmark won't work if we don't have superuser"
+    conn = psycopg2.connect(connstr)
+    conn.autocommit = True
+
+    with conn.cursor() as cur:
+        cur.execute("SELECT rolsuper FROM pg_roles WHERE rolname = 'neondb_owner'")
+        is_super = cast("bool", cur.fetchall()[0][0])
+        assert is_super, "This benchmark won't work if we don't have superuser"
+
+    conn.close()
 
     pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s100"], env=env)
 
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
-    cur = conn.cursor()
-    cur.execute("ALTER SYSTEM SET neon.logical_replication_max_snap_files = -1")
 
-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute("SELECT pg_reload_conf()")
-
-    with psycopg2.connect(connstr) as conn:
-        conn.autocommit = True
-        with conn.cursor() as cur:
-            cur.execute(
-                """
-                DO $$
-                    BEGIN
-                    IF EXISTS (
-                        SELECT 1
-                        FROM pg_replication_slots
-                        WHERE slot_name = 'slotter'
-                    ) THEN
-                        PERFORM pg_drop_replication_slot('slotter');
-                    END IF;
-                END $$;
+    with conn.cursor() as cur:
+        cur.execute(
             """
-            )
-            cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+            DO $$
+                BEGIN
+                IF EXISTS (
+                    SELECT 1
+                    FROM pg_replication_slots
+                    WHERE slot_name = 'slotter'
+                ) THEN
+                    PERFORM pg_drop_replication_slot('slotter');
+                END IF;
+            END $$;
+        """
+        )
+        cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+
+    conn.close()
 
     workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
     try:
         start = time.time()
         prev_measurement = time.time()
         while time.time() - start < test_duration_min * 60:
-            with psycopg2.connect(connstr) as conn:
-                with conn.cursor() as cur:
-                    cur.execute(
-                        "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
-                    )
-                    check_pgbench_still_running(workload)
-                    cur.execute(
-                        "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
-                    )
+            conn = psycopg2.connect(connstr)
+            conn.autocommit = True
+
+            with conn.cursor() as cur:
+                cur.execute(
+                    "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
+                )
+                check_pgbench_still_running(workload)
+                cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")
+
+            conn.close()
 
             # Measure storage
             if time.time() - prev_measurement > test_interval_min * 60:
diff --git a/test_runner/performance/test_physical_replication.py b/test_runner/performance/test_physical_replication.py
index 8b368977df..d56f6dce09 100644
--- a/test_runner/performance/test_physical_replication.py
+++ b/test_runner/performance/test_physical_replication.py
@@ -102,15 +102,21 @@ def test_ro_replica_lag(
                     check_pgbench_still_running(master_workload)
                     check_pgbench_still_running(replica_workload)
                     time.sleep(sync_interval_min * 60)
+
+                    conn_master = psycopg2.connect(master_connstr)
+                    conn_replica = psycopg2.connect(replica_connstr)
+                    conn_master.autocommit = True
+                    conn_replica.autocommit = True
+
                     with (
-                        psycopg2.connect(master_connstr) as conn_master,
-                        psycopg2.connect(replica_connstr) as conn_replica,
+                        conn_master.cursor() as cur_master,
+                        conn_replica.cursor() as cur_replica,
                     ):
-                        with (
-                            conn_master.cursor() as cur_master,
-                            conn_replica.cursor() as cur_replica,
-                        ):
-                            lag = measure_replication_lag(cur_master, cur_replica)
+                        lag = measure_replication_lag(cur_master, cur_replica)
+
+                    conn_master.close()
+                    conn_replica.close()
+
                     log.info(f"Replica lagged behind master by {lag} seconds")
                     zenbenchmark.record("replica_lag", lag, "s", MetricReport.LOWER_IS_BETTER)
             finally:
@@ -219,11 +225,15 @@ def test_replication_start_stop(
         pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10"], env=master_env)
 
         # Sync replicas
-        with psycopg2.connect(master_connstr) as conn_master:
-            with conn_master.cursor() as cur_master:
-                for i in range(num_replicas):
-                    conn_replica = psycopg2.connect(replica_connstr[i])
-                    measure_replication_lag(cur_master, conn_replica.cursor())
+        conn_master = psycopg2.connect(master_connstr)
+        conn_master.autocommit = True
+
+        with conn_master.cursor() as cur_master:
+            for i in range(num_replicas):
+                conn_replica = psycopg2.connect(replica_connstr[i])
+                measure_replication_lag(cur_master, conn_replica.cursor())
+
+        conn_master.close()
 
         master_pgbench = pg_bin.run_nonblocking(
             [
@@ -277,17 +287,22 @@ def test_replication_start_stop(
 
             time.sleep(configuration_test_time_sec)
 
-            with psycopg2.connect(master_connstr) as conn_master:
-                with conn_master.cursor() as cur_master:
-                    for ireplica in range(num_replicas):
-                        replica_conn = psycopg2.connect(replica_connstr[ireplica])
-                        lag = measure_replication_lag(cur_master, replica_conn.cursor())
-                        zenbenchmark.record(
-                            f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
-                        )
-                        log.info(
-                            f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
-                        )
+            conn_master = psycopg2.connect(master_connstr)
+            conn_master.autocommit = True
+
+            with conn_master.cursor() as cur_master:
+                for ireplica in range(num_replicas):
+                    replica_conn = psycopg2.connect(replica_connstr[ireplica])
+                    lag = measure_replication_lag(cur_master, replica_conn.cursor())
+                    zenbenchmark.record(
+                        f"Replica {ireplica} lag", lag, "s", MetricReport.LOWER_IS_BETTER
+                    )
+                    log.info(
+                        f"Replica {ireplica} lagging behind master by {lag} seconds after configuration {iconfig:>b}"
+                    )
+
+            conn_master.close()
+
         master_pgbench.terminate()
     except Exception as e:
         error_occurred = True

From 1ff5333a1bff07ec13f3c5c1def2dbb161371c3f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Wed, 13 Nov 2024 08:50:01 +0200
Subject: [PATCH 226/239] Do not wallog AUX files at replica (#9457)

## Problem

Attempt to persist LR stuff at replica cause cannot make new WAL entries
during recovery` error.
See https://neondb.slack.com/archives/C07S7RBFVRA/p1729280401283389

## Summary of changes

Do not wallog AUX files at replica.
Related Postgres PRs:

https://github.com/neondatabase/postgres/pull/517
https://github.com/neondatabase/postgres/pull/516
https://github.com/neondatabase/postgres/pull/515
https://github.com/neondatabase/postgres/pull/514


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 .../test_physical_and_logical_replicaiton.py  | 53 +++++++++++++++++--
 vendor/postgres-v14                           |  2 +-
 vendor/postgres-v15                           |  2 +-
 vendor/postgres-v16                           |  2 +-
 vendor/postgres-v17                           |  2 +-
 vendor/revisions.json                         |  8 +--
 6 files changed, 58 insertions(+), 11 deletions(-)

diff --git a/test_runner/regress/test_physical_and_logical_replicaiton.py b/test_runner/regress/test_physical_and_logical_replicaiton.py
index ec14e08a14..ad2d0871b8 100644
--- a/test_runner/regress/test_physical_and_logical_replicaiton.py
+++ b/test_runner/regress/test_physical_and_logical_replicaiton.py
@@ -5,7 +5,8 @@ import time
 from fixtures.neon_fixtures import NeonEnv, logical_replication_sync
 
 
-def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
+def test_physical_and_logical_replication_slot_not_copied(neon_simple_env: NeonEnv, vanilla_pg):
+    """Test read replica of a primary which has a logical replication publication"""
     env = neon_simple_env
 
     n_records = 100000
@@ -13,7 +14,6 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     primary = env.endpoints.create_start(
         branch_name="main",
         endpoint_id="primary",
-        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
     )
     p_con = primary.connect()
     p_cur = p_con.cursor()
@@ -30,7 +30,6 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     secondary = env.endpoints.new_replica_start(
         origin=primary,
         endpoint_id="secondary",
-        config_lines=["min_wal_size=32MB", "max_wal_size=64MB"],
     )
 
     s_con = secondary.connect()
@@ -48,3 +47,51 @@ def test_physical_and_logical_replication(neon_simple_env: NeonEnv, vanilla_pg):
     # Check that LR slot is not copied to replica
     s_cur.execute("select count(*) from pg_replication_slots")
     assert s_cur.fetchall()[0][0] == 0
+
+
+def test_aux_not_logged_at_replica(neon_simple_env: NeonEnv, vanilla_pg):
+    """Test that AUX files are not saved at replica"""
+    env = neon_simple_env
+
+    n_records = 20000
+
+    primary = env.endpoints.create_start(
+        branch_name="main",
+        endpoint_id="primary",
+    )
+    p_con = primary.connect()
+    p_cur = p_con.cursor()
+    p_cur.execute("CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))")
+    p_cur.execute("create publication pub1 for table t")
+
+    # start subscriber
+    vanilla_pg.start()
+    vanilla_pg.safe_psql("CREATE TABLE t(pk bigint primary key, payload text)")
+    connstr = primary.connstr().replace("'", "''")
+    vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
+
+    for pk in range(n_records):
+        p_cur.execute("insert into t (pk) values (%s)", (pk,))
+
+    # LR snapshot is stored each 15 seconds
+    time.sleep(16)
+
+    # start replica
+    secondary = env.endpoints.new_replica_start(
+        origin=primary,
+        endpoint_id="secondary",
+    )
+
+    s_con = secondary.connect()
+    s_cur = s_con.cursor()
+
+    logical_replication_sync(vanilla_pg, primary)
+
+    assert vanilla_pg.safe_psql("select count(*) from t")[0][0] == n_records
+    s_cur.execute("select count(*) from t")
+    assert s_cur.fetchall()[0][0] == n_records
+
+    vanilla_pg.stop()
+    secondary.stop()
+    primary.stop()
+    assert not secondary.log_contains("cannot make new WAL entries during recovery")
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 2199b83fb7..de0a000daf 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 2199b83fb72680001ce0f43bf6187a21dfb8f45d
+Subproject commit de0a000dafc2e66ce2e39282d3aa1c704fe0390e
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index 22e580fe9f..fd631a9590 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit 22e580fe9ffcea7e02592110b1c9bf426d83cada
+Subproject commit fd631a959049dfe2b82f67409c8b8b0d3e0016d1
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index e131a9c027..03b43900ed 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit e131a9c027b202ce92bd7b9cf2569d48a6f9948e
+Subproject commit 03b43900edc5d8d6eecec460bfc89aec7174bd84
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index 9ad2f3c5c3..ae4cc30dba 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit 9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb
+Subproject commit ae4cc30dba24f3910533e5a48e8103c3f2fff300
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 18bde18359..8d5885d07a 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
     "17.0",
-    "9ad2f3c5c37c08069a01c1e3f6b7cf275437e0cb"
+    "ae4cc30dba24f3910533e5a48e8103c3f2fff300"
   ],
   "v16": [
     "16.4",
-    "e131a9c027b202ce92bd7b9cf2569d48a6f9948e"
+    "03b43900edc5d8d6eecec460bfc89aec7174bd84"
   ],
   "v15": [
     "15.8",
-    "22e580fe9ffcea7e02592110b1c9bf426d83cada"
+    "fd631a959049dfe2b82f67409c8b8b0d3e0016d1"
   ],
   "v14": [
     "14.13",
-    "2199b83fb72680001ce0f43bf6187a21dfb8f45d"
+    "de0a000dafc2e66ce2e39282d3aa1c704fe0390e"
   ]
 }

From 7595d3afe63035f7508c029b9ee152dbe1962dc4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 13 Nov 2024 09:17:26 +0000
Subject: [PATCH 227/239] pageserver: add `no_sync` for use in regression tests
 (2/2) (#9678)

## Problem

Followup to https://github.com/neondatabase/neon/pull/9677 which enables
`no_sync` in tests. This can be merged once the next release has
happened.

## Summary of changes

- Always run pageserver with `no_sync = true` in tests.
---
 test_runner/fixtures/neon_fixtures.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0728a33a63..990db1aed0 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1065,6 +1065,9 @@ class NeonEnv:
                 "http_auth_type": http_auth_type,
                 # Default which can be overriden with `NeonEnvBuilder.pageserver_config_override`
                 "availability_zone": "us-east-2a",
+                # Disable pageserver disk syncs in tests: when running tests concurrently, this avoids
+                # the pageserver taking a long time to start up due to syncfs flushing other tests' data
+                "no_sync": True,
             }
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine

From 080d585b22e516914c94c05ab82b4c8b0cfc0671 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 13 Nov 2024 10:36:48 +0100
Subject: [PATCH 228/239] Add installed_extensions prometheus metric (#9608)

and add /metrics endpoint to compute_ctl to expose such metrics

metric format example for extension pg_rag
with versions 1.2.3 and 1.4.2
installed in 3 and 1 databases respectively:

neon_extensions_installed{extension="pg_rag", version="1.2.3"} = 3
neon_extensions_installed{extension="pg_rag", version="1.4.2"} = 1

------
infra part: https://github.com/neondatabase/flux-fleet/pull/251
---------

Co-authored-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock                                    |  3 +
 compute_tools/Cargo.toml                      |  3 +
 compute_tools/src/http/api.rs                 | 25 ++++++++
 compute_tools/src/http/openapi_spec.yaml      | 15 +++++
 compute_tools/src/installed_extensions.rs     | 31 +++++++++-
 test_runner/fixtures/endpoint/http.py         |  5 ++
 .../regress/test_installed_extensions.py      | 61 ++++++++++++++++++-
 7 files changed, 138 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 00d58be2d5..64231ed11c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1229,12 +1229,15 @@ dependencies = [
  "flate2",
  "futures",
  "hyper 0.14.30",
+ "metrics",
  "nix 0.27.1",
  "notify",
  "num_cpus",
+ "once_cell",
  "opentelemetry",
  "opentelemetry_sdk",
  "postgres",
+ "prometheus",
  "regex",
  "remote_storage",
  "reqwest 0.12.4",
diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml
index 91e0b9d5b8..0bf4ed53d6 100644
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -18,9 +18,11 @@ clap.workspace = true
 flate2.workspace = true
 futures.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
+metrics.workspace = true
 nix.workspace = true
 notify.workspace = true
 num_cpus.workspace = true
+once_cell.workspace = true
 opentelemetry.workspace = true
 opentelemetry_sdk.workspace = true
 postgres.workspace = true
@@ -39,6 +41,7 @@ tracing-subscriber.workspace = true
 tracing-utils.workspace = true
 thiserror.workspace = true
 url.workspace = true
+prometheus.workspace = true
 
 compute_api.workspace = true
 utils.workspace = true
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index af35f71bf2..3677582c11 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -9,6 +9,7 @@ use crate::catalog::SchemaDumpError;
 use crate::catalog::{get_database_schema, get_dbs_and_roles};
 use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
+use crate::installed_extensions;
 use compute_api::requests::{ConfigurationRequest, ExtensionInstallRequest, SetRoleGrantsRequest};
 use compute_api::responses::{
     ComputeStatus, ComputeStatusResponse, ExtensionInstallResult, GenericAPIError,
@@ -19,6 +20,8 @@ use anyhow::Result;
 use hyper::header::CONTENT_TYPE;
 use hyper::service::{make_service_fn, service_fn};
 use hyper::{Body, Method, Request, Response, Server, StatusCode};
+use metrics::Encoder;
+use metrics::TextEncoder;
 use tokio::task;
 use tracing::{debug, error, info, warn};
 use tracing_utils::http::OtelName;
@@ -65,6 +68,28 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             Response::new(Body::from(serde_json::to_string(&metrics).unwrap()))
         }
 
+        // Prometheus metrics
+        (&Method::GET, "/metrics") => {
+            debug!("serving /metrics GET request");
+
+            let mut buffer = vec![];
+            let metrics = installed_extensions::collect();
+            let encoder = TextEncoder::new();
+            encoder.encode(&metrics, &mut buffer).unwrap();
+
+            match Response::builder()
+                .status(StatusCode::OK)
+                .header(CONTENT_TYPE, encoder.format_type())
+                .body(Body::from(buffer))
+            {
+                Ok(response) => response,
+                Err(err) => {
+                    let msg = format!("error handling /metrics request: {err}");
+                    error!(msg);
+                    render_json_error(&msg, StatusCode::INTERNAL_SERVER_ERROR)
+                }
+            }
+        }
         // Collect Postgres current usage insights
         (&Method::GET, "/insights") => {
             info!("serving /insights GET request");
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index 11eee6ccfd..7b9a62c545 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -37,6 +37,21 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeMetrics"
 
+  /metrics
+    get:
+      tags:
+      - Info
+      summary: Get compute node metrics in text format.
+      description: ""
+      operationId: getComputeMetrics
+      responses:
+        200:
+          description: ComputeMetrics
+          content:
+            text/plain:
+              schema:
+                type: string
+                description: Metrics in text format.
   /insights:
     get:
       tags:
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
index 877f99bff7..6dd55855db 100644
--- a/compute_tools/src/installed_extensions.rs
+++ b/compute_tools/src/installed_extensions.rs
@@ -1,4 +1,5 @@
 use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use metrics::proto::MetricFamily;
 use std::collections::HashMap;
 use std::collections::HashSet;
 use tracing::info;
@@ -8,6 +9,10 @@ use anyhow::Result;
 use postgres::{Client, NoTls};
 use tokio::task;
 
+use metrics::core::Collector;
+use metrics::{register_uint_gauge_vec, UIntGaugeVec};
+use once_cell::sync::Lazy;
+
 /// We don't reuse get_existing_dbs() just for code clarity
 /// and to make database listing query here more explicit.
 ///
@@ -59,6 +64,12 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
 
             for (extname, v) in extensions.iter() {
                 let version = v.to_string();
+
+                // increment the number of databases where the version of extension is installed
+                INSTALLED_EXTENSIONS
+                    .with_label_values(&[extname, &version])
+                    .inc();
+
                 extensions_map
                     .entry(extname.to_string())
                     .and_modify(|e| {
@@ -74,9 +85,11 @@ pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtension
             }
         }
 
-        Ok(InstalledExtensions {
+        let res = InstalledExtensions {
             extensions: extensions_map.values().cloned().collect(),
-        })
+        };
+
+        Ok(res)
     })
     .await?
 }
@@ -97,6 +110,18 @@ pub fn get_installed_extensions_sync(connstr: Url) -> Result<()> {
         "[NEON_EXT_STAT] {}",
         serde_json::to_string(&result).expect("failed to serialize extensions list")
     );
-
     Ok(())
 }
+
+static INSTALLED_EXTENSIONS: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "installed_extensions",
+        "Number of databases where the version of extension is installed",
+        &["extension_name", "version"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub fn collect() -> Vec<MetricFamily> {
+    INSTALLED_EXTENSIONS.collect()
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index ea8291c1e0..db3723b7cc 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -46,3 +46,8 @@ class EndpointHttpClient(requests.Session):
         )
         res.raise_for_status()
         return res.json()
+
+    def metrics(self) -> str:
+        res = self.get(f"http://localhost:{self.port}/metrics")
+        res.raise_for_status()
+        return res.text
diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py
index 4700db85ee..54ce7c8340 100644
--- a/test_runner/regress/test_installed_extensions.py
+++ b/test_runner/regress/test_installed_extensions.py
@@ -1,6 +1,14 @@
-from logging import info
+from __future__ import annotations
 
-from fixtures.neon_fixtures import NeonEnv
+import time
+from logging import info
+from typing import TYPE_CHECKING
+
+from fixtures.log_helper import log
+from fixtures.metrics import parse_metrics
+
+if TYPE_CHECKING:
+    from fixtures.neon_fixtures import NeonEnv
 
 
 def test_installed_extensions(neon_simple_env: NeonEnv):
@@ -85,3 +93,52 @@ def test_installed_extensions(neon_simple_env: NeonEnv):
             assert ext["n_databases"] == 2
             ext["versions"].sort()
             assert ext["versions"] == ["1.2", "1.3"]
+
+    # check that /metrics endpoint is available
+    # ensure that we see the metric before and after restart
+    res = client.metrics()
+    info("Metrics: %s", res)
+    m = parse_metrics(res)
+    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+    assert len(neon_m) == 1
+    for sample in neon_m:
+        assert sample.value == 2
+    neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+    assert len(neon_m) == 1
+    for sample in neon_m:
+        assert sample.value == 1
+
+    endpoint.stop()
+    endpoint.start()
+
+    timeout = 10
+    while timeout > 0:
+        try:
+            res = client.metrics()
+            timeout = -1
+            if len(parse_metrics(res).query_all("installed_extensions")) < 4:
+                # Assume that not all metrics that are collected yet
+                time.sleep(1)
+                timeout -= 1
+                continue
+        except Exception:
+            log.exception("failed to get metrics, assume they are not collected yet")
+            time.sleep(1)
+            timeout -= 1
+            continue
+
+        assert (
+            len(parse_metrics(res).query_all("installed_extensions")) >= 4
+        ), "Not all metrics are collected"
+
+        info("After restart metrics: %s", res)
+        m = parse_metrics(res)
+        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.2"})
+        assert len(neon_m) == 1
+        for sample in neon_m:
+            assert sample.value == 1
+
+        neon_m = m.query_all("installed_extensions", {"extension_name": "neon", "version": "1.3"})
+        assert len(neon_m) == 1
+        for sample in neon_m:
+            assert sample.value == 1

From d5435b1a81ae8889c6ef02b395627fffe8651aa5 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 13 Nov 2024 12:20:32 +0200
Subject: [PATCH 229/239] tests: Increase timeout in
 test_create_churn_during_restart (#9736)

This test was seen to be flaky, e.g. at:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-9457/11804246485/index.html#suites/ec4311502db344eee91f1354e9dc839b/982bd121ea698414/.
If I _reduce_ the timeout from 10s to 8s on my laptop, it reliably hits
that timeout and fails. That suggests that the test is pretty close to
the edge even when it passes. Let's bump up the timeout to 30 s to make
it more robust.

See also https://github.com/neondatabase/neon/issues/9730, although the
error message is different there.
---
 test_runner/regress/test_tenants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 03cb79fc1d..5a499ea98b 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -427,7 +427,7 @@ def test_create_churn_during_restart(neon_env_builder: NeonEnvBuilder):
             env.pageserver.start()
 
             for f in futs:
-                f.result(timeout=10)
+                f.result(timeout=30)
 
     # The tenant should end up active
     wait_until_tenant_active(env.pageserver.http_client(), tenant_id, iterations=10, period=1)

From 10aaa3677d6edda474d7d2ee6967d7bdc82610f8 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 13 Nov 2024 15:08:58 +0200
Subject: [PATCH 230/239] PostgreSQL minor version updates (17.1, 16.5, 15.9,
 14.14) (#9727)

This includes a patch to temporarily disable one test in the pg_anon
test suite. It is an upstream issue, the test started failing with the
new PostgreSQL minor versions because of a change in the default
timezone used in tests. We don't want to block the release for this,
so just disable the test for now. See
https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485

Corresponding postgres repository PRs:
https://github.com/neondatabase/postgres/pull/524
https://github.com/neondatabase/postgres/pull/525
https://github.com/neondatabase/postgres/pull/526
https://github.com/neondatabase/postgres/pull/527
---
 compute/patches/pg_anon.patch | 42 +++++++++++++++++++++++++++++++++++
 vendor/postgres-v14           |  2 +-
 vendor/postgres-v15           |  2 +-
 vendor/postgres-v16           |  2 +-
 vendor/postgres-v17           |  2 +-
 vendor/revisions.json         | 16 ++++++-------
 6 files changed, 54 insertions(+), 12 deletions(-)

diff --git a/compute/patches/pg_anon.patch b/compute/patches/pg_anon.patch
index 15dfd3c5a0..e2b4b292e4 100644
--- a/compute/patches/pg_anon.patch
+++ b/compute/patches/pg_anon.patch
@@ -1,3 +1,45 @@
+commit 00aa659afc9c7336ab81036edec3017168aabf40
+Author: Heikki Linnakangas <heikki@neon.tech>
+Date:   Tue Nov 12 16:59:19 2024 +0200
+
+    Temporarily disable test that depends on timezone
+
+diff --git a/tests/expected/generalization.out b/tests/expected/generalization.out
+index 23ef5fa..9e60deb 100644
+--- a/ext-src/pg_anon-src/tests/expected/generalization.out
++++ b/ext-src/pg_anon-src/tests/expected/generalization.out
+@@ -284,12 +284,9 @@ SELECT anon.generalize_tstzrange('19041107','century');
+  ["Tue Jan 01 00:00:00 1901 PST","Mon Jan 01 00:00:00 2001 PST")
+ (1 row)
+ 
+-SELECT anon.generalize_tstzrange('19041107','millennium');
+-                      generalize_tstzrange                       
+------------------------------------------------------------------
+- ["Thu Jan 01 00:00:00 1001 PST","Mon Jan 01 00:00:00 2001 PST")
+-(1 row)
+-
++-- temporarily disabled, see:
++-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
++--SELECT anon.generalize_tstzrange('19041107','millennium');
+ -- generalize_daterange
+ SELECT anon.generalize_daterange('19041107');
+   generalize_daterange   
+diff --git a/tests/sql/generalization.sql b/tests/sql/generalization.sql
+index b868344..b4fc977 100644
+--- a/ext-src/pg_anon-src/tests/sql/generalization.sql
++++ b/ext-src/pg_anon-src/tests/sql/generalization.sql
+@@ -61,7 +61,9 @@ SELECT anon.generalize_tstzrange('19041107','month');
+ SELECT anon.generalize_tstzrange('19041107','year');
+ SELECT anon.generalize_tstzrange('19041107','decade');
+ SELECT anon.generalize_tstzrange('19041107','century');
+-SELECT anon.generalize_tstzrange('19041107','millennium');
++-- temporarily disabled, see:
++-- https://gitlab.com/dalibo/postgresql_anonymizer/-/commit/199f0a392b37c59d92ae441fb8f037e094a11a52#note_2148017485
++--SELECT anon.generalize_tstzrange('19041107','millennium');
+ 
+ -- generalize_daterange
+ SELECT anon.generalize_daterange('19041107');
+
 commit 7dd414ee75f2875cffb1d6ba474df1f135a6fc6f
 Author: Alexey Masterov <alexeymasterov@neon.tech>
 Date:   Fri May 31 06:34:26 2024 +0000
diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index de0a000daf..c5e0d642ef 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit de0a000dafc2e66ce2e39282d3aa1c704fe0390e
+Subproject commit c5e0d642efb02e4bfedc283b0a7707fe6c79cc89
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index fd631a9590..1feff6b60f 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit fd631a959049dfe2b82f67409c8b8b0d3e0016d1
+Subproject commit 1feff6b60f07cb71b665d0f5ead71a4320a71743
diff --git a/vendor/postgres-v16 b/vendor/postgres-v16
index 03b43900ed..b0b693ea29 160000
--- a/vendor/postgres-v16
+++ b/vendor/postgres-v16
@@ -1 +1 @@
-Subproject commit 03b43900edc5d8d6eecec460bfc89aec7174bd84
+Subproject commit b0b693ea298454e95e6b154780d1fd586a244dfd
diff --git a/vendor/postgres-v17 b/vendor/postgres-v17
index ae4cc30dba..aa2e29f2b6 160000
--- a/vendor/postgres-v17
+++ b/vendor/postgres-v17
@@ -1 +1 @@
-Subproject commit ae4cc30dba24f3910533e5a48e8103c3f2fff300
+Subproject commit aa2e29f2b6952140dfe51876bbd11054acae776f
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 8d5885d07a..a1f2bc5dd1 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,18 +1,18 @@
 {
   "v17": [
-    "17.0",
-    "ae4cc30dba24f3910533e5a48e8103c3f2fff300"
+    "17.1",
+    "aa2e29f2b6952140dfe51876bbd11054acae776f"
   ],
   "v16": [
-    "16.4",
-    "03b43900edc5d8d6eecec460bfc89aec7174bd84"
+    "16.5",
+    "b0b693ea298454e95e6b154780d1fd586a244dfd"
   ],
   "v15": [
-    "15.8",
-    "fd631a959049dfe2b82f67409c8b8b0d3e0016d1"
+    "15.9",
+    "1feff6b60f07cb71b665d0f5ead71a4320a71743"
   ],
   "v14": [
-    "14.13",
-    "de0a000dafc2e66ce2e39282d3aa1c704fe0390e"
+    "14.14",
+    "c5e0d642efb02e4bfedc283b0a7707fe6c79cc89"
   ]
 }

From b4e00b8b220b6443c117fbaf9746ab4ef9c38e55 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 13 Nov 2024 18:07:39 +0000
Subject: [PATCH 231/239] pageserver: refuse to load tenants with suspiciously
 old indices in old generations (#9719)

## Problem

Historically, if a control component passed a pageserver "generation: 1"
this could be a quick way to corrupt a tenant by loading a historic
index.

Follows https://github.com/neondatabase/neon/pull/9383
Closes #6951

## Summary of changes

- Introduce a Fatal variant to DownloadError, to enable index downloads
to signal when they have encountered a scary enough situation that we
shouldn't proceed to load the tenant.
- Handle this variant by putting the tenant into a broken state (no
matter which timeline within the tenant reported it)
- Add a test for this case

In the event that this behavior fires when we don't want it to, we have
ways to intervene:
- "Touch" an affected index to update its mtime (download+upload S3
object)
- If this behavior is triggered, it indicates we're attaching in some
old generation, so we should be able to fix that by manually bumping
generation numbers in the storage controller database (this should never
happen, but it's an option if it does)
---
 libs/remote_storage/src/error.rs              |  6 +-
 pageserver/src/tenant.rs                      |  6 ++
 .../src/tenant/remote_timeline_client.rs      |  8 ++-
 .../regress/test_pageserver_generations.py    | 68 ++++++++++++++++++-
 4 files changed, 85 insertions(+), 3 deletions(-)

diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs
index 17790e9f70..ec9f868998 100644
--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -15,6 +15,9 @@ pub enum DownloadError {
     ///
     /// Concurrency control is not timed within timeout.
     Timeout,
+    /// Some integrity/consistency check failed during download. This is used during
+    /// timeline loads to cancel the load of a tenant if some timeline detects fatal corruption.
+    Fatal(String),
     /// The file was found in the remote storage, but the download failed.
     Other(anyhow::Error),
 }
@@ -29,6 +32,7 @@ impl std::fmt::Display for DownloadError {
             DownloadError::Unmodified => write!(f, "File was not modified"),
             DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
             DownloadError::Timeout => write!(f, "timeout"),
+            DownloadError::Fatal(why) => write!(f, "Fatal read error: {why}"),
             DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
         }
     }
@@ -41,7 +45,7 @@ impl DownloadError {
     pub fn is_permanent(&self) -> bool {
         use DownloadError::*;
         match self {
-            BadInput(_) | NotFound | Unmodified | Cancelled => true,
+            BadInput(_) | NotFound | Unmodified | Fatal(_) | Cancelled => true,
             Timeout | Other(_) => false,
         }
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d0a96e78a6..61bb1fe40c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1433,6 +1433,12 @@ impl Tenant {
                     info!(%timeline_id, "index_part not found on remote");
                     continue;
                 }
+                Err(DownloadError::Fatal(why)) => {
+                    // If, while loading one remote timeline, we saw an indication that our generation
+                    // number is likely invalid, then we should not load the whole tenant.
+                    error!(%timeline_id, "Fatal error loading timeline: {why}");
+                    anyhow::bail!(why.to_string());
+                }
                 Err(e) => {
                     // Some (possibly ephemeral) error happened during index_part download.
                     // Pretend the timeline exists to not delete the timeline directory,
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index b37c16e133..600583f6b5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -574,12 +574,18 @@ impl RemoteTimelineClient {
 
             if latest_index_generation > index_generation {
                 // Unexpected!  Why are we loading such an old index if a more recent one exists?
-                tracing::warn!(
+                // We will refuse to proceed, as there is no reasonable scenario where this should happen, but
+                // there _is_ a clear bug/corruption scenario where it would happen (controller sets the generation
+                // backwards).
+                tracing::error!(
                     ?index_generation,
                     ?latest_index_generation,
                     ?latest_index_mtime,
                     "Found a newer index while loading an old one"
                 );
+                return Err(DownloadError::Fatal(
+                    "Index age exceeds threshold and a newer index exists".into(),
+                ));
             }
         }
 
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 8f6c9f16fd..4f59efb8b3 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -35,9 +35,10 @@ from fixtures.pageserver.utils import (
     wait_for_upload,
 )
 from fixtures.remote_storage import (
+    LocalFsStorage,
     RemoteStorageKind,
 )
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 from fixtures.workload import Workload
 
 if TYPE_CHECKING:
@@ -728,3 +729,68 @@ def test_upgrade_generationless_local_file_paths(
     )
     # We should download into the same local path we started with
     assert os.path.exists(victim_path)
+
+
+@run_only_on_default_postgres("Only tests index logic")
+def test_old_index_time_threshold(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Exercise pageserver's detection of trying to load an ancient non-latest index.
+    (see https://github.com/neondatabase/neon/issues/6951)
+    """
+
+    # Run with local_fs because we will interfere with mtimes by local filesystem access
+    neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS)
+    env = neon_env_builder.init_start()
+    tenant_id = env.initial_tenant
+    timeline_id = env.initial_timeline
+
+    workload = Workload(env, tenant_id, timeline_id)
+    workload.init()
+    workload.write_rows(32)
+
+    # Remember generation 1's index path
+    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
+    index_path = env.pageserver_remote_storage.index_path(tenant_id, timeline_id)
+
+    # Increment generation by detaching+attaching, and write+flush some data to get a new remote index
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": {"Attached": 0}})
+    env.storage_controller.reconcile_until_idle()
+    workload.churn_rows(32)
+
+    # A new index should have been written
+    assert env.pageserver_remote_storage.index_path(tenant_id, timeline_id) != index_path
+
+    # Hack the mtime on the generation 1 index
+    log.info(f"Setting old mtime on {index_path}")
+    os.utime(index_path, times=(time.time(), time.time() - 30 * 24 * 3600))
+    env.pageserver.allowed_errors.extend(
+        [
+            ".*Found a newer index while loading an old one.*",
+            ".*Index age exceeds threshold and a newer index exists.*",
+        ]
+    )
+
+    # Detach from storage controller + attach in an old generation directly on the pageserver.
+    workload.stop()
+    env.storage_controller.tenant_policy_update(tenant_id, {"placement": "Detached"})
+    env.storage_controller.reconcile_until_idle()
+    env.storage_controller.tenant_policy_update(tenant_id, {"scheduling": "Stop"})
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy")
+
+    # The controller would not do this (attach in an old generation): we are doing it to simulate
+    # a hypothetical profound bug in the controller.
+    env.pageserver.http_client().tenant_location_conf(
+        tenant_id, {"generation": 1, "mode": "AttachedSingle", "tenant_conf": {}}
+    )
+
+    # The pageserver should react to this situation by refusing to attach the tenant and putting
+    # it into Broken state
+    env.pageserver.allowed_errors.append(".*tenant is broken.*")
+    with pytest.raises(
+        PageserverApiException,
+        match="tenant is broken: Index age exceeds threshold and a newer index exists",
+    ):
+        env.pageserver.http_client().timeline_detail(tenant_id, timeline_id)

From 1280b708f1636034cfe99038faab1ae628dd4b2d Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Wed, 13 Nov 2024 20:35:48 -0600
Subject: [PATCH 232/239] Improve error handling for NeonAPI fixture

Move error handling to the common request function and add a debug log.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 test_runner/fixtures/neon_api.py | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

diff --git a/test_runner/fixtures/neon_api.py b/test_runner/fixtures/neon_api.py
index 89c1f324b4..9de6681beb 100644
--- a/test_runner/fixtures/neon_api.py
+++ b/test_runner/fixtures/neon_api.py
@@ -5,6 +5,8 @@ from typing import TYPE_CHECKING, cast, final
 
 import requests
 
+from fixtures.log_helper import log
+
 if TYPE_CHECKING:
     from typing import Any, Literal, Optional
 
@@ -30,7 +32,11 @@ class NeonAPI:
             kwargs["headers"] = {}
         kwargs["headers"]["Authorization"] = f"Bearer {self.__neon_api_key}"
 
-        return requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+        resp = requests.request(method, f"{self.__neon_api_base_url}{endpoint}", **kwargs)
+        log.debug("%s %s returned a %d: %s", method, endpoint, resp.status_code, resp.text)
+        resp.raise_for_status()
+
+        return resp
 
     def create_project(
         self,
@@ -66,8 +72,6 @@ class NeonAPI:
             json=data,
         )
 
-        assert resp.status_code == 201
-
         return cast("dict[str, Any]", resp.json())
 
     def get_project_details(self, project_id: str) -> dict[str, Any]:
@@ -79,7 +83,7 @@ class NeonAPI:
                 "Content-Type": "application/json",
             },
         )
-        assert resp.status_code == 200
+
         return cast("dict[str, Any]", resp.json())
 
     def delete_project(
@@ -95,8 +99,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def start_endpoint(
@@ -112,8 +114,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def suspend_endpoint(
@@ -129,8 +129,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def restart_endpoint(
@@ -146,8 +144,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def create_endpoint(
@@ -178,8 +174,6 @@ class NeonAPI:
             json=data,
         )
 
-        assert resp.status_code == 201
-
         return cast("dict[str, Any]", resp.json())
 
     def get_connection_uri(
@@ -206,8 +200,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def get_branches(self, project_id: str) -> dict[str, Any]:
@@ -219,8 +211,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def get_endpoints(self, project_id: str) -> dict[str, Any]:
@@ -232,8 +222,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def get_operations(self, project_id: str) -> dict[str, Any]:
@@ -246,8 +234,6 @@ class NeonAPI:
             },
         )
 
-        assert resp.status_code == 200
-
         return cast("dict[str, Any]", resp.json())
 
     def wait_for_operation_to_finish(self, project_id: str):

From d06bf4b0fe6865e6bd4fadcc443945df72b6e162 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Thu, 14 Nov 2024 13:06:42 +0300
Subject: [PATCH 233/239] safekeeper: fix atomicity of WAL truncation (#9685)

If WAL truncation fails in the middle it might leave some data on disk
above the write/flush LSN. In theory, concatenated with previous records
it might form bogus WAL (though very unlikely in practice because CRC
would protect from that). To protect from that, set
pending_wal_truncation flag: means before any WAL writes truncation must
be retried until it succeeds. We already did that in case of safekeeper
restart, now extend this mechanism for failures without restart. Also,
importantly, reset LSNs in the beginning of the operation, not in the
end, because once on disk deletion starts previous pointers are wrong.

All this most likely haven't created any problems in practice because
CRC protects from the consequences.

Tests for this are hard; simulation infrastructure might be useful here
in the future, but not yet.
---
 safekeeper/src/wal_storage.rs | 66 +++++++++++++++++++++++++----------
 1 file changed, 47 insertions(+), 19 deletions(-)

diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 11f372bceb..c3bb6cd12c 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -127,23 +127,29 @@ pub struct PhysicalStorage {
     /// - doesn't point to the end of the segment
     file: Option<File>,
 
-    /// When false, we have just initialized storage using the LSN from find_end_of_wal().
-    /// In this case, [`write_lsn`] can be less than actually written WAL on disk. In particular,
-    /// there can be a case with unexpected .partial file.
+    /// When true, WAL truncation potentially has been interrupted and we need
+    /// to finish it before allowing WAL writes; see truncate_wal for details.
+    /// In this case [`write_lsn`] can be less than actually written WAL on
+    /// disk. In particular, there can be a case with unexpected .partial file.
     ///
     /// Imagine the following:
     /// - 000000010000000000000001
-    ///   - it was fully written, but the last record is split between 2 segments
-    ///   - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in the end of this segment
-    ///   - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were initialized to 0/1FFFFF0
+    ///   - it was fully written, but the last record is split between 2
+    ///     segments
+    ///   - after restart, `find_end_of_wal()` returned 0/1FFFFF0, which is in
+    ///     the end of this segment
+    ///   - `write_lsn`, `write_record_lsn` and `flush_record_lsn` were
+    ///     initialized to 0/1FFFFF0
     /// - 000000010000000000000002.partial
-    ///   - it has only 1 byte written, which is not enough to make a full WAL record
+    ///   - it has only 1 byte written, which is not enough to make a full WAL
+    ///     record
     ///
-    /// Partial segment 002 has no WAL records, and it will be removed by the next truncate_wal().
-    /// This flag will be set to true after the first truncate_wal() call.
+    /// Partial segment 002 has no WAL records, and it will be removed by the
+    /// next truncate_wal(). This flag will be set to true after the first
+    /// truncate_wal() call.
     ///
     /// [`write_lsn`]: Self::write_lsn
-    is_truncated_after_restart: bool,
+    pending_wal_truncation: bool,
 }
 
 impl PhysicalStorage {
@@ -208,7 +214,7 @@ impl PhysicalStorage {
             flush_record_lsn: flush_lsn,
             decoder: WalStreamDecoder::new(write_lsn, state.server.pg_version / 10000),
             file: None,
-            is_truncated_after_restart: false,
+            pending_wal_truncation: true,
         })
     }
 
@@ -405,6 +411,13 @@ impl Storage for PhysicalStorage {
                 startpos
             );
         }
+        if self.pending_wal_truncation {
+            bail!(
+                "write_wal called with pending WAL truncation, write_lsn={}, startpos={}",
+                self.write_lsn,
+                startpos
+            );
+        }
 
         let write_seconds = time_io_closure(self.write_exact(startpos, buf)).await?;
         // WAL is written, updating write metrics
@@ -479,15 +492,34 @@ impl Storage for PhysicalStorage {
             );
         }
 
-        // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
-        // disk (this happens on each connect).
-        if self.is_truncated_after_restart
+        // Quick exit if nothing to do and we know that the state is clean to
+        // avoid writing up to 16 MiB of zeros on disk (this happens on each
+        // connect).
+        if !self.pending_wal_truncation
             && end_pos == self.write_lsn
             && end_pos == self.flush_record_lsn
         {
             return Ok(());
         }
 
+        // Atomicity: we start with LSNs reset because once on disk deletion is
+        // started it can't be reversed. However, we might crash/error in the
+        // middle, leaving garbage above the truncation point. In theory,
+        // concatenated with previous records it might form bogus WAL (though
+        // very unlikely in practice because CRC would guard from that). To
+        // protect, set pending_wal_truncation flag before beginning: it means
+        // truncation must be retried and WAL writes are prohibited until it
+        // succeeds. Flag is also set on boot because we don't know if the last
+        // state was clean.
+        //
+        // Protocol (HandleElected before first AppendRequest) ensures we'll
+        // always try to ensure clean truncation before any writes.
+        self.pending_wal_truncation = true;
+
+        self.write_lsn = end_pos;
+        self.write_record_lsn = end_pos;
+        self.flush_record_lsn = end_pos;
+
         // Close previously opened file, if any
         if let Some(unflushed_file) = self.file.take() {
             self.fdatasync_file(&unflushed_file).await?;
@@ -513,11 +545,7 @@ impl Storage for PhysicalStorage {
             fs::rename(wal_file_path, wal_file_partial_path).await?;
         }
 
-        // Update LSNs
-        self.write_lsn = end_pos;
-        self.write_record_lsn = end_pos;
-        self.flush_record_lsn = end_pos;
-        self.is_truncated_after_restart = true;
+        self.pending_wal_truncation = false;
         Ok(())
     }
 

From 21282aa1134dba24aeb74bcde79a550b5b02f108 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Thu, 14 Nov 2024 15:16:43 +0000
Subject: [PATCH 234/239] cargo: use neon branch of rust-postgres (#9757)

## Problem

We are pining our fork of rust-postgres to a commit hash and that
prevents us from making
further changes to it. The latest commit in rust-postgres requires
https://github.com/neondatabase/neon/pull/8747,
but that seems to have gone stale. I reverted rust-postgres `neon`
branch to the pinned commit in
https://github.com/neondatabase/rust-postgres/pull/31.

## Summary of changes

Switch back to using the `neon` branch of the rust-postgres fork.
---
 Cargo.lock                |  8 ++++----
 Cargo.toml                | 21 +++++----------------
 workspace_hack/Cargo.toml |  4 ++--
 3 files changed, 11 insertions(+), 22 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 64231ed11c..f6e3f9ddb1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4009,7 +4009,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4022,7 +4022,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4041,7 +4041,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6227,7 +6227,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=20031d7a9ee1addeae6e0968e3899ae6bf01cee2#20031d7a9ee1addeae6e0968e3899ae6bf01cee2"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#a130197713830a0ea0004b539b1f51a66b4c3e18"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 8207726caa..706d742f1b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -203,21 +203,10 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-
-# We want to use the 'neon' branch for these, but there's currently one
-# incompatible change on the branch. See:
-#
-# - PR #8076 which contained changes that depended on the new changes in
-#   the rust-postgres crate, and
-# - PR #8654 which reverted those changes and made the code in proxy incompatible
-#   with the tip of the 'neon' branch again.
-#
-# When those proxy changes are re-applied (see PR #8747), we can switch using
-# the tip of the 'neon' branch again.
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
 ## Local libraries
 compute_api = { version = "0.1", path = "./libs/compute_api/" }
@@ -255,7 +244,7 @@ tonic-build = "0.12"
 [patch.crates-io]
 
 # Needed to get `tokio-postgres-rustls` to depend on our fork.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon" }
 
 ################# Binary contents sections
 
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index ae4018a884..d6773987ea 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -58,7 +58,7 @@ num-integer = { version = "0.1", features = ["i128"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", default-features = false, features = ["with-serde_json-1"] }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", default-features = false, features = ["with-serde_json-1"] }
 prost = { version = "0.13", features = ["prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
@@ -78,7 +78,7 @@ sync_wrapper = { version = "0.1", default-features = false, features = ["futures
 tikv-jemalloc-sys = { version = "0.5" }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "test-util"] }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev = "20031d7a9ee1addeae6e0968e3899ae6bf01cee2", features = ["with-serde_json-1"] }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch = "neon", features = ["with-serde_json-1"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
 tokio-stream = { version = "0.1", features = ["net"] }
 tokio-util = { version = "0.7", features = ["codec", "compat", "io", "rt"] }

From f70611c8df719a45f23abffdbc5b60a803e4f87e Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Thu, 14 Nov 2024 17:19:13 +0200
Subject: [PATCH 235/239] Correctly truncate VM (#9342)

## Problem

https://github.com/neondatabase/neon/issues/9240

## Summary of changes

Correctly truncate VM page instead just replacing it with zero page.

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 libs/pageserver_api/src/record.rs       |  5 ++++
 libs/postgres_ffi/src/pg_constants.rs   |  7 ++++--
 pageserver/src/walingest.rs             | 28 +++++++++++++++++----
 pageserver/src/walredo/apply_neon.rs    | 28 +++++++++++++++++++++
 test_runner/regress/test_vm_truncate.py | 33 +++++++++++++++++++++++++
 5 files changed, 94 insertions(+), 7 deletions(-)
 create mode 100644 test_runner/regress/test_vm_truncate.py

diff --git a/libs/pageserver_api/src/record.rs b/libs/pageserver_api/src/record.rs
index 5c3f3deb82..bb62b35d36 100644
--- a/libs/pageserver_api/src/record.rs
+++ b/libs/pageserver_api/src/record.rs
@@ -41,6 +41,11 @@ pub enum NeonWalRecord {
         file_path: String,
         content: Option<Bytes>,
     },
+    // Truncate visibility map page
+    TruncateVisibilityMap {
+        trunc_byte: usize,
+        trunc_offs: usize,
+    },
 
     /// A testing record for unit testing purposes. It supports append data to an existing image, or clear it.
     #[cfg(feature = "testing")]
diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 497d011d7a..e343473d77 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -243,8 +243,11 @@ const FSM_LEAF_NODES_PER_PAGE: usize = FSM_NODES_PER_PAGE - FSM_NON_LEAF_NODES_P
 pub const SLOTS_PER_FSM_PAGE: u32 = FSM_LEAF_NODES_PER_PAGE as u32;
 
 /* From visibilitymap.c */
-pub const VM_HEAPBLOCKS_PER_PAGE: u32 =
-    (BLCKSZ as usize - SIZEOF_PAGE_HEADER_DATA) as u32 * (8 / 2); // MAPSIZE * (BITS_PER_BYTE / BITS_PER_HEAPBLOCK)
+
+pub const VM_MAPSIZE: usize = BLCKSZ as usize - MAXALIGN_SIZE_OF_PAGE_HEADER_DATA;
+pub const VM_BITS_PER_HEAPBLOCK: usize = 2;
+pub const VM_HEAPBLOCKS_PER_BYTE: usize = 8 / VM_BITS_PER_HEAPBLOCK;
+pub const VM_HEAPBLOCKS_PER_PAGE: usize = VM_MAPSIZE * VM_HEAPBLOCKS_PER_BYTE;
 
 /* From origin.c */
 pub const REPLICATION_STATE_MAGIC: u32 = 0x1257DADE;
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index c3ccd8a2e4..84e553f330 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -587,11 +587,29 @@ impl WalIngest {
                 forknum: VISIBILITYMAP_FORKNUM,
             };
 
-            let mut vm_page_no = blkno / pg_constants::VM_HEAPBLOCKS_PER_PAGE;
-            if blkno % pg_constants::VM_HEAPBLOCKS_PER_PAGE != 0 {
-                // Tail of last remaining vm page has to be zeroed.
-                // We are not precise here and instead of digging in VM bitmap format just clear the whole page.
-                modification.put_rel_page_image_zero(rel, vm_page_no)?;
+            // last remaining block, byte, and bit
+            let mut vm_page_no = blkno / (pg_constants::VM_HEAPBLOCKS_PER_PAGE as u32);
+            let trunc_byte = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_PAGE
+                / pg_constants::VM_HEAPBLOCKS_PER_BYTE;
+            let trunc_offs = blkno as usize % pg_constants::VM_HEAPBLOCKS_PER_BYTE
+                * pg_constants::VM_BITS_PER_HEAPBLOCK;
+
+            // Unless the new size is exactly at a visibility map page boundary, the
+            // tail bits in the last remaining map page, representing truncated heap
+            // blocks, need to be cleared. This is not only tidy, but also necessary
+            // because we don't get a chance to clear the bits if the heap is extended
+            // again.
+            if (trunc_byte != 0 || trunc_offs != 0)
+                && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
+            {
+                modification.put_rel_wal_record(
+                    rel,
+                    vm_page_no,
+                    NeonWalRecord::TruncateVisibilityMap {
+                        trunc_byte,
+                        trunc_offs,
+                    },
+                )?;
                 vm_page_no += 1;
             }
             let nblocks = get_relsize(modification, rel, ctx).await?;
diff --git a/pageserver/src/walredo/apply_neon.rs b/pageserver/src/walredo/apply_neon.rs
index 78601d87af..d62e325310 100644
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -42,6 +42,34 @@ pub(crate) fn apply_in_neon(
         } => {
             anyhow::bail!("tried to pass postgres wal record to neon WAL redo");
         }
+        //
+        // Code copied from PostgreSQL `visibilitymap_prepare_truncate` function in `visibilitymap.c`
+        //
+        NeonWalRecord::TruncateVisibilityMap {
+            trunc_byte,
+            trunc_offs,
+        } => {
+            // sanity check that this is modifying the correct relation
+            let (rel, _) = key.to_rel_block().context("invalid record")?;
+            assert!(
+                rel.forknum == VISIBILITYMAP_FORKNUM,
+                "TruncateVisibilityMap record on unexpected rel {}",
+                rel
+            );
+            let map = &mut page[pg_constants::MAXALIGN_SIZE_OF_PAGE_HEADER_DATA..];
+            map[*trunc_byte + 1..].fill(0u8);
+            /*----
+             * Mask out the unwanted bits of the last remaining byte.
+             *
+             * ((1 << 0) - 1) = 00000000
+             * ((1 << 1) - 1) = 00000001
+             * ...
+             * ((1 << 6) - 1) = 00111111
+             * ((1 << 7) - 1) = 01111111
+             *----
+             */
+            map[*trunc_byte] &= (1 << *trunc_offs) - 1;
+        }
         NeonWalRecord::ClearVisibilityMapFlags {
             new_heap_blkno,
             old_heap_blkno,
diff --git a/test_runner/regress/test_vm_truncate.py b/test_runner/regress/test_vm_truncate.py
new file mode 100644
index 0000000000..43b4f2d8b1
--- /dev/null
+++ b/test_runner/regress/test_vm_truncate.py
@@ -0,0 +1,33 @@
+from fixtures.neon_fixtures import NeonEnv
+
+
+#
+# Test that VM is properly truncated
+#
+def test_vm_truncate(neon_simple_env: NeonEnv):
+    env = neon_simple_env
+
+    endpoint = env.endpoints.create_start("main")
+    con = endpoint.connect()
+    cur = con.cursor()
+    cur.execute("CREATE EXTENSION neon_test_utils")
+    cur.execute("CREATE EXTENSION pageinspect")
+
+    cur.execute(
+        "create table t(pk integer primary key, counter integer default 0, filler text default repeat('?', 200))"
+    )
+    cur.execute("insert into t (pk) values (generate_series(1,1000))")
+    cur.execute("delete from t where pk>10")
+    cur.execute("vacuum t")  # truncates the relation, including its VM and FSM
+    # get image of the first block of the VM excluding the page header. It's expected
+    # to still be in the buffer cache.
+    # ignore page header (24 bytes, 48 - it's hex representation)
+    cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)")
+    pg_bitmap = cur.fetchall()[0][0]
+    # flush shared buffers
+    cur.execute("SELECT clear_buffer_cache()")
+    # now download the first block of the VM from the pageserver ...
+    cur.execute("select substr(encode(get_raw_page('t', 'vm', 0), 'hex'), 48)")
+    ps_bitmap = cur.fetchall()[0][0]
+    # and check that content of bitmaps are equal, i.e. PS is producing the same VM page as Postgres
+    assert pg_bitmap == ps_bitmap

From 8cde37bc0beb0db91c0bd15908f1a4b0f7cf6dbd Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Thu, 14 Nov 2024 10:26:58 -0500
Subject: [PATCH 236/239] test: disable test_readonly_node_gc until proper fix
 (#9755)

## Problem

After investigation, we think to make `test_readonly_node_gc` less
flaky, we need to make a proper fix (likely involving persisting part of
the lease state). See https://github.com/neondatabase/neon/issues/9754
for details.

## Summary of changes

- skip the test until proper fix.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 test_runner/regress/test_readonly_node.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index f257f0853b..826136d5f9 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -122,6 +122,7 @@ def test_readonly_node(neon_simple_env: NeonEnv):
         )
 
 
+@pytest.mark.skip("See https://github.com/neondatabase/neon/issues/9754")
 def test_readonly_node_gc(neon_env_builder: NeonEnvBuilder):
     """
     Test static endpoint is protected from GC by acquiring and renewing lsn leases.

From 49b599c1138e2cb35fa87974309dabf189e3bf84 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 14 Nov 2024 10:59:15 -0600
Subject: [PATCH 237/239] Remove the replication slot in test_snap_files at the
 end of the test

Analysis of the LR benchmarking tests indicates that in the duration of
test_subscriber_lag, a leftover 'slotter' replication slot can lead to
retained WAL growing on the publisher. This replication slot is not used
by any subscriber. The only purpose of the slot is to generate snapshot
files for the puspose of test_snap_files.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 .../performance/test_logical_replication.py   | 117 +++++++++++-------
 1 file changed, 74 insertions(+), 43 deletions(-)

diff --git a/test_runner/performance/test_logical_replication.py b/test_runner/performance/test_logical_replication.py
index 050c09c1e5..9d653d1a1e 100644
--- a/test_runner/performance/test_logical_replication.py
+++ b/test_runner/performance/test_logical_replication.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
 
 import time
+from collections.abc import Iterator
+from contextlib import contextmanager
 from typing import TYPE_CHECKING, cast
 
 import psycopg2
@@ -18,7 +20,7 @@ if TYPE_CHECKING:
     from fixtures.benchmark_fixture import NeonBenchmarker
     from fixtures.neon_api import NeonApiEndpoint
     from fixtures.neon_fixtures import NeonEnv, PgBin, VanillaPostgres
-    from psycopg2.extensions import cursor
+    from psycopg2.extensions import connection, cursor
 
 
 @pytest.mark.timeout(1000)
@@ -292,6 +294,48 @@ def test_snap_files(
     then runs pgbench inserts while generating large numbers of snapfiles. Then restarts
     the node and tries to peek the replication changes.
     """
+
+    @contextmanager
+    def replication_slot(conn: connection, slot_name: str) -> Iterator[None]:
+        """
+        Make sure that the replication slot doesn't outlive the test. Normally
+        we wouldn't want this behavior, but since the test creates and drops
+        the replication slot, we do.
+
+        We've had problems in the past where this slot sticking around caused
+        issues with the publisher retaining WAL during the execution of the
+        other benchmarks in this suite.
+        """
+
+        def __drop_replication_slot(c: cursor) -> None:
+            c.execute(
+                """
+                DO $$
+                BEGIN
+                    IF EXISTS (
+                        SELECT 1
+                        FROM pg_replication_slots
+                        WHERE slot_name = %(slot_name)s
+                    ) THEN
+                        PERFORM pg_drop_replication_slot(%(slot_name)s);
+                    END IF;
+                END $$;
+                """,
+                {"slot_name": slot_name},
+            )
+
+        with conn.cursor() as c:
+            __drop_replication_slot(c)
+            c.execute(
+                "SELECT pg_create_logical_replication_slot(%(slot_name)s, 'test_decoding')",
+                {"slot_name": slot_name},
+            )
+
+        yield
+
+        with conn.cursor() as c:
+            __drop_replication_slot(c)
+
     test_duration_min = 60
     test_interval_min = 5
     pgbench_duration = f"-T{test_duration_min * 60 * 2}"
@@ -314,48 +358,35 @@ def test_snap_files(
     conn = psycopg2.connect(connstr)
     conn.autocommit = True
 
-    with conn.cursor() as cur:
-        cur.execute(
-            """
-            DO $$
-                BEGIN
-                IF EXISTS (
-                    SELECT 1
-                    FROM pg_replication_slots
-                    WHERE slot_name = 'slotter'
-                ) THEN
-                    PERFORM pg_drop_replication_slot('slotter');
-                END IF;
-            END $$;
-        """
+    with replication_slot(conn, "slotter"):
+        workload = pg_bin.run_nonblocking(
+            ["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env
         )
-        cur.execute("SELECT pg_create_logical_replication_slot('slotter', 'test_decoding')")
+        try:
+            start = time.time()
+            prev_measurement = time.time()
+            while time.time() - start < test_duration_min * 60:
+                conn = psycopg2.connect(connstr)
+                conn.autocommit = True
+
+                with conn.cursor() as cur:
+                    cur.execute(
+                        "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
+                    )
+                    check_pgbench_still_running(workload)
+                    cur.execute(
+                        "SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())"
+                    )
+
+                conn.close()
+
+                # Measure storage
+                if time.time() - prev_measurement > test_interval_min * 60:
+                    storage = benchmark_project_pub.get_synthetic_storage_size()
+                    zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER)
+                    prev_measurement = time.time()
+                time.sleep(test_interval_min * 60 / 3)
+        finally:
+            workload.terminate()
 
     conn.close()
-
-    workload = pg_bin.run_nonblocking(["pgbench", "-c10", pgbench_duration, "-Mprepared"], env=env)
-    try:
-        start = time.time()
-        prev_measurement = time.time()
-        while time.time() - start < test_duration_min * 60:
-            conn = psycopg2.connect(connstr)
-            conn.autocommit = True
-
-            with conn.cursor() as cur:
-                cur.execute(
-                    "SELECT count(*) FROM (SELECT pg_log_standby_snapshot() FROM generate_series(1, 10000) g) s"
-                )
-                check_pgbench_still_running(workload)
-                cur.execute("SELECT pg_replication_slot_advance('slotter', pg_current_wal_lsn())")
-
-            conn.close()
-
-            # Measure storage
-            if time.time() - prev_measurement > test_interval_min * 60:
-                storage = benchmark_project_pub.get_synthetic_storage_size()
-                zenbenchmark.record("storage", storage, "B", MetricReport.LOWER_IS_BETTER)
-                prev_measurement = time.time()
-            time.sleep(test_interval_min * 60 / 3)
-
-    finally:
-        workload.terminate()

From 93939f123fcd76105543f7a251541712991b5a7a Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 14 Nov 2024 17:31:35 +0000
Subject: [PATCH 238/239] tests: add test_timeline_archival_chaos (#9609)

## Problem

- We lack test coverage of cases where multiple timelines fight for
updates to the same manifest
(https://github.com/neondatabase/neon/pull/9557), and in timeline
archival changes while dual-attached
(https://github.com/neondatabase/neon/pull/9555)

## Summary of changes

- Add a chaos test for timeline creation->archival->offload->deletion
---
 pageserver/src/http/routes.rs                 |   1 +
 pageserver/src/tenant.rs                      |   4 +
 .../src/tenant/remote_timeline_client.rs      |  40 ++-
 pageserver/src/tenant/timeline/delete.rs      |  44 +--
 storage_controller/src/service.rs             |   1 +
 test_runner/fixtures/neon_fixtures.py         |  22 +-
 test_runner/regress/test_timeline_archive.py  | 261 +++++++++++++++++-
 7 files changed, 340 insertions(+), 33 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index dde9c5dd0b..ab170679ba 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -324,6 +324,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                     .into_boxed_str(),
             ),
             a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
+            Cancelled => ApiError::ResourceUnavailable("shutting down".into()),
             Other(e) => ApiError::InternalServerError(e),
         }
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 61bb1fe40c..c6fc3bfe6c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -700,6 +700,9 @@ pub enum DeleteTimelineError {
     #[error("Timeline deletion is already in progress")]
     AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
 
+    #[error("Cancelled")]
+    Cancelled,
+
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
@@ -710,6 +713,7 @@ impl Debug for DeleteTimelineError {
             Self::NotFound => write!(f, "NotFound"),
             Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
             Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
+            Self::Cancelled => f.debug_tuple("Cancelled").finish(),
             Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
         }
     }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 600583f6b5..94f42c7827 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -243,7 +243,7 @@ use self::index::IndexPart;
 use super::metadata::MetadataUpdate;
 use super::storage_layer::{Layer, LayerName, ResidentLayer};
 use super::upload_queue::{NotInitialized, SetDeletedFlagProgress};
-use super::Generation;
+use super::{DeleteTimelineError, Generation};
 
 pub(crate) use download::{
     download_index_part, download_tenant_manifest, is_temp_download_file,
@@ -1550,15 +1550,17 @@ impl RemoteTimelineClient {
     /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
     /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
     /// deletes leaked files if any and proceeds with deletion of index file at the end.
-    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub(crate) async fn delete_all(self: &Arc<Self>) -> Result<(), DeleteTimelineError> {
         debug_assert_current_span_has_tenant_and_timeline_id();
 
         let layers: Vec<RemotePath> = {
             let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = locked.stopped_mut()?;
+            let stopped = locked.stopped_mut().map_err(DeleteTimelineError::Other)?;
 
             if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
-                anyhow::bail!("deleted_at is not set")
+                return Err(DeleteTimelineError::Other(anyhow::anyhow!(
+                    "deleted_at is not set"
+                )));
             }
 
             debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
@@ -1593,7 +1595,10 @@ impl RemoteTimelineClient {
         };
 
         let layer_deletion_count = layers.len();
-        self.deletion_queue_client.push_immediate(layers).await?;
+        self.deletion_queue_client
+            .push_immediate(layers)
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;
 
         // Delete the initdb.tar.zst, which is not always present, but deletion attempts of
         // inexistant objects are not considered errors.
@@ -1601,7 +1606,8 @@ impl RemoteTimelineClient {
             remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &self.timeline_id);
         self.deletion_queue_client
             .push_immediate(vec![initdb_path])
-            .await?;
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;
 
         // Do not delete index part yet, it is needed for possible retry. If we remove it first
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -1609,7 +1615,9 @@ impl RemoteTimelineClient {
 
         // Execute all pending deletions, so that when we proceed to do a listing below, we aren't
         // taking the burden of listing all the layers that we already know we should delete.
-        self.flush_deletion_queue().await?;
+        self.flush_deletion_queue()
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;
 
         let cancel = shutdown_token();
 
@@ -1672,28 +1680,32 @@ impl RemoteTimelineClient {
         if !remaining_layers.is_empty() {
             self.deletion_queue_client
                 .push_immediate(remaining_layers)
-                .await?;
+                .await
+                .map_err(|_| DeleteTimelineError::Cancelled)?;
         }
 
         fail::fail_point!("timeline-delete-before-index-delete", |_| {
-            Err(anyhow::anyhow!(
+            Err(DeleteTimelineError::Other(anyhow::anyhow!(
                 "failpoint: timeline-delete-before-index-delete"
-            ))?
+            )))?
         });
 
         debug!("enqueuing index part deletion");
         self.deletion_queue_client
             .push_immediate([latest_index].to_vec())
-            .await?;
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;
 
         // Timeline deletion is rare and we have probably emitted a reasonably number of objects: wait
         // for a flush to a persistent deletion list so that we may be sure deletion will occur.
-        self.flush_deletion_queue().await?;
+        self.flush_deletion_queue()
+            .await
+            .map_err(|_| DeleteTimelineError::Cancelled)?;
 
         fail::fail_point!("timeline-delete-after-index-delete", |_| {
-            Err(anyhow::anyhow!(
+            Err(DeleteTimelineError::Other(anyhow::anyhow!(
                 "failpoint: timeline-delete-after-index-delete"
-            ))?
+            )))?
         });
 
         info!(prefix=%timeline_storage_path, referenced=layer_deletion_count, not_referenced=%not_referenced_count, "done deleting in timeline prefix, including index_part.json");
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 5a4c2d9da3..69001a6c40 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -5,6 +5,7 @@ use std::{
 
 use anyhow::Context;
 use pageserver_api::{models::TimelineState, shard::TenantShardId};
+use remote_storage::DownloadError;
 use tokio::sync::OwnedMutexGuard;
 use tracing::{error, info, info_span, instrument, Instrument};
 use utils::{crashsafe, fs_ext, id::TimelineId, pausable_failpoint};
@@ -16,7 +17,7 @@ use crate::{
         metadata::TimelineMetadata,
         remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
         CreateTimelineCause, DeleteTimelineError, MaybeDeletedIndexPart, Tenant,
-        TimelineOrOffloaded,
+        TenantManifestError, TimelineOrOffloaded,
     },
     virtual_file::MaybeFatalIo,
 };
@@ -110,13 +111,6 @@ pub(super) async fn delete_local_timeline_directory(
     info!("finished deleting layer files, releasing locks");
 }
 
-/// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(
-    remote_client: &Arc<RemoteTimelineClient>,
-) -> anyhow::Result<()> {
-    remote_client.delete_all().await.context("delete_all")
-}
-
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
 async fn remove_maybe_offloaded_timeline_from_tenant(
@@ -221,11 +215,24 @@ impl DeleteTimelineFlow {
             None => {
                 let remote_client = tenant
                     .build_timeline_client(timeline.timeline_id(), tenant.remote_storage.clone());
-                let result = remote_client
+                let result = match remote_client
                     .download_index_file(&tenant.cancel)
                     .instrument(info_span!("download_index_file"))
                     .await
-                    .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!("error: {:?}", e)))?;
+                {
+                    Ok(r) => r,
+                    Err(DownloadError::NotFound) => {
+                        // Deletion is already complete
+                        tracing::info!("Timeline already deleted in remote storage");
+                        return Ok(());
+                    }
+                    Err(e) => {
+                        return Err(DeleteTimelineError::Other(anyhow::anyhow!(
+                            "error: {:?}",
+                            e
+                        )));
+                    }
+                };
                 let index_part = match result {
                     MaybeDeletedIndexPart::Deleted(p) => {
                         tracing::info!("Timeline already set as deleted in remote index");
@@ -406,7 +413,12 @@ impl DeleteTimelineFlow {
             "timeline_delete",
             async move {
                 if let Err(err) = Self::background(guard, conf, &tenant, &timeline, remote_client).await {
-                    error!("Error: {err:#}");
+                    // Only log as an error if it's not a cancellation.
+                    if matches!(err, DeleteTimelineError::Cancelled) {
+                        info!("Shutdown during timeline deletion");
+                    }else {
+                        error!("Error: {err:#}");
+                    }
                     if let TimelineOrOffloaded::Timeline(timeline) = timeline {
                         timeline.set_broken(format!("{err:#}"))
                     }
@@ -438,7 +450,7 @@ impl DeleteTimelineFlow {
             Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
         });
 
-        delete_remote_layers_and_index(&remote_client).await?;
+        remote_client.delete_all().await?;
 
         pausable_failpoint!("in_progress_delete");
 
@@ -449,10 +461,10 @@ impl DeleteTimelineFlow {
         // So indeed, the tenant manifest might refer to an offloaded timeline which has already been deleted.
         // However, we handle this case in tenant loading code so the next time we attach, the issue is
         // resolved.
-        tenant
-            .store_tenant_manifest()
-            .await
-            .map_err(|e| DeleteTimelineError::Other(anyhow::anyhow!(e)))?;
+        tenant.store_tenant_manifest().await.map_err(|e| match e {
+            TenantManifestError::Cancelled => DeleteTimelineError::Cancelled,
+            _ => DeleteTimelineError::Other(e.into()),
+        })?;
 
         *guard = Self::Finished;
 
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index e3a147bc06..3b85da6665 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -3642,6 +3642,7 @@ impl Service {
                 match res {
                     Ok(ok) => Ok(ok),
                     Err(mgmt_api::Error::ApiError(StatusCode::CONFLICT, _)) => Ok(StatusCode::CONFLICT),
+                    Err(mgmt_api::Error::ApiError(StatusCode::SERVICE_UNAVAILABLE, msg)) => Err(ApiError::ResourceUnavailable(msg.into())),
                     Err(e) => {
                         Err(
                             ApiError::InternalServerError(anyhow::anyhow!(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 990db1aed0..205a47a9d5 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2379,6 +2379,17 @@ class NeonPageserver(PgProtocol, LogUtils):
         #
         # The entries in the list are regular experessions.
         self.allowed_errors: list[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
+        # Store persistent failpoints that should be reapplied on each start
+        self._persistent_failpoints: dict[str, str] = {}
+
+    def add_persistent_failpoint(self, name: str, action: str):
+        """
+        Add a failpoint that will be automatically reapplied each time the pageserver starts.
+        The failpoint will be set immediately if the pageserver is running.
+        """
+        self._persistent_failpoints[name] = action
+        if self.running:
+            self.http_client().configure_failpoints([(name, action)])
 
     def timeline_dir(
         self,
@@ -2446,6 +2457,15 @@ class NeonPageserver(PgProtocol, LogUtils):
         """
         assert self.running is False
 
+        if self._persistent_failpoints:
+            # Tests shouldn't use this mechanism _and_ set FAILPOINTS explicitly
+            assert extra_env_vars is None or "FAILPOINTS" not in extra_env_vars
+            if extra_env_vars is None:
+                extra_env_vars = {}
+            extra_env_vars["FAILPOINTS"] = ",".join(
+                f"{k}={v}" for (k, v) in self._persistent_failpoints.items()
+            )
+
         storage = self.env.pageserver_remote_storage
         if isinstance(storage, S3Storage):
             s3_env_vars = storage.access_env_vars()
@@ -4522,7 +4542,7 @@ def pytest_addoption(parser: Parser):
 
 
 SMALL_DB_FILE_NAME_REGEX: re.Pattern[str] = re.compile(
-    r"config-v1|heatmap-v1|metadata|.+\.(?:toml|pid|json|sql|conf)"
+    r"config-v1|heatmap-v1|tenant-manifest|metadata|.+\.(?:toml|pid|json|sql|conf)"
 )
 
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index d3839e3d2c..c447535e10 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -1,10 +1,14 @@
 from __future__ import annotations
 
 import json
+import random
+import threading
+import time
 from typing import Optional
 
 import pytest
-from fixtures.common_types import TenantId, TimelineArchivalState, TimelineId
+import requests
+from fixtures.common_types import TenantId, TenantShardId, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
@@ -12,8 +16,9 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import assert_prefix_empty, assert_prefix_not_empty, list_prefix
+from fixtures.pg_version import PgVersion
 from fixtures.remote_storage import S3Storage, s3_storage
-from fixtures.utils import wait_until
+from fixtures.utils import run_only_on_default_postgres, wait_until
 from mypy_boto3_s3.type_defs import (
     ObjectTypeDef,
 )
@@ -378,6 +383,258 @@ def test_timeline_offload_persist(neon_env_builder: NeonEnvBuilder, delete_timel
     )
 
 
+@run_only_on_default_postgres("this test isn't sensitive to the contents of timelines")
+def test_timeline_archival_chaos(neon_env_builder: NeonEnvBuilder):
+    """
+    A general consistency check on archival/offload timeline state, and its intersection
+    with tenant migrations and timeline deletions.
+    """
+
+    # Offloading is off by default at time of writing: remove this line when it's on by default
+    neon_env_builder.pageserver_config_override = "timeline_offloading = true"
+    neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+
+    # We will exercise migrations, so need multiple pageservers
+    neon_env_builder.num_pageservers = 2
+
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "compaction_period": "1s",
+        }
+    )
+    tenant_id = env.initial_tenant
+    tenant_shard_id = TenantShardId(tenant_id, 0, 0)
+
+    # Unavailable pageservers during timeline CRUD operations can be logged as errors on the storage controller
+    env.storage_controller.allowed_errors.append(".*error sending request.*")
+
+    for ps in env.pageservers:
+        # We will do unclean restarts, which results in these messages when cleaning up files
+        ps.allowed_errors.extend(
+            [
+                ".*removing local file.*because it has unexpected length.*",
+                ".*__temp.*",
+                # FIXME: there are still anyhow::Error paths in timeline creation/deletion which
+                # generate 500 results when called during shutdown
+                ".*InternalServerError.*",
+                # FIXME: there are still anyhow::Error paths in timeline deletion that generate
+                # log lines at error severity
+                ".*delete_timeline.*Error",
+            ]
+        )
+
+    class TimelineState:
+        def __init__(self):
+            self.timeline_id = TimelineId.generate()
+            self.created = False
+            self.archived = False
+            self.offloaded = False
+            self.deleted = False
+
+    controller_ps_api = env.storage_controller.pageserver_api()
+
+    shutdown = threading.Event()
+
+    violations = []
+
+    timelines_deleted = []
+
+    def list_timelines(tenant_id) -> tuple[set[TimelineId], set[TimelineId]]:
+        """Get the list of active and offloaded TimelineId"""
+        listing = controller_ps_api.timeline_and_offloaded_list(tenant_id)
+        active_ids = set([TimelineId(t["timeline_id"]) for t in listing.timelines])
+        offloaded_ids = set([TimelineId(t["timeline_id"]) for t in listing.offloaded])
+
+        return (active_ids, offloaded_ids)
+
+    def timeline_objects(tenant_shard_id, timeline_id):
+        response = list_prefix(
+            env.pageserver_remote_storage,  # type: ignore
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(tenant_shard_id),
+                    "timelines",
+                    str(timeline_id),
+                )
+            )
+            + "/",
+        )
+
+        return [k["Key"] for k in response.get("Contents", [])]
+
+    def worker():
+        """
+        Background thread which drives timeline lifecycle operations, and checks that between steps
+        it obeys invariants. This should detect errors in pageserver persistence and in errors in
+        concurrent operations on different timelines when it is run many times in parallel.
+        """
+        state = TimelineState()
+
+        # Jitter worker startup, we're not interested in exercising lots of concurrent creations
+        # as we know that's I/O bound.
+        shutdown.wait(random.random() * 10)
+
+        while not shutdown.is_set():
+            # A little wait between actions to jitter out the API calls rather than having them
+            # all queue up at once
+            shutdown.wait(random.random())
+
+            try:
+                if not state.created:
+                    log.info(f"Creating timeline {state.timeline_id}")
+                    controller_ps_api.timeline_create(
+                        PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=state.timeline_id
+                    )
+                    state.created = True
+
+                    if (
+                        timeline_objects(
+                            tenant_shard_id=tenant_shard_id, timeline_id=state.timeline_id
+                        )
+                        == []
+                    ):
+                        msg = f"Timeline {state.timeline_id} unexpectedly not present in remote storage"
+                        violations.append(msg)
+
+                elif state.deleted:
+                    # Try to confirm its deletion completed.
+                    # Deleted timeline should not appear in listing API, either as offloaded or active
+                    (active_ids, offloaded_ids) = list_timelines(tenant_id)
+                    if state.timeline_id in active_ids or state.timeline_id in offloaded_ids:
+                        msg = f"Timeline {state.timeline_id} appeared in listing after deletion was acked"
+                        violations.append(msg)
+                        raise RuntimeError(msg)
+
+                    objects = timeline_objects(tenant_shard_id, state.timeline_id)
+                    if len(objects) == 0:
+                        log.info(f"Confirmed deletion of timeline {state.timeline_id}")
+                        timelines_deleted.append(state.timeline_id)
+                        state = TimelineState()  # A new timeline ID to create on next iteration
+                    else:
+                        # Deletion of objects doesn't have to be synchronous, we will keep polling
+                        log.info(f"Timeline {state.timeline_id} objects still exist: {objects}")
+                        shutdown.wait(random.random())
+                else:
+                    # The main lifetime of a timeline: proceed active->archived->offloaded->deleted
+                    if not state.archived:
+                        log.info(f"Archiving timeline {state.timeline_id}")
+                        controller_ps_api.timeline_archival_config(
+                            tenant_id, state.timeline_id, TimelineArchivalState.ARCHIVED
+                        )
+                        state.archived = True
+                    elif state.archived and not state.offloaded:
+                        log.info(f"Waiting for offload of timeline {state.timeline_id}")
+                        # Wait for offload: this should happen fast because we configured a short compaction interval
+                        while not shutdown.is_set():
+                            (active_ids, offloaded_ids) = list_timelines(tenant_id)
+                            if state.timeline_id in active_ids:
+                                log.info(f"Timeline {state.timeline_id} is still active")
+                                shutdown.wait(0.5)
+                            elif state.timeline_id in offloaded_ids:
+                                log.info(f"Timeline {state.timeline_id} is now offloaded")
+                                state.offloaded = True
+                                break
+                            else:
+                                # Timeline is neither offloaded nor active, this is unexpected: the pageserver
+                                # should ensure that the timeline appears in either the offloaded list or main list
+                                msg = f"Timeline {state.timeline_id} disappeared!"
+                                violations.append(msg)
+                                raise RuntimeError(msg)
+                    elif state.offloaded:
+                        # Once it's offloaded it should only be in offloaded or deleted state: check
+                        # it didn't revert back to active.  This tests that the manfiest is doing its
+                        # job to suppress loading of offloaded timelines as active.
+                        (active_ids, offloaded_ids) = list_timelines(tenant_id)
+                        if state.timeline_id in active_ids:
+                            msg = f"Timeline {state.timeline_id} is active, should be offloaded or deleted"
+                            violations.append(msg)
+                            raise RuntimeError(msg)
+
+                        log.info(f"Deleting timeline {state.timeline_id}")
+                        controller_ps_api.timeline_delete(tenant_id, state.timeline_id)
+                        state.deleted = True
+                    else:
+                        raise RuntimeError("State should be unreachable")
+            except PageserverApiException as e:
+                # This is expected: we are injecting chaos, API calls will sometimes fail.
+                # TODO: can we narrow this to assert we are getting friendly 503s?
+                log.info(f"Iteration error, will retry: {e}")
+                shutdown.wait(random.random())
+            except requests.exceptions.RetryError as e:
+                # Retryable error repeated more times than `requests` is configured to tolerate, this
+                # is expected when a pageserver remains unavailable for a couple seconds
+                log.info(f"Iteration error, will retry: {e}")
+                shutdown.wait(random.random())
+            except Exception as e:
+                log.warning(
+                    f"Unexpected worker exception (current timeline {state.timeline_id}): {e}"
+                )
+            else:
+                # In the non-error case, use a jitterd but small wait, we want to keep
+                # a high rate of operations going
+                shutdown.wait(random.random() * 0.1)
+
+    n_workers = 4
+    threads = []
+    for _i in range(0, n_workers):
+        t = threading.Thread(target=worker)
+        t.start()
+        threads.append(t)
+
+    # Set delay failpoints so that deletions and migrations take some time, and have a good
+    # chance to interact with other concurrent timeline mutations.
+    env.storage_controller.configure_failpoints(
+        [("reconciler-live-migrate-pre-await-lsn", "sleep(1)")]
+    )
+    for ps in env.pageservers:
+        ps.add_persistent_failpoint("in_progress_delete", "sleep(1)")
+
+    # Generate some chaos, while our workers are trying to complete their timeline operations
+    rng = random.Random()
+    try:
+        chaos_rounds = 48
+        for _i in range(0, chaos_rounds):
+            action = rng.choice([0, 1])
+            if action == 0:
+                # Pick a random pageserver to gracefully restart
+                pageserver = rng.choice(env.pageservers)
+
+                # Whether to use a graceful shutdown or SIGKILL
+                immediate = random.choice([True, False])
+                log.info(f"Restarting pageserver {pageserver.id}, immediate={immediate}")
+
+                t1 = time.time()
+                pageserver.restart(immediate=immediate)
+                restart_duration = time.time() - t1
+
+                # Make sure we're up for as long as we spent restarting, to ensure operations can make progress
+                log.info(f"Staying alive for {restart_duration}s")
+                time.sleep(restart_duration)
+            else:
+                # Migrate our tenant between pageservers
+                origin_ps = env.get_tenant_pageserver(tenant_shard_id)
+                dest_ps = rng.choice([ps for ps in env.pageservers if ps.id != origin_ps.id])
+                log.info(f"Migrating {tenant_shard_id} {origin_ps.id}->{dest_ps.id}")
+                env.storage_controller.tenant_shard_migrate(
+                    tenant_shard_id=tenant_shard_id, dest_ps_id=dest_ps.id
+                )
+
+            log.info(f"Full timeline lifecycles so far: {len(timelines_deleted)}")
+    finally:
+        shutdown.set()
+
+        for thread in threads:
+            thread.join()
+
+    # Sanity check that during our run we did exercise some full timeline lifecycles, in case
+    # one of our workers got stuck
+    assert len(timelines_deleted) > 10
+
+    # That no invariant-violations were reported by workers
+    assert violations == []
+
+
 @pytest.mark.parametrize("offload_child", ["offload", "offload-corrupt", "archive", None])
 def test_timeline_retain_lsn(neon_env_builder: NeonEnvBuilder, offload_child: Optional[str]):
     """

From 38563de7dd2aa910c4d5564a4ad8c67ab62334e3 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Thu, 14 Nov 2024 19:41:10 +0000
Subject: [PATCH 239/239] storcon: exclude non-Active tenants from shard
 autosplitting (#9743)

## Problem

We didn't have a neat way to prevent auto-splitting of tenants. This
could be useful during incidents or for testing.

Closes https://github.com/neondatabase/neon/issues/9332

## Summary of changes

- Filter splitting candidates by scheduling policy
---
 storage_controller/src/service.rs | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 3b85da6665..446c476b99 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6356,6 +6356,19 @@ impl Service {
 
         // Pick the biggest tenant to split first
         top_n.sort_by_key(|i| i.resident_size);
+
+        // Filter out tenants in a prohibiting scheduling mode
+        {
+            let locked = self.inner.read().unwrap();
+            top_n.retain(|i| {
+                if let Some(shard) = locked.tenants.get(&i.id) {
+                    matches!(shard.get_scheduling_policy(), ShardSchedulingPolicy::Active)
+                } else {
+                    false
+                }
+            });
+        }
+
         let Some(split_candidate) = top_n.into_iter().next() else {
             tracing::debug!("No split-elegible shards found");
             return;